# ROIC Preprocessing Ver.2

This notebook executes the data preprocessing pipeline for ROIC analysis using the `roic_analysis` package.

**Steps:**

1. Load Data (Factset, Bloomberg, Index Constituents)
2. Feature Engineering (WACC, Economic Profit, iROIC, etc.)
3. Data Validation
4. Save Processed Data


In [None]:
%load_ext autoreload
%autoreload 2

import os
import sys
from pathlib import Path

from dotenv import load_dotenv

# Add project root to path
current_dir = Path.cwd()
if current_dir.name == "notebook":
    root_dir = current_dir.parent
else:
    root_dir = current_dir

sys.path.append(str(root_dir))

from src.roic_analysis.data_loader import ROICDataLoader
from src.roic_analysis.feature_engineering import FactorEngineer
from src.validate_data import validate_data

load_dotenv(root_dir / ".env")

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


True

## 1. Load Data


In [None]:
# Define Paths (Load from env or set defaults)
UNIVERSE_CODE = "MSXJPN_AD"

FACTSET_FINANCIALS_DIR = Path(
    os.environ.get("FACTSET_FINANCIALS_DIR", root_dir / "data/Factset/Financials")
)
FACTSET_INDEX_CONSTITUENTS_DIR = Path(
    os.environ.get("FACTSET_INDEX_CONSTITUENTS_DIR", root_dir / "data/Factset/Index")
)
BLOOMBERG_DATA_DIR = Path(
    os.environ.get("BLOOMBERG_DATA_DIR", root_dir / "data/Bloomberg")
)

financials_db_path = FACTSET_FINANCIALS_DIR / UNIVERSE_CODE / "Financials_and_Price.db"
index_constituents_db_path = FACTSET_INDEX_CONSTITUENTS_DIR / "Index_Constituents.db"
bloomberg_db_path = BLOOMBERG_DATA_DIR / "Index_Price_and_Returns.db"

print(f"Financials DB: {financials_db_path}")
print(f"Index DB: {index_constituents_db_path}")
print(f"Bloomberg DB: {bloomberg_db_path}")

Financials DB: C:\Users\Yuki Hata\Desktop\papers\Quants\data\FactSet\Financials\MSXJPN_AD\Financials_and_Price.db
Index DB: C:\Users\Yuki Hata\Desktop\papers\Quants\data\FactSet\Index_Constituents\Index_Constituents.db
Bloomberg DB: C:\Users\Yuki Hata\Desktop\papers\Quants\data\Bloomberg\data\Index_Price_and_Returns.db


In [None]:
# Initialize Loader
loader = ROICDataLoader(
    financials_db_path=financials_db_path,
    index_constituents_db_path=index_constituents_db_path,
    bloomberg_db_path=bloomberg_db_path,
    universe_code=UNIVERSE_CODE,
)

# Load and Preprocess
df = loader.load_and_preprocess()
print(f"Loaded Data Shape: {df.shape}")
display(df.head())

Loading index constituents...
Loading financials...
Merging data...
Handling missing values...
Loaded Data Shape: (400864, 20)


Unnamed: 0,date,GICS Sector,GICS Industry,GICS Industry Group,Weight (%),Mkt Value,Forward_Return_12M_annlzd,Forward_Return_1M_annlzd,Forward_Return_3M_annlzd,Forward_Return_3Y_annlzd,Forward_Return_5Y_annlzd,Forward_Return_6M_annlzd,ROIC_label_Past5Y,Return_12M_annlzd,Return_1M_annlzd,Return_3M_annlzd,Return_3Y_annlzd,Return_5Y_annlzd,Return_6M_annlzd,P_SYMBOL
0,2000-01-31,Information Technology,Communications Equipment,Technology Hardware & Equipment,0.233095,4210900000000.0,,,,,,,,,,,,,,0HSW-GB
1,2000-02-29,Information Technology,Communications Equipment,Technology Hardware & Equipment,0.196482,3665690000000.0,,,,,,,,,,,,,,0HSW-GB
2,2000-03-31,Information Technology,Communications Equipment,Technology Hardware & Equipment,0.175928,3281114000000.0,,,,,,,,,,,,,,0HSW-GB
3,2000-04-30,Information Technology,Communications Equipment,Technology Hardware & Equipment,0.191451,3628872000000.0,,,,,,,,,,,,,,0HSW-GB
4,2000-05-31,Information Technology,Communications Equipment,Technology Hardware & Equipment,0.187935,3468668000000.0,,,,,,,,,,,,,,0HSW-GB


## 2. Feature Engineering


In [None]:
engineer = FactorEngineer()

# 1. WACC & Economic Profit
print("Calculating WACC and Economic Profit...")
df = engineer.calculate_wacc(df)
df = engineer.calculate_economic_profit(df)

# 2. Incremental ROIC
print("Calculating Incremental ROIC...")
df = engineer.calculate_incremental_roic(df)

# 3. DuPont Decomposition
print("Performing DuPont Decomposition...")
df = engineer.decompose_dupont(df)

# 4. ROIC Ranks
print("Adding ROIC Ranks...")
df = engineer.add_roic_rank_cols(df)

# 5. Intangible Capitalization (Optional/Advanced)
if "RD_Expense" in df.columns:
    print("Capitalizing R&D...")
    df = engineer.capitalize_intangibles(df)

display(df.head())

## 3. Validation


In [None]:
is_valid = validate_data(df)
if not is_valid:
    print("WARNING: Data validation found issues. Check logs.")
else:
    print("Data validation passed.")

## 4. Save Data


In [None]:
output_path = root_dir / "data" / "MSCI_KOKUSAI_enhanced_data.parquet"
df.to_parquet(output_path)
print(f"Saved processed data to: {output_path}")