### Build Optimal Field Set for Barra Model

In [5]:
# Get fundamental data using the fields that work with your subscription
print("Fetching fundamental data (latest values only)...")

working_fields = [
    'TR.PriceClose',              # For ratios and returns
    'TR.CompanyMarketCap',        # Size factor
    'TR.BookValuePerShare',       # Value factor (B/M)
    'TR.EPSMean',                 # Value factor (E/P)
    'TR.Revenue',                 # For calculations
    'TR.RevenuePerShare',         # Value factor (S/P)
    'TR.TotalDebt',               # Leverage factor
    'TR.TotalEquity',             # Leverage factor
    'TR.ShareholdersEquity',      # Alternative equity
    'TR.TotalAssets',             # For ROA calculation
    'TR.NetProfitMargin',         # Quality factor
    'TR.OperatingMargin',         # Quality factor
    'TR.EBITDA',                  # Quality factor
]

# Request only most recent fiscal period data
fundamentals = ld.get_data(
    universe=tickers,
    fields=working_fields,
    parameters={'Period': 'FY0'}  # FY0 = Most recent fiscal year
)

# If the column is named 'Instrument'
fundamentals.set_index('Instrument', inplace=True)

print(f"\nFundamentals shape: {fundamentals.shape}")
print(f"Expected shape: ({len(tickers)}, {len(working_fields)})")

# If still getting multiple rows per ticker, take the latest
if len(fundamentals) > len(tickers):
    print(f"\n⚠ Got {len(fundamentals)} rows instead of {len(tickers)}")
    print("Extracting most recent value per ticker...")
    
    # Reset index to make ticker a column if needed
    if fundamentals.index.name is None:
        fundamentals = fundamentals.reset_index()
    
    # Get the last (most recent) entry for each ticker
    fundamentals = fundamentals.groupby('Instrument', as_index=True).last()
    
    print(f"✓ Reduced to {len(fundamentals)} rows (one per ticker)")

print(f"\nFinal shape: {fundamentals.shape}")
print("\nColumn names:")
print(fundamentals.columns.tolist())
# print("\nFirst few rows:")
# print(fundamentals.head())
# print("\nIndex (tickers):")
# print(fundamentals.index[:10])
print("\nMissing values per column:")
print(fundamentals.isna().sum())

Fetching fundamental data (latest values only)...

Fundamentals shape: (1025, 13)
Expected shape: (101, 13)

⚠ Got 1025 rows instead of 101
Extracting most recent value per ticker...
✓ Reduced to 101 rows (one per ticker)

Final shape: (101, 13)

Column names:
['Price Close', 'Company Market Cap', 'Book Value Per Share', 'Earnings Per Share - Mean', 'Revenue', 'Revenue Per Share', 'Total Debt', 'Total Equity', 'Shareholders Equity - Broker Estimate', 'Total Assets', 'Net Profit Margin, (%)', 'Operating Margin, Percent', 'EBITDA']

Missing values per column:
Price Close                               0
Company Market Cap                        0
Book Value Per Share                      0
Earnings Per Share - Mean                 0
Revenue                                  10
Revenue Per Share                         0
Total Debt                                0
Total Equity                              0
Shareholders Equity - Broker Estimate     0
Total Assets                            

### Get Historical Data for Growth Calculation

In [6]:
# Get data from 1 year ago to calculate YoY growth
print("="*70)
print("FETCHING HISTORICAL DATA FOR GROWTH CALCULATIONS")
print("="*70)



# Calculate date 1 year ago
one_year_ago = (datetime.now() - timedelta(days=365)).strftime('%Y-%m-%d')

print(f"\nFetching data from: {one_year_ago}")

# Fields we want historical values for
growth_fields = [
    'TR.Revenue',
    'TR.EPSMean',
    'TR.EBITDA'
]

print("\nFetching historical data...")
try:
    fundamentals_historical = ld.get_data(
        universe=tickers,
        fields=growth_fields,
        parameters={
            'SDate': one_year_ago,
            'Period': 'FY0'  # Most recent fiscal year as of that date
        }
    )
    
    print(f"✓ Historical data retrieved: {fundamentals_historical.shape}")
    print(f"Expected: ({len(tickers)}, {len(growth_fields)})")
    
    print("\nHistorical columns:")
    print(fundamentals_historical.columns.tolist())
    print("\nFirst few rows:")
    print(fundamentals_historical.head())
    
    # Check if 'Instrument' column exists and set it as index
    if 'Instrument' in fundamentals_historical.columns:
        print("\n✓ Setting 'Instrument' as index...")
        fundamentals_historical = fundamentals_historical.set_index('Instrument')
        print(f"New shape after setting index: {fundamentals_historical.shape}")
    
    # If still got multiple rows per ticker, take the first
    if len(fundamentals_historical) > len(tickers):
        print(f"\n⚠ Got {len(fundamentals_historical)} rows, extracting one per ticker...")
        fundamentals_historical = fundamentals_historical.groupby(fundamentals_historical.index).first()
        print(f"✓ Reduced to {len(fundamentals_historical)} rows")
    
    print(f"\nFinal historical shape: {fundamentals_historical.shape}")
    print("Columns after index set:")
    print(fundamentals_historical.columns.tolist())
    
    # Now rename the data columns (excluding index)
    # Map the actual column names to our desired names
    column_mapping = {}
    for col in fundamentals_historical.columns:
        if 'revenue' in col.lower():
            column_mapping[col] = 'Revenue_1Y_Ago'
        elif 'eps' in col.lower() or 'earning' in col.lower():
            column_mapping[col] = 'EPS_1Y_Ago'
        elif 'ebitda' in col.lower():
            column_mapping[col] = 'EBITDA_1Y_Ago'
    
    fundamentals_historical = fundamentals_historical.rename(columns=column_mapping)
    
    print("\n✓ Renamed columns:")
    print(fundamentals_historical.columns.tolist())
    print("\nSample historical data:")
    print(fundamentals_historical.head())
    
    has_historical = True
    print("\n✓ Historical data successfully loaded!")
    
except Exception as e:
    print(f"\n✗ Could not fetch historical data: {e}")
    print("\nFull error details:")
    import traceback
    traceback.print_exc()
    print("\nWill use momentum as growth proxy instead")
    has_historical = False
    fundamentals_historical = None

FETCHING HISTORICAL DATA FOR GROWTH CALCULATIONS

Fetching data from: 2024-12-19

Fetching historical data...
✓ Historical data retrieved: (101, 4)
Expected: (101, 3)

Historical columns:
['Instrument', 'Revenue', 'Earnings Per Share - Mean', 'EBITDA']

First few rows:
  Instrument      Revenue  Earnings Per Share - Mean         EBITDA
0      DUK.N  29060000000                    5.59042  13203000000.0
1      UNH.N         <NA>                   24.96073  36258000000.0
2      NEE.N  28114000000                    3.12484  16128000000.0
3        V.N  35926000000                    9.92009  25373000000.0
4      CRM.N  34857000000                    8.19759   9958000000.0

✓ Setting 'Instrument' as index...
New shape after setting index: (101, 3)

Final historical shape: (101, 3)
Columns after index set:
['Revenue', 'Earnings Per Share - Mean', 'EBITDA']

✓ Renamed columns:
['Revenue_1Y_Ago', 'EPS_1Y_Ago', 'EBITDA_1Y_Ago']

Sample historical data:
            Revenue_1Y_Ago  EPS_1Y_Ago  E

### Calculate Year-over-Year Growth Rates

In [7]:
# Calculate YoY growth rates
print("="*70)
print("CALCULATING YEAR-OVER-YEAR GROWTH RATES")
print("="*70)

columns_to_drop = ['Revenue_1Y_Ago', 'EPS_1Y_Ago', 'EBITDA_1Y_Ago']
fundamentals = fundamentals.drop(columns=columns_to_drop, errors='ignore')

if has_historical and fundamentals_historical is not None:
    print("\n✓ Using actual fundamental growth data\n")
    
    # Merge historical data with current data
    fundamentals = fundamentals.join(fundamentals_historical)
    
    # Calculate Revenue Growth (YoY)
    fundamentals['Revenue_Growth'] = (
        (fundamentals['Revenue'] - fundamentals['Revenue_1Y_Ago']) / 
        fundamentals['Revenue_1Y_Ago'].abs() * 100
    )
    
    # Calculate EPS Growth (YoY)
    fundamentals['EPS_Growth'] = (
        (fundamentals['Earnings Per Share - Mean'] - fundamentals['EPS_1Y_Ago']) / 
        fundamentals['EPS_1Y_Ago'].abs() * 100
    )
    
    # Calculate EBITDA Growth (YoY)
    fundamentals['EBITDA_Growth'] = (
        (fundamentals['EBITDA'] - fundamentals['EBITDA_1Y_Ago']) / 
        fundamentals['EBITDA_1Y_Ago'].abs() * 100
    )
    
    # Clean growth rates (remove infinities and extreme outliers)
    for col in ['Revenue_Growth', 'EPS_Growth', 'EBITDA_Growth']:
        fundamentals[col] = fundamentals[col].replace([np.inf, -np.inf], np.nan)
        # Cap extreme values at +/- 500% to avoid outliers
        fundamentals[col] = fundamentals[col].clip(-500, 500)
    
    print("Growth Metrics Calculated:")
    print("-" * 70)
    print(f"Revenue Growth:  {fundamentals['Revenue_Growth'].notna().sum()} valid values")
    print(f"EPS Growth:      {fundamentals['EPS_Growth'].notna().sum()} valid values")
    print(f"EBITDA Growth:   {fundamentals['EBITDA_Growth'].notna().sum()} valid values")
    
    print("\nGrowth Statistics:")
    print(fundamentals[['Revenue_Growth', 'EPS_Growth', 'EBITDA_Growth']].describe())
    
    print("\nTop 10 Revenue Growth Companies:")
    print(fundamentals.nlargest(10, 'Revenue_Growth')[['Revenue_Growth', 'EPS_Growth']])
    
    print("\n✓ Growth factors ready to build!")
    
else:
    print("\n⚠ Using momentum as growth proxy")
    fundamentals['Revenue_Growth'] = np.nan
    fundamentals['EPS_Growth'] = np.nan
    fundamentals['EBITDA_Growth'] = np.nan

print("\nFinal fundamentals shape:", fundamentals.shape)
print("\nAll available columns:")
print(fundamentals.columns.tolist())

CALCULATING YEAR-OVER-YEAR GROWTH RATES

✓ Using actual fundamental growth data

Growth Metrics Calculated:
----------------------------------------------------------------------
Revenue Growth:  91 valid values
EPS Growth:      101 valid values
EBITDA Growth:   101 valid values

Growth Statistics:
       Revenue_Growth  EPS_Growth  EBITDA_Growth
count            91.0       101.0          101.0
mean         7.182684   15.834507      10.459667
std         14.070936   68.311495      39.936102
min        -14.495977 -186.938645    -266.628396
25%          0.777579   -2.006243       0.355784
50%           5.00791    7.477437       7.927016
75%         11.225804   18.851729      20.919447
max        114.203408  447.860706     139.147651

Top 10 Revenue Growth Companies:
            Revenue_Growth  EPS_Growth
Instrument                            
NVDA.OQ         114.203408  137.445828
LLY.N            31.996741  109.085862
PLTR.OQ          28.786137   53.476217
AVGO.OQ          23.874433   3

### Process and Calculate Derived Metrics

In [8]:
# Process and calculate additional metrics
print("Processing and calculating derived metrics...")

# 1. Debt-to-Equity
fundamentals['Debt_to_Equity'] = fundamentals['Total Debt'] / fundamentals['Total Equity']
fundamentals['Debt_to_Equity'] = fundamentals['Debt_to_Equity'].replace([np.inf, -np.inf], np.nan)
print("✓ Debt-to-Equity calculated")

# 2. Debt-to-Assets
fundamentals['Debt_to_Assets'] = fundamentals['Total Debt'] / fundamentals['Total Assets']
fundamentals['Debt_to_Assets'] = fundamentals['Debt_to_Assets'].replace([np.inf, -np.inf], np.nan)
print("✓ Debt-to-Assets calculated")

# 3. Asset Turnover (Revenue / Total Assets)
fundamentals['Asset_Turnover'] = fundamentals['Revenue'] / fundamentals['Total Assets']
fundamentals['Asset_Turnover'] = fundamentals['Asset_Turnover'].replace([np.inf, -np.inf], np.nan)
print("✓ Asset Turnover calculated")

# 4. ROE (calculated using DuPont formula approximation)
# ROE = Net Profit Margin × Asset Turnover × Equity Multiplier
# Equity Multiplier = Total Assets / Total Equity
fundamentals['Equity_Multiplier'] = fundamentals['Total Assets'] / fundamentals['Total Equity']

fundamentals['ROE'] = (
    fundamentals['Net Profit Margin, (%)'] / 100 *  # Convert percentage to decimal
    fundamentals['Asset_Turnover'] * 
    fundamentals['Equity_Multiplier'] * 
    100  # Convert back to percentage
)
fundamentals['ROE'] = fundamentals['ROE'].replace([np.inf, -np.inf], np.nan)
print("✓ ROE calculated (DuPont formula)")

# 5. ROA (calculated)
# ROA = Net Profit Margin × Asset Turnover
fundamentals['ROA'] = (
    fundamentals['Net Profit Margin, (%)'] / 100 * 
    fundamentals['Asset_Turnover'] * 
    100
)
fundamentals['ROA'] = fundamentals['ROA'].replace([np.inf, -np.inf], np.nan)
print("✓ ROA calculated")

# 6. EBITDA Margin
fundamentals['EBITDA_Margin'] = (fundamentals['EBITDA'] / fundamentals['Revenue']) * 100
fundamentals['EBITDA_Margin'] = fundamentals['EBITDA_Margin'].replace([np.inf, -np.inf], np.nan)
print("✓ EBITDA Margin calculated")

print("\nFinal fundamentals shape:", fundamentals.shape)
print("\nAll columns:")
print(fundamentals.columns.tolist())
# print("\nData summary:")
# print(fundamentals.describe())
# print("\nMissing values:")
# print(fundamentals.isna().sum())

Processing and calculating derived metrics...
✓ Debt-to-Equity calculated
✓ Debt-to-Assets calculated
✓ Asset Turnover calculated
✓ ROE calculated (DuPont formula)
✓ ROA calculated
✓ EBITDA Margin calculated

Final fundamentals shape: (101, 26)

All columns:
['Price Close', 'Company Market Cap', 'Book Value Per Share', 'Earnings Per Share - Mean', 'Revenue', 'Revenue Per Share', 'Total Debt', 'Total Equity', 'Shareholders Equity - Broker Estimate', 'Total Assets', 'Net Profit Margin, (%)', 'Operating Margin, Percent', 'EBITDA', 'Revenue_1Y_Ago', 'EPS_1Y_Ago', 'EBITDA_1Y_Ago', 'Revenue_Growth', 'EPS_Growth', 'EBITDA_Growth', 'Debt_to_Equity', 'Debt_to_Assets', 'Asset_Turnover', 'Equity_Multiplier', 'ROE', 'ROA', 'EBITDA_Margin']


### Build Value Factor

In [12]:
# Value Factor = combination of B/M, E/P, S/P
print("Building Value Factor...")

# Extract needed columns (using our renamed columns)
book_value = fundamentals['Total Equity']
price = fundamentals['Price Close']
eps = fundamentals['Earnings Per Share - Mean']
revenue = fundamentals['Revenue']
market_cap = fundamentals['Company Market Cap']

# Calculate ratios
BM = book_value / market_cap  # Book-to-Market
EP =  eps /price       # Earnings-to-Price
SP = revenue /  market_cap    # Sales-to-Price

# Remove infinite and NA values
BM = BM.replace([np.inf, -np.inf], np.nan).dropna()
EP = EP.replace([np.inf, -np.inf], np.nan).dropna()
SP = SP.replace([np.inf, -np.inf], np.nan).dropna()

# Standardize each component
BM_std = standardize(winsorize(BM))
EP_std = standardize(winsorize(EP))
SP_std = standardize(winsorize(SP))

# Align all series
common_index = BM_std.index.intersection(EP_std.index).intersection(SP_std.index)

# Composite Value Factor (average)
value_factor = (BM_std[common_index] + EP_std[common_index] + SP_std[common_index]) / 3

print(f"\nValue Factor Statistics:")
print(value_factor.describe())
print(f"\nTop 10 Value stocks:")
print(value_factor.nlargest(10))

Building Value Factor...

Value Factor Statistics:
count        91.0
mean    -0.082449
std      0.817906
min     -1.138195
25%     -0.615094
50%     -0.254481
75%      0.075833
max       3.38971
dtype: Float64

Top 10 Value stocks:
Instrument
GM.N         3.38971
CVS.N       2.595521
CMCSA.OQ    2.335659
TGT.N       2.101118
VZ.N        1.646217
PFE.N       1.494348
T.N         1.332107
FDX.N       1.119505
COP.N       1.019853
XOM.N       0.900282
dtype: Float64


### Build Growth Factor (Using Real YoY Growth!)

In [13]:
# Growth Factor = combination of Revenue growth, EPS growth, and EBITDA growth
print("Building Growth Factor...")

# Check if we have actual growth data
if 'Revenue_Growth' in fundamentals.columns and fundamentals['Revenue_Growth'].notna().sum() > 10:
    print("✓ Using actual year-over-year growth data\n")
    
    revenue_growth = fundamentals['Revenue_Growth']
    eps_growth = fundamentals['EPS_Growth']
    ebitda_growth = fundamentals['EBITDA_Growth']
    
    
    print(f"Revenue Growth: {len(revenue_growth)} valid values")
    print(f"EPS Growth:     {len(eps_growth)} valid values")
    print(f"EBITDA Growth:  {len(ebitda_growth)} valid values")
    
    # Standardize each component
    revenue_growth_std = standardize(winsorize(revenue_growth))
    eps_growth_std = standardize(winsorize(eps_growth))
    ebitda_growth_std = standardize(winsorize(ebitda_growth))
    
    # Find common stocks across all growth metrics. Gives the list of stocks that exist in ALL three Series
    common_index = (
        revenue_growth_std.index
        .intersection(eps_growth_std.index)
        .intersection(ebitda_growth_std.index)
    )
    
    # Composite Growth Factor (equal-weighted average)
    growth_factor = (
        revenue_growth_std[common_index] + 
        eps_growth_std[common_index] + 
        ebitda_growth_std[common_index]
    ) / 3
    
    print(f"\n✓ Growth factor created for {len(growth_factor)} stocks")
    

if len(growth_factor) > 0:
    print(f"\nTop 10 Growth stocks:")
    top_growth = growth_factor.nlargest(10)
    print(top_growth)
    
    print(f"\nBottom 10 Growth stocks (negative growth):")
    bottom_growth = growth_factor.nsmallest(10)
    print(bottom_growth)

Building Growth Factor...
✓ Using actual year-over-year growth data

Revenue Growth: 101 valid values
EPS Growth:     101 valid values
EBITDA Growth:  101 valid values

✓ Growth factor created for 101 stocks

Top 10 Growth stocks:
Instrument
UBER.N     3.637524
NVDA.OQ    3.246364
PLTR.OQ    2.315872
LLY.N      1.984353
MRK.N      1.974068
NOW.N      1.114455
META.OQ     1.10587
AVGO.OQ    1.065193
AMZN.OQ    1.009332
GS.N       0.797271
dtype: Float64

Bottom 10 Growth stocks (negative growth):
Instrument
BA.N      -2.010424
NKE.N     -1.532452
INTC.OQ   -1.508303
DE.N      -1.273953
TXN.OQ    -1.236553
NEE.N     -1.088301
CVS.N     -0.816692
MO.N      -0.775582
CVX.N     -0.744279
LOW.N     -0.728325
dtype: Float64


### Build Momentum Factor

In [14]:
# Momentum Factor = 12-month return (skipping last month)
print("Building Momentum Factor...")

# Check we have enough data
print(f"Total days of return data: {len(returns)}")

# We need at least 252 trading days for 12-month momentum
if len(returns) < 252:
    print(f"WARNING: Only {len(returns)} days available, using all available data")
    momentum_returns = returns.iloc[:-21] if len(returns) > 21 else returns
else:
    # Calculate 12-1 month momentum
    # Returns from 252 days ago to 21 days ago (skip last month)
    momentum_returns = returns.iloc[-252:-21]

print(f"Using {len(momentum_returns)} days for momentum calculation")

# Calculate cumulative return for each stock (compound returns)
momentum_factor = (1 + momentum_returns).prod() - 1

# Remove NaN values
momentum_factor = momentum_factor.dropna()

print(f"\nValid momentum values: {len(momentum_factor)}")

# Standardize
if len(momentum_factor) > 10:  # Need minimum stocks for standardization
    momentum_factor = standardize(winsorize(momentum_factor))
    
    print(f"\nMomentum Factor Statistics:")
    print(momentum_factor.describe())
    print(f"\nTop 10 Momentum stocks:")
    print(momentum_factor.nlargest(10))
else:
    print("WARNING: Not enough stocks with valid momentum data")

Building Momentum Factor...
Total days of return data: 501
Using 231 days for momentum calculation

Valid momentum values: 101

Momentum Factor Statistics:
count       101.0
mean         -0.0
std           1.0
min     -1.713032
25%     -0.643397
50%     -0.096852
75%      0.664776
max      2.819536
dtype: Float64

Top 10 Momentum stocks:
Price Close
PLTR.OQ     2.819536
AMD.OQ      2.819536
GE.N        2.693801
CVS.N       2.254137
INTC.OQ     2.159022
UBER.N      1.613706
RTX.N       1.501762
GOOGL.OQ     1.37032
CAT.N       1.361439
GOOG.OQ     1.341076
dtype: Float64


### Build Volatility Factor

In [15]:
# Volatility Factor = historical standard deviation
print("Building Volatility Factor...")

# Calculate 252-day (1 year) rolling volatility
volatility = returns.std() * np.sqrt(252)  # Annualized

# Standardize
volatility_factor = standardize(winsorize(volatility))

print(f"\nVolatility Factor Statistics:")
print(volatility_factor.describe())
print(f"\nTop 10 most volatile stocks:")
print(volatility_factor.nlargest(10))

Building Volatility Factor...

Volatility Factor Statistics:
count       101.0
mean          0.0
std           1.0
min     -1.224684
25%     -0.701005
50%     -0.177926
75%      0.340512
max       3.62258
dtype: Float64

Top 10 most volatile stocks:
Price Close
PLTR.OQ     3.62258
TSLA.OQ     3.62258
INTC.OQ    3.076968
AMD.OQ     2.700284
AVGO.OQ    2.580774
NVDA.OQ    2.330979
ORCL.N     2.113706
UBER.N      1.10867
UNH.N      1.060172
QCOM.OQ    1.024354
dtype: Float64


###  Build Size Factor

In [16]:
# Size Factor = log of Market Cap
print("Building Size Factor...")

market_cap = fundamentals['Company Market Cap']

# Remove NaN and zero values
market_cap = market_cap[market_cap > 0].dropna()

print(f"Valid market cap values: {len(market_cap)}")

# Log transform
log_mcap = np.log(market_cap)

# Standardize
size_factor = standardize(winsorize(log_mcap))

print(f"\nSize Factor Statistics:")
print(size_factor.describe())
print(f"\nTop 10 largest stocks:")
print(size_factor.nlargest(10))

Building Size Factor...
Valid market cap values: 101

Size Factor Statistics:
count       101.0
mean         -0.0
std           1.0
min     -1.567446
25%      -0.69417
50%     -0.215887
75%       0.33371
max      2.777669
Name: Company Market Cap, dtype: Float64

Top 10 largest stocks:
Instrument
AAPL.OQ     2.777669
NVDA.OQ     2.777669
GOOG.OQ     2.684924
GOOGL.OQ    2.684924
MSFT.OQ     2.668904
AMZN.OQ     2.284398
META.OQ     1.924126
TSLA.OQ      1.88425
AVGO.OQ       1.8536
BRKb.N      1.502378
Name: Company Market Cap, dtype: Float64


###  Build Quality Factor

In [17]:
# Quality Factor = combination of available profitability metrics
print("Building Quality Factor...")

# Check what quality metrics we have
available_quality = []

if 'ROE' in fundamentals.columns and fundamentals['ROE'].notna().sum() > 10:
    roe = fundamentals['ROE'].replace([np.inf, -np.inf], np.nan).dropna()
    roe_std = standardize(winsorize(roe))
    available_quality.append(('ROE', roe_std))
    print(f"✓ ROE: {len(roe)} valid values")

if 'ROA' in fundamentals.columns and fundamentals['ROA'].notna().sum() > 10:
    roa = fundamentals['ROA'].replace([np.inf, -np.inf], np.nan).dropna()
    roa_std = standardize(winsorize(roa))
    available_quality.append(('ROA', roa_std))
    print(f"✓ ROA: {len(roa)} valid values")

if 'Net Profit Margin, (%)' in fundamentals.columns and fundamentals['Net Profit Margin, (%)'].notna().sum() > 10:
    margin = fundamentals['Net Profit Margin, (%)'].replace([np.inf, -np.inf], np.nan).dropna()
    margin_std = standardize(winsorize(margin))
    available_quality.append(('Net Profit Margin, (%)', margin_std))
    print(f"✓ Profit Margin: {len(margin)} valid values")

if len(available_quality) == 0:
    print("\n⚠ WARNING: No quality metrics available")
    print("Creating quality factor from Value + Low Volatility (quality proxy)")
    
    # Quality stocks tend to be valuable and stable
    # So we can create a proxy from Value and inverse Volatility
    quality_factor = (value_factor - volatility_factor) / 2
else:
    # Combine available quality metrics
    print(f"\nCombining {len(available_quality)} quality metrics...")
    
    # Find common stocks
    common_index = available_quality[0][1].index
    for name, factor in available_quality[1:]:
        common_index = common_index.intersection(factor.index)
    
    # Average the standardized metrics
    quality_sum = sum(factor[common_index] for name, factor in available_quality)
    quality_factor = quality_sum / len(available_quality)

print(f"\nQuality Factor Statistics:")
print(quality_factor.describe())
if len(quality_factor) > 0:
    print(f"\nTop 10 Quality stocks:")
    print(quality_factor.nlargest(10))

Building Quality Factor...
✓ ROE: 91 valid values
✓ ROA: 91 valid values
✓ Profit Margin: 101 valid values

Combining 3 quality metrics...

Quality Factor Statistics:
count        91.0
mean     0.008003
std      0.710003
min     -2.091698
25%     -0.363894
50%     -0.150102
75%      0.352084
max      2.237123
dtype: Float64

Top 10 Quality stocks:
Instrument
NVDA.OQ     2.237123
MA.N        2.231716
CL.N        1.929493
AAPL.OQ     1.702994
V.N         1.424845
META.OQ     1.169177
ADBE.OQ     1.116045
GOOG.OQ     0.916297
GOOGL.OQ    0.916297
HD.N        0.885767
dtype: Float64


###  Build Leverage Factor

In [18]:
# Leverage Factor = Debt-to-Equity ratio
print("Building Leverage Factor...")

debt_to_equity = fundamentals['Debt_to_Equity']

# Remove NaN and infinite values
debt_to_equity = debt_to_equity.replace([np.inf, -np.inf], np.nan).dropna()

print(f"Valid leverage values: {len(debt_to_equity)}")

# Standardize
leverage_factor = standardize(winsorize(debt_to_equity))

print(f"\nLeverage Factor Statistics:")
print(leverage_factor.describe())
print(f"\nTop 10 most leveraged stocks:")
print(leverage_factor.nlargest(10))



Building Leverage Factor...
Valid leverage values: 101

Leverage Factor Statistics:
count       101.0
mean         -0.0
std           1.0
min     -3.027589
25%     -0.286055
50%     -0.181865
75%      0.094229
max      4.505346
Name: Debt_to_Equity, dtype: Float64

Top 10 most leveraged stocks:
Instrument
ABBV.N     4.505346
CL.N       4.505346
AMT.N      2.244993
AMGN.OQ    2.108865
GS.N       1.825856
SPG.N      1.633224
HD.N       1.583162
MS.N       1.379552
AXP.N       1.16363
SCHW.N     1.162353
Name: Debt_to_Equity, dtype: Float64


### Build Sectors Factor

In [19]:
# Industry Factors = dummy variables for sectors
print("Building Industry Factors...")

# Get sector for each stock
sectors = constituents.set_index('ticker')['sector']

# Create dummy variables
industry_factors = pd.get_dummies(sectors, prefix='Sector')

print(f"\nNumber of sectors: {industry_factors.shape[1]}")
print("\nSectors:")
print(industry_factors.columns.tolist())
print("\nSector distribution:")
print(industry_factors.sum().sort_values(ascending=False))

Building Industry Factors...

Number of sectors: 11

Sectors:
['Sector_Communication Services', 'Sector_Consumer Discretionary', 'Sector_Consumer Staples', 'Sector_Energy', 'Sector_Financials', 'Sector_Health Care', 'Sector_Industrials', 'Sector_Information Technology', 'Sector_Materials', 'Sector_Real Estate', 'Sector_Utilities']

Sector distribution:
Sector_Financials                18
Sector_Information Technology    17
Sector_Health Care               15
Sector_Industrials               14
Sector_Consumer Staples          10
Sector_Communication Services     9
Sector_Consumer Discretionary     9
Sector_Energy                     3
Sector_Utilities                  3
Sector_Real Estate                2
Sector_Materials                  1
dtype: Int64


### Combine All Factors

In [20]:
# Combine all factors into one DataFrame
print("Combining all factors...")

# Create a list of all factor series
all_factors = {
    'Value': value_factor,
    'Growth': growth_factor,
    'Momentum': momentum_factor,
    'Volatility': volatility_factor,
    'Size': size_factor,
    'Quality': quality_factor,
    'Leverage': leverage_factor
}

# Find common stocks across all factors
common_stocks = None
for name, factor in all_factors.items():
    print(f"{name}: {len(factor)} stocks")
    if common_stocks is None:
        common_stocks = set(factor.index)
    else:
        common_stocks = common_stocks.intersection(set(factor.index))

common_stocks = list(common_stocks)
print(f"\nCommon stocks across all factors: {len(common_stocks)}")

# Create factor exposures matrix with aligned stocks
factor_exposures = pd.DataFrame(index=common_stocks)

for name, factor in all_factors.items():
    factor_exposures[name] = factor.loc[common_stocks]

# Add industry factors (only for common stocks)
industry_factors_aligned = industry_factors.loc[
    industry_factors.index.intersection(common_stocks)
]

# Merge with industry factors
factor_exposures = pd.concat([factor_exposures, industry_factors_aligned], axis=1)

# Fill any remaining NaN with 0
factor_exposures = factor_exposures.fillna(0)

print(f"\nFinal Factor Exposures Shape: {factor_exposures.shape}")
print(f"Number of stocks: {factor_exposures.shape[0]}")
print(f"Number of factors: {factor_exposures.shape[1]}")
print("\nStyle factors:")
print(factor_exposures[['Value', 'Growth', 'Momentum', 'Volatility', 'Size', 'Quality', 'Leverage']].head())
print("\nFirst few industry factors:")
print(factor_exposures.filter(like='Sector').head())

Combining all factors...
Value: 91 stocks
Growth: 101 stocks
Momentum: 101 stocks
Volatility: 101 stocks
Size: 101 stocks
Quality: 91 stocks
Leverage: 101 stocks

Common stocks across all factors: 91

Final Factor Exposures Shape: (91, 18)
Number of stocks: 91
Number of factors: 18

Style factors:
            Value    Growth  Momentum  Volatility      Size   Quality  \
UNP.N    -0.30102 -0.341704 -0.566237   -0.799322 -0.500403  0.370833   
CL.N    -0.366956 -0.193732 -0.933902   -1.043235 -1.277053  1.929493   
QCOM.OQ  0.012607  0.333696 -0.187628    1.024354 -0.215887 -0.052061   
ORCL.N  -0.636942 -0.003336  0.755684    2.113706  0.779473  0.188037   
PG.N    -0.209443 -0.418885  -0.91388   -1.170543  0.370815  0.189767   

         Leverage  
UNP.N    0.094229  
CL.N     4.505346  
QCOM.OQ -0.181865  
ORCL.N   0.773234  
PG.N    -0.190273  

First few industry factors:
         Sector_Communication Services  Sector_Consumer Discretionary  \
UNP.N                            False  