In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pandas import Timestamp

# Reading the data

In [3]:
pd.set_option('display.max_rows', 10)
pd.set_option('display.max_columns', 10)

In [None]:
# df_info = pd.read_csv('data_files/SignalDoc.csv')
# df_info

In [None]:
# df = pd.read_csv('data_files/crspm_and_predictors.csv')
# df

In [2]:
rf_rates = pd.read_csv('TB3MS.csv')
rf_rates

Unnamed: 0,observation_date,TB3MS
0,1934-01-01,0.72
1,1934-02-01,0.62
2,1934-03-01,0.24
3,1934-04-01,0.15
4,1934-05-01,0.16
...,...,...
1089,2024-10-01,4.51
1090,2024-11-01,4.42
1091,2024-12-01,4.27
1092,2025-01-01,4.21


In [3]:
# it is giving diff rows every time,
# every time rows are increasing, maybe due to some limits of colab
NUM_ROWS = 500000
df = pd.read_csv('crspm_and_predictors.csv', nrows=NUM_ROWS)
#df = pd.read_csv('sample.csv', low_memory=False, engine='c')
df.shape # (500000, 119)

(500000, 119)

In [9]:
rf_rates['observation_date'] = pd.to_datetime(rf_rates['observation_date'], format='%Y-%m-%d')
df['yyyymm'] = pd.to_datetime(df['yyyymm'], format='%Y%m') #df.loc[:, 'yyyymm']

# Sampling /skiping for now

In [10]:
# need to be changed according to data
filtered_tb3ms = rf_rates[(rf_rates['observation_date'] >= '1990-01-01') &
                          (rf_rates['observation_date'] <= '2000-12-01')]

filtered_tb3ms.reset_index(drop=True, inplace=True)
filtered_tb3ms

Unnamed: 0,observation_date,TB3MS
0,1990-01-01,7.64
1,1990-02-01,7.74
2,1990-03-01,7.90
3,1990-04-01,7.77
4,1990-05-01,7.74
...,...,...
127,2000-08-01,6.09
128,2000-09-01,6.00
129,2000-10-01,6.11
130,2000-11-01,6.17


In [6]:
df = df.drop(columns=['year']) # added in analyze_dataset()
df.reset_index(drop=True, inplace=True)

KeyError: "['year'] not found in axis"

In [None]:
# Function to analyze date ranges and predictor availability
def analyze_dataset(df):
    min_date = df['yyyymm'].min()
    max_date = df['yyyymm'].max()
    print(f"Date range: {min_date.strftime('%Y-%m')} to {max_date.strftime('%Y-%m')}")
    print(f"Spans {(max_date.year - min_date.year) * 12 + (max_date.month - min_date.month)} months")

    df['year'] = df['yyyymm'].dt.year

    # Count stocks per year
    stocks_per_year = df.groupby('year')['permno'].nunique()
    # print(f"\nNumber of unique stocks by year:")
    # print(stocks_per_year)

    # Get predictor columns
    predictor_columns = [col for col in df.columns if col not in
                        ['permno', 'yyyymm', 'prc', 'ret', 'Price', 'year']]

    print(f"\nTotal predictors: {len(predictor_columns)}")

    # Calculate predictor availability overall
    overall_availability = df[predictor_columns].notna().mean().sort_values(ascending=False) * 100

    # Calculate predictor availability by year
    availability_by_year = {}
    for year in sorted(df['year'].unique()):
        year_data = df[df['year'] == year]
        availability = year_data[predictor_columns].notna().mean() * 100
        availability_by_year[year] = availability

    # Find years with good predictor coverage
    good_threshold = 70  # Consider a predictor "good" if it has at least 70% non-null values
    good_predictors_by_year = {}

    for year, availability in availability_by_year.items():
        good_predictors = availability[availability >= good_threshold].index.tolist()
        good_predictors_by_year[year] = good_predictors

    # Find optimal consecutive period
    min_period = 10  # Minimum consecutive years to consider
    years = sorted(good_predictors_by_year.keys())

    best_start = None
    best_end = None
    max_predictors = 0

    for i in range(len(years) - min_period + 1):
        start_year = years[i]

        # Try windows of different lengths
        for j in range(i + min_period - 1, len(years)):
            end_year = years[j]
            window_years = years[i:j+1]

            # Find predictors available across the entire window
            common_predictors = set(good_predictors_by_year[window_years[0]])
            for year in window_years[1:]:
                common_predictors = common_predictors.intersection(set(good_predictors_by_year[year]))

            if len(common_predictors) > max_predictors:
                max_predictors = len(common_predictors)
                best_start = start_year
                best_end = end_year

    print(f"\nOptimal window: {best_start} to {best_end} ({best_end-best_start+1} years)")
    print(f"Number of predictors with good availability: {max_predictors}")

    # Get the list of common predictors
    if best_start is not None:
        common_predictors = set(good_predictors_by_year[best_start])
        for year in range(best_start+1, best_end+1):
            common_predictors = common_predictors.intersection(set(good_predictors_by_year[year]))

        # Return important information
        return {
            'date_range': (min_date, max_date),
            'stocks_per_year': stocks_per_year,
            'optimal_window': (best_start, best_end),
            'common_predictors': list(common_predictors),
            'overall_availability': overall_availability
        }

    return None


In [None]:
# result = analyze_dataset(df)
# res: Optimal window: 1988 to 1997 (10 years)


# Cleaning

Variables: The dataset includes a unique firm identifier (permno), date,
 stock price (prc), stock return in percentage units adjusted for delisting (ret), and a variable named Signed Price (Price). In addition, there are 114 predictor variables (signals).

In [11]:
# Merge the DataFrames based on the date columns
merged_df = pd.merge(df, filtered_tb3ms, left_on='yyyymm',
                        right_on='observation_date', how='left')

In [12]:
merged_df.drop(columns=['observation_date'], inplace=True)
merged_df.rename(columns={'TB3MS': 'RiskFree', 'yyyymm': 'date'}, inplace=True)

In [None]:
merged_df.describe()

  sqr = _ensure_numeric((avg - values) ** 2)


Unnamed: 0,permno,date,prc,ret,Price,Size,STreversal,MaxRet,High52,RealizedVol,...,MomOffSeason06YrPlus,grcapx,EntMult,Investment,PctTotAcc,EarnSupBig,EarningsSurprise,BetaTailRisk,CBOperProf,RiskFree
count,500000.0,500000,469055.0,467677.0,469055.0,469055.0,481619.0,466475.0,462970.0,462383.0,...,247601.0,257715.0,263002.0,240087.0,216617.0,259804.0,243029.0,236936.0,242469.0,119127.0
mean,43610.59863,1994-05-27 18:50:57.868800128,17.544655,1.078451,-2.356802,inf,-1.047232,-0.071932,0.754237,-0.031089,...,-0.013254,-2.747557,-20.946259,-1.007701,-1.986602,43434710000.0,1100947000000.0,0.627944,0.095031,4.993146
min,10659.0,1960-01-01 00:00:00,-401.0,-100.0,-8.412943,-19.86091,-873.2674,-12.777778,0.002568,-2.873565,...,-1.375983,-20612.5,-79777.492,-153.85258,-2919.4285,-67936550000000.0,-1019048000000000.0,-3.190127,-4.634063,2.86
25%,18203.0,1982-12-01 00:00:00,1.1,-6.1409,-3.243568,-12.86593,-6.29015,-0.086004,0.599017,-0.038464,...,-0.023317,-0.896333,-12.62052,-1.210507,-1.202857,-0.4163806,-0.6777885,0.302062,0.033767,4.33
50%,37218.0,1992-11-01 00:00:00,9.9,0.0,-2.562639,-11.14692,0.0,-0.048334,0.795526,-0.023371,...,-0.012737,-0.121975,-7.775087,-0.937381,-0.535535,-0.08903723,0.0,0.556818,0.116796,5.0
75%,64988.0,2005-07-01 00:00:00,24.25,6.5893,-1.609438,-9.663198,5.8824,-0.027027,0.931296,-0.014213,...,-0.002845,0.379806,-4.941983,-0.670604,0.611309,0.2306155,0.6451272,0.869021,0.188293,5.65
max,86223.0,2024-12-01 00:00:00,4505.0,873.2674,4.158563,inf,100.0,0.534451,56.042801,0.0,...,3.649201,775.0,485.29169,41.532864,1433.0,358044000000000.0,8.86572e+16,5.201625,2.611111,7.9
std,27193.638807,,78.897353,17.310151,1.296351,,17.05872,0.102058,0.296595,0.031083,...,0.02664,154.024971,393.375653,0.773589,41.922132,4233875000000.0,311528400000000.0,0.497144,0.197483,1.247474


In [16]:
merged_df.shape

(500000, 120)

In [19]:
merged_df.isna().sum()

permno                   0
date                     0
prc                  30945
ret                  32323
Price                30945
                     ...  
EarnSupBig          240196
EarningsSurprise    256971
BetaTailRisk        263064
CBOperProf          257531
RiskFree            380873
Length: 120, dtype: int64

In [32]:
merged_df = merged_df.replace({-np.inf: np.nan, np.inf: np.nan})
non_nan_col = merged_df[['permno', 'date']].copy()
numeric_df = merged_df.drop(columns=['permno', 'date'])
print(numeric_df.shape)
features_df = merged_df.drop(columns=['permno', 'date', 'prc', 'Price', 'ret', 'RiskFree'])
print(features_df.shape)

mask_miss_features = np.isnan(features_df)
mask_missing = np.isnan(numeric_df)
numeric_df = numeric_df.fillna(0)
merged_df = pd.concat([non_nan_col, numeric_df], axis=1)
print(merged_df.shape)


(500000, 118)
(500000, 114)
(500000, 120)


In [30]:
merged_df.isna().sum()

permno              0
date                0
prc                 0
ret                 0
Price               0
                   ..
EarnSupBig          0
EarningsSurprise    0
BetaTailRisk        0
CBOperProf          0
RiskFree            0
Length: 120, dtype: int64

In [31]:
inf_counts = (df == np.inf).sum() + (df == -np.inf).sum()
print(inf_counts)


permno              0
yyyymm              0
prc                 0
ret                 0
Price               0
                   ..
PctTotAcc           0
EarnSupBig          0
EarningsSurprise    0
BetaTailRisk        0
CBOperProf          0
Length: 119, dtype: int64


# CAPM

CAPM:  E[return] = risk_free + betta*(E[market_ret] - risk_free)


E[market_ret] - risk_free is market risk premium

In [36]:
df_metrics = merged_df[['permno',	'date',	'prc',	'ret',	'Price','RiskFree']].copy()
df_metrics['ExcessReturn'] = df_metrics['ret'] - df_metrics['RiskFree']
df_metrics['MarketCap'] = df_metrics['prc'].abs()  # price as a proxy

market_returns = df_metrics.groupby('date').apply(
    lambda x: np.sum(x['ExcessReturn'] * x['MarketCap']) / np.sum(x['MarketCap'])
    if np.sum(x['MarketCap']) > 0 else np.nan
).to_dict()

# weighted_sum = df_metrics.groupby('date').apply(lambda x: (x['ExcessReturn'] * x['MarketCap']).sum())
# total_market_cap = df_metrics.groupby('date')['MarketCap'].sum()
# market_returns = (weighted_sum / total_market_cap).to_dict()

df_metrics['MarketReturn_VW'] = df_metrics['date'].map(market_returns)
print(df_metrics.shape)
print(df_metrics.columns)

# Sort by date for rolling window calculations
df_metrics = df_metrics.sort_values(['permno', 'date'])



(500000, 9)
Index(['permno', 'date', 'prc', 'ret', 'Price', 'RiskFree', 'ExcessReturn',
       'MarketCap', 'MarketReturn_VW'],
      dtype='object')


  market_returns = df_metrics.groupby('date').apply(


Unnamed: 0,permno,date,prc,ret,Price,RiskFree,ExcessReturn,MarketCap,MarketReturn_VW
0,10659,1991-02-01,-0.17188,-15.3846,1.760959,5.94,-21.3246,0.17188,5.475416
1,10659,1991-03-01,-0.10938,-36.3636,2.212927,5.91,-42.2736,0.10938,-0.094520
2,10659,1991-04-01,-0.21875,100.0000,1.519826,5.65,94.3500,0.21875,-2.971900
3,10659,1991-05-01,-0.21875,0.0000,1.519826,5.46,-5.4600,0.21875,-0.699092
4,10659,1991-06-01,-0.20313,-7.1429,1.593909,5.57,-12.7129,0.20313,-8.571086
...,...,...,...,...,...,...,...,...,...
499995,86223,2022-06-01,24.37000,-11.1233,-3.193353,0.00,-11.1233,24.37000,-5.366093
499996,86223,2022-07-01,26.73000,11.6331,-3.285787,0.00,11.6331,26.73000,8.384041
499997,86223,2022-08-01,26.32000,-1.5339,-3.270329,0.00,-1.5339,26.32000,-1.544232
499998,86223,2022-09-01,23.78000,-9.6505,-3.168845,0.00,-9.6505,23.78000,-8.184954


In [66]:
print(f'price < 0 {df_metrics['Price'].lt(0).sum()}')
print(f'prc < 0 {df_metrics['prc'].lt(0).sum()}')
print(f'ret < 0 {df_metrics['ret'].lt(0).sum()}')

filtered_df = df_metrics[df_metrics.groupby('permno')['permno'].transform('count') > 3]

df_metrics.groupby(by=['permno'], sort=True).count().sort_values('date', ascending=False)


group_counts = df_metrics.groupby('permno').size()
print(group_counts)
valid_permno = group_counts[group_counts > 10]
print(valid_permno)





price < 0 443766
prc < 0 104469
ret < 0 216553
permno
10659    304
10660     83
10661    460
10662     69
10664    279
        ... 
86218    298
86219     23
86221    222
86222    306
86223    298
Length: 3624, dtype: int64
permno
10659    304
10660     83
10661    460
10662     69
10664    279
        ... 
86218    298
86219     23
86221    222
86222    306
86223    298
Length: 3534, dtype: int64


In [67]:
df_metrics.reset_index(drop=True, inplace=True)
df_metrics

Unnamed: 0,permno,date,prc,ret,Price,RiskFree,ExcessReturn,MarketCap,MarketReturn_VW
0,10659,1991-02-01,-0.17188,-15.3846,1.760959,5.94,-21.3246,0.17188,5.475416
1,10659,1991-03-01,-0.10938,-36.3636,2.212927,5.91,-42.2736,0.10938,-0.094520
2,10659,1991-04-01,-0.21875,100.0000,1.519826,5.65,94.3500,0.21875,-2.971900
3,10659,1991-05-01,-0.21875,0.0000,1.519826,5.46,-5.4600,0.21875,-0.699092
4,10659,1991-06-01,-0.20313,-7.1429,1.593909,5.57,-12.7129,0.20313,-8.571086
...,...,...,...,...,...,...,...,...,...
499995,86223,2022-06-01,24.37000,-11.1233,-3.193353,0.00,-11.1233,24.37000,-5.366093
499996,86223,2022-07-01,26.73000,11.6331,-3.285787,0.00,11.6331,26.73000,8.384041
499997,86223,2022-08-01,26.32000,-1.5339,-3.270329,0.00,-1.5339,26.32000,-1.544232
499998,86223,2022-09-01,23.78000,-9.6505,-3.168845,0.00,-9.6505,23.78000,-8.184954


In [68]:
import statsmodels.api as sm
from tqdm import tqdm

# rolling betas with a 36-month window
window_size = 36 # for estimation of beta based on previous 3 years
predictions = {}
batch_size = 5000

# # dictionary of date-market_return for faster lookups
# market_returns_dict = df_metrics[['date', 'MarketReturn_VW']].drop_duplicates().set_index('date')['MarketReturn_VW'].to_dict()

# TODO optimize taking too long
def process_stock_group(stock_group):
    for permno, group in stock_group.groupby('permno'):
        if len(group) <= window_size:
            continue
        for i in range(window_size, len(group)):
            current_date = group.iloc[i]['date']
            historical_data = group.iloc[i-window_size:i]

            if len(historical_data) < window_size / 2 or historical_data[['ExcessReturn', 'MarketReturn_VW']].isnull().values.any():
                continue

            # Estimating beta using OLS regression
            X = historical_data[['MarketReturn_VW']]
            X = sm.add_constant(X)
            y = historical_data['ExcessReturn']

            try:
                model = sm.OLS(y, X).fit()
                alpha = model.params.iloc[0]
                beta = model.params.iloc[1]

                # prediction for curr date
                market_premium = group.iloc[i]['MarketReturn_VW']
                prediction = alpha + beta * market_premium

                predictions[(permno, current_date)] = prediction
            except:
                continue


In [None]:
unique_stocks = df_metrics['permno'].unique()
print(unique_stocks)
num_batches = (len(unique_stocks) + batch_size - 1) // batch_size

for batch_num in tqdm(range(num_batches), total=num_batches, desc="Processing Batches"):
    print(batch_num)
    start_idx = batch_num * batch_size
    end_idx = min((batch_num + 1) * batch_size, len(unique_stocks))
    batch_stocks = unique_stocks[start_idx:end_idx]
    batch_data = df_metrics[df_metrics['permno'].isin(batch_stocks)]
    process_stock_group(batch_data)

predictions_df = pd.DataFrame([
    {'permno': key[0], 'date': key[1], 'CAPM_Predicted': value}
    for key, value in predictions.items()
])

predictions_df['permno'] = predictions_df['permno'].astype(df_metrics['permno'].dtype)
predictions_df['date'] = predictions_df['date'].astype(df_metrics['date'].dtype)

df_metrics_with_predictions = df_metrics.merge(
    predictions_df,
    on=['permno', 'date'],
    how='left'
)


Processing Batches:   0%|          | 0/1 [00:00<?, ?it/s]

0


In [None]:
prediction_count = df_metrics_with_predictions['CAPM_Predicted'].notna().sum()
print(f"Successfully merged {prediction_count} predictions with df_metrics")

# Calculate performance metrics for the CAPM model
valid_predictions = df_metrics_with_predictions.dropna(subset=['CAPM_Predicted', 'ExcessReturn'])

# Calculate R²
# check if this is correct
sse = np.sum((valid_predictions['ExcessReturn'] - valid_predictions['CAPM_Predicted'])**2)
sst = np.sum(valid_predictions['ExcessReturn']**2)
r2_oos = 1 - (sse / sst)

print(f"CAPM Benchmark Out-of-Sample R²: {r2_oos:.3f}")
print(f"Number of observations used for evaluation: {len(valid_predictions)}")


Successfully merged 451085 predictions with df_metrics
CAPM Benchmark Out-of-Sample R²: 0.032
Number of observations used for evaluation: 451085


# End of CAPM


In [None]:
merged_df['ExcessReturn'] = merged_df['ret'] - merged_df['RiskFree']

merged_df['date'] = pd.to_datetime(merged_df['date'].astype(str), format='%Y-%m-%d')


In [1]:
predictor_vars = [c for c in merged_df.columns if c not in ['permno',	'date',	'prc',	'ret',	'Price', 'RiskFree', 'ExcessReturn']]
len(predictor_vars)

NameError: name 'merged_df' is not defined

# Scaling

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler(feature_range=(-1, 1))

# our data is 1990-2000
train_end = '1995-12-31'
valid_end = '1997-12-31'

# Create masks for each period
train_mask = merged_df['date'] <= train_end
valid_mask = (merged_df['date'] > train_end) & (merged_df['date'] <= valid_end)
test_mask = merged_df['date'] > valid_end


In [None]:
# Extract features and target for each period
X_train = merged_df.loc[train_mask, predictor_vars]
X_valid = merged_df.loc[valid_mask, predictor_vars]
X_test = merged_df.loc[test_mask, predictor_vars]

y_train = merged_df.loc[train_mask, 'ExcessReturn']
y_valid = merged_df.loc[valid_mask, 'ExcessReturn']
y_test = merged_df.loc[test_mask, 'ExcessReturn']

In [None]:
X_train_scaled = scaler.fit_transform(X_train)

# Apply same transformation to validation and test
X_valid_scaled = scaler.transform(X_valid)
X_test_scaled = scaler.transform(X_test)


In [None]:
# Clip validation and test to [-1, 1] to handle outliers
X_valid_scaled = np.clip(X_valid_scaled, -1, 1)
X_test_scaled = np.clip(X_test_scaled, -1, 1)

# Convert back to DataFrames
X_train_processed = pd.DataFrame(X_train_scaled, columns=predictor_vars, index=X_train.index)
X_valid_processed = pd.DataFrame(X_valid_scaled, columns=predictor_vars, index=X_valid.index)
X_test_processed = pd.DataFrame(X_test_scaled, columns=predictor_vars, index=X_test.index)

# Step 5: Replace NaNs with 0 in the scaled space
# First, identify which positions were originally NaN
train_nan_mask = X_train.isna()
valid_nan_mask = X_valid.isna()
test_nan_mask = X_test.isna()

# Then set those positions to 0
for col in predictor_vars:
    X_train_processed.loc[train_nan_mask[col], col] = 0
    X_valid_processed.loc[valid_nan_mask[col], col] = 0
    X_test_processed.loc[test_nan_mask[col], col] = 0

# Check results
for dataset, name in [(X_train_processed, "Training"),
                      (X_valid_processed, "Validation"),
                      (X_test_processed, "Testing")]:
    print(f"\n{name} dataset:")
    print(f"Min values: {dataset.min().min():.4f}")
    print(f"Max values: {dataset.max().max():.4f}")
    print(f"NaN count: {dataset.isna().sum().sum()}")


Training dataset:
Min values: -1.0000
Max values: 1.0000
NaN count: 0

Validation dataset:
Min values: -1.0000
Max values: 1.0000
NaN count: 0

Testing dataset:
Min values: -1.0000
Max values: 1.0000
NaN count: 0


In [None]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
import pandas as pd
from tqdm import tqdm
import time
import os

# Check for GPU availability
try:
    import torch
    gpu_available = torch.cuda.is_available()
    if gpu_available:
        gpu_name = torch.cuda.get_device_name(0)
except ImportError:
    gpu_available = False


In [None]:
# Function to calculate out-of-sample R² (as used in GKX paper)
def r2_oos(y_true, y_pred):
    return 1 - np.sum((y_true - y_pred)**2) / np.sum(y_true**2)

def evaluate_model(model, model_name, X_train, y_train, X_valid, y_valid, X_test, y_test):
    print(f"\nTraining {model_name}...")
    start_time = time.time()

    model.fit(X_train, y_train)

    train_time = time.time() - start_time
    print(f"Training completed in {train_time:.2f} seconds")

    print("Generating predictions...")
    start_time = time.time()

    valid_pred = model.predict(X_valid)
    test_pred = model.predict(X_test)

    pred_time = time.time() - start_time
    print(f"Prediction completed in {pred_time:.2f} seconds")

    print("Calculating metrics...")
    results = {
        'valid_mse': mean_squared_error(y_valid, valid_pred),
        'valid_r2_oos': r2_oos(y_valid, valid_pred),
        'test_mse': mean_squared_error(y_test, test_pred),
        'test_r2_oos': r2_oos(y_test, test_pred),
        'train_time': train_time,
        'pred_time': pred_time
    }

    results['valid_pred'] = valid_pred
    results['test_pred'] = test_pred

    print(f"{model_name} - Valid R² OOS: {results['valid_r2_oos']:.6f}, Test R² OOS: {results['test_r2_oos']:.6f}")

    return results

In [None]:
model_results = {}

models = {
    'Linear Regression': LinearRegression()
    # 'Ridge': Ridge(alpha=1.0),
    # 'Lasso': Lasso(alpha=0.01, max_iter=10000),
    # 'Elastic Net': ElasticNet(alpha=0.01, l1_ratio=0.5, max_iter=10000),
    # 'Random Forest': RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42, n_jobs=-1),
    # 'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, max_depth=3, random_state=42)
}

print(f"\nStarting model training and evaluation...")
print(f"Dataset sizes - Training: {X_train_processed.shape}, Validation: {X_valid_processed.shape}, Testing: {X_test_processed.shape}")

for name in tqdm(list(models.keys()), desc="Models"):
    model = models[name]
    model_results[name] = evaluate_model(
        model, name,
        X_train_processed, y_train,
        X_valid_processed, y_valid,
        X_test_processed, y_test
    )




Starting model training and evaluation...
Dataset sizes - Training: (525397, 114), Validation: (213285, 114), Testing: (309275, 114)


Models:   0%|          | 0/1 [00:00<?, ?it/s]


Training Linear Regression...


Models: 100%|██████████| 1/1 [00:13<00:00, 13.27s/it]

Training completed in 13.07 seconds
Generating predictions...
Prediction completed in 0.18 seconds
Calculating metrics...
Linear Regression - Valid R² OOS: 0.998550, Test R² OOS: 0.998262





In [None]:
print("\nModel Comparison (Test R² OOS):")
for name, results in model_results.items():
    print(f"{name}: {results['test_r2_oos']:.6f} (Training: {results['train_time']:.2f}s, Prediction: {results['pred_time']:.2f}s)")



In [None]:
# Feature importance for tree-based models
if 'Random Forest' in model_results:
    rf_model = models['Random Forest']
    rf_importances = pd.DataFrame({
        'feature': predictor_vars,
        'importance': rf_model.feature_importances_
    }).sort_values('importance', ascending=False)

    print("\nTop 10 Important Features (Random Forest):")
    print(rf_importances.head(10))

if 'Gradient Boosting' in model_results:
    gb_model = models['Gradient Boosting']
    gb_importances = pd.DataFrame({
        'feature': predictor_vars,
        'importance': gb_model.feature_importances_
    }).sort_values('importance', ascending=False)

    print("\nTop 10 Important Features (Gradient Boosting):")
    print(gb_importances.head(10))

# System resource information
print("\nSystem Resource Information:")
print(f"Memory usage of training data: {X_train_processed.memory_usage().sum() / (1024**2):.2f} MB")
print(f"Number of processors: {os.cpu_count()}")