In [2]:
import matplotlib.pyplot as plt

import numpy as np
import pandas as pd
import time
from tqdm import tqdm
import datetime

from sklearn import preprocessing
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import  mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import GridSearchCV

random_state = np.random.RandomState(0)

## data loading

In [10]:
# Import P_selected
P_selected = pd.read_excel('Portfolio_features_extraction_using_full_LASSO.xlsx', skiprows=range(1,4))
P_selected = P_selected.drop(columns=['Unnamed: 0']).dropna()
print(P_selected.shape)
P_selected.head()

(1955, 78)


Unnamed: 0,flow_55976,flow_27748,flow_75259,flow_40970,flow_62519,flow_21792,flow_54084,flow_10302,flow_27887,flow_75228,...,flow_22752,flow_27780,flow_59379,flow_27705,flow_54594,flow_48725,rsi,rsi_change,macd_rsi,stochastic
0,-6.975709,-1.129123,14.299749,1.426277,9.120669,-0.317257,-0.059609,-0.598776,-8.629334,0.066839,...,-2.927825,-5.331582,1.635919,-2.862576,-3.755393,1.674835,33.426083,0.0,0,0.0
1,-6.975709,-1.221991,-5.184144,-1.638974,0.808131,-0.463507,0.610452,2.164043,1.527991,2.329318,...,-2.069588,-1.625845,2.295149,5.847429,-1.580178,1.814051,33.426083,0.0,0,0.0
2,-6.975709,-1.436277,6.040777,-1.03432,19.670714,-5.074804,0.688991,0.275694,-4.907652,-3.326576,...,-5.260498,0.484742,12.213694,-4.991003,-1.221645,0.205041,33.426083,0.0,0,0.0
3,-3.351631,-3.149916,2.100766,-0.456173,29.96974,1.944966,0.508484,3.725222,-3.085524,-0.624239,...,-3.373292,-0.780234,1.779914,-2.804367,-4.79562,0.819203,33.426083,0.0,0,0.0
4,-2.878666,0.060249,0.323572,2.446547,15.992772,0.274019,1.166131,29.844291,-1.281088,6.362596,...,-4.528327,1.660976,-11.525256,-2.508979,1.846624,1.384477,33.426083,0.0,0,0.0


In [6]:
# Import P_small
P_small = pd.read_csv('P_small.csv')
P_small = P_small.drop(columns=['Unnamed: 0','Portfolio_flows']).dropna()
print(P_small.shape)
P_small.head()

(1955, 8)


Unnamed: 0,Dates,Portfolio_returns,Portfolio_flows_lag_1,Portfolio_flows_lag_2,Portfolio_flows_lag_3,Portfolio_returns_lag_1,Portfolio_returns_lag_2,Portfolio_returns_lag_3
0,19930406,-0.003112,-0.140873,-2.371056,-1.399531,0.003637,-0.015585,-0.001138
1,19930407,-0.001031,0.453289,-0.140873,-2.371056,-0.003112,0.003637,-0.015585
2,19930408,-0.000309,-0.559098,0.453289,-0.140873,-0.001031,-0.003112,0.003637
3,19930412,0.009771,0.160661,-0.559098,0.453289,-0.000309,-0.001031,-0.003112
4,19930413,0.00326,1.440657,0.160661,-0.559098,0.009771,-0.000309,-0.001031


In [16]:
P_select = pd.merge(P_small.iloc[:,:2], P_selected, left_index=True, right_index=True)
P_select.head()

Unnamed: 0,Dates,Portfolio_returns,flow_55976,flow_27748,flow_75259,flow_40970,flow_62519,flow_21792,flow_54084,flow_10302,...,flow_22752,flow_27780,flow_59379,flow_27705,flow_54594,flow_48725,rsi,rsi_change,macd_rsi,stochastic
0,19930406,-0.003112,-6.975709,-1.129123,14.299749,1.426277,9.120669,-0.317257,-0.059609,-0.598776,...,-2.927825,-5.331582,1.635919,-2.862576,-3.755393,1.674835,33.426083,0.0,0,0.0
1,19930407,-0.001031,-6.975709,-1.221991,-5.184144,-1.638974,0.808131,-0.463507,0.610452,2.164043,...,-2.069588,-1.625845,2.295149,5.847429,-1.580178,1.814051,33.426083,0.0,0,0.0
2,19930408,-0.000309,-6.975709,-1.436277,6.040777,-1.03432,19.670714,-5.074804,0.688991,0.275694,...,-5.260498,0.484742,12.213694,-4.991003,-1.221645,0.205041,33.426083,0.0,0,0.0
3,19930412,0.009771,-3.351631,-3.149916,2.100766,-0.456173,29.96974,1.944966,0.508484,3.725222,...,-3.373292,-0.780234,1.779914,-2.804367,-4.79562,0.819203,33.426083,0.0,0,0.0
4,19930413,0.00326,-2.878666,0.060249,0.323572,2.446547,15.992772,0.274019,1.166131,29.844291,...,-4.528327,1.660976,-11.525256,-2.508979,1.846624,1.384477,33.426083,0.0,0,0.0


## Regression with rolling window + gridCV

In [33]:
def regress_rolling_p(df, start_index, window_size, grid):
    '''
    Parameters
    ----------
    df: dataframe
    start_index: int. When you want to start predicting from.
    window_size: int
    grid: dict for grid search

    '''
    regressor = RandomForestRegressor(n_estimators=100, random_state=random_state, oob_score=True, max_features='sqrt')
        
    temp_df = df.reset_index(drop=True)
    temp_df = temp_df.replace([np.inf, -np.inf], 0) # remove any infinity values
    #print(temp_df.head())
    scaler = preprocessing.StandardScaler() 
    looper = temp_df.Dates.iloc[start_index:].values # a list of dates to loop through

    models = []
    best_p = []
    dates = []
    train_MSE = []
    R2 = []
    mean_y = []
    Predicted_y = []
    True_y = []
    imp = []
    
    # rolling regression
    for i, date in enumerate(tqdm(looper)):
        end = temp_df[temp_df.Dates==date].index[0]
        start = end - window_size
        
        X = temp_df.iloc[start:end,2:]
        y = temp_df.iloc[start:end,1]
        X_scaled = scaler.fit_transform(X)
        
        X_train = X_scaled[:-1]
        y_train = y[:-1]
        X_test = X_scaled[-1:]
        y_test = y[-1:]
        
        CV_rf = GridSearchCV(estimator=regressor, param_grid=grid, refit=True)  
        CV_rf.fit(X_train, y_train.values.ravel())  
        
        models.append(CV_rf.best_estimator_)
        best_p.append(CV_rf.best_params_)
        y_pred = CV_rf.predict(X_test)
        dates.append(date)
        train_MSE.append(mean_squared_error(y_train, CV_rf.predict(X_train)))
        R2.append(r2_score(y_train, CV_rf.predict(X_train)))
        mean_y.append(y_train.mean())
        Predicted_y.append(y_pred[0])
        True_y.append(y_test.values[0])
        imp.append(CV_rf.best_estimator_.feature_importances_)
        
    result = pd.DataFrame(list(zip(dates,train_MSE,R2,mean_y,Predicted_y,True_y,best_p)), columns=['Date','MSE_train','R2_train','Historical_Mean','Predicted_Return','True_Return','Best_parameter'])
    return models, result, imp

## implementing regression

In [18]:
start = time.process_time()
print("We now do regressions on equally-weighted portfolio.")

ws = 252
start_index = ws + 1
grid = {"min_samples_leaf": [1, 10, 20]}
models_p, result_p = regress_rolling_p(P_select,start_index, ws, grid)
 
# Notify user
end = time.process_time()
print("Regression done!")
print('Running time: %s mins'%round((end-start)/60,2))

  0%|                                                                                         | 0/1702 [00:00<?, ?it/s]

We now do regressions on equally-weighted portfolio.


100%|████████████████████████████████████████████████████████████████████████████| 1702/1702 [1:37:45<00:00,  3.45s/it]

Regression done!
Running time: 97.42 mins





In [19]:
result_p['diff_sse'] = (result_p.Historical_Mean - result_p.True_Return)**2 - (result_p.Predicted_Return - result_p.True_Return)**2
result_p['Cum_SSE_diff'] = np.cumsum(result_p.diff_sse) 
result_p.head()

Unnamed: 0,Date,MSE_train,R2_train,Historical_Mean,Predicted_Return,True_Return,diff_sse,Cum_SSE_diff
0,19940406,2e-06,0.941532,0.000467,0.000526,0.023475,2.700011e-06,3e-06
1,19940407,2e-06,0.938591,0.000565,0.001454,0.003069,3.662434e-06,6e-06
2,19940408,1e-06,0.949506,0.000578,0.003143,0.005718,1.978559e-05,2.6e-05
3,19940411,2e-06,0.936618,0.000562,-0.002053,-0.008421,4.01442e-05,6.6e-05
4,19940412,2e-06,0.940325,0.000516,0.002577,0.001664,4.85255e-07,6.7e-05


In [24]:
result_out = result_p.copy(deep=True)
result_out['model'] = models_p
result_out.head()

Unnamed: 0,Date,MSE_train,R2_train,Historical_Mean,Predicted_Return,True_Return,diff_sse,Cum_SSE_diff,model
0,19940406,2e-06,0.941532,0.000467,0.000526,0.023475,2.700011e-06,3e-06,"(DecisionTreeRegressor(max_features='sqrt', ra..."
1,19940407,2e-06,0.938591,0.000565,0.001454,0.003069,3.662434e-06,6e-06,"(DecisionTreeRegressor(max_features='sqrt', ra..."
2,19940408,1e-06,0.949506,0.000578,0.003143,0.005718,1.978559e-05,2.6e-05,"(DecisionTreeRegressor(max_features='sqrt', ra..."
3,19940411,2e-06,0.936618,0.000562,-0.002053,-0.008421,4.01442e-05,6.6e-05,"(DecisionTreeRegressor(max_features='sqrt', ra..."
4,19940412,2e-06,0.940325,0.000516,0.002577,0.001664,4.85255e-07,6.7e-05,"(DecisionTreeRegressor(max_features='sqrt', ra..."


In [25]:
result_out.to_csv('random_forest_result.csv')

In [34]:
start = time.process_time()
print("We now do regressions on equally-weighted portfolio.")

ws = 252
start_index = int(ws*7.5) + 1
grid = {"min_samples_leaf": [1, 10, 20]}
model_add, result_add, importance = regress_rolling_p(P_select,start_index, ws, grid)
 
# Notify user
end = time.process_time()
print("Regression done!")
print('Running time: %s mins'%round((end-start)/60,2))

  0%|                                                                                           | 0/64 [00:00<?, ?it/s]

We now do regressions on equally-weighted portfolio.


100%|██████████████████████████████████████████████████████████████████████████████████| 64/64 [03:21<00:00,  3.15s/it]

Regression done!
Running time: 3.36 mins





In [37]:
result_add.tail()

Unnamed: 0,Date,MSE_train,R2_train,Historical_Mean,Predicted_Return,True_Return,Best_parameter
59,20001222,5e-06,0.943144,0.000725,0.004494,0.009809,{'min_samples_leaf': 1}
60,20001226,5e-06,0.942207,0.000766,0.007727,0.020414,{'min_samples_leaf': 1}
61,20001227,6e-06,0.932983,0.000816,0.002846,0.004582,{'min_samples_leaf': 1}
62,20001228,6e-06,0.940537,0.000797,0.0073,0.023689,{'min_samples_leaf': 1}
63,20001229,6e-06,0.943926,0.000871,0.009045,0.019886,{'min_samples_leaf': 1}


In [45]:
feature_name = P_selected.columns.to_list()
feature_imp = pd.DataFrame(importance, columns=feature_name)

feature_imp.head()

Unnamed: 0,flow_55976,flow_27748,flow_75259,flow_40970,flow_62519,flow_21792,flow_54084,flow_10302,flow_27887,flow_75228,...,flow_22752,flow_27780,flow_59379,flow_27705,flow_54594,flow_48725,rsi,rsi_change,macd_rsi,stochastic
0,0.018831,0.011375,0.010995,0.006012,0.006825,0.006685,0.002047,0.005529,0.023822,0.005209,...,0.017298,0.009261,0.018756,0.006621,0.002279,0.017186,0.012151,0.242724,0.001065,0.084289
1,0.022994,0.013244,0.012211,0.004791,0.008778,0.006752,0.002842,0.005488,0.022151,0.0072,...,0.023105,0.009553,0.021748,0.011809,0.004283,0.014043,0.012135,0.24197,0.000668,0.079632
2,0.018038,0.011277,0.005972,0.005121,0.009436,0.005649,0.003528,0.006256,0.02249,0.004567,...,0.015122,0.005597,0.026189,0.005703,0.005109,0.019563,0.015267,0.233219,0.000673,0.083368
3,0.020777,0.011317,0.012464,0.005584,0.007844,0.005011,0.005389,0.006072,0.028888,0.004839,...,0.019418,0.01343,0.015695,0.006758,0.004661,0.015566,0.015364,0.227676,0.001515,0.086797
4,0.015479,0.011052,0.010634,0.004465,0.009047,0.004962,0.001944,0.006099,0.022536,0.004777,...,0.018226,0.007647,0.020401,0.007613,0.005334,0.019795,0.016657,0.259131,0.000706,0.076497


In [58]:
imp_mean = feature_imp.mean()
imp_mean = pd.DataFrame(imp_mean.sort_values(ascending=False), columns=['avg importance'])

In [59]:
imp_mean.head(10)

Unnamed: 0,avg importance
rsi_change,0.24805
stochastic,0.082239
flow_66181,0.022133
flow_27887,0.021974
flow_56232,0.020579
rsi,0.020404
flow_59379,0.019025
flow_55976,0.016837
flow_10147,0.016568
flow_25961,0.016221
