In [1]:
# sklearn imports
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV

# other imports
import pyarrow.parquet as pq
import pandas as pd
import numpy as np
#import plotly.express as px
#import plotly.graph_objects as go

In [2]:
## Reading the Parquet file and converting it to a pandas dataframe
df = pq.read_table(r'CaoYouSample.parquet')
df = df.to_pandas()

In [3]:
## Data Cleaning steps before we jump into test - train split and transformations
df_new = df.copy()
rem_cols = ['CONM', 'TIC', 'CUSIP','FiscalYearEnd', 'FYEND_plus_3mos','LPERMNO','FYR','SIC','DATADATE']
df_new.drop(rem_cols, axis = 1, inplace = True)
df_new = df_new.drop_duplicates()
df_new = df_new.dropna()

In [4]:
# No need to do any of this. All columns are already scaled by common shares outstanding (CSHO).
# See my SAS file 03 Create features.sas. Also, Cao and You scale by common shares outstanding,
# not common equity (CEQ).

'''
# helper function to return the column list under each column type
def col_classification(df):
    emp_dict = {}
    for types in df.dtypes.unique():
        if types in emp_dict:
            continue
        else:
            emp_dict[types] = []
    
    for col in df.columns:
        emp_dict[df[col].dtypes].append(col)
    
    return emp_dict

# helper function to transform the input data frame (This function normalized all the input variables w.r.t 'CEQ' column)
# CEQ: Common Equity
def transform_X(df):
    X_ = df.copy()
    for col in X_.columns:
        if col != 'CEQ':
            
            # creating a new column name (per equity)
            col_str = col + '_PER_EQ'
            X_.loc[:,col_str] = X_[col] / X_['CEQ']
            
            # dropping the redundant column
            X_.drop(col, axis = 1, inplace = True)
        else:
            continue
    
    # replacing Inifinities with nan and then truncating it to zero
    X_ = X_.replace([np.inf, -np.inf], np.nan)
    X_ = X_.fillna(0)
    
    # returning the dataframe as a matrix
    return X_.values
'''

"\n# helper function to return the column list under each column type\ndef col_classification(df):\n    emp_dict = {}\n    for types in df.dtypes.unique():\n        if types in emp_dict:\n            continue\n        else:\n            emp_dict[types] = []\n    \n    for col in df.columns:\n        emp_dict[df[col].dtypes].append(col)\n    \n    return emp_dict\n\n# helper function to transform the input data frame (This function normalized all the input variables w.r.t 'CEQ' column)\n# CEQ: Common Equity\ndef transform_X(df):\n    X_ = df.copy()\n    for col in X_.columns:\n        if col != 'CEQ':\n            \n            # creating a new column name (per equity)\n            col_str = col + '_PER_EQ'\n            X_.loc[:,col_str] = X_[col] / X_['CEQ']\n            \n            # dropping the redundant column\n            X_.drop(col, axis = 1, inplace = True)\n        else:\n            continue\n    \n    # replacing Inifinities with nan and then truncating it to zero\n    X_ 

In [4]:
'''# calling the col_classification function
dtype_dict = col_classification(df_new)'''

'''# Creating list of X and Y columns
X = [item for item in dtype_dict[list(dtype_dict.keys())[1]] if item != 'E_F1' and item != 'FYEAR']
y = [item for item in dtype_dict[list(dtype_dict.keys())[1]] if item == 'E_F1']'''

# Just to be safe, let's create the feature list manually. I'm using Appendix 1 of Cao and You,
# pages 42 - 43, under the heading "Input features for machine learning models".
X_IncomeStmt = ['SALE', 'COGS', 'XSGA', 'XAD', 'XRD', 'DP', 'XINT', 'NOPIO', 'TXT', 'XIDO', 'E', 'DVC']
X_IncomeStmt += [f'{feature}_D1' for feature in X_IncomeStmt]
X_BalanceSheet = ['CHE', 'INVT', 'RECT', 'ACT', 'PPENT', 'IVAO', 'INTAN', 'AT', 'AP', 'DLC', 'TXP', 'LCT', 'DLTT', 'LT', 'CEQ']
X_BalanceSheet += [f'{feature}_D1' for feature in X_BalanceSheet]
X_CashFlowStmt = ['CFO', 'CFO_D1']

X = X_IncomeStmt + X_BalanceSheet + X_CashFlowStmt
y = ['E_F1']

# separating out test and train data sets (for the year - 2016)
year = 2016

train_X = df_new.loc[(df_new['FYEAR'] >= (year-10)) & (df_new['FYEAR'] <= (year-1)), X]
train_y = df_new.loc[(df_new['FYEAR'] >= (year-10)) & (df_new['FYEAR'] <= (year-1)), y]

test_X = df_new.loc[df_new['FYEAR'] == year, X]
test_y = df_new.loc[df_new['FYEAR'] == year, y]

In [5]:
# Setting up the hyperparameter search space for the RF Model
parameters = {'max_features':['auto'],'max_depth':[20,25,30,35],'min_samples_leaf':[15,20,25,50]}
#parameters = {'max_features':['auto'],'max_depth':[30],'min_samples_leaf':[25]}

# Setting up the Random forest model
rf_mod = RandomForestRegressor(n_estimators=500, criterion='mse', oob_score=True, n_jobs=-1, random_state=10)

# Setting up the Grid Search object
grid_search = GridSearchCV(rf_mod, parameters, cv=5, n_jobs=-1, scoring='neg_mean_squared_error')

# Fitting the model
grid_search.fit(train_X, train_y)

  self.best_estimator_.fit(X, y, **fit_params)


GridSearchCV(cv=5, error_score=nan,
             estimator=RandomForestRegressor(bootstrap=True, ccp_alpha=0.0,
                                             criterion='mse', max_depth=None,
                                             max_features='auto',
                                             max_leaf_nodes=None,
                                             max_samples=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_estimators=500, n_jobs=-1,
                                             oob_score=True, random_state=10,
                                             verbose=0, warm_start=False),
             iid='deprecated', n_jobs=-1

In [6]:
#test_X_prepared = transform_X(test_X)
pred = grid_search.predict(test_X)

In [7]:
print(f"RMSE(max_depth:30, min_samples_leaf:25): {np.sqrt(mean_squared_error(test_y, pred))}\n")

RMSE(max_depth:30, min_samples_leaf:25): 13.263330502778093

