In [2]:
# sklearn imports
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV

# other imports
import pyarrow.parquet as pq
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go

In [3]:
## Reading the Parquet file and converting it to a pandas dataframe
df = pq.read_table('CaoYouSample.parquet')
df = df.to_pandas()

In [4]:
## Data Cleaning steps before we jump into test - train split and transformations
df_new = df.copy()
rem_cols = ['CONM', 'TIC', 'CUSIP','FiscalYearEnd', 'FYEND_plus_3mos','LPERMNO','FYR','SIC','DATADATE']
df_new.drop(rem_cols, axis = 1, inplace = True)
df_new = df_new.drop_duplicates()
df_new = df_new.dropna()

In [8]:
# helper function to return the column list under each column type
def col_classification(df):
    emp_dict = {}
    for types in df.dtypes.unique():
        if types in emp_dict:
            continue
        else:
            emp_dict[types] = []
    
    for col in df.columns:
        emp_dict[df[col].dtypes].append(col)
    
    return emp_dict

# helper function to transform the input data frame (This function normalized all the input variables w.r.t 'CEQ' column)
# CEQ: Common Equity
def transform_X(df):
    X_ = df.copy()
    for col in X_.columns:
        if col != 'CEQ':
            
            # creating a new column name (per equity)
            col_str = col + '_PER_EQ'
            X_.loc[:,col_str] = X_[col] / X_['CEQ']
            
            # dropping the redundant column
            X_.drop(col, axis = 1, inplace = True)
        else:
            continue
    
    # replacing Inifinities with nan and then truncating it to zero
    X_ = X_.replace([np.inf, -np.inf], np.nan)
    X_ = X_.fillna(0)
    
    # returning the dataframe as a matrix
    return X_.values

In [10]:
# calling the col_classification function
dtype_dict = col_classification(df_new)

# Creating list of X and Y columns
X = [item for item in dtype_dict[list(dtype_dict.keys())[1]] if item != 'E_F1' and item != 'FYEAR']
y = [item for item in dtype_dict[list(dtype_dict.keys())[1]] if item == 'E_F1']

# separating out test and train data sets (for the year - 2016)
year = 2016

train_X = df_new.loc[(df_new['FYEAR'] >= (year-10)) & (df_new['FYEAR'] <= (year-1)), X]
train_y = df_new.loc[(df_new['FYEAR'] >= (year-10)) & (df_new['FYEAR'] <= (year-1)), y]

test_X = df_new.loc[df_new['FYEAR'] == year, X]
test_y = df_new.loc[df_new['FYEAR'] == year, y]

In [11]:
# Preparing the input data frame for training purpose
X_prepared = transform_X(train_X)

In [None]:
# Setting up the hyperparameter search space for the RF Model
#parameters = {'max_features':['auto'],'max_depth':[20,25,30,35],'min_samples_leaf':[15,20,25,50]}
parameters = {'max_features':['auto'],'max_depth':[30],'min_samples_leaf':[25]}

# Setting up the Random forest model
rf_mod = RandomForestRegressor(n_estimators=500,criterion='mse',oob_score=True,n_jobs=-1,random_state=10)

# Setting up the Grid Search object
grid_search = GridSearchCV(rf_mod, parameters, cv=5, n_jobs=-1, scoring='neg_mean_squared_error')

# Fitting the model
grid_search.fit(X_prepared, train_y)

In [12]:
test_X_prepared = transform_X(test_X)
pred = grid_search.predict(test_X_prepared)

In [14]:
print(f"RMSE(max_depth:30, min_samples_leaf:25): {np.sqrt(mean_squared_error(test_y, pred))}\n")

RMSE(max_depth:30, min_samples_leaf:25): 16.08369316805577

