In [1]:
import pandas as pd
import numpy as np
import random
import pickle
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
import matplotlib.pyplot as plt
from sklearn import linear_model
from sklearn.feature_selection import SelectKBest, f_regression, mutual_info_regression
from sklearn.ensemble import ExtraTreesRegressor

In [2]:
pkl_file = open('data_clipped.pkl', 'rb')
data_transformed = pickle.load(pkl_file)
pkl_file.close()

In [3]:
Y_train = data_transformed['Y_train']
X_train = data_transformed['X_train']
Y_test = data_transformed['Y_test']
X_test = data_transformed['X_test']

In [4]:
Corr_=[]
for col in X_train.columns.values:
    Corr_.append([col, abs(Y_train.corr(X_train[col]))])

In [5]:
Corr_df = pd.DataFrame(Corr_,columns=['column', 'correlation'])
Corr_df.sort_values('correlation',ascending=False).head(10)

Unnamed: 0,column,correlation
87,technical_20,0.012239
95,technical_30,0.010319
1,timestamp,0.008216
59,fundamental_53,0.005512
105,technical_40,0.005433
17,fundamental_11,0.00541
76,technical_7,0.005225
54,fundamental_48,0.005034
62,fundamental_56,0.00493
61,fundamental_55,0.004783


In [6]:
KBest_select = SelectKBest(score_func =f_regression, k=10).fit(X_train, Y_train)

In [7]:
names = X_train.columns.values
kbest_scores=pd.DataFrame({'feature': names, 'score': KBest_select.scores_}).sort_values('score',ascending=False)
kbest_scores.head(10)

Unnamed: 0,feature,score
87,technical_20,192.22594
95,technical_30,136.643252
1,timestamp,86.62445
59,fundamental_53,38.98027
105,technical_40,37.872596
17,fundamental_11,37.558779
76,technical_7,35.023978
54,fundamental_48,32.510046
62,fundamental_56,31.181914
61,fundamental_55,29.353802


In [8]:
Extrees_select = ExtraTreesRegressor(n_estimators=10,random_state=0, n_jobs=8).fit(X_train, Y_train)

In [9]:
Extrees_scores=pd.DataFrame({'feature': names, 'score': Extrees_select.feature_importances_}).sort_values('score',ascending=False)
Extrees_scores.head(10)

Unnamed: 0,feature,score
98,technical_33,0.040908
95,technical_30,0.039512
106,technical_41,0.036451
1,timestamp,0.035902
90,technical_24,0.033384
73,technical_3,0.031328
87,technical_20,0.030382
71,technical_1,0.029881
74,technical_5,0.028558
96,technical_31,0.026377


In [10]:
selected_corr= list(Corr_df.sort_values('correlation',ascending=False).column)[0:10]
selected_kbest=list(kbest_scores.feature)[0:10]
selected_trees=list(Extrees_scores.feature)[0:10]

In [11]:
selected_features={'selected_corr':selected_corr,'selected_kbest':selected_kbest, 'selected_trees':selected_trees}
output = open('selected_features_clip.pkl', 'wb')
pickle.dump(selected_features, output)
output.close()