In [1]:
import pandas as pd
import numpy as np
import random
import pickle
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
import matplotlib.pyplot as plt
from sklearn import linear_model
from sklearn.feature_selection import SelectKBest, f_regression, mutual_info_regression
from sklearn.ensemble import ExtraTreesRegressor

In [2]:
pkl_file = open('data_reduced.pkl', 'rb')
data_transformed = pickle.load(pkl_file)
pkl_file.close()

In [3]:
Y_train = data_transformed['Y_train']
X_train = data_transformed['X_train']
Y_test = data_transformed['Y_test']
X_test = data_transformed['X_test']

In [4]:
Corr_=[]
for col in X_train.columns.values:
    Corr_.append([col, abs(Y_train.corr(X_train[col]))])

In [5]:
Corr_df = pd.DataFrame(Corr_,columns=['column', 'correlation'])
Corr_df.sort_values('correlation',ascending=False).head(10)

Unnamed: 0,column,correlation
87,technical_20,0.012476
95,technical_30,0.011095
1,timestamp,0.00798
61,fundamental_55,0.005466
59,fundamental_53,0.004931
86,technical_19,0.004389
14,fundamental_8,0.00433
62,fundamental_56,0.00421
66,fundamental_60,0.00408
17,fundamental_11,0.003935


In [6]:
KBest_select = SelectKBest(score_func =f_regression, k=10).fit(X_train, Y_train)

In [7]:
names = X_train.columns.values
kbest_scores=pd.DataFrame({'feature': names, 'score': KBest_select.scores_}).sort_values('score',ascending=False)
kbest_scores.head(10)

Unnamed: 0,feature,score
87,technical_20,153.282764
95,technical_30,121.21774
1,timestamp,62.705865
61,fundamental_55,29.421808
59,fundamental_53,23.93942
86,technical_19,18.966139
14,fundamental_8,18.457531
62,fundamental_56,17.455104
66,fundamental_60,16.389499
17,fundamental_11,15.247439


In [8]:
Extrees_select = ExtraTreesRegressor(n_estimators=10,random_state=0, n_jobs=8).fit(X_train, Y_train)

In [9]:
Extrees_scores=pd.DataFrame({'feature': names, 'score': Extrees_select.feature_importances_}).sort_values('score',ascending=False)
Extrees_scores.head(10)

Unnamed: 0,feature,score
95,technical_30,0.038413
98,technical_33,0.036902
106,technical_41,0.032754
90,technical_24,0.030203
87,technical_20,0.028957
73,technical_3,0.028397
1,timestamp,0.028028
71,technical_1,0.026742
74,technical_5,0.025932
81,technical_13,0.02589


In [10]:
selected_corr= list(Corr_df.sort_values('correlation',ascending=False).column)[0:10]
selected_kbest=list(kbest_scores.feature)[0:10]
selected_trees=list(Extrees_scores.feature)[0:10]

In [11]:
selected_features={'selected_corr':selected_corr,'selected_kbest':selected_kbest, 'selected_trees':selected_trees}
output = open('selected_features_red.pkl', 'wb')
pickle.dump(selected_features, output)
output.close()