In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.feature_selection import SelectKBest, f_regression, mutual_info_regression
from sklearn.ensemble import ExtraTreesRegressor
%matplotlib inline

In [3]:
data = pd.HDFStore("train.h5", "r").get("train")

In [4]:
_mean = data.mean(axis=0)
data.fillna(_mean, inplace=True);

In [5]:
train, test = train_test_split(data, test_size=0.25, random_state=8, shuffle= False)
y_train = train.y
X_train = train.drop('y', axis=1)
y_test = test.y
X_test = test.drop('y', axis=1)

## R score for Evaluation

In [6]:
def r_score(y_true, y_pred):
    r2 = r2_score(y_true, y_pred)
    return (np.sign(r2)*np.sqrt(np.abs(r2)))

## Several benchmark models

In [8]:
print ('Using mean y of train set:{:.6f}'.format(r_score(y_test,np.ones(y_test.size)*y_train.mean())))
print ('Using median y of train set:{:.6f}'.format(r_score(y_test,np.ones(y_test.size)*y_train.median())))
print ('Using zeros:{:.6f}'.format(r_score(y_test,np.ones(y_test.size)*0)))

id_mean=train.groupby('id')['y'].mean()
all_mean=y_train.mean()
y_test_calc=[id_mean[i] if i in id_mean else all_mean for i in X_test.id]
print ('Using id based mean:{:.6f}'.format(r_score(y_test,y_test_calc)))

Using mean y of train set:-0.012877
Using median y of train set:-0.023757
Using zeros:-0.017466
Using id based mean:-0.060893


## Pick the first five features from each feature selection method in Initial_EDA.

In [17]:
# From correlation:
best_corr=['technical_20','technical_30','fundamental_11','technical_27','technical_19']

# From SelectKBest:
best_k=['technical_20','technical_30','fundamental_11','fundamental_51','fundamental_53']

# From ExtraTrees:
best_trees=['technical_30','technical_33','technical_20','technical_41','technical_24']

## Scatter plots with y

In [18]:
best_all=list(set(best_corr)|set(best_k)|set(best_trees))

## Fit simple linear regression models for each selected feature

In [34]:
clf_lin = linear_model.LinearRegression()
for feature in best_all:
    clf_lin.fit(X_train[feature].values.reshape(-1, 1), y_train)
    y_test_pred=clf_lin.predict(X_test[feature].values.reshape(-1, 1))
    print ('Using LinearRegression with {}: {:.6f}'.format(feature, r_score(y_test,y_test_pred)))

Using LinearRegression with fundamental_11: -0.012102
Using LinearRegression with fundamental_53: -0.033865
Using LinearRegression with technical_30: 0.011037
Using LinearRegression with technical_33: -0.012903
Using LinearRegression with technical_19: -0.008601
Using LinearRegression with fundamental_51: -0.012891
Using LinearRegression with technical_24: -0.012806
Using LinearRegression with technical_41: -0.012893
Using LinearRegression with technical_20: 0.013850
Using LinearRegression with technical_27: -0.012945


In [35]:
clf_ridge = linear_model.Ridge(alpha=1)
for feature in best_all:
    clf_ridge.fit(X_train[feature].values.reshape(-1, 1), y_train)
    y_test_pred=clf_ridge.predict(X_test[feature].values.reshape(-1, 1))
    print ('Using LinearRegression with {}: {:.6f}'.format(feature, r_score(y_test,y_test_pred)))

Using LinearRegression with fundamental_11: -0.012102
Using LinearRegression with fundamental_53: -0.033865
Using LinearRegression with technical_30: 0.010207
Using LinearRegression with technical_33: -0.012903
Using LinearRegression with technical_19: -0.008601
Using LinearRegression with fundamental_51: -0.012891
Using LinearRegression with technical_24: -0.012806
Using LinearRegression with technical_41: -0.012893
Using LinearRegression with technical_20: 0.013231
Using LinearRegression with technical_27: -0.012945
