# Feature extraction with RBM

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
data = pd.read_csv('../data/ML/E0_ML_n3_part1.csv')
data.head()

Unnamed: 0,h_nb_victories,h_nb_draws,h_nb_defeats,h_nb_points,h_nb_goals_scored,h_nb_goals_conceded,h_nb_goals_diff,h_nb_games,h_nb_games_home,h_nb_victories_home,...,diff_nb_defeats,diff_nb_points,diff_nb_goals_diff,diff_season_wages,Month,Week,distance_km,capacity_home_stadium,home_win,id
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,41.0,8,33,7839.955337,42785.0,0,E0_2009_2010_0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,-85.5,8,33,2384.244176,31154.0,0,E0_2009_2010_1
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,-8.1,8,33,8738.1094,28100.0,0,E0_2009_2010_2
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,136.4,8,33,14384.656594,42449.0,1,E0_2009_2010_3
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,-56.3,8,33,8491.070855,40157.0,0,E0_2009_2010_4


In [3]:
LABEL = 'home_win'
y=data[LABEL].values
X=data.drop([LABEL,'id','Month','Week'],1).values
features=data.drop([LABEL,'id','Month','Week'],1).columns.values

In [9]:
# normalisation -> [0,1]
from sklearn.preprocessing import MinMaxScaler
from sklearn.neural_network import BernoulliRBM

mm_scaler = MinMaxScaler(feature_range=(0,1))
X_mm = mm_scaler.fit_transform(X)

# new features with RBM
rbm = BernoulliRBM(n_components=25)
X_rbm=rbm.fit_transform(X_mm)
print(rbm.intercept_hidden_)
print(rbm.intercept_visible_)
print(rbm.components_)
len(X_rbm[0])

[-0.37850279 -0.36256577 -0.36857289 -0.36924064 -0.36728588 -0.36937077
 -0.37403561 -0.3672552  -0.36868193 -0.36488134 -0.36549952 -0.36589695
 -0.36996067 -0.37670666 -0.36327156 -0.37639226 -0.36998755 -0.3598193
 -0.37192746 -0.36867492 -0.36933267 -0.36048763 -0.3743176  -0.36726407
 -0.37492855]
[-0.84       -1.22       -1.13333333 -0.84111111 -1.135      -0.8
 -0.94222222  0.32333333  0.95       -0.71       -1.3        -0.76       -0.51
 -1.41666667 -1.4        -0.55363636 -0.39       -0.87       -1.26       -0.75
 -1.30666667 -1.53       -0.61648069 -0.62       -1.36666667 -0.71
 -0.47571429 -1.06       -0.92       -0.36777778  0.27       -0.34       -0.63
 -1.28       -0.86       -0.38666667 -1.39333333 -1.27       -0.14545455
 -0.35666667 -0.83       -1.11       -0.95       -1.43666667 -1.43166667
 -0.82227468 -0.47       -0.39       -0.78       -0.48166667 -0.7625
 -0.37084337 -0.67455808 -0.81674404]
[[-0.21769934 -0.23813515 -0.2159755  ..., -0.06652807 -0.07054375
  -0.

25

In [5]:
# standardisation
from sklearn.preprocessing import StandardScaler

std_scaler = StandardScaler()
X_std = std_scaler.fit_transform(X)

In [6]:
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import RFECV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, roc_curve, make_scorer

param = {'estimator__C': [10**i for i in range(-5, 3)]}
param1 = {'C': [10**i for i in range(-5, 3)]}
aucroc_score = make_scorer(roc_auc_score)

log_reg = LogisticRegression()

selector = RFECV(log_reg, step=1, cv=5)

grid_search = GridSearchCV(selector, param_grid=param,
                                   scoring=aucroc_score, cv=5,
                                   verbose=0, n_jobs=-1)
grid_search.fit(X_std, y)

print('Logistic regression with RFECV feature selection')
print('Best score: %f' % grid_search.best_score_)
print(grid_search.best_params_)
print("Optimal number of features : %d" % grid_search.best_estimator_.n_features_)
best_features = [features[i] for i in range(len(grid_search.best_estimator_.ranking_)) if grid_search.best_estimator_.ranking_[i] == 1]
print(best_features)

Logistic regression with RFECV feature selection
Best score: 0.666501
{'estimator__C': 0.1}
Optimal number of features : 1
['diff_season_wages']


In [10]:
grid_search = GridSearchCV(log_reg, param_grid=param1,
                                   scoring=aucroc_score, cv=5,
                                   verbose=0, n_jobs=-1)
grid_search.fit(X_rbm, y)

print('Logistic regression with RBM feature extraction without feature selection')
print('Best score: %f' % grid_search.best_score_)
print(grid_search.best_params_)

Logistic regression with RBM feature extraction without feature selection
Best score: 0.500000
{'C': 1e-05}
