# PATH TOWARDS STACKING

In [75]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler

In [76]:
CLASSIFIERS = {'rdmf': RandomForestClassifier(n_estimators=100, n_jobs=-1),
               'logreg': LogisticRegression(n_jobs=-1),
               'xgboost': XGBClassifier(n_estimators=24, learning_rate=0.05, max_depth=3,
                                        min_child_weight=1, gamma=0,
                                        scale_pos_weight=1, nthread=-1, seed=27)}

FEATURES_TO_KEEP = {'rdmf': ['h_season_points', 'h_mean_nb_goals_scored_home',
                             'h_mean_nb_goals_conceded_home', 'h_season_wages',
                             'a_mean_nb_goals_scored_away',
                             'a_mean_nb_goals_conceded_away',
                             'a_season_wages', 'distance_km'],
                    'logreg': ['h_nb_games_home', 'h_nb_victories', 'h_season_points',
                               'h_nb_games_total', 'h_nb_goals_scored_home',
                               'h_season_wages', 'a_nb_games_away', 'a_season_points',
                               'a_nb_games_total', 'a_season_wages'],
                    'xgboost': ['h_nb_victories', 'h_season_points',
                                'h_nb_games_total', 'h_nb_goals_scored_home',
                                'h_mean_nb_goals_scored_home', 'h_nb_goals_conceded_home',
                                'h_mean_nb_goals_conceded_home', 'h_season_wages',
                                'a_season_points', 'a_nb_goals_scored_away',
                                'a_mean_nb_goals_scored_away', 'a_mean_nb_goals_conceded_away',
                                'a_season_wages', 'capacity_home_stadium']}

In [77]:
FILEPATH = '../data/ML/E0_ML.csv'
data = pd.read_csv(FILEPATH)
data = data.drop('id', 1)
stacking_data = data.copy()

n_train = 2*len(stacking_data)/3
n_test = len(stacking_data) - n_train

stacking_data_train = stacking_data[0:n_train]
stacking_data_train=stacking_data_train.reset_index()

stacking_data_test = stacking_data[n_train:n_train+n_test]
stacking_data_test=stacking_data_test.reset_index()

In [78]:
y = data['home_win'].values

feat_data = data.drop('home_win', 1)


X = feat_data.values
standardizer = StandardScaler()
X = standardizer.fit_transform(X)

X_train = X[0:n_train]
y_train = y[0:n_train]

X_test = X[n_train:n_train+n_test]
y_test = y[n_train:n_train+n_test]

## Step 1: Metadata_train: Create k folds and associate the examples to one fold

In [79]:
stacking_data_train['fold_index']=np.nan
from sklearn.model_selection import StratifiedKFold
k=5
skf = StratifiedKFold(n_splits=k,random_state=17)
i=0
for train_index, test_index in skf.split(X_train, y_train):
    stacking_data_train.loc[test_index,'fold_index'] = i
    i+=1

## Step 2: Metada_train: For all models, train on k-1 folds, predict on the remaining fold

In [80]:
for algo_name in CLASSIFIERS:
    stacking_data_train[algo_name]=np.nan
    classifier = CLASSIFIERS[algo_name]
    for train_index, test_index in skf.split(X_train, y_train):
        classifier.fit(X_train[train_index],y_train[train_index])
        predictions = classifier.predict(X_train[test_index])
        
        for i in test_index:
            stacking_data_train.loc[i][algo_name]=predictions[i]
            
stacking_data_train.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


IndexError: index 394 is out of bounds for axis 0 with size 394