In [181]:
import numpy as np # calculations with arrays
import pandas as pd # user-friendly DataFrames for data representation
import sklearn # machine learning algorithms
from sklearn import ensemble, linear_model
from sklearn.metrics import log_loss
from sklearn import cross_validation
from sklearn import grid_search
from sklearn.feature_extraction import DictVectorizer as DV
import xgboost as xgboost
import matplotlib.pyplot as plt # import plot functions
# necessary to plot in jupyter notebook:
%matplotlib inline
import seaborn as sns # make plots beautiful

In [182]:
train = pd.read_csv('train2.csv')
test = pd.read_csv('test2.csv')
sample_submission = pd.read_csv('sample_submission.csv')

In [183]:
train.head(3)

Unnamed: 0,year,day,team1,team2,score1,score2,target
0,2998,19,317,131,336,278,True
1,2998,28,61,29,301,259,True
2,2998,28,110,141,359,267,True


In [184]:
print train.shape

(101609, 7)


In [185]:
test.head(3)

Unnamed: 0,Id,year,team1,team2
0,0,3021,363,161
1,1,3021,286,2
2,2,3020,232,52


In [186]:
print test.shape

(125207, 4)


In [187]:
dum_df = pd.concat([train[['team1', 'team2']], test[['team1','team2']]], axis=0)
print dum_df.shape

(226816, 2)


In [188]:
dum_df.head(2)

Unnamed: 0,team1,team2
0,317,131
1,61,29


In [189]:
dummies_team1 = pd.get_dummies(dum_df.team1)

In [190]:
dummies_team2 = pd.get_dummies(dum_df.team2)

In [191]:
dummies_team1.shape

(226816, 356)

In [192]:
dummies_team2.shape

(226816, 357)

In [193]:
dummies_team1.columns

Int64Index([  2,   3,   4,   5,   6,   7,   8,  10,  11,  12,
            ...
            355, 356, 357, 358, 359, 360, 361, 362, 363, 364],
           dtype='int64', length=356)

In [194]:
dummies_team2.columns

Int64Index([  1,   2,   3,   4,   5,   6,   7,   8,  10,  11,
            ...
            355, 356, 357, 358, 359, 360, 361, 362, 363, 364],
           dtype='int64', length=357)

In [195]:
dummies_team1.columns & dummies_team2.columns

Int64Index([  2,   3,   4,   5,   6,   7,   8,  10,  11,  12,
            ...
            355, 356, 357, 358, 359, 360, 361, 362, 363, 364],
           dtype='int64', length=356)

In [196]:
cat = pd.concat([dummies_team1, dummies_team2], axis=1)

In [197]:
cat.shape

(226816, 713)

In [198]:
cat_train = cat[:101609]

In [199]:
cat_test = cat[101609:]

In [200]:
cat_train.shape

(101609, 713)

In [201]:
cat_test.shape

(125207, 713)

In [202]:
train.drop(['team1','team2','score1','score2','day'], axis=1, inplace=True)

In [203]:
train.head(3)

Unnamed: 0,year,target
0,2998,True
1,2998,True
2,2998,True


In [204]:
train.shape

(101609, 2)

In [205]:
X_train = pd.concat([train, cat_train], axis=1)

In [206]:
X_train.shape

(101609, 715)

In [207]:
X_train.head(3)

Unnamed: 0,year,target,2,3,4,5,6,7,8,10,...,355,356,357,358,359,360,361,362,363,364
0,2998,True,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2998,True,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2998,True,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [208]:
X_test = pd.concat([test, cat_test], axis=1)

In [209]:
X_test.head(3)

Unnamed: 0,Id,year,team1,team2,2,3,4,5,6,7,...,355,356,357,358,359,360,361,362,363,364
0,0,3021,363,161,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,3021,286,2,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,3020,232,52,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [210]:
print X_train.shape
print X_test.shape

(101609, 715)
(125207, 717)


In [211]:
y_train = X_train[['target']]
X_train.drop(['target'], inplace=True, axis=1)

In [212]:
X_train.head(3)

Unnamed: 0,year,2,3,4,5,6,7,8,10,11,...,355,356,357,358,359,360,361,362,363,364
0,2998,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2998,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2998,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [213]:
X_train.shape

(101609, 714)

In [214]:
X_test.drop(['id','team1','team2'], inplace=True, axis=1)

In [215]:
X_test.head(3)

Unnamed: 0,Id,year,2,3,4,5,6,7,8,10,...,355,356,357,358,359,360,361,362,363,364
0,0,3021,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,3021,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,3020,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [216]:
X_test.shape

(125207, 715)

## Имеем
X_train - нормальные данные для обучения и оценки качества взятые с train.csv
X_test - надо предсказать, но осталась колонка id

In [217]:
cv_train, cv_test, y_cv_train, y_cv_test = cross_validation.train_test_split(X_train, y_train, test_size=0.2)

In [218]:
y_train.head(3)

Unnamed: 0,target
0,True
1,True
2,True


In [219]:
y_train.shape

(101609, 1)

In [220]:
cv_train.shape

(81287, 714)

In [221]:
cv_test.shape

(20322, 714)

In [222]:
y_cv_train.shape

(81287, 1)

In [223]:
cv_train.head(3)

Unnamed: 0,year,2,3,4,5,6,7,8,10,11,...,355,356,357,358,359,360,361,362,363,364
77422,3015,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
63882,3012,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5438,2999,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [225]:
cv_test.head(3)

Unnamed: 0,year,2,3,4,5,6,7,8,10,11,...,355,356,357,358,359,360,361,362,363,364
53677,3010,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
35029,3006,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14461,3001,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [304]:
xgb_cl = xgboost.XGBClassifier(max_depth=8, learning_rate=0.7,n_estimators=130)

In [305]:
xgb_cl.fit(cv_train, y_cv_train)

XGBClassifier(base_score=0.5, colsample_bytree=1, gamma=0, learning_rate=0.7,
       max_delta_step=0, max_depth=8, min_child_weight=1, n_estimators=130,
       nthread=-1, objective='binary:logistic', seed=0, silent=True,
       subsample=1)

In [290]:
param_grid = {'max_depth': np.linspace(2,7,5, dtype=int), 
              'learning_rate': np.linspace(0.75,1.25,3, dtype=int),
              'n_estimators': np.linspace(80,120,3, dtype=int)}

In [292]:
cv_train.head(2)

Unnamed: 0,year,2,3,4,5,6,7,8,10,11,...,355,356,357,358,359,360,361,362,363,364
77422,3015,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
63882,3012,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [295]:
grid_cv = grid_search.GridSearchCV(estimator = xgb_cl, param_grid=param_grid)

In [296]:
cv_train.shape

(81287, 714)

In [297]:
y_cv_train.shape

(81287, 1)

In [298]:
grid_cv.fit(cv_train, y_cv_train)

IndexError: too many indices for array

In [289]:
cv_test.head(3)

Unnamed: 0,year,2,3,4,5,6,7,8,10,11,...,355,356,357,358,359,360,361,362,363,364
53677,3010,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
35029,3006,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14461,3001,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [306]:
y_cv_pred = xgb_cl.predict_proba(cv_test)[:,1]

In [307]:
log_loss(y_cv_test,y_cv_pred)

0.603119000746698

In [308]:
X_test.head(3)

Unnamed: 0,year,2,3,4,5,6,7,8,10,11,...,355,356,357,358,359,360,361,362,363,364
0,3021,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,3021,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3020,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [309]:
X_test.shape

(125207, 714)

In [310]:
X_test.drop(['Id'], inplace=True, axis=1)

In [311]:
X_test.head(3)

Unnamed: 0,year,2,3,4,5,6,7,8,10,11,...,355,356,357,358,359,360,361,362,363,364
0,3021,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,3021,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3020,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [312]:
y_pred = xgb_cl.predict_proba(X_test)[:,1]

In [313]:
ss = sample_submission.copy()


In [314]:
ss.shape

(125207, 2)

In [315]:
y_pred.shape

(125207,)

In [316]:
ss.target = y_pred
ss.to_csv('ans.csv', index=False)