# Import libraries

In [1]:
import numpy as np # calculations with arrays
import pandas as pd # user-friendly DataFrames for data representation
import sklearn # machine learning algorithms
from sklearn import ensemble, linear_model
from sklearn.metrics import log_loss
from sklearn import cross_validation
from sklearn import grid_search
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction import DictVectorizer as DV
import xgboost as xgboost
import matplotlib.pyplot as plt # import plot functions
# necessary to plot in jupyter notebook:
%matplotlib inline
import seaborn as sns # make plots beautiful



# Load data

In [2]:
train = pd.read_csv('train2.csv')
test = pd.read_csv('test2.csv')
sample_submission = pd.read_csv('sample_submission.csv')

# Data

In [3]:
train.head(3)

Unnamed: 0,year,day,team1,team2,score1,score2,target
0,2998,19,317,131,336,278,True
1,2998,28,61,29,301,259,True
2,2998,28,110,141,359,267,True


In [75]:
train.shape

(101609, 7)

In [4]:
test.head(3)

Unnamed: 0,Id,year,team1,team2
0,0,3021,363,161
1,1,3021,286,2
2,2,3020,232,52


In [5]:
sample_submission.head(3)

Unnamed: 0,Id,target
0,0,0.5
1,1,0.5
2,2,0.5


In [6]:
print 'train.shape = ', train.shape
print 'test.shape = ', test.shape
print 'sample_submission.shape = ', sample_submission.shape

train.shape =  (101609, 7)
test.shape =  (125207, 4)
sample_submission.shape =  (125207, 2)


# Work with categorical features

# Make dataset for cross-vallidation using One-Hot-Encoding

In [36]:
enc = OneHotEncoder(sparse=False)

In [37]:
train_team1 = pd.DataFrame(enc.fit_transform(train[['team1']]))

In [38]:
print 'train_team1.shape = ', train_team1.shape

train_team1.shape =  (101609, 353)


In [39]:
train_team2 = pd.DataFrame(enc.transform(train[['team2']]))

In [40]:
print 'train_team2.shape = ', train_team2.shape

train_team2.shape =  (101609, 353)


In [41]:
train_teams = pd.concat([train_team1, train_team2], axis=1)

In [42]:
print 'train_team1 + train_team2 = ', train_teams.shape

train_team1 + train_team2 =  (101609, 706)


In [43]:
test_team1 = pd.DataFrame(enc.transform(test[['team1']]))

In [44]:
print 'test_team1 = ', test_team1.shape

test_team1 =  (125207, 353)


In [45]:
test_team2 = pd.DataFrame(enc.transform(test[['team2']]))

In [46]:
print 'test_team2 = ', test_team2.shape

test_team2 =  (125207, 353)


In [47]:
test_teams = pd.concat([test_team1, test_team2], axis=1)

In [48]:
print 'test_teams.shape = ', test_teams.shape

test_teams.shape =  (125207, 706)


# Feature engenering

### Добавили среднее число очков: для команды team1 – среднее количество очков, набранных дома; для команды team2 – среднее количество очков, набранных в гостях

In [53]:
df_train = train
df_test = test

In [54]:
means_score = {}
for index_team in list(df_train.team1.unique()):
    df_for_team1 = df_train[df_train['team1'] == index_team]
    df_for_team2 = df_train[df_train['team2'] == index_team]
    means_score[index_team] = [int(df_for_team1.score1.mean()), int(df_for_team2.score2.mean())]
#print(means_score)
#for c in means_score:
#    print(means_score[c])

In [55]:
teams_not_train = []
for index_team in list(df_test.team1.unique()):
    if index_team not in list(df_train.team1.unique()):
        teams_not_train.append(index_team)
for index_team in list(df_test.team2.unique()):
    if index_team not in list(df_train.team1.unique()):
        teams_not_train.append(index_team)

In [56]:
for index_team in teams_not_train:
    means_score[index_team] = [df_train.score1.mean(), df_train.score2.mean()]

In [57]:
features_to_concat_train = []
for year, day, team1, team2, _, __, ___ in df_train.values:
    features_to_concat_train.append([means_score[team1][0], means_score[team2][1]])
features_to_concat_test = []
for Id, year, team1, team2 in df_test.values:
    features_to_concat_test.append([means_score[team1][0], means_score[team2][1]])

In [58]:
df_feature_to_concat_train = pd.DataFrame(data=features_to_concat_train, columns=['mean_home_score', 'mean_away_score'])

In [59]:
df_feature_to_concat_test = pd.DataFrame(data=features_to_concat_test, columns=['mean_home_score', 'mean_away_score'])

### Добавляем процент побед: для team1 – процент побед дома; для team2 – процент побед в гостях

In [64]:
percentage_wins = {}
for index_team in list(df_train.team1.unique()):
    df_for_team_home = df_train[df_train['team1'] == index_team]
    df_for_team_away = df_train[df_train['team2'] == index_team]
    percentage_wins[index_team] = [df_for_team_home.target.mean(), 1 - df_for_team_away.target.mean()]

In [65]:
for index_team in teams_not_train:
    percentage_wins[index_team] = [df_train.target.mean(), 1 - df_train.target.mean()]

In [66]:
percentage_of_wins_train = []
for year, day, team1, team2, _, __, ___ in df_train.values:
    percentage_of_wins_train.append([percentage_wins[team1][0], percentage_wins[team2][1]])
percentage_of_wins_test = []
for Id, year, team1, team2 in df_test.values:
    percentage_of_wins_test.append([percentage_wins[team1][0], percentage_wins[team2][1]])

In [67]:
df_percentage_of_wins_train = pd.DataFrame(data=percentage_of_wins_train, columns=['percentage_wins_home', 'percentage_wins_away'])

In [68]:
df_percentage_of_wins_test = pd.DataFrame(data=percentage_of_wins_test, columns=['percentage_wins_home', 'percentage_wins_away'])

   # Make X_train, X_test, y_train, y_test

### Делаем X_train

In [87]:
X_train_all_feat = pd.concat([train, df_feature_to_concat_train, df_percentage_of_wins_train], axis=1)

In [88]:
print 'X_train_all_feat.shape = ', X_train_all_feat.shape

X_train_all_feat.shape =  (101609, 11)


In [89]:
X_train_all_feat.head(2)

Unnamed: 0,year,day,team1,team2,score1,score2,target,mean_home_score,mean_away_score,percentage_wins_home,percentage_wins_away
0,2998,19,317,131,336,278,True,289,285,0.688073,0.640118
1,2998,28,61,29,301,259,True,273,276,0.503333,0.556962


In [90]:
y_train =  X_train_all_feat[['target']]

In [91]:
X_train_not_scaled = X_train_all_feat.drop(['day', 'team1', 'team2','score1', 'score2', 'target'], axis=1)

In [92]:
X_train_not_scaled.head(2)

Unnamed: 0,year,mean_home_score,mean_away_score,percentage_wins_home,percentage_wins_away
0,2998,289,285,0.688073,0.640118
1,2998,273,276,0.503333,0.556962


In [93]:
print 'X_train_not_scaled.shape = ', X_train_not_scaled.shape

X_train_not_scaled.shape =  (101609, 5)


In [97]:
print 'y_train.shape = ', y_train.shape

y_train.shape =  (101609, 1)


In [98]:
X_test_all_feat = pd.concat([test, df_feature_to_concat_test, df_percentage_of_wins_test], axis=1)

In [99]:
X_test_all_feat.shape

(125207, 8)

In [100]:
X_test_all_feat.head(2)

Unnamed: 0,Id,year,team1,team2,mean_home_score,mean_away_score,percentage_wins_home,percentage_wins_away
0,0,3021,363,161,250.0,278.0,0.428571,0.535826
1,1,3021,286,2,271.0,242.0,0.593567,0.394464


In [101]:
X_test_not_scaled = X_test_all_feat.drop(['Id', 'team1', 'team2'], axis=1)

In [102]:
X_test_not_scaled.head(2)

Unnamed: 0,year,mean_home_score,mean_away_score,percentage_wins_home,percentage_wins_away
0,3021,250.0,278.0,0.428571,0.535826
1,3021,271.0,242.0,0.593567,0.394464


## X_train_not_scaled – преобразованная train.csv, но не нормированный year
## y_train – target from train.csv
## X_test_not_scaled – test.csv, но не нормированный year

# Нормировка 

In [103]:
scaler = StandardScaler()
train_scaled = pd.DataFrame(scaler.fit_transform(X_train_not_scaled), columns=list(X_train_not_scaled.columns))
test_scaled = pd.DataFrame(scaler.transform(X_test_not_scaled), columns=list(X_test_not_scaled.columns))


In [104]:
train_scaled.head(2)

Unnamed: 0,year,mean_home_score,mean_away_score,percentage_wins_home,percentage_wins_away
0,-1.752835,1.411135,1.134001,1.573942,1.261859
1,-1.752835,0.236316,0.468392,0.019886,0.518128


In [105]:
test_scaled.head(2)

Unnamed: 0,year,mean_home_score,mean_away_score,percentage_wins_home,percentage_wins_away
0,1.873146,-1.452486,0.616305,-0.609021,0.329087
1,1.873146,0.089464,-2.046131,0.778944,-0.935227


In [106]:
X_train = pd.concat([train_scaled, train_teams], axis=1)

In [107]:
print 'X_train.shape = ', X_train.shape

X_train.shape =  (101609, 711)


In [108]:
X_train.head(3)

Unnamed: 0,year,mean_home_score,mean_away_score,percentage_wins_home,percentage_wins_away,0,1,2,3,4,...,343,344,345,346,347,348,349,350,351,352
0,-1.752835,1.411135,1.134001,1.573942,1.261859,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-1.752835,0.236316,0.468392,0.019886,0.518128,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-1.752835,1.264283,-0.197217,0.21717,-0.392998,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [109]:
X_test = pd.concat([test_scaled, test_teams], axis=1)

In [110]:
print 'X_test.shape = ', X_test.shape

X_test.shape =  (125207, 711)


In [111]:
X_test.head(3)

Unnamed: 0,year,mean_home_score,mean_away_score,percentage_wins_home,percentage_wins_away,0,1,2,3,4,...,343,344,345,346,347,348,349,350,351,352
0,1.873146,-1.452486,0.616305,-0.609021,0.329087,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.873146,0.089464,-2.046131,0.778944,-0.935227,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.715495,1.264283,-0.862826,0.415097,-2.705856,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# cross_validation

In [113]:
train_data, test_data, train_labels, test_labels = cross_validation.train_test_split(X_train, y_train,
                                                                                     test_size=0.2, stratify = y_train)

In [114]:
print np.sum(test_labels['target'] == True) / np.sum(test_labels['target'] == False)

1


In [115]:
print np.sum(train_labels['target'] == True) / np.sum(train_labels['target'] == False)

1


In [116]:
print 'train_data.shape = ', train_data.shape
print 'test_data.shape = ', test_data.shape
print 'train_labels.shape = ', train_labels.shape
print 'test_labels.shape = ', test_labels.shape

train_data.shape =  (81287, 711)
test_data.shape =  (20322, 711)
train_labels.shape =  (81287, 1)
test_labels.shape =  (20322, 1)


# XGB

## Первое приближение

In [237]:
train_data = X_train_hot_encod

In [238]:
train_labels = y_train

In [191]:
params = {  'max_depth': np.linspace(2, 12, 3, dtype=int), 
            'max_delta_step': np.linspace(0.03, 0.5, 3),
            'n_estimators': np.linspace(80,160, 3, dtype=int)}

In [192]:
xgb_classifier = xgboost.XGBClassifier()

In [195]:
y_t = np.array(train_labels).reshape(train_data.shape[0])
y_t.shape

(101609,)

# Linear_Classifier

In [117]:
lin_cl = linear_model.LogisticRegression(C=0.8)

In [118]:
lin_cl.fit(train_data, train_labels)

  y = column_or_1d(y, warn=True)


LogisticRegression(C=0.8, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [126]:
X_train.head(2)

Unnamed: 0,year,mean_home_score,mean_away_score,percentage_wins_home,percentage_wins_away,0,1,2,3,4,...,343,344,345,346,347,348,349,350,351,352
0,-1.752835,1.411135,1.134001,1.573942,1.261859,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-1.752835,0.236316,0.468392,0.019886,0.518128,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [125]:
lin_cl.coef_

array([[-0.00668974,  0.06378819, -0.05108153,  0.93504147, -0.91278323,
         0.72079846, -0.07815502,  0.43281329, -0.91981884, -1.14024575,
        -0.30502664, -0.68693202, -0.29918212, -0.30760465,  0.23331415,
         0.7965367 , -0.27939164, -0.39654278,  0.3669798 , -0.13714863,
         0.35024333,  0.78196997, -0.39085981, -0.07295957,  0.79807257,
        -1.30009247, -0.72655426, -0.11331618, -1.16255339, -0.20008637,
         0.51941386, -0.56700546,  0.07290809,  0.44662152,  0.12461971,
        -0.09822127, -0.11893335, -0.71696021,  0.25988168, -0.43666564,
        -0.16045751,  0.57614053,  0.28665246,  0.60660384, -0.2942108 ,
        -0.08527007,  0.19131387,  0.12743037, -0.62561027, -0.2033644 ,
         0.34728898, -0.40006455,  0.31609836,  0.12105912,  0.26564679,
         0.89148609,  0.37833594, -0.54639192, -1.06001436, -0.51449038,
         0.79246044,  0.28183858,  0.13540458,  0.20046986, -0.93388469,
        -0.73279018, -0.14079125,  0.2881775 ,  0.1

In [119]:
log_loss(test_labels, lin_cl.predict_proba(test_data)[:,1])

0.60089956576544035

In [120]:
y_pred = lin_cl.predict_proba(X_test)[:,1]

# Вывод ответа

In [121]:
ss = sample_submission.copy()
ss.target = y_pred
ss.to_csv('ans.csv', index=False)