In [1]:
import numpy as np # calculations with arrays
import pandas as pd # user-friendly DataFrames for data representation
import sklearn # machine learning algorithms
from sklearn import ensemble, linear_model
from sklearn.metrics import log_loss
from sklearn import cross_validation
from sklearn import grid_search
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction import DictVectorizer as DV
import xgboost as xgboost
import matplotlib.pyplot as plt # import plot functions
# necessary to plot in jupyter notebook:
%matplotlib inline
import seaborn as sns # make plots beautiful



# Ridge Classification and Logistic Regression

In [2]:
train = pd.read_csv('train2.csv')
test = pd.read_csv('test2.csv')
sample_submission = pd.read_csv('sample_submission.csv')

In [3]:
train.head(2)

Unnamed: 0,year,day,team1,team2,score1,score2,target
0,2998,19,317,131,336,278,True
1,2998,28,61,29,301,259,True


In [4]:
test.head(2)

Unnamed: 0,Id,year,team1,team2
0,0,3021,363,161
1,1,3021,286,2


In [5]:
enc = OneHotEncoder(sparse=False)
train_team1 = pd.DataFrame(enc.fit_transform(train[['team1']]))
train_team2 = pd.DataFrame(enc.transform(train[['team2']]))
train_teams = pd.concat([train_team1, train_team2], axis=1)

test_team1 = pd.DataFrame(enc.transform(test[['team1']]))
test_team2 = pd.DataFrame(enc.transform(test[['team2']]))
test_teams = pd.concat([test_team1, test_team2], axis=1)

In [6]:
test_teams.shape

(125207, 706)

In [7]:
X = pd.concat([train[['year']], train_teams], axis = 1)

In [8]:
X_test_submission = pd.concat([test[['year']], test_teams], axis = 1)

In [9]:
print X_test_submission.shape
for c in test.team2.unique():
    if(c not in train.team1.unique()):
        print c

(125207, 707)
162
1
113
130


In [10]:
X_train = X[X['year'] <= 3014]
y_train = train[train['year'] <= 3014].target
X_test = X[X['year'] > 3014]
y_test = train[train['year'] > 3014].target

In [11]:
alg = linear_model.RidgeClassifier()

In [12]:
alg.fit(X_train, y_train)

RidgeClassifier(alpha=1.0, class_weight=None, copy_X=True, fit_intercept=True,
        max_iter=None, normalize=False, random_state=None, solver='auto',
        tol=0.001)

In [13]:
y_pred = alg.predict(X_test)

In [14]:
log_loss(y_test, y_pred = y_pred)

13.193379599483086

In [15]:
alg.coef_[0, :3]

array([-0.00099407, -0.05548319, -0.06027442])

Коэффициент при years мал

# Взвешенная сумма алгоритмов

In [38]:
alg = linear_model.LogisticRegression(C = 0.8)

param = {}
param['max_depth'] = 25
param['booster'] = 'gbtree'
param['objective'] = 'binary:logistic'
param['eval_metric'] = 'logloss'
param['eta'] = 0.05

numround = 700

In [39]:
train_data, test_data, train_labels, test_labels = sklearn.cross_validation.train_test_split(X, train.target,  
                                                                                     test_size = 0.3)

In [40]:
Xdatatrain = xgboost.DMatrix(data = X, label = train.target)
Xdatatest = xgboost.DMatrix(data = test_data, label = test_labels)

plst = list(param.items())
watchlist = [(Xdatatrain, 'train'), (Xdatatest, 'eval')]            

bst = xgboost.train(plst, Xdatatrain, numround, evals = watchlist)

[0]	train-logloss:0.691845	eval-logloss:0.691622
[1]	train-logloss:0.690467	eval-logloss:0.690226
[2]	train-logloss:0.689007	eval-logloss:0.688918
[3]	train-logloss:0.687932	eval-logloss:0.687740
[4]	train-logloss:0.686644	eval-logloss:0.686574
[5]	train-logloss:0.685556	eval-logloss:0.685707
[6]	train-logloss:0.684524	eval-logloss:0.684485
[7]	train-logloss:0.683499	eval-logloss:0.683419
[8]	train-logloss:0.682707	eval-logloss:0.682561
[9]	train-logloss:0.681612	eval-logloss:0.681675
[10]	train-logloss:0.680879	eval-logloss:0.680798
[11]	train-logloss:0.679833	eval-logloss:0.679895
[12]	train-logloss:0.679083	eval-logloss:0.679049
[13]	train-logloss:0.678238	eval-logloss:0.678253
[14]	train-logloss:0.676721	eval-logloss:0.677633
[15]	train-logloss:0.676671	eval-logloss:0.676903
[16]	train-logloss:0.675824	eval-logloss:0.676018
[17]	train-logloss:0.675001	eval-logloss:0.675170
[18]	train-logloss:0.674251	eval-logloss:0.674458
[19]	train-logloss:0.673367	eval-logloss:0.673734
[20]	train

In [41]:
alg.fit(X, train.target)
y_pred_alg = alg.predict_proba(X_test)[:, 1]
print "Log_loss alg = ", log_loss(y_test, y_pred_alg)
y_pred_bst = bst.predict(xgboost.DMatrix(X_test))
print "Log_loss bst = ", log_loss(y_test, y_pred_bst)

Log_loss alg =  0.591960325205
Log_loss bst =  0.489412404029


In [70]:
def losses_func(w):
    return log_loss(test_labels, (y_pred_alg*w[0] + y_pred_bst*w[1]))

In [71]:
import scipy.optimize as opt

In [72]:
x = opt.minimize(losses_func, method='Nelder-Mead', x0=[-1, 2])

In [73]:
print x.fun

0.582819318456


In [42]:
y_test_alg_submission = alg.predict_proba(X_test_submission)[:,1]
y_test_bst_submission = bst.predict(xgboost.DMatrix(X_test_submission))
y_test_submission = y_test_alg_submission*0.58 + y_test_bst_submission*(1-0.58)

In [43]:
ss = sample_submission.copy()

for i in range(len(y_test_submission)):
    c = y_test_submission[i]
    if(c <= 0.02):
        y_test_submission[i] = 0.0
    if(c >= 0.98):
        y_test_submission[i] = 1.0
        
for c in y_test_submission:
    if(c < 0 or c>1):
        print c
        
ss.target = y_test_submission
ss.to_csv('Double-alg-xgboost-lin_part1.csv', index = False)