In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import log_loss, roc_auc_score
from sklearn.model_selection import cross_val_predict
import seaborn as sns

In [2]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [3]:
# загрузка данных
trd = pd.read_csv('train.csv')
tsd = pd.read_csv('test.csv')
td = pd.concat([trd, tsd], ignore_index=True, sort = False, axis=0)

In [4]:
# количество нулей в данных
td.isnull().sum()

tour                          0
team_home                     0
team_away                     0
time                          0
champ                         0
team_home_substitution    19756
team_home_reserved        21886
team_home_yellow_cards    20398
team_home_squad           17076
team_home_red_cards       23261
team_away_substitution    19760
team_away_reserved        21871
team_away_yellow_cards    20187
team_away_squad           17077
team_away_red_cards       23114
team_home_score            7211
team_away_score            7211
target                     7211
dtype: int64

In [5]:
td.head(5)

Unnamed: 0,tour,team_home,team_away,time,champ,team_home_substitution,team_home_reserved,team_home_yellow_cards,team_home_squad,team_home_red_cards,team_away_substitution,team_away_reserved,team_away_yellow_cards,team_away_squad,team_away_red_cards,team_home_score,team_away_score,target
0,3,team140,team326,2015-08-22,champ8,,,,,,,,,,,1.0,6.0,2.0
1,19,team57,team179,2014-12-21,champ3,,,,,,,,,,,0.0,5.0,2.0
2,34,team169,team289,2011-05-01,champ13,"player123494, player192579, player169076",,"player67349, player123494","player120363, player124028, player195525, play...",,"player23933, player77864, player78867",,"player126840, player23933","player194866, player27147, player194129, playe...",,0.0,0.0,0.0
3,26,team16,team130,2017-02-12,champ1,,,,,,,,,,,4.0,2.0,1.0
4,35,team42,team249,2012-04-21,champ13,"player159174, player136476, player181390","player139979, player118277, player139541, play...","player30439, player110391","player32937, player143540, player50802, player...",,"player46210, player74713, player68501","player2823, player109113, player7299, player73659","player137575, player129771, player46210","player79128, player137575, player161921, playe...",,1.0,2.0,2.0


In [6]:
# перекодировка категориальных переменных из текстовых в числовые
from sklearn import preprocessing
td['team_home'] = preprocessing.LabelEncoder().fit_transform(td['team_home'])
td['team_away'] = preprocessing.LabelEncoder().fit_transform(td['team_away'])
td['champ'] = preprocessing.LabelEncoder().fit_transform(td['champ'])

In [8]:
# новые переменные - средняя результативность и средний результат команд
td["team_home_mean_target"] = td.team_home.map(td.groupby("team_home").target.mean())
td["team_away_mean_target"] = td.team_away.map(td.groupby("team_away").target.mean())
td["team_home_mean_score"] = td.team_home.map(td.groupby("team_home").team_home_score.mean())
td["team_away_mean_score"] = td.team_away.map(td.groupby("team_away").team_away_score.mean())

In [9]:
# новые переменные - год месяц
td['time_match'] = pd.to_datetime(td['time'])
td['year'] = td.time_match.apply(lambda x : x.year)
td['month'] = td.time_match.apply(lambda x : x.month)

In [11]:
# разложение категориальных перменных, которые могут быть полезны при прогнозировании результата
td = pd.concat([td, pd.get_dummies(td.tour, prefix="tour", drop_first = True)], axis=1)
td = pd.concat([td, pd.get_dummies(td.team_home, prefix="H", drop_first = True)], axis=1)
td = pd.concat([td, pd.get_dummies(td.team_away, prefix="A", drop_first = True)], axis=1)
td = pd.concat([td, pd.get_dummies(td.champ, prefix="ch", drop_first = True)], axis=1)
td = pd.concat([td, pd.get_dummies(td.month, prefix="m", drop_first = True)], axis=1)
td = pd.concat([td, pd.get_dummies(td.year, prefix="y", drop_first = True)], axis=1)

In [12]:
# удаление неиспользуемых переменных и разложенных
td.drop(['tour', 'team_home', 'team_away', 'champ', 'team_home_score', 'team_away_score'], axis=1, inplace=True)
td.drop(['time'], axis=1, inplace=True)
td.drop(['team_home_red_cards', 'team_away_red_cards'], axis=1, inplace=True)
td.drop(['month', 'year'], axis=1, inplace=True)
td.drop(['team_home_substitution', 'team_home_reserved', 'team_home_yellow_cards', 'team_home_squad', 
         'team_away_substitution', 'team_away_reserved', 'team_away_yellow_cards', 'team_away_squad'], axis=1, inplace=True)

In [13]:
td.tail(5)

Unnamed: 0,target,team_home_mean_target,team_away_mean_target,team_home_mean_score,team_away_mean_score,time_match,tour_2,tour_3,tour_4,tour_5,...,m_12,y_2010,y_2011,y_2012,y_2013,y_2014,y_2015,y_2016,y_2017,y_2018
23662,,1.051724,0.973684,1.448276,1.096491,2016-01-16,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
23663,,1.107143,1.55102,1.392857,2.346939,2017-05-07,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
23664,,,0.947826,,0.852174,2016-10-23,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
23665,,0.949367,1.092308,1.468354,1.338462,2017-10-01,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
23666,,,0.991071,,1.133929,2017-05-14,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [14]:
td.drop(['time_match'], axis=1, inplace=True)
td.head(5)

Unnamed: 0,target,team_home_mean_target,team_away_mean_target,team_home_mean_score,team_away_mean_score,tour_2,tour_3,tour_4,tour_5,tour_6,...,m_12,y_2010,y_2011,y_2012,y_2013,y_2014,y_2015,y_2016,y_2017,y_2018
0,2.0,1.25,1.108696,1.125,1.195652,0,1,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1,2.0,0.880342,1.08547,1.410256,1.452991,0,0,0,0,0,...,1,0,0,0,0,1,0,0,0,0
2,0.0,1.094737,0.871795,1.294737,0.871795,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
3,1.0,0.714286,1.136364,2.02381,1.5,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,2.0,0.991304,1.512821,3.217391,2.34188,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0


In [16]:
td.shape

(23667, 789)

In [15]:
# разделение на имеющиеся данные и прогнозируемые
X_train = td[~td.target.isnull()]
X_test = td[td.target.isnull()]
y_train = td[~td.target.isnull()].target
y_test = td[td.target.isnull()].target

In [16]:
X_test.drop(['target'], axis=1, inplace=True)
X_train.drop(['target'], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [56]:
from sklearn.model_selection import train_test_split

In [57]:
# разделение на тренировочную и тестовые выборки (без рандомизации, так как предсказываем будущее по прошлому)
X_train1, X_test1, y_train1, y_test1 = train_test_split(
                          X_train, y_train, random_state=42, test_size=.25, shuffle=False)

In [64]:
# обучение модели случайного леса и оценка модели по логлосс)
model = RandomForestClassifier(n_estimators=700,
max_features=0.2,
random_state=1,
max_depth=5,
n_jobs=-1)
model.fit(X_train1, y_train1)
predictions = model1.predict_proba(X_test1)
print("Score is {}".format(log_loss(y_test1, predictions)))

Score is 0.9929752874795208


In [65]:
submit = pd.DataFrame(model.predict_proba(X_test.fillna(0)), columns=["draw", "win", "lose"])
submit.to_csv("reshenie.csv")