In [1]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [2]:
df = pd.read_csv('./src/final_dataset.csv')
df.columns

Index(['date', 'season', 'country_name', 'league_name', 'match_api_id',
       'team_long_name_home', 'team_short_name_home', 'team_long_name_away',
       'team_short_name_away', 'match_result', 'home_players_average_rating',
       'away_players_average_rating', 'home_players_average_score',
       'away_players_average_score', 'home_team_score', 'away_team_score',
       'avg_bet_home', 'avg_bet_away'],
      dtype='object')

Для прогнозных данных нужны только самые актуальные показатели

In [3]:
df1_home = df[['date', 'country_name', 'league_name', 'team_long_name_home', 'team_short_name_home', 'home_players_average_rating', 'home_players_average_score', 'home_team_score']]
df1_home=df1_home.sort_values('date').drop_duplicates(['country_name', 'league_name',
       'team_long_name_home', 'team_short_name_home'], keep='last')
df1_home.drop('date', axis= 1 , inplace= True )
df1_away = df[['date', 'country_name', 'league_name', 'team_long_name_away', 'team_short_name_away', 'away_players_average_rating', 'away_players_average_score', 'away_team_score']]
df1_away=df1_away.sort_values('date').drop_duplicates(['country_name', 'league_name',
       'team_long_name_away', 'team_short_name_away'], keep='last')
df1_away.drop('date', axis= 1 , inplace= True )

Проверим, каждая ли команда играла и на своем поле, и на поле противника.

In [4]:
print(len(df1_home))
print(len(df1_away))

241
242


Определим недостающую команду, заполним недостающие данные для нее и добавим в массив

In [5]:
missed_team=df1_away[ ~ df1_away['team_long_name_away']. isin (df1_home['team_long_name_home'])]
missed_team.rename(columns = {'team_long_name_away':'team_long_name_home', 'team_short_name_away':'team_short_name_home', 'away_players_average_rating':'home_players_average_rating', 'away_players_average_score':'home_players_average_score', 'away_team_score':'home_team_score'}, inplace = True )
df1_home=pd.concat([df1_home, missed_team], ignore_index=True)
len(df1_home)

242

Соберем все возможные комбинации команд внутри лиг

In [6]:
cross_joined_df = pd.concat(
    [
        sub_df.merge(sub_df, on=['country_name','league_name']).query('team_long_name_home_x != team_long_name_home_y')
        for _, sub_df in df1_home.groupby('league_name')
    ])
cross_joined_df.rename(columns = {'team_long_name_home_x':'team_long_name_home', 'team_short_name_home_x':'team_short_name_home',
                                  'home_players_average_rating_x':'home_players_average_rating', 'home_players_average_score_x':'home_players_average_score', 'home_team_score_x':'home_team_score',
                                 'team_long_name_home_y':'team_long_name_away', 'team_short_name_home_y':'team_short_name_away',
                                 'home_players_average_rating_y':'away_players_average_rating', 'home_players_average_score_y':'away_players_average_score', 'home_team_score_y':'away_team_score'}, inplace = True )
cross_joined_df

Unnamed: 0,country_name,league_name,team_long_name_home,team_short_name_home,home_players_average_rating,home_players_average_score,home_team_score,team_long_name_away,team_short_name_away,away_players_average_rating,away_players_average_score,away_team_score
1,Belgium,Belgium Jupiler League,KAS Eupen,EUP,64.0,58.94,52.50,Sint-Truidense VV,STT,64.00,58.08,46.75
2,Belgium,Belgium Jupiler League,KAS Eupen,EUP,64.0,58.94,52.50,Oud-Heverlee Leuven,O-H,66.36,60.13,49.50
3,Belgium,Belgium Jupiler League,KAS Eupen,EUP,64.0,58.94,52.50,RAEC Mons,MON,64.91,59.76,50.75
4,Belgium,Belgium Jupiler League,KAS Eupen,EUP,64.0,58.94,52.50,Beerschot AC,BAC,62.91,57.84,49.38
5,Belgium,Belgium Jupiler League,KAS Eupen,EUP,64.0,58.94,52.50,Lierse SK,LIE,63.27,56.77,56.00
...,...,...,...,...,...,...,...,...,...,...,...,...
955,Spain,Spain LIGA BBVA,Villarreal CF,VIL,75.0,67.13,45.75,Sevilla FC,SEV,76.45,68.31,52.50
956,Spain,Spain LIGA BBVA,Villarreal CF,VIL,75.0,67.13,45.75,FC Barcelona,BAR,84.27,74.21,52.38
957,Spain,Spain LIGA BBVA,Villarreal CF,VIL,75.0,67.13,45.75,Real Madrid CF,REA,83.73,74.17,54.50
958,Spain,Spain LIGA BBVA,Villarreal CF,VIL,75.0,67.13,45.75,Getafe CF,GET,74.00,66.86,43.62


Добавим переменные, использованные в модели

In [7]:
cross_joined_df['rating_diff']=cross_joined_df['home_players_average_rating']-cross_joined_df['away_players_average_rating']
cross_joined_df['players_score_diff']=cross_joined_df['home_players_average_score']-cross_joined_df['away_players_average_score']
cross_joined_df['team_diff']=cross_joined_df['home_team_score']-cross_joined_df['away_team_score']
#cross_joined_df['bet_diff']=cross_joined_df['avg_bet_home']-cross_joined_df['avg_bet_away']

Приведем прогноз к формату исторических данных, использованных в основной модели проекта

In [8]:
homedf1=cross_joined_df[['country_name', 'league_name', 'team_long_name_home', 'team_short_name_home',
    'rating_diff', 'players_score_diff', 'team_diff']]
homedf1.rename(columns = {'team_long_name_home':'team_long_name', 'team_short_name_home':'team_short_name'}, inplace = True )
homedf1['homestage']=1
awaydf1=cross_joined_df[['country_name', 'league_name', 'team_long_name_away', 'team_short_name_away',
    'rating_diff', 'players_score_diff', 'team_diff']]
awaydf1.rename(columns = {'team_long_name_away':'team_long_name', 'team_short_name_away':'team_short_name'}, inplace = True )
awaydf1['homestage']=0
awaydf1['rating_diff'] = -1*awaydf1['rating_diff']
awaydf1['team_diff'] = -1*awaydf1['team_diff']
awaydf1['players_score_diff'] = -1*awaydf1['players_score_diff']
cross_joined_df = pd.concat([homedf1, awaydf1], ignore_index=True)
cross_joined_df.columns

Index(['country_name', 'league_name', 'team_long_name', 'team_short_name',
       'rating_diff', 'players_score_diff', 'team_diff', 'homestage'],
      dtype='object')

Вернемся к историческим данным и спрогнозируем bet_diff, avg_bet_home, avg_bet_away

In [9]:
df['rating_diff']=df['home_players_average_rating']-df['away_players_average_rating']
df['players_score_diff']=df['home_players_average_score']-df['away_players_average_score']
df['team_diff']=df['home_team_score']-df['away_team_score']
df['bet_diff']=df['avg_bet_home']-df['avg_bet_away']

In [10]:
homedf=df[['date', 'season', 'country_name', 'league_name', 'match_api_id', 'team_long_name_home', 'team_short_name_home',
    'match_result', 'rating_diff', 'players_score_diff', 'team_diff', 'bet_diff']]
homedf.rename(columns = {'team_long_name_home':'team_long_name', 'team_short_name_home':'team_short_name'}, inplace = True )
homedf['homestage']=1
homedf['match_result'] = homedf['match_result'].apply(lambda x: 1 if x == 1 else 0)
awaydf=df[['date', 'season', 'country_name', 'league_name', 'match_api_id', 'team_long_name_away', 'team_short_name_away',
    'match_result', 'rating_diff', 'players_score_diff', 'team_diff', 'bet_diff']]
awaydf.rename(columns = {'team_long_name_away':'team_long_name', 'team_short_name_away':'team_short_name'}, inplace = True )

awaydf['homestage']=0
awaydf['match_result'] = awaydf['match_result'].apply(lambda x: 1 if x == -1 else 0)
awaydf['rating_diff'] = -1*awaydf['rating_diff']
awaydf['team_diff'] = -1*awaydf['team_diff']
awaydf['players_score_diff'] = -1*awaydf['players_score_diff']
awaydf['bet_diff'] = -1*awaydf['bet_diff']
df = pd.concat([homedf, awaydf], ignore_index=True)
df.shape

(29832, 13)

Подберем возможные варианты коэффициентов букмекеров для будущих игр

In [13]:
X = df[['rating_diff', 'players_score_diff', 'team_diff', 'homestage']]
Y = df['bet_diff']

In [14]:
test_split_index = int(X.shape[0]*0.7)
X_train = X[:test_split_index]
X_test = X[test_split_index:]
Y_train = Y[:test_split_index]
Y_test = Y[test_split_index:]

In [15]:
model = LinearRegression()

In [16]:
model.fit(X_train, Y_train)

In [17]:
Y_predicted = model.predict(X_test)
Y_predicted

array([-0.7082649 , -0.21095404, -0.19052418, ..., -0.62591264,
       -0.17479556, -0.30736153])

In [18]:
print('MSE: %.2f' % mean_squared_error(Y_test, Y_predicted))
print('MAE: %.2f' % mean_absolute_error(Y_test, Y_predicted))
print('R2: %.2f' % r2_score(Y_test, Y_predicted))

MSE: 0.03
MAE: 0.13
R2: 0.80


Вероятность очень высокая, теперь, используя эту модель, мы можем заполнить bet_diff для прогноза 

In [19]:
predicted_bets=model.predict(cross_joined_df[['rating_diff', 'players_score_diff', 'team_diff', 'homestage']])
cross_joined_df['bet_diff']=predicted_bets

In [20]:
cross_joined_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13064 entries, 0 to 13063
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   country_name        13064 non-null  object 
 1   league_name         13064 non-null  object 
 2   team_long_name      13064 non-null  object 
 3   team_short_name     13064 non-null  object 
 4   rating_diff         13064 non-null  float64
 5   players_score_diff  13064 non-null  float64
 6   team_diff           13064 non-null  float64
 7   homestage           13064 non-null  int64  
 8   bet_diff            13064 non-null  float64
dtypes: float64(4), int64(1), object(4)
memory usage: 918.7+ KB


In [22]:
cross_joined_df[['country_name', 'league_name',
       'team_long_name', 'team_short_name', 'rating_diff', 'players_score_diff', 'team_diff', 'bet_diff', 'homestage']].to_csv(
    './src/new_variables.csv', index=False)