In [35]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings("ignore")

from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.linear_model import LogisticRegression

In [36]:
df = pd.read_csv('./src/final_dataset.csv')

In [37]:
df.columns

Index(['date', 'season', 'country_name', 'league_name', 'match_api_id',
       'team_long_name_home', 'team_short_name_home', 'team_long_name_away',
       'team_short_name_away', 'match_result', 'home_players_average_rating',
       'away_players_average_rating', 'home_players_average_score',
       'away_players_average_score', 'home_team_score', 'away_team_score',
       'avg_bet_home', 'avg_bet_away'],
      dtype='object')

Разделим каждую строчку на 2: одну для home_team, одну для away_team

In [38]:
df['rating_diff']=df['home_players_average_rating']-df['away_players_average_rating']
df['players_score_diff']=df['home_players_average_score']-df['away_players_average_score']
df['team_diff']=df['home_team_score']-df['away_team_score']
df['bet_diff']=df['avg_bet_home']-df['avg_bet_away']

In [39]:
homedf=df[['date', 'season', 'country_name', 'league_name', 'match_api_id', 'team_long_name_home', 'team_short_name_home',
    'match_result', 'rating_diff', 'players_score_diff', 'team_diff', 'bet_diff']]
homedf.rename(columns = {'team_long_name_home':'team_long_name', 'team_short_name_home':'team_short_name'}, inplace = True )
homedf['homestage']=1
homedf['match_result'] = homedf['match_result'].apply(lambda x: 1 if x == 1 else 0)
awaydf=df[['date', 'season', 'country_name', 'league_name', 'match_api_id', 'team_long_name_away', 'team_short_name_away',
    'match_result', 'rating_diff', 'players_score_diff', 'team_diff', 'bet_diff']]
awaydf.rename(columns = {'team_long_name_away':'team_long_name', 'team_short_name_away':'team_short_name'}, inplace = True )

awaydf['homestage']=0
awaydf['match_result'] = awaydf['match_result'].apply(lambda x: 1 if x == -1 else 0)
awaydf['rating_diff'] = -1*awaydf['rating_diff']
awaydf['team_diff'] = -1*awaydf['team_diff']
awaydf['players_score_diff'] = -1*awaydf['players_score_diff']
awaydf['bet_diff'] = -1*awaydf['bet_diff']
df = pd.concat([homedf, awaydf], ignore_index=True)

In [48]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29832 entries, 0 to 29831
Data columns (total 13 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   date                29832 non-null  object 
 1   season              29832 non-null  object 
 2   country_name        29832 non-null  object 
 3   league_name         29832 non-null  object 
 4   match_api_id        29832 non-null  int64  
 5   team_long_name      29832 non-null  object 
 6   team_short_name     29832 non-null  object 
 7   match_result        29832 non-null  int64  
 8   rating_diff         29832 non-null  float64
 9   players_score_diff  29832 non-null  float64
 10  team_diff           29832 non-null  float64
 11  bet_diff            29832 non-null  float64
 12  homestage           29832 non-null  int64  
dtypes: float64(4), int64(3), object(6)
memory usage: 3.0+ MB


Параметры для обучения

In [40]:
log_reg=LogisticRegression()
X = df[['rating_diff', 'players_score_diff', 'team_diff', 'bet_diff',
       'homestage']]
Y = df['match_result']

In [41]:
test_split_index=int(X.shape[0]*0.75)
X_train=X[:test_split_index]
X_test=X[test_split_index:]
Y_train=Y[:test_split_index]
Y_test=Y[test_split_index:]

In [42]:
log_reg.fit(X_train, Y_train)
Y_predicted=log_reg.predict(X_test)

In [43]:
print('Overall accuracy:', accuracy_score(Y_test, Y_predicted))
print('Overall precision:', precision_score(Y_test, Y_predicted))
print('Overall recall:', recall_score(Y_test, Y_predicted))

Overall accuracy: 0.7459104317511397
Overall precision: 0.6376376376376376
Overall recall: 0.29354838709677417


In [49]:
forecast_df = pd.read_csv('./src/new_variables.csv')
forecast_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13064 entries, 0 to 13063
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   country_name        13064 non-null  object 
 1   league_name         13064 non-null  object 
 2   team_long_name      13064 non-null  object 
 3   team_short_name     13064 non-null  object 
 4   rating_diff         13064 non-null  float64
 5   players_score_diff  13064 non-null  float64
 6   team_diff           13064 non-null  float64
 7   bet_diff            13064 non-null  float64
 8   homestage           13064 non-null  int64  
dtypes: float64(4), int64(1), object(4)
memory usage: 918.7+ KB


In [51]:
predicted_match_result=log_reg.predict(forecast_df[['rating_diff', 'players_score_diff', 'team_diff', 'bet_diff',
       'homestage']])
forecast_df['match_result']=predicted_match_result
forecast_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13064 entries, 0 to 13063
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   country_name        13064 non-null  object 
 1   league_name         13064 non-null  object 
 2   team_long_name      13064 non-null  object 
 3   team_short_name     13064 non-null  object 
 4   rating_diff         13064 non-null  float64
 5   players_score_diff  13064 non-null  float64
 6   team_diff           13064 non-null  float64
 7   bet_diff            13064 non-null  float64
 8   homestage           13064 non-null  int64  
 9   match_result        13064 non-null  int64  
dtypes: float64(4), int64(2), object(4)
memory usage: 1020.8+ KB


In [52]:
forecast_df.to_csv(
    './src/final_forecast.csv', index=False)