In [14]:
import pandas as pd
import numpy as np
import xgboost as xgb
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score, classification_report

# Загрузка данных
df = pd.read_csv("Match.csv")
teams = pd.read_csv("Team_Attributes.csv")
df

Unnamed: 0,id,country_id,league_id,season,stage,date,match_api_id,home_team_api_id,away_team_api_id,home_team_goal,...,SJA,VCH,VCD,VCA,GBH,GBD,GBA,BSH,BSD,BSA
0,1,1,1,2008/2009,1,2008-08-17 00:00:00,492473,9987,9993,1,...,4.00,1.65,3.40,4.50,1.78,3.25,4.00,1.73,3.40,4.20
1,2,1,1,2008/2009,1,2008-08-16 00:00:00,492474,10000,9994,0,...,3.80,2.00,3.25,3.25,1.85,3.25,3.75,1.91,3.25,3.60
2,3,1,1,2008/2009,1,2008-08-16 00:00:00,492475,9984,8635,0,...,2.50,2.35,3.25,2.65,2.50,3.20,2.50,2.30,3.20,2.75
3,4,1,1,2008/2009,1,2008-08-17 00:00:00,492476,9991,9998,5,...,7.50,1.45,3.75,6.50,1.50,3.75,5.50,1.44,3.75,6.50
4,5,1,1,2008/2009,1,2008-08-16 00:00:00,492477,7947,9985,1,...,1.73,4.50,3.40,1.65,4.50,3.50,1.65,4.75,3.30,1.67
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25974,25975,24558,24558,2015/2016,9,2015-09-22 00:00:00,1992091,10190,10191,1,...,,,,,,,,,,
25975,25976,24558,24558,2015/2016,9,2015-09-23 00:00:00,1992092,9824,10199,1,...,,,,,,,,,,
25976,25977,24558,24558,2015/2016,9,2015-09-23 00:00:00,1992093,9956,10179,2,...,,,,,,,,,,
25977,25978,24558,24558,2015/2016,9,2015-09-22 00:00:00,1992094,7896,10243,0,...,,,,,,,,,,


In [2]:
# Преобразование дат в datetime
df['date'] = pd.to_datetime(df['date'])
teams['date'] = pd.to_datetime(teams['date'])
df['date']

0       2008-08-17
1       2008-08-16
2       2008-08-16
3       2008-08-17
4       2008-08-16
           ...    
25974   2015-09-22
25975   2015-09-23
25976   2015-09-23
25977   2015-09-22
25978   2015-09-23
Name: date, Length: 25979, dtype: datetime64[ns]

In [3]:
# Сколько очков за матч получат команды
df['result'] = 1
df.loc[df['home_team_goal'] > df['away_team_goal'], 'result'] = 2
df.loc[df['home_team_goal'] < df['away_team_goal'], 'result'] = 0

In [5]:
# Создаем копию, где оставим только то что нам нужно
teams_clean = teams.dropna(subset=['team_api_id', 'date', 'buildUpPlaySpeed', 'buildUpPlayPassing', 'buildUpPlayDribbling']).copy()
teams_clean

Unnamed: 0,id,team_fifa_api_id,team_api_id,date,buildUpPlaySpeed,buildUpPlaySpeedClass,buildUpPlayDribbling,buildUpPlayDribblingClass,buildUpPlayPassing,buildUpPlayPassingClass,...,chanceCreationShooting,chanceCreationShootingClass,chanceCreationPositioningClass,defencePressure,defencePressureClass,defenceAggression,defenceAggressionClass,defenceTeamWidth,defenceTeamWidthClass,defenceDefenderLineClass
1,2,434,9930,2014-09-19,52,Balanced,48.0,Normal,56,Mixed,...,64,Normal,Organised,47,Medium,44,Press,54,Normal,Cover
2,3,434,9930,2015-09-10,47,Balanced,41.0,Normal,54,Mixed,...,64,Normal,Organised,47,Medium,44,Press,54,Normal,Cover
7,8,77,8485,2014-09-19,58,Balanced,64.0,Normal,62,Mixed,...,57,Normal,Organised,41,Medium,42,Press,60,Normal,Cover
8,9,77,8485,2015-09-10,59,Balanced,64.0,Normal,53,Mixed,...,63,Normal,Free Form,49,Medium,45,Press,63,Normal,Cover
13,14,614,8576,2014-09-19,59,Balanced,57.0,Normal,52,Mixed,...,52,Normal,Organised,38,Medium,47,Press,53,Normal,Cover
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1449,1450,244,8394,2015-09-10,52,Balanced,52.0,Normal,50,Mixed,...,57,Normal,Organised,48,Medium,43,Press,49,Normal,Cover
1450,1451,112512,8027,2014-09-19,54,Balanced,51.0,Normal,40,Mixed,...,52,Normal,Organised,44,Medium,47,Press,52,Normal,Cover
1451,1452,112512,8027,2015-09-10,54,Balanced,51.0,Normal,40,Mixed,...,52,Normal,Organised,44,Medium,47,Press,52,Normal,Cover
1456,1457,15005,10000,2014-09-19,54,Balanced,42.0,Normal,51,Mixed,...,32,Little,Organised,44,Medium,58,Press,37,Normal,Cover


In [6]:
# Добавляем атрибуты домашней команды
df_with_attrs = pd.merge_asof(
    df.sort_values('date'),
    teams_clean.sort_values('date'),
    left_on='date',
    right_on='date',
    left_by='home_team_api_id',
    right_by='team_api_id',
    direction='backward',
)
df_with_attrs = df_with_attrs.rename(columns={
    'buildUpPlaySpeed': 'buildUpPlaySpeed_home',
    'buildUpPlayPassing': 'buildUpPlayPassing_home', 
    'buildUpPlayDribbling': 'buildUpPlayDribbling_home'
})
df_with_attrs['buildUpPlaySpeed_home']

0         NaN
1         NaN
2         NaN
3         NaN
4         NaN
         ... 
25974    61.0
25975     NaN
25976    63.0
25977    58.0
25978    62.0
Name: buildUpPlaySpeed_home, Length: 25979, dtype: float64

In [7]:
df_with_attrs = pd.merge_asof(
    df_with_attrs.sort_values('date'),
    teams_clean.sort_values('date'),
    left_on='date',
    right_on='date',
    left_by='away_team_api_id',
    right_by='team_api_id',
    direction='backward',
)
df_with_attrs = df_with_attrs.rename(columns={
    'buildUpPlaySpeed': 'buildUpPlaySpeed_away',
    'buildUpPlayPassing': 'buildUpPlayPassing_away',
    'buildUpPlayDribbling': 'buildUpPlayDribbling_away'
})

In [8]:
# Заполняем пропущенные значения
df_with_attrs[['buildUpPlaySpeed_home', 'buildUpPlayPassing_home', 'buildUpPlayDribbling_home']] = \
    df_with_attrs[['buildUpPlaySpeed_home', 'buildUpPlayPassing_home', 'buildUpPlayDribbling_home']].fillna(0)
df_with_attrs[['buildUpPlaySpeed_away', 'buildUpPlayPassing_away', 'buildUpPlayDribbling_away']] = \
    df_with_attrs[['buildUpPlaySpeed_away', 'buildUpPlayPassing_away', 'buildUpPlayDribbling_away']].fillna(0)

In [9]:
# Создаем признаки разницы между командами
df_with_attrs['speed_diff'] = df_with_attrs['buildUpPlaySpeed_home'] - df_with_attrs['buildUpPlaySpeed_away']
df_with_attrs['passing_diff'] = df_with_attrs['buildUpPlayPassing_home'] - df_with_attrs['buildUpPlayPassing_away']
df_with_attrs['dribbling_diff'] = df_with_attrs['buildUpPlayDribbling_home'] - df_with_attrs['buildUpPlayDribbling_away']

In [10]:
df = df.sort_values('date')
# Подготовка данных для модели
X = df_with_attrs[[
    'home_team_api_id', 'away_team_api_id', 'date','speed_diff', 'passing_diff', 'dribbling_diff', 'B365H', 'B365A', 'B365D', 'BWH', 'BWD', 'BWA'
    # 'home_team_api_id', 'away_team_api_id', 'B365H', 'B365A', 'B365D', 'BWH', 'BWD', 'BWA'
]]
y = df_with_attrs['result']
X

Unnamed: 0,home_team_api_id,away_team_api_id,date,speed_diff,passing_diff,dribbling_diff,B365H,B365A,B365D,BWH,BWD,BWA
0,10192,9931,2008-07-18,0.0,0.0,0.0,,,,,,
1,9930,10179,2008-07-19,0.0,0.0,0.0,,,,,,
2,10199,9824,2008-07-20,0.0,0.0,0.0,,,,,,
3,7955,10243,2008-07-20,0.0,0.0,0.0,,,,,,
4,6493,7955,2008-07-23,0.0,0.0,0.0,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
25974,10191,10192,2016-05-25,6.0,-29.0,3.0,,,,,,
25975,9931,9956,2016-05-25,11.0,-8.0,3.0,,,,,,
25976,7896,10190,2016-05-25,-52.0,-48.0,-46.0,,,,,,
25977,10199,10179,2016-05-25,2.0,14.0,-6.0,,,,,,


In [11]:
# Разделение данных и обучение модели
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False, random_state=42)
model = CatBoostClassifier(random_state=42, verbose=0)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# Оценка точности
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['Поражение дома', 'Ничья', 'Победа дома']))

Accuracy: 0.4834

Classification Report:
                precision    recall  f1-score   support

Поражение дома       0.45      0.49      0.47      1561
         Ничья       0.30      0.19      0.23      1330
   Победа дома       0.56      0.65      0.61      2305

      accuracy                           0.48      5196
     macro avg       0.44      0.44      0.43      5196
  weighted avg       0.46      0.48      0.47      5196



In [15]:
# Удаляем столбец 'date'
X_numeric = X.drop('date', axis=1)

# Разделение данных
X_train, X_test, y_train, y_test = train_test_split(X_numeric, y, test_size=0.2, shuffle=False, random_state=42)

# Создание и обучение модели
model = xgb.XGBClassifier(random_state=42, verbosity=0, use_label_encoder=False, eval_metric='mlogloss')
model.fit(X_train, y_train)

# Оценка модели
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['Поражение дома', 'Ничья', 'Победа дома']))

Accuracy: 0.4871

Classification Report:
                precision    recall  f1-score   support

Поражение дома       0.48      0.43      0.45      1561
         Ничья       0.31      0.20      0.25      1330
   Победа дома       0.54      0.69      0.61      2305

      accuracy                           0.49      5196
     macro avg       0.44      0.44      0.44      5196
  weighted avg       0.46      0.49      0.47      5196



In [16]:
def advanced_rolling_validation_df(X, y, initial_train_ratio=0.7):
    X_numeric = X.select_dtypes(include=[np.number])
    total_samples = len(X_numeric)
    initial_train_size = int(total_samples * initial_train_ratio)
    val_size = total_samples - initial_train_size
    
    results = []
    
    for i in tqdm(range(300)):
        train_end = initial_train_size + i
        test_index = initial_train_size + i
        
        X_train = X_numeric.iloc[:train_end]
        y_train = y.iloc[:train_end]
        X_test = X_numeric.iloc[test_index:test_index+1]
        y_test = y.iloc[test_index:test_index+1].values[0]
        
        model = xgb.XGBClassifier(random_state=42, verbosity=0, use_label_encoder=False, eval_metric='mlogloss')
        model.fit(X_train, y_train)
        
        y_proba = model.predict_proba(X_test)[0]
        y_pred = model.predict(X_test)[0]
        
        results.append({
            'step': i + 1,
            'true_value': y_test,
            'predicted_class': y_pred,
            'prob_class_0': y_proba[0],
            'prob_class_1': y_proba[1], 
            'prob_class_2': y_proba[2],
            'max_probability': np.max(y_proba),
            'is_correct': y_test == y_pred,
            'training_samples': train_end
        })
    
    # Создаем DataFrame
    df_results = pd.DataFrame(results)
    
    # Выводим статистику
    accuracy = df_results['is_correct'].mean()
    print(f"\nAccuracy: {accuracy:.4f}")
    print(f"\nСредняя уверенность: {df_results['max_probability'].mean():.3f}")
    
    return df_results

# Запускаем и получаем красивый DataFrame
results_df = advanced_rolling_validation_df(X, y, initial_train_ratio=0.7)

# Выводим первые несколько строк
print("\nРезультаты прогнозов:")
print(results_df.head(10))

100%|████████████████████████████████████████████████████████████████████████████████| 300/300 [10:32<00:00,  2.11s/it]


Accuracy: 0.5300

Средняя уверенность: 0.550

Результаты прогнозов:
   step  true_value  predicted_class  prob_class_0  prob_class_1  \
0     1           2                2      0.164355      0.246383   
1     2           1                0      0.609333      0.329447   
2     3           2                2      0.239059      0.344982   
3     4           0                2      0.210185      0.276335   
4     5           1                0      0.355119      0.302250   
5     6           0                0      0.600851      0.291101   
6     7           0                0      0.579409      0.235098   
7     8           1                2      0.246306      0.285873   
8     9           1                0      0.604394      0.269280   
9    10           2                2      0.160302      0.156024   

   prob_class_2  max_probability  is_correct  training_samples  
0      0.589262         0.589262        True             18185  
1      0.061220         0.609333       False        


