## 1. importing libraries

In [1]:
import numpy as np
import pandas as pd
import sklearn
import joblib
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from feature_engine.datetime import DatetimeFeatures
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib

## 2. Display setting

In [5]:
pd.set_option("display.max_columns",None)

In [7]:
sklearn.set_config(transform_output="default")


## 3. getting the data

In [10]:
df=pd.read_csv('matches.csv')

In [12]:
df.sample(10)

Unnamed: 0.1,Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,xg,xga,poss,attendance,captain,formation,referee,match report,notes,sh,sot,dist,fk,pk,pkatt,season,team
3508,122,2020-12-16,20:00 (22:00),Premier League,Matchweek 13,Wed,Away,L,1,2,Liverpool,1.1,1.0,25,2000.0,Hugo Lloris,4-4-2,Anthony Taylor,Match Report,,8,2,19.2,1,0,0,2021,Tottenham Hotspur
2957,86,2022-04-20,19:45 (20:45),Premier League,Matchweek 30,Wed,Away,L,0,1,Newcastle Utd,0.5,0.9,63,51938.0,Marc Guéhi,4-2-3-1,Tony Harrington,Match Report,,10,2,15.4,0,0,0,2022,Crystal Palace
1128,42,2024-03-02,17:30 (19:30),Premier League,Matchweek 27,Sat,Away,W,3,2,Luton Town,1.9,1.8,54,11594.0,John McGinn,4-2-3-1,Michael Oliver,Match Report,,12,8,12.5,0,0,0,2024,Aston Villa
2848,90,2022-05-15,14:00 (15:00),Premier League,Matchweek 37,Sun,Away,D,1,1,Leeds United,2.2,1.6,50,36638.0,Lewis Dunk,3-4-3,Mike Dean,Match Report,,15,6,14.3,0,0,0,2022,Brighton and Hove Albion
843,10,2019-10-06,16:30 (17:30),Premier League,Matchweek 8,Sun,Away,L,0,1,Newcastle Utd,0.9,0.8,68,51198.0,Ashley Young,4-2-3-1,Mike Dean,Match Report,,12,3,17.9,1,0,0,2023,Manchester United
3129,65,2021-11-30,20:15 (22:15),Premier League,Matchweek 14,Tue,Home,W,1,0,Crystal Palace,1.4,1.1,61,35558.0,Liam Cooper,4-1-4-1,Kevin Friend,Match Report,,14,2,20.1,0,1,1,2022,Leeds United
320,17,2021-01-02,12:30 (14:30),Premier League,Matchweek 17,Sat,Away,L,0,3,Tottenham,1.1,2.6,63,,Luke Ayling,4-1-4-1,David Coote,Match Report,,18,5,18.5,0,0,0,2024,Leeds United
1057,45,2024-04-14,16:30 (17:30),Premier League,Matchweek 33,Sun,Home,L,0,2,Aston Villa,1.6,0.9,52,60350.0,Martin Ødegaard,4-3-3,David Coote,Match Report,,18,4,17.2,1,0,0,2024,Arsenal
1757,135,2022-10-16,16:30 (17:30),Premier League,Matchweek 11,Sun,Away,L,0,1,Liverpool,1.0,2.1,63,53286.0,İlkay Gündoğan,3-4-3,Anthony Taylor,Match Report,,16,6,18.4,0,0,0,2023,Manchester City
2985,75,2022-01-19,20:00 (22:00),Premier League,Matchweek 17,Wed,Home,L,1,3,Manchester Utd,2.3,2.3,44,17094.0,Pontus Jansson,3-5-2,Andre Marriner,Match Report,,18,9,16.7,0,0,0,2022,Brentford


In [14]:
df.shape

(4788, 28)

## 4 droping unnecessary colummns

In [17]:
df.drop(columns=['Unnamed: 0','date','result','xga','gf','ga','formation','notes','dist','attendance'],inplace=True)

In [19]:
df.head()


Unnamed: 0,time,comp,round,day,venue,opponent,xg,poss,captain,referee,match report,sh,sot,fk,pk,pkatt,season,team
0,20:15 (21:15),Premier League,Matchweek 2,Mon,Away,Wolves,1.9,65,Fernandinho,Andre Marriner,Match Report,13,8,2,1,1,2024,Manchester City
1,16:30 (17:30),Premier League,Matchweek 3,Sun,Home,Leicester City,0.9,72,Fernandinho,Michael Oliver,Match Report,16,5,1,0,0,2024,Manchester City
2,17:30 (18:30),Premier League,Matchweek 4,Sat,Away,Leeds United,1.2,49,Kevin De Bruyne,Mike Dean,Match Report,23,1,1,0,0,2024,Manchester City
3,17:30 (18:30),Premier League,Matchweek 5,Sat,Home,Arsenal,1.3,58,Raheem Sterling,Chris Kavanagh,Match Report,13,5,0,0,0,2024,Manchester City
4,12:30 (13:30),Premier League,Matchweek 6,Sat,Away,West Ham,1.0,69,Raheem Sterling,Anthony Taylor,Match Report,14,7,1,0,0,2024,Manchester City


## 5 features construction

In [None]:


def calculate_rolling_averages(df, cols, window):
   
    new_col_suffix = f'_roll_{window}'

    
    df_temp = df[['team'] + cols]
    
    for col in cols:
        new_col_name = col + new_col_suffix
        
        df[new_col_name] = df_temp.groupby('team')[col].transform(
            lambda x: x.rolling(window=window, min_periods=1).mean().shift(1)
        )
    return df

stats_cols = ['xg', 'poss', 'sh']

df = calculate_rolling_averages(df, stats_cols, window=5)

df = calculate_rolling_averages(df, stats_cols, window=10)

df.drop(columns=['poss','sh'],inplace=True)

df= df.dropna()

df['time'] = df['time'].str.split(' ').str[0]

## 6 spliting data into x,y and training and testing datasets

In [23]:
def split_data(data):
    x=data.drop(columns=['xg'])
    y=data['xg']
    return (x,y)

x,y=split_data(df)

x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)

## 6 data preprocessing

In [25]:
x_train.head()

Unnamed: 0,time,comp,round,day,venue,opponent,captain,referee,match report,sot,fk,pk,pkatt,season,team,xg_roll_5,poss_roll_5,sh_roll_5,xg_roll_10,poss_roll_10,sh_roll_10
2787,19:30,Premier League,Matchweek 14,Wed,Away,Southampton,Kasper Schmeichel,Robert Jones,Match Report,6,0,0,0,2022,Leicester City,1.2,49.8,11.4,1.42,50.6,13.0
2571,19:45,Premier League,Matchweek 19,Wed,Home,Leeds United,Virgil van Dijk,Michael Oliver,Match Report,12,0,2,2,2022,Liverpool,2.34,64.8,20.6,2.13,62.5,19.3
641,20:15,Premier League,Matchweek 34,Mon,Home,West Ham,Ben Mee,Anthony Taylor,Match Report,1,0,1,1,2024,Burnley,1.44,42.2,13.6,1.22,44.2,11.3
1801,17:30,Premier League,Matchweek 18,Sat,Away,Brighton,Martin Ødegaard,Anthony Taylor,Match Report,7,0,0,0,2023,Arsenal,1.92,62.2,16.0,1.8,59.4,15.0
2712,14:00,Premier League,Matchweek 15,Sun,Home,Crystal Palace,Harry Maguire,Craig Pawson,Match Report,3,2,0,0,2022,Manchester United,1.32,43.0,8.0,1.46,49.5,12.7


In [27]:
time_cols=['time']
num_cols=['sot','fk','pk','pkatt','season','xg_roll_5','poss_roll_5','sh_roll_5','xg_roll_10','poss_roll_10','sh_roll_10']
cate_cols=['comp','round','day','venue','opponent','captain','referee','match report','team']

In [29]:
x_train.shape


(3809, 21)

In [37]:


num_transformer = Pipeline(steps=[
    ('imputer',SimpleImputer(strategy='median')),
    ('scaling',StandardScaler())
])

cate_transformer = Pipeline(steps=[
    ('imputer',SimpleImputer(strategy='most_frequent')),
    ('scaling',OneHotEncoder(sparse_output=False,handle_unknown='ignore'))
    ])

time_transformer = Pipeline(steps=[
    ('imputer',SimpleImputer(strategy='most_frequent')),
    ('extractor',DatetimeFeatures(features_to_extract=['hour','minute'],format='mixed')),
    ('scaling',StandardScaler())
     ])




In [39]:
preprocessor = ColumnTransformer(transformers = [
    ('time',time_transformer,time_cols),
    ('num',num_transformer,num_cols),
    ('cate',cate_transformer,cate_cols)
])

In [41]:
preprocessor.fit_transform(x_train)

array([[ 1.07589886,  1.2829509 ,  0.70257196, ...,  0.        ,
         0.        ,  0.        ],
       [ 1.07589886,  2.30900632,  3.14612386, ...,  0.        ,
         0.        ,  0.        ],
       [ 1.47369529,  0.25689547, -1.33372129, ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [-1.70867611,  1.2829509 , -0.51920399, ...,  0.        ,
         0.        ,  0.        ],
       [ 1.07589886, -0.76915995,  0.70257196, ...,  0.        ,
         0.        ,  0.        ],
       [-0.51528684, -0.76915995,  1.92434791, ...,  0.        ,
         0.        ,  0.        ]])

## 7 Model Selection


In [43]:

algorithms = {
    'Linear Regression': LinearRegression(),
    'Suport Vector Machine': SVR(),
    'Random Forest': RandomForestRegressor(n_estimators=10),
    'XG Boost': XGBRegressor(n_estimators=10)
}

In [45]:
def evaluate_regression_models(X_train, X_test, y_train, y_test, algorithms, preprocessor):
    results = {}
    fitted_models = {}
    
    for name, model in algorithms.items():
        print(f" Training {name}...")
        
        full_pipeline = Pipeline(steps=[
            ('preprocessor', preprocessor),
            ('model', model)
        ])
        
        full_pipeline.fit(X_train, y_train)
        
        y_pred_train = full_pipeline.predict(X_train)
        y_pred_test = full_pipeline.predict(X_test)
        
        train_mae = mean_absolute_error(y_train, y_pred_train)
        test_mae = mean_absolute_error(y_test, y_pred_test)
        train_rmse = np.sqrt(mean_squared_error(y_train, y_pred_train))
        test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))
        train_r2 = r2_score(y_train, y_pred_train)
        test_r2 = r2_score(y_test, y_pred_test)
        
        results[name] = {
            'pipeline': full_pipeline,
            'train_mae': train_mae,
            'test_mae': test_mae,
            'train_rmse': train_rmse,
            'test_rmse': test_rmse,
            'train_r2': train_r2,
            'test_r2': test_r2,
            'predictions_train': y_pred_train,
            'predictions_test': y_pred_test
        }
        
        fitted_models[name] = full_pipeline
        
        print(f"    {name} - Test R²: {test_r2:.4f}, Test MAE: {test_mae:.4f}, Test: {test_rmse:.4f}")
    
    return results, fitted_models

results, fitted_models = evaluate_regression_models(
    x_train, x_test, y_train, y_test, algorithms, preprocessor
)

 Training Linear Regression...
    Linear Regression - Test R²: -61533082835771604992.0000, Test MAE: 395488431.8004, Test: 6634547464.1979
 Training Suport Vector Machine...
    Suport Vector Machine - Test R²: 0.5859, Test MAE: 0.4069, Test: 0.5443
 Training Random Forest...
    Random Forest - Test R²: 0.6101, Test MAE: 0.3879, Test: 0.5281
 Training XG Boost...
    XG Boost - Test R²: 0.5840, Test MAE: 0.4191, Test: 0.5455


In [47]:
def compare_models(results):
    comparison_data = []
    for name, metrics in results.items():
        comparison_data.append({
            'Model': name,
            'Train R²': metrics['train_r2'],
            'Test R²': metrics['test_r2'],
            'Train MAE': metrics['train_mae'],
            'Test MAE': metrics['test_mae'],
            'Train RMSE': metrics['train_rmse'],
            'Test RMSE': metrics['test_rmse'],
            'Overfitting Score': metrics['train_r2'] - metrics['test_r2']  # Lower is better
        })
    
    comparison_df = pd.DataFrame(comparison_data)
    
    comparison_df = comparison_df.sort_values(['Test R²', 'Test MAE'], ascending=[False, True])
    
    best_model_name = comparison_df.iloc[0]['Model']
    best_model_metrics = results[best_model_name]
    
    return comparison_df, best_model_name, best_model_metrics

comparison_df, best_model_name, best_model_metrics = compare_models(results)

print(" MODEL COMPARISON RESULTS:")
print("="*80)
print(comparison_df.round(4))
print("="*80)
print(f" BEST MODEL: {best_model_name}")
print(f" Test R²: {best_model_metrics['test_r2']:.4f}")
print(f" Test MAE: {best_model_metrics['test_mae']:.4f}")
print(f" Test RMSE: {best_model_metrics['test_rmse']:.4f}")

 MODEL COMPARISON RESULTS:
                   Model  Train R²       Test R²  Train MAE      Test MAE  \
2          Random Forest    0.9276  6.101000e-01     0.1507  3.879000e-01   
1  Suport Vector Machine    0.8044  5.859000e-01     0.2359  4.069000e-01   
3               XG Boost    0.6964  5.840000e-01     0.3583  4.191000e-01   
0      Linear Regression    0.6194 -6.153308e+19     0.3935  3.954884e+08   

   Train RMSE     Test RMSE  Overfitting Score  
2      0.2221  5.281000e-01       3.175000e-01  
1      0.3652  5.443000e-01       2.185000e-01  
3      0.4549  5.455000e-01       1.124000e-01  
0      0.5094  6.634547e+09       6.153308e+19  
 BEST MODEL: Random Forest
 Test R²: 0.6101
 Test MAE: 0.3879
 Test RMSE: 0.5281


## 8 model training

In [49]:
model = Pipeline(steps =[
    ('preprocess',preprocessor),
    ('Random Forest',RandomForestRegressor(n_estimators=10))
])

In [51]:
model.fit(x_train,y_train)

## 9 model evaluation

In [53]:
def evaluate_model(x,y):
    y_pred=model.predict(x)
    return r2_score(y,y_pred)

In [55]:
print(f'R2 score on training data is : {evaluate_model(x_train,y_train)}')
print(f'R2 score on test data is : {evaluate_model(x_test,y_test)}')

R2 score on training data is : 0.9258799728357796
R2 score on test data is : 0.629747060693046


## 10 model perdidtence( saving model)

In [58]:
joblib.dump(model,'model.joblib')

['model.joblib']

## 11 loading the trained model and trying to predict

In [66]:
match_model=joblib.load('model.joblib')

In [68]:
match_model

In [70]:
x_train.sample(2)

Unnamed: 0,time,comp,round,day,venue,opponent,captain,referee,match report,sot,fk,pk,pkatt,season,team,xg_roll_5,poss_roll_5,sh_roll_5,xg_roll_10,poss_roll_10,sh_roll_10
2283,15:00,Premier League,Matchweek 4,Sat,Away,Liverpool,Adam Smith,Stuart Attwell,Match Report,2,0,0,0,2023,Bournemouth,0.78,39.6,10.2,1.16,39.8,12.7
3160,15:00,Premier League,Matchweek 7,Sat,Home,Norwich City,James Tarkowski,Kevin Friend,Match Report,4,1,0,0,2022,Burnley,0.88,39.8,11.8,1.09,44.4,13.2


In [72]:
column = ['time', 'comp', 'round', 'day', 'venue', 'opponent','captain', 'referee', 'match report', 'sot', 'fk', 'pk', 'pkatt','season', 'team', 'xg_roll_5', 'poss_roll_5', 'sh_roll_5','xg_roll_10', 'poss_roll_10', 'sh_roll_10']
input=[['13:30','Premier League','Matchweek 14','Sat','Home','Southampton','Kasper Schmeichel','Peter Bankes','Match Report',4,0,0,0,2023,'Leicester City',1.20,49.8, 11.4,1.42,50.6,13.0]]

data=pd.DataFrame(input,columns=column)
match_model.predict(data)

array([1.56])