## 1. importing libraries

In [1]:
import numpy as np
import pandas as pd
import sklearn
import joblib
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from feature_engine.datetime import DatetimeFeatures
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib

## 2. Display setting

In [158]:
pd.set_option("display.max_columns",None)

In [160]:
sklearn.set_config(transform_output="pandas")


## 3. getting the data

In [163]:
df=pd.read_csv('matches.csv')

In [165]:
df.head(3)

Unnamed: 0.1,Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,xg,xga,poss,attendance,captain,formation,referee,match report,notes,sh,sot,dist,fk,pk,pkatt,season,team
0,0,2020-09-21,20:15 (21:15),Premier League,Matchweek 2,Mon,Away,W,3,1,Wolves,1.9,0.6,65,,Fernandinho,4-2-3-1,Andre Marriner,Match Report,,13,8,21.1,2,1,1,2024,Manchester City
1,2,2020-09-27,16:30 (17:30),Premier League,Matchweek 3,Sun,Home,L,2,5,Leicester City,0.9,2.9,72,,Fernandinho,4-2-3-1,Michael Oliver,Match Report,,16,5,19.8,1,0,0,2024,Manchester City
2,4,2020-10-03,17:30 (18:30),Premier League,Matchweek 4,Sat,Away,D,1,1,Leeds United,1.2,2.4,49,,Kevin De Bruyne,4-3-3,Mike Dean,Match Report,,23,1,18.2,1,0,0,2024,Manchester City


In [167]:
df.shape

(4788, 28)

## 4 droping unnecessary colummns

In [170]:
df.drop(columns=['Unnamed: 0','date','result','xga','gf','ga','formation','notes','dist','attendance'],inplace=True)

In [88]:
print(df.shape[0])


4788


## 5 constracting new features

In [172]:


def calculate_rolling_averages(df, cols, window):
   
    new_col_suffix = f'_roll_{window}'

    
    df_temp = df[['team'] + cols]
    
    for col in cols:
        new_col_name = col + new_col_suffix
        
        df[new_col_name] = df_temp.groupby('team')[col].transform(
            lambda x: x.rolling(window=window, min_periods=1).mean().shift(1)
        )
    return df

stats_cols = ['xg', 'poss', 'sh']

df = calculate_rolling_averages(df, stats_cols, window=5)

df = calculate_rolling_averages(df, stats_cols, window=10)


df.to_csv('matches_with_lagged_features.csv', index=False)

df.drop(columns=['poss','sh'],inplace=True)

df= df.dropna()

df['time'] = df['time'].str.split(' ').str[0]

## 6 spliting data into x,y and training and testing datasets

In [174]:
def split_data(data):
    x=data.drop(columns=['xg'])
    y=data['xg']
    return (x,y)

x,y=split_data(df)

x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)

## 6 data preprocessing

In [176]:
time_cols=['time']
num_cols=['sot','fk','pk','pkatt','season','xg_roll_5','poss_roll_5','sh_roll_5','xg_roll_10','poss_roll_10','sh_roll_10']
cate_cols=['comp','round','day','venue','opponent','captain','referee','match report','team']

In [204]:
x_train.head()

Unnamed: 0,time,comp,round,day,venue,opponent,captain,referee,match report,sot,fk,pk,pkatt,season,team,xg_roll_5,poss_roll_5,sh_roll_5,xg_roll_10,poss_roll_10,sh_roll_10
2787,19:30,Premier League,Matchweek 14,Wed,Away,Southampton,Kasper Schmeichel,Robert Jones,Match Report,6,0,0,0,2022,Leicester City,1.2,49.8,11.4,1.42,50.6,13.0
2571,19:45,Premier League,Matchweek 19,Wed,Home,Leeds United,Virgil van Dijk,Michael Oliver,Match Report,12,0,2,2,2022,Liverpool,2.34,64.8,20.6,2.13,62.5,19.3
641,20:15,Premier League,Matchweek 34,Mon,Home,West Ham,Ben Mee,Anthony Taylor,Match Report,1,0,1,1,2024,Burnley,1.44,42.2,13.6,1.22,44.2,11.3
1801,17:30,Premier League,Matchweek 18,Sat,Away,Brighton,Martin Ødegaard,Anthony Taylor,Match Report,7,0,0,0,2023,Arsenal,1.92,62.2,16.0,1.8,59.4,15.0
2712,14:00,Premier League,Matchweek 15,Sun,Home,Crystal Palace,Harry Maguire,Craig Pawson,Match Report,3,2,0,0,2022,Manchester United,1.32,43.0,8.0,1.46,49.5,12.7


In [178]:


num_transformer = Pipeline(steps=[
    ('imputer',SimpleImputer(strategy='median')),
    ('scaling',StandardScaler())
])

cate_transformer = Pipeline(steps=[
    ('imputer',SimpleImputer(strategy='most_frequent')),
    ('scaling',OneHotEncoder(sparse_output=False,handle_unknown='ignore'))
    ])

time_transformer = Pipeline(steps=[
    ('imputer',SimpleImputer(strategy='most_frequent')),
    ('extractor',DatetimeFeatures(features_to_extract=['hour','minute'],format='mixed')),
    ('scaling',StandardScaler())
     ])




In [180]:
preprocessor = ColumnTransformer(transformers = [
    ('num',num_transformer,num_cols),
    ('cate',cate_transformer,cate_cols),
    ('time',time_transformer,time_cols)
])

In [230]:
preprocessor.fit_transform(x_train).head()

Unnamed: 0,num__sot,num__fk,num__pk,num__pkatt,num__season,num__xg_roll_5,num__poss_roll_5,num__sh_roll_5,num__xg_roll_10,num__poss_roll_10,num__sh_roll_10,cate__comp_Premier League,cate__round_Matchweek 1,cate__round_Matchweek 10,cate__round_Matchweek 11,cate__round_Matchweek 12,cate__round_Matchweek 13,cate__round_Matchweek 14,cate__round_Matchweek 15,cate__round_Matchweek 16,cate__round_Matchweek 17,cate__round_Matchweek 18,cate__round_Matchweek 19,cate__round_Matchweek 2,cate__round_Matchweek 20,cate__round_Matchweek 21,cate__round_Matchweek 22,cate__round_Matchweek 23,cate__round_Matchweek 24,cate__round_Matchweek 25,cate__round_Matchweek 26,cate__round_Matchweek 27,cate__round_Matchweek 28,cate__round_Matchweek 29,cate__round_Matchweek 3,cate__round_Matchweek 30,cate__round_Matchweek 31,cate__round_Matchweek 32,cate__round_Matchweek 33,cate__round_Matchweek 34,cate__round_Matchweek 35,cate__round_Matchweek 36,cate__round_Matchweek 37,cate__round_Matchweek 38,cate__round_Matchweek 4,cate__round_Matchweek 5,cate__round_Matchweek 6,cate__round_Matchweek 7,cate__round_Matchweek 8,cate__round_Matchweek 9,cate__day_Fri,cate__day_Mon,cate__day_Sat,cate__day_Sun,cate__day_Thu,cate__day_Tue,cate__day_Wed,cate__venue_Away,cate__venue_Home,cate__opponent_Arsenal,cate__opponent_Aston Villa,cate__opponent_Bournemouth,cate__opponent_Brentford,cate__opponent_Brighton,cate__opponent_Burnley,cate__opponent_Chelsea,cate__opponent_Crystal Palace,cate__opponent_Everton,cate__opponent_Fulham,cate__opponent_Leeds United,cate__opponent_Leicester City,cate__opponent_Liverpool,cate__opponent_Luton Town,cate__opponent_Manchester City,cate__opponent_Manchester Utd,cate__opponent_Newcastle Utd,cate__opponent_Norwich City,cate__opponent_Nott'ham Forest,cate__opponent_Sheffield Utd,cate__opponent_Southampton,cate__opponent_Tottenham,cate__opponent_Watford,cate__opponent_West Brom,cate__opponent_West Ham,cate__opponent_Wolves,cate__captain_Aaron Cresswell,cate__captain_Adam Lallana,cate__captain_Adam Smith,cate__captain_Adam Webster,cate__captain_Adrian Mariappa,cate__captain_Aleksandar Mitrović,cate__captain_Alexander Tettey,cate__captain_Alexandre Lacazette,cate__captain_Amari'i Bell,cate__captain_Andrew Robertson,cate__captain_Andy Carroll,cate__captain_Anel Ahmedhodžić,cate__captain_Angelo Ogbonna,cate__captain_Ashley Westwood,cate__captain_Ashley Young,cate__captain_Ben Chilwell,cate__captain_Ben Davies,cate__captain_Ben Foster,cate__captain_Ben Gibson,cate__captain_Ben Godfrey,cate__captain_Ben Mee,cate__captain_Bernd Leno,cate__captain_Billy Sharp,cate__captain_Bobby Reid,cate__captain_Branislav Ivanović,cate__captain_Bruno Fernandes,cate__captain_Bukayo Saka,cate__captain_Callum Wilson,cate__captain_Carlton Morris,cate__captain_Chris Basham,cate__captain_Christian Nørgaard,cate__captain_Christoph Zimmermann,cate__captain_Conor Coady,cate__captain_Conor Gallagher,cate__captain_Craig Cathcart,cate__captain_Cristian Romero,cate__captain_Cristiano Ronaldo,cate__captain_César Azpilicueta,cate__captain_Dale Stephens,cate__captain_Dan Burn,cate__captain_David McGoldrick,cate__captain_David Raya,cate__captain_David Silva,cate__captain_David de Gea,cate__captain_Declan Rice,cate__captain_Douglas Luiz,cate__captain_Emiliano Martínez,cate__captain_Ezri Konsa,cate__captain_Fabian Schär,cate__captain_Federico Fernández,cate__captain_Felipe,cate__captain_Fernandinho,cate__captain_Gary Cahill,cate__captain_Georginio Wijnaldum,cate__captain_Granit Xhaka,cate__captain_Grant Hanley,cate__captain_Gylfi Sigurðsson,cate__captain_Harrison Reed,cate__captain_Harry Kane,cate__captain_Harry Maguire,cate__captain_Harry Winks,cate__captain_Hugo Lloris,cate__captain_Héctor Bellerín,cate__captain_Ivan Toney,cate__captain_Jack Cork,cate__captain_Jack Grealish,cate__captain_Jack Robinson,cate__captain_Jake Livermore,cate__captain_Jamaal Lascelles,cate__captain_James Maddison,cate__captain_James McArthur,cate__captain_James Milner,cate__captain_James Tarkowski,cate__captain_James Tomkins,cate__captain_James Ward-Prowse,cate__captain_Jan Vertonghen,cate__captain_Joachim Andersen,cate__captain_Joe Worrall,cate__captain_Joel Ward,cate__captain_John Egan,cate__captain_John McGinn,cate__captain_Jonjo Shelvey,cate__captain_Jonny Evans,cate__captain_Jordan Henderson,cate__captain_Jordan Pickford,cate__captain_Jorginho,cate__captain_Josh Brownhill,cate__captain_Josh Cullen,cate__captain_José Sá,cate__captain_João Moutinho,cate__captain_Kasper Schmeichel,cate__captain_Kepa Arrizabalaga,cate__captain_Kevin De Bruyne,cate__captain_Kieran Trippier,cate__captain_Kurt Zouma,cate__captain_Kyle Bartley,cate__captain_Kyle Walker,cate__captain_Leighton Baines,cate__captain_Lewis Dunk,cate__captain_Liam Cooper,cate__captain_Lloyd Kelly,cate__captain_Lucas Digne,cate__captain_Luka Milivojević,cate__captain_Luke Ayling,cate__captain_Marc Albrighton,cate__captain_Marc Guéhi,cate__captain_Marcos Alonso,cate__captain_Mark Noble,cate__captain_Martin Ødegaard,cate__captain_Mason Holgate,cate__captain_Mateo Kovačić,cate__captain_Mathias Jensen,cate__captain_Matt Phillips,cate__captain_Max Kilman,cate__captain_Michael Keane,cate__captain_Morgan Gibbs-White,cate__captain_Moussa Sissoko,cate__captain_N'Golo Kanté,cate__captain_Nacho Monreal,cate__captain_Nathan Aké,cate__captain_Nemanja Matić,cate__captain_Neto,cate__captain_Oleksandr Zinchenko,cate__captain_Oliver Arblaster,cate__captain_Oliver McBurnie,cate__captain_Oliver Norwood,cate__captain_Oriol Romeu,cate__captain_Pascal Groß,cate__captain_Patrick Bamford,cate__captain_Pelly Ruddock Mpanzu,cate__captain_Pierre Højbjerg,cate__captain_Pierre-Emerick Aubameyang,cate__captain_Pontus Jansson,cate__captain_Raheem Sterling,cate__captain_Reece James,cate__captain_Remo Freuler,cate__captain_Rob Holding,cate__captain_Roberto Firmino,cate__captain_Rodrigo,cate__captain_Ryan Yates,cate__captain_Rúben Dias,cate__captain_Rúben Neves,cate__captain_Scott Dann,cate__captain_Scott McTominay,cate__captain_Serge Aurier,cate__captain_Sergio Agüero,cate__captain_Shane Duffy,cate__captain_Simon Francis,cate__captain_Son Heung-min,cate__captain_Steve Cook,cate__captain_Séamus Coleman,cate__captain_Thiago Silva,cate__captain_Tim Ream,cate__captain_Timm Klose,cate__captain_Toby Alderweireld,cate__captain_Tom Cairney,cate__captain_Tom Cleverley,cate__captain_Tom Lockyer,cate__captain_Tomáš Souček,cate__captain_Troy Deeney,cate__captain_Tyrone Mings,cate__captain_Virgil van Dijk,cate__captain_Wes Morgan,cate__captain_Wilfred Ndidi,cate__captain_Wilfried Zaha,cate__captain_William Troost-Ekong,cate__captain_Willian,cate__captain_Yerry Mina,cate__captain_Youri Tielemans,cate__captain_Étienne Capoue,cate__captain_İlkay Gündoğan,cate__captain_Łukasz Fabiański,cate__referee_Andre Marriner,cate__referee_Andy Madley,cate__referee_Anthony Taylor,cate__referee_Chris Kavanagh,cate__referee_Craig Pawson,cate__referee_Darren Bond,cate__referee_Darren England,cate__referee_David Coote,cate__referee_Graham Scott,cate__referee_Jarred Gillett,cate__referee_John Brooks,cate__referee_Jonathan Moss,cate__referee_Joshua Smith,cate__referee_Kevin Friend,cate__referee_Lee Mason,cate__referee_Lewis Smith,cate__referee_Martin Atkinson,cate__referee_Matt Donohue,cate__referee_Michael Oliver,cate__referee_Michael Salisbury,cate__referee_Mike Dean,cate__referee_Oliver Langford,cate__referee_Paul Tierney,cate__referee_Peter Bankes,cate__referee_Rebecca Welch,cate__referee_Robert Jones,cate__referee_Robert Madley,cate__referee_Samuel Allison,cate__referee_Samuel Barrott,cate__referee_Simon Hooper,cate__referee_Stuart Attwell,cate__referee_Sunny Singh,cate__referee_Thomas Bramall,cate__referee_Tim Robinson,cate__referee_Tony Harrington,cate__match report_Match Report,cate__team_Arsenal,cate__team_Aston Villa,cate__team_Bournemouth,cate__team_Brentford,cate__team_Brighton and Hove Albion,cate__team_Burnley,cate__team_Chelsea,cate__team_Crystal Palace,cate__team_Everton,cate__team_Fulham,cate__team_Leeds United,cate__team_Leicester City,cate__team_Liverpool,cate__team_Luton Town,cate__team_Manchester City,cate__team_Manchester United,cate__team_Newcastle United,cate__team_Norwich City,cate__team_Nottingham Forest,cate__team_Sheffield United,cate__team_Southampton,cate__team_Tottenham Hotspur,cate__team_Watford,cate__team_West Bromwich Albion,cate__team_West Ham United,cate__team_Wolverhampton Wanderers,time__time_hour,time__time_minute
2787,0.702572,-0.680759,-0.341594,-0.381774,-0.240073,-0.409654,-0.0638,-0.374306,0.065531,0.031292,0.149667,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.075899,1.282951
2571,3.146124,-0.680759,5.599653,4.954651,-0.240073,2.004855,1.625372,2.535707,1.783773,1.470129,2.409266,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.075899,2.309006
641,-1.333721,-0.680759,2.629029,2.286439,1.135022,0.098663,-0.919648,0.321566,-0.418481,-0.742536,-0.460066,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.473695,0.256895
1801,1.109831,-0.680759,-0.341594,-0.381774,0.447474,1.115299,1.332582,1.0807,0.985154,1.095306,0.867,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.280306,1.282951
2712,-0.519204,2.299713,-0.341594,-0.381774,-0.240073,-0.155495,-0.829558,-1.449746,0.162334,-0.10171,0.042067,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.913083,-0.76916


## 7 Model Selection


In [182]:

algorithms = {
    'Linear Regression': LinearRegression(),
    'Suport Vector Machine': SVR(),
    'Random Forest': RandomForestRegressor(n_estimators=10),
    'XG Boost': XGBRegressor(n_estimators=10)
}

In [184]:
def evaluate_regression_models(X_train, X_test, y_train, y_test, algorithms, preprocessor):
    results = {}
    fitted_models = {}
    
    for name, model in algorithms.items():
        print(f" Training {name}...")
        
        full_pipeline = Pipeline(steps=[
            ('preprocessor', preprocessor),
            ('model', model)
        ])
        
        full_pipeline.fit(X_train, y_train)
        
        y_pred_train = full_pipeline.predict(X_train)
        y_pred_test = full_pipeline.predict(X_test)
        
        train_mae = mean_absolute_error(y_train, y_pred_train)
        test_mae = mean_absolute_error(y_test, y_pred_test)
        train_rmse = np.sqrt(mean_squared_error(y_train, y_pred_train))
        test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))
        train_r2 = r2_score(y_train, y_pred_train)
        test_r2 = r2_score(y_test, y_pred_test)
        
        results[name] = {
            'pipeline': full_pipeline,
            'train_mae': train_mae,
            'test_mae': test_mae,
            'train_rmse': train_rmse,
            'test_rmse': test_rmse,
            'train_r2': train_r2,
            'test_r2': test_r2,
            'predictions_train': y_pred_train,
            'predictions_test': y_pred_test
        }
        
        fitted_models[name] = full_pipeline
        
        print(f"    {name} - Test R²: {test_r2:.4f}, Test MAE: {test_mae:.4f}, Test: {test_rmse:.4f}")
    
    return results, fitted_models

results, fitted_models = evaluate_regression_models(
    x_train, x_test, y_train, y_test, algorithms, preprocessor
)

 Training Linear Regression...
    Linear Regression - Test R²: -38296934820311900160.0000, Test MAE: 382002289.9612, Test: 5234063367.7241
 Training Suport Vector Machine...
    Suport Vector Machine - Test R²: 0.5859, Test MAE: 0.4069, Test: 0.5443
 Training Random Forest...
    Random Forest - Test R²: 0.6194, Test MAE: 0.3885, Test: 0.5218
 Training XG Boost...
    XG Boost - Test R²: 0.5820, Test MAE: 0.4188, Test: 0.5469


In [186]:
def compare_models(results):
    comparison_data = []
    for name, metrics in results.items():
        comparison_data.append({
            'Model': name,
            'Train R²': metrics['train_r2'],
            'Test R²': metrics['test_r2'],
            'Train MAE': metrics['train_mae'],
            'Test MAE': metrics['test_mae'],
            'Train RMSE': metrics['train_rmse'],
            'Test RMSE': metrics['test_rmse'],
            'Overfitting Score': metrics['train_r2'] - metrics['test_r2']  # Lower is better
        })
    
    comparison_df = pd.DataFrame(comparison_data)
    
    comparison_df = comparison_df.sort_values(['Test R²', 'Test MAE'], ascending=[False, True])
    
    best_model_name = comparison_df.iloc[0]['Model']
    best_model_metrics = results[best_model_name]
    
    return comparison_df, best_model_name, best_model_metrics

comparison_df, best_model_name, best_model_metrics = compare_models(results)

print(" MODEL COMPARISON RESULTS:")
print("="*80)
print(comparison_df.round(4))
print("="*80)
print(f" BEST MODEL: {best_model_name}")
print(f" Test R²: {best_model_metrics['test_r2']:.4f}")
print(f" Test MAE: {best_model_metrics['test_mae']:.4f}")
print(f" Test RMSE: {best_model_metrics['test_rmse']:.4f}")

 MODEL COMPARISON RESULTS:
                   Model  Train R²       Test R²  Train MAE      Test MAE  \
2          Random Forest    0.9243  6.194000e-01     0.1529  3.885000e-01   
1  Suport Vector Machine    0.8044  5.859000e-01     0.2359  4.069000e-01   
3               XG Boost    0.6938  5.820000e-01     0.3596  4.188000e-01   
0      Linear Regression    0.6194 -3.829693e+19     0.3935  3.820023e+08   

   Train RMSE     Test RMSE  Overfitting Score  
2      0.2271  5.218000e-01       3.049000e-01  
1      0.3652  5.443000e-01       2.185000e-01  
3      0.4568  5.469000e-01       1.119000e-01  
0      0.5094  5.234063e+09       3.829693e+19  
 BEST MODEL: Random Forest
 Test R²: 0.6194
 Test MAE: 0.3885
 Test RMSE: 0.5218


## 8 model training

In [188]:
model = Pipeline(steps =[
    ('preprocess',preprocessor),
    ('Random Forest',RandomForestRegressor(n_estimators=10))
])

In [190]:
model.fit(x_train,y_train)

## 9 model evaluation

In [192]:
def evaluate_model(x,y):
    y_pred=model.predict(x)
    return r2_score(y,y_pred)

In [194]:
print(f'R2 score on training data is : {evaluate_model(x_train,y_train)}')
print(f'R2 score on test data is : {evaluate_model(x_test,y_test)}')

R2 score on training data is : 0.9289635892288354
R2 score on test data is : 0.6212186483519373


## 10 model perdidtence( saving model)

In [197]:
joblib.dump(model,'model.joblib')

['model.joblib']

## 11 loading the trained model and trying to predict

In [200]:
match_model=joblib.load('model.joblib')

In [202]:
match_model

In [238]:
x_train.sample(2)

Unnamed: 0,time,comp,round,day,venue,opponent,captain,referee,match report,sot,fk,pk,pkatt,season,team,xg_roll_5,poss_roll_5,sh_roll_5,xg_roll_10,poss_roll_10,sh_roll_10
1682,15:00,Premier League,Matchweek 11,Sat,Home,Crystal Palace,Josh Brownhill,Peter Bankes,Match Report,5,0,0,0,2024,Burnley,0.86,48.2,8.8,0.82,49.5,9.7
1784,20:00,Premier League,Matchweek 32,Wed,Away,Brighton,İlkay Gündoğan,Simon Hooper,Match Report,4,0,0,0,2023,Manchester City,1.88,67.6,13.6,2.14,66.8,13.5


In [240]:
column = ['time', 'comp', 'round', 'day', 'venue', 'opponent','captain', 'referee', 'match report', 'sot', 'fk', 'pk', 'pkatt','season', 'team', 'xg_roll_5', 'poss_roll_5', 'sh_roll_5','xg_roll_10', 'poss_roll_10', 'sh_roll_10']
input=[['13:30','Premier League','Matchweek 14','Sat','Home','Southampton','Kasper Schmeichel','Peter Bankes','Match Report',4,0,0,0,2023,'Leicester City',1.20,49.8, 11.4,1.42,50.6,13.0]]

data=pd.DataFrame(input,columns=column)
match_model.predict(data)

array([1.43])