## Data Preparation and Feature Engineering

In [1]:
import pandas as pd
data = pd.read_csv('/content/data_with_weather_summary.csv')
data.head()


Unnamed: 0,League,Team,Season,Player,Position,Appearances,Goals,Started,Started As A Sub,Came On,...,latitude,longitude,game_date,tavg,tmin,tmax,prcp,snow,wspd,weather_summary
0,Bundesliga,1. FC Koln,2014/15,Jonas Hector,Defender,33.0,2.0,33.0,0.0,0.0,...,50.9375,6.9603,10-08-2014,14.1,11.2,16.7,18.4,0.0,13.0,Rainy
1,Bundesliga,1. FC Koln,2014/15,Timo Horn,Goalkeeper,33.0,0.0,33.0,1.0,0.0,...,50.9375,6.9603,10-08-2014,14.1,11.2,16.7,18.4,0.0,13.0,Rainy
2,Bundesliga,1. FC Koln,2014/15,Matthias Lehmann,Midfielder,32.0,5.0,32.0,0.0,0.0,...,50.9375,6.9603,10-08-2014,14.1,11.2,16.7,18.4,0.0,13.0,Rainy
3,Bundesliga,1. FC Koln,2014/15,Kevin Wimmer,Defender,32.0,0.0,32.0,1.0,0.0,...,50.9375,6.9603,10-08-2014,14.1,11.2,16.7,18.4,0.0,13.0,Rainy
4,Bundesliga,1. FC Koln,2014/15,Kevin Vogt,Midfielder,32.0,1.0,30.0,3.0,2.0,...,50.9375,6.9603,10-08-2014,14.1,11.2,16.7,18.4,0.0,13.0,Rainy


In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
# dropping the snow column because it's useless
data = data.drop('snow', axis=1)
# dropping datapoints with unknown weather_summary because it's useless
unknown_weather = data[data['weather_summary'] == 'Unknown']
print(unknown_weather)
print(f"Number of rows with unknown weather: {len(unknown_weather)}")
data = data[
    data['weather_summary'] != 'Unknown'
]

# Print the updated dataframe and number of rows
print(data)
print(f"Number of rows after dropping 'Unknown' weather: {len(data)}")


            League                 Team   Season                   Player  \
11682   Bundesliga  TSG 1899 Hoffenheim  2014/15           Oliver Baumann   
11683   Bundesliga  TSG 1899 Hoffenheim  2014/15          Roberto Firmino   
11684   Bundesliga  TSG 1899 Hoffenheim  2014/15             Andreas Beck   
11685   Bundesliga  TSG 1899 Hoffenheim  2014/15            Kevin Volland   
11686   Bundesliga  TSG 1899 Hoffenheim  2014/15           Eugen Polanski   
...            ...                  ...      ...                      ...   
104016     Serie A              Udinese  2016/17            Marco Faraoni   
104017     Serie A              Udinese  2016/17                  Ewandro   
104018     Serie A              Udinese  2016/17            Andrija Balic   
104019     Serie A              Udinese  2016/17  Assane Demoya Gnoukouri   
104020     Serie A              Udinese  2016/17             Pablo Armero   

          Position  Appearances  Goals  Started  Started As A Sub  Came On 

In [3]:
d1 = data.copy()


## Feature engineering form score

In [4]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.impute import SimpleImputer

def engineer_form_score(df):
    # Position-specific features mapping
    position_features = {
        'Midfielder': [
            'Goals', 'Assists', 'Minutes played',
            'Open Play', 'Free Kick', 'Penalty',
            'Yellow Cards', 'Straight Red Card'
        ],
        'Attacker': [
            'Goals', 'Assists', 'Min/goal',
            'Open Play', 'Cross', 'Penalty',
            'First Scorer', 'Last Scorer'
        ],
        'Defender': [
            'CS', 'Goals Against', 'GA Per Match',
            'Minutes played', 'Yellow Cards',
            'Straight Red Card'
        ],
        'Goalkeeper': [
            'CS', 'Goals Against', 'GA Per Match',
            'Minutes played'
        ]
    }

    def calculate_position_score(group):
        # Select features based on position
        pos_features = position_features.get(group['Position'].iloc[0],
                                             position_features['Midfielder'])

        # Filter valid features
        valid_features = [f for f in pos_features if f in group.columns]

        # Subset data
        subset = group[valid_features]

        # Impute missing values
        imputer = SimpleImputer(strategy='mean')
        imputed_data = imputer.fit_transform(subset)

        # Standardize features
        scaler = StandardScaler()
        normalized = pd.DataFrame(
            scaler.fit_transform(imputed_data),
            columns=subset.columns,
            index=subset.index
        )

        # Calculate weighted performance score
        position_weights = {
            'Midfielder': {
                'Goals': 0.25,
                'Assists': 0.25,
                'Minutes played': 0.2,
                'Open Play': 0.1,
                'Yellow Cards': -0.1
            },
            'Attacker': {
                'Goals': 0.35,
                'Assists': 0.25,
                'Min/goal': 0.2,
                'Open Play': 0.1,
                'First Scorer': 0.1
            },
            'Defender': {
                'CS': 0.3,
                'Goals Against': -0.2,
                'GA Per Match': -0.2,
                'Minutes played': 0.2,
                'Yellow Cards': -0.1
            },
            'Goalkeeper': {
                'CS': 0.4,
                'Goals Against': -0.3,
                'Minutes played': 0.3
            }
        }

        # Get weights for the position
        weights = position_weights.get(group['Position'].iloc[0],
                                       position_weights['Midfielder'])

        # Calculate weighted score
        weighted_score = sum(
            normalized[col] * weights.get(col, 0)
            for col in normalized.columns
            if col in weights
        )

        return weighted_score

    # Calculate scores by season and position
    df['Normalized Performance'] = df.groupby(['Season', 'Position']).apply(
        calculate_position_score
    ).reset_index(level=[0,1], drop=True)

    # Normalize to form score between 0.5 and 1
    min_max_scaler = MinMaxScaler(feature_range=(0.5, 1))
    df['Form Score'] = min_max_scaler.fit_transform(
        df['Normalized Performance'].values.reshape(-1, 1)
    )

    return df

# Usage
d1_with_form_score = engineer_form_score(d1)

In [5]:
def top_20_form_scores_by_season(df):
    # Group by Season and sort within each season
    top_players = df.groupby('Season').apply(
        lambda x: x.nlargest(20, 'Form Score')[['Season', 'Player', 'Position', 'Form Score']]
    ).reset_index(drop=True)

    # Print top 20 for each season
    for season in df['Season'].unique():
        print(f"\nTop 20 Players for Season {season}:")
        season_top = top_players[top_players['Season'] == season]
        print(season_top.to_string(index=False))

# Usage
top_20_form_scores_by_season(d1_with_form_score)


Top 20 Players for Season 2014/15:
 Season            Player   Position  Form Score
2014/15      Lionel Messi    Forward    0.806194
2014/15      Lionel Messi    Forward    0.806194
2014/15      Lionel Messi    Forward    0.806194
2014/15 Cristiano Ronaldo    Forward    0.786379
2014/15 Cristiano Ronaldo    Forward    0.786379
2014/15 Cristiano Ronaldo    Forward    0.786379
2014/15   Kevin De Bruyne Midfielder    0.760586
2014/15   Kevin De Bruyne Midfielder    0.760586
2014/15   Kevin De Bruyne Midfielder    0.760586
2014/15     Dimitri Payet Midfielder    0.724700
2014/15     Dimitri Payet Midfielder    0.724700
2014/15     Dimitri Payet Midfielder    0.724700
2014/15   James Rodriguez Midfielder    0.724345
2014/15   James Rodriguez Midfielder    0.724345
2014/15   James Rodriguez Midfielder    0.724345
2014/15       Eden Hazard Midfielder    0.721145
2014/15       Eden Hazard Midfielder    0.721145
2014/15       Eden Hazard Midfielder    0.721145
2014/15      Arjen Robben Midfiel

In [6]:
# Remove duplicates based on 'Player', 'Season', and 'Form Score'
d1_with_form_score = d1_with_form_score.drop_duplicates(subset=['Player', 'Season', 'Form Score'])

def top_20_form_scores_by_season(df):
    # Group by Season and sort within each season
    top_players = df.groupby('Season').apply(
        lambda x: x.nlargest(20, 'Form Score')[['Season', 'Player', 'Position', 'Form Score']]
    ).reset_index(drop=True)

    # Print top 20 for each season
    for season in df['Season'].unique():
        print(f"\nTop 20 Players for Season {season}:")
        season_top = top_players[top_players['Season'] == season]
        print(season_top.to_string(index=False))

top_20_form_scores_by_season(d1_with_form_score)


Top 20 Players for Season 2014/15:
 Season            Player   Position  Form Score
2014/15      Lionel Messi    Forward    0.806194
2014/15 Cristiano Ronaldo    Forward    0.786379
2014/15   Kevin De Bruyne Midfielder    0.760586
2014/15     Dimitri Payet Midfielder    0.724700
2014/15   James Rodriguez Midfielder    0.724345
2014/15       Eden Hazard Midfielder    0.721145
2014/15      Arjen Robben Midfielder    0.718769
2014/15       Nabil Fekir Midfielder    0.711560
2014/15      Ali Crawford Midfielder    0.709660
2014/15       Gareth Bale Midfielder    0.703623
2014/15    Franco Vasquez Midfielder    0.703322
2014/15     Jason Denayer   Defender    0.698835
2014/15       David Silva Midfielder    0.691149
2014/15   Karim Bellarabi Midfielder    0.691062
2014/15     Memphis Depay Midfielder    0.689858
2014/15   Alexander Meier Midfielder    0.687705
2014/15     Sergio Aguero    Forward    0.685651
2014/15   Anthony Mounier Midfielder    0.685093
2014/15      Marek Hamsik Midfiel

In [7]:
#convert game_date to date time format

import pandas as pd

# Convert 'game_date' to datetime format
d1_with_form_score['game_date'] = pd.to_datetime(d1_with_form_score['game_date'])

In [8]:
# Check for missing values in 'd1_with_form_score'
print(d1_with_form_score.isnull().sum())

# Handle missing values (example: imputation with the mean)
for col in d1_with_form_score.select_dtypes(include=np.number).columns:
    if d1_with_form_score[col].isnull().any():
        imputer = SimpleImputer(strategy='mean')  # You can choose other strategies
        d1_with_form_score[col] = imputer.fit_transform(d1_with_form_score[[col]])

League                       0
Team                         0
Season                       0
Player                       0
Position                     0
                          ... 
prcp                      8102
wspd                       989
weather_summary              0
Normalized Performance       0
Form Score                   0
Length: 78, dtype: int64


In [9]:
# Check for missing values again after imputation
print(d1_with_form_score.isnull().sum())

League                    0
Team                      0
Season                    0
Player                    0
Position                  0
                         ..
prcp                      0
wspd                      0
weather_summary           0
Normalized Performance    0
Form Score                0
Length: 78, dtype: int64


In [10]:
d1_with_form_score.columns

Index(['League', 'Team', 'Season', 'Player', 'Position', 'Appearances',
       'Goals', 'Started', 'Started As A Sub', 'Came On', 'Taken Off',
       'Own Goals', 'First Half', 'Second Half', 'First Scorer', 'Last Scorer',
       'Home', 'Away', 'Right Foot', 'Left Foot', 'Header', 'Other Method',
       'Open Play', 'Cross', 'Free Kick', 'Direct Free Kick', 'Throw In',
       'Penalty', 'Corner', 'Other Type Of Play', 'Minutes played', 'Min/goal',
       'Assists', '% Assists', 'Recipient Goals', 'CS', '% Clean Sheets',
       'Hat Tricks', 'Yellow Cards', 'Yellow - First Half',
       'Yellow - Second Half', 'Yellow - Home', 'Yellow - Away',
       'Yelow - Minutes Played', 'Minutes per Yellow Cards', 'Sent Off',
       'Straight Red Card', 'GA Home', 'GA Away', 'GA First Half',
       'GA Second Half', 'GA First 15 mins', 'GA Last 10 mins',
       'GA Home Matches Scored In', 'GA Away Matches Scored In',
       'GA Per Match', 'GF Home', 'GF Away', 'GF First Half', 'GF Second Half',

In [11]:
d2 = d1_with_form_score.copy()

In [12]:
d3 = d1_with_form_score.copy()

In [13]:
d2.shape

(29867, 78)

## Rolling Feature Creation




The first critical step in our analysis involves creating rolling features to capture temporal performance dynamics. We implemented a feature engineering strategy that generates rolling statistical measures for the player's form score:

Rolling Mean: Calculates the average form score over sliding windows of 3, 5, and 7 matches
Rolling Variance: Measures the performance consistency across these windows

It helps the model to:
Captures short-term performance trends
Provides insights into player consistency
Allows the model to learn from recent performance patterns

In [14]:
# Define rolling window sizes
window_sizes = [3]

# Add rolling mean and variance of "Form Score" for each window size
for window in window_sizes:
    d3[f'Form_Score_Rolling_Mean_{window}'] = d3['Form Score'].rolling(window=window).mean()
    d3[f'Form_Score_Rolling_Variance_{window}'] = d3['Form Score'].rolling(window=window).var()

# Display the first few rows to verify
print(d3.head())
d3.columns


       League        Team   Season            Player    Position  Appearances  \
0  Bundesliga  1. FC Koln  2014/15      Jonas Hector    Defender         33.0   
1  Bundesliga  1. FC Koln  2014/15         Timo Horn  Goalkeeper         33.0   
2  Bundesliga  1. FC Koln  2014/15  Matthias Lehmann  Midfielder         32.0   
3  Bundesliga  1. FC Koln  2014/15      Kevin Wimmer    Defender         32.0   
4  Bundesliga  1. FC Koln  2014/15        Kevin Vogt  Midfielder         32.0   

   Goals  Started  Started As A Sub  Came On  ...  tavg  tmin  tmax  prcp  \
0    2.0     33.0               0.0      0.0  ...  14.1  11.2  16.7  18.4   
1    0.0     33.0               1.0      0.0  ...  14.1  11.2  16.7  18.4   
2    5.0     32.0               0.0      0.0  ...  14.1  11.2  16.7  18.4   
3    0.0     32.0               1.0      0.0  ...  14.1  11.2  16.7  18.4   
4    1.0     30.0               3.0      2.0  ...  14.1  11.2  16.7  18.4   

   wspd  weather_summary  Normalized Performance  

Index(['League', 'Team', 'Season', 'Player', 'Position', 'Appearances',
       'Goals', 'Started', 'Started As A Sub', 'Came On', 'Taken Off',
       'Own Goals', 'First Half', 'Second Half', 'First Scorer', 'Last Scorer',
       'Home', 'Away', 'Right Foot', 'Left Foot', 'Header', 'Other Method',
       'Open Play', 'Cross', 'Free Kick', 'Direct Free Kick', 'Throw In',
       'Penalty', 'Corner', 'Other Type Of Play', 'Minutes played', 'Min/goal',
       'Assists', '% Assists', 'Recipient Goals', 'CS', '% Clean Sheets',
       'Hat Tricks', 'Yellow Cards', 'Yellow - First Half',
       'Yellow - Second Half', 'Yellow - Home', 'Yellow - Away',
       'Yelow - Minutes Played', 'Minutes per Yellow Cards', 'Sent Off',
       'Straight Red Card', 'GA Home', 'GA Away', 'GA First Half',
       'GA Second Half', 'GA First 15 mins', 'GA Last 10 mins',
       'GA Home Matches Scored In', 'GA Away Matches Scored In',
       'GA Per Match', 'GF Home', 'GF Away', 'GF First Half', 'GF Second Half',

In [15]:
d3.isnull().sum()

Unnamed: 0,0
League,0
Team,0
Season,0
Player,0
Position,0
...,...
weather_summary,0
Normalized Performance,0
Form Score,0
Form_Score_Rolling_Mean_3,2


In [16]:
import pandas as pd
import numpy as np

# Assuming 'd3' is your DataFrame (as defined in your provided code)
# Replace with your actual DataFrame if needed.

# Find rows with any missing values
rows_with_missing_values = d3[d3.isnull().any(axis=1)]

# Print the rows with missing values
rows_with_missing_values

Unnamed: 0,League,Team,Season,Player,Position,Appearances,Goals,Started,Started As A Sub,Came On,...,tavg,tmin,tmax,prcp,wspd,weather_summary,Normalized Performance,Form Score,Form_Score_Rolling_Mean_3,Form_Score_Rolling_Variance_3
0,Bundesliga,1. FC Koln,2014/15,Jonas Hector,Defender,33.0,2.0,33.0,0.0,0.0,...,14.1,11.2,16.7,18.4,13.0,Rainy,0.21766,0.572202,,
1,Bundesliga,1. FC Koln,2014/15,Timo Horn,Goalkeeper,33.0,0.0,33.0,1.0,0.0,...,14.1,11.2,16.7,18.4,13.0,Rainy,1.01784,0.607331,,


Imputing the missing values

In [17]:
d3['Form_Score_Rolling_Mean_3'].fillna(d3['Form_Score_Rolling_Mean_3'].mean(), inplace=True)
d3['Form_Score_Rolling_Variance_3'].fillna(d3['Form_Score_Rolling_Variance_3'].mean(), inplace=True)


In [18]:
d3.isnull().sum()

Unnamed: 0,0
League,0
Team,0
Season,0
Player,0
Position,0
...,...
weather_summary,0
Normalized Performance,0
Form Score,0
Form_Score_Rolling_Mean_3,0


## Preprocessing pipeline

In [21]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

# Step 1: Categorize features
categorical_features = [
    'League', 'Team', 'Season', 'Player', 'Position',
    'First Scorer', 'Last Scorer', 'weather_summary'
]

numerical_features = [
    # Performance Metrics
    'Appearances', 'Goals', 'Started', 'Started As A Sub',
    'Came On', 'Taken Off', 'Own Goals',
    'First Half', 'Second Half',

    # Goal-related Features
    'Right Foot', 'Left Foot', 'Header', 'Other Method',
    'Open Play', 'Cross', 'Free Kick', 'Direct Free Kick',
    'Throw In', 'Penalty', 'Corner', 'Other Type Of Play',

    # Time and Performance Metrics
    'Minutes played', 'Min/goal', 'Assists', '% Assists',
    'Recipient Goals', 'Hat Tricks',

    # Card-related Features
    'Yellow Cards', 'Yellow - First Half', 'Yellow - Second Half',
    'Yellow - Home', 'Yellow - Away',
    'Minutes per Yellow Cards',
    'Sent Off', 'Straight Red Card',

    # Goal-related Aggregate Features
    'GA Home', 'GA Away', 'GA First Half', 'GA Second Half',
    'GA First 15 mins', 'GA Last 10 mins',
    'GF Home', 'GF Away', 'GF First Half', 'GF Second Half',
    'GF First 15 mins', 'GF Last 10 mins',

    # Weather and Performance Features
    'tavg', 'tmin', 'tmax', 'prcp', 'wspd',
    'Normalized Performance',

    # Rolling Features
    'Form_Score_Rolling_Mean_3', 'Form_Score_Rolling_Variance_3'
]

drop_columns = ['game_date', 'latitude', 'longitude']

# Step 2: Drop unnecessary columns
df = d3.drop(columns=drop_columns, errors='ignore')

# Step 3: Handle high-cardinality categorical features (Target Encoding for 'Player')
print("Performing Target Encoding for 'Player'...")
player_mean = df.groupby('Player')['Form Score'].mean()
df['Player_encoded'] = df['Player'].map(player_mean)
df = df.drop(columns=['Player'], errors='ignore')

# Step 4: Create preprocessing pipelines
# Update categorical features after dropping 'Player'
categorical_features = ['League', 'Team', 'Season', 'Position',
                        'First Scorer', 'Last Scorer', 'weather_summary']

numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

# Step 5: Separate features and target
target_column = 'Form Score'
X = df.drop(columns=[target_column])
y = df[target_column]

# Step 6: Fit and transform the features
X_transformed = preprocessor.fit_transform(X)

# Step 7: Retrieve feature names after preprocessing
onehot_encoder = preprocessor.named_transformers_['cat'].named_steps['onehot']
cat_feature_names = onehot_encoder.get_feature_names_out(categorical_features).tolist()
feature_names = numerical_features + ['Player_encoded'] + cat_feature_names

# Step 8: Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_transformed, y, test_size=0.2, random_state=42)

# Step 9: Output processed data and additional information
processed_data = {
    'X_train': X_train,
    'y_train': y_train,
    'X_test': X_test,
    'y_test': y_test,
    'feature_names': feature_names,
    'preprocessor': preprocessor
}

# Print the number of features after preprocessing
print(f"Number of features after preprocessing: {len(feature_names)}")
print(f"Processed feature names: {feature_names[:10]}...")  # Display first 10 feature names


Performing Target Encoding for 'Player'...
Number of features after preprocessing: 286
Processed feature names: ['Appearances', 'Goals', 'Started', 'Started As A Sub', 'Came On', 'Taken Off', 'Own Goals', 'First Half', 'Second Half', 'Right Foot']...


### Linear Regression

In [28]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
import pandas as pd
import numpy as np

# Assuming X and y are your feature matrix and target vector
# Step 1: Identify categorical and numerical columns
categorical_features = ['weather_summary','League','Team','Season','Position']  # Add any additional categorical columns here
numerical_features = [col for col in X.columns if col not in categorical_features]

# Step 2: Preprocessing - Encode categorical features and scale numerical features
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features),
        ('num', StandardScaler(), numerical_features)
    ]
)

X_processed = preprocessor.fit_transform(X)

# Step 3: Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.2, random_state=42)

# Step 4: Initialize the Linear Regression model
lr_model = LinearRegression()

# Step 5: Train the Linear Regression model
print("Training the Linear Regression model...")
lr_model.fit(X_train, y_train)
print("Model training completed.")

# Step 6: Evaluate the model on the test set
y_pred = lr_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R^2 Score: {r2}")


Training the Linear Regression model...
Model training completed.
Mean Squared Error: 2.892627285582581e-13
R^2 Score: 0.9999999995933359


### Random Forest Regressor

In [23]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Initialize the Random Forest Regressor model
model = RandomForestRegressor(
    n_estimators=100,          # Number of trees
    max_depth=12,              # Maximum depth of the tree
    max_features='sqrt',       # Square root of features considered per split
    min_samples_split=5,       # Minimum samples required to split an internal node
    min_samples_leaf=3,        # Minimum samples required in a leaf node
    random_state=42,
    n_jobs=-1                  # Use all CPU cores
)

# Step 1: Train the model on the training set
print("Training the model on the training set...")
model.fit(X_train, y_train)
print("Training completed.")

# Step 2: Predict on the test set
print("Predicting on the test set...")
y_pred = model.predict(X_test)

# Step 3: Evaluate the model
test_rmse = np.sqrt(mean_squared_error(y_test, y_pred))
test_mae = mean_absolute_error(y_test, y_pred)
test_r2 = r2_score(y_test, y_pred)

print("\nTest Set Evaluation:")
print(f"Test RMSE: {test_rmse:.4f}")
print(f"Test MAE: {test_mae:.4f}")
print(f"Test R²: {test_r2:.4f}")


Training the model on the training set...
Training completed.
Predicting on the test set...

Test Set Evaluation:
Test RMSE: 0.0085
Test MAE: 0.0044
Test R²: 0.8983


In [25]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, make_scorer

# Step 1: Categorize features
categorical_features = [
    'League', 'Team', 'Season', 'Player', 'Position',
    'First Scorer', 'Last Scorer', 'weather_summary'
]

numerical_features = [
    # Performance Metrics
    'Appearances', 'Goals', 'Started', 'Started As A Sub',
    'Came On', 'Taken Off', 'Own Goals',
    'First Half', 'Second Half',

    # Goal-related Features
    'Right Foot', 'Left Foot', 'Header', 'Other Method',
    'Open Play', 'Cross', 'Free Kick', 'Direct Free Kick',
    'Throw In', 'Penalty', 'Corner', 'Other Type Of Play',

    # Time and Performance Metrics
    'Minutes played', 'Min/goal', 'Assists', '% Assists',
    'Recipient Goals', 'Hat Tricks',

    # Card-related Features
    'Yellow Cards', 'Yellow - First Half', 'Yellow - Second Half',
    'Yellow - Home', 'Yellow - Away',
    'Minutes per Yellow Cards',
    'Sent Off', 'Straight Red Card',

    # Goal-related Aggregate Features
    'GA Home', 'GA Away', 'GA First Half', 'GA Second Half',
    'GA First 15 mins', 'GA Last 10 mins',
    'GF Home', 'GF Away', 'GF First Half', 'GF Second Half',
    'GF First 15 mins', 'GF Last 10 mins',

    # Weather and Performance Features
    'tavg', 'tmin', 'tmax', 'prcp', 'wspd',
    'Normalized Performance',

    # Rolling Features
    'Form_Score_Rolling_Mean_3', 'Form_Score_Rolling_Variance_3'
]

drop_columns = ['game_date', 'latitude', 'longitude']

# Step 2: Drop unnecessary columns
df = d3.drop(columns=drop_columns, errors='ignore')

# Step 3: Handle high-cardinality categorical features (Target Encoding for 'Player')
print("Performing Target Encoding for 'Player'...")
player_mean = df.groupby('Player')['Form Score'].mean()
df['Player_encoded'] = df['Player'].map(player_mean)
df = df.drop(columns=['Player'], errors='ignore')

# Step 4: Create preprocessing pipelines
# Update categorical features after dropping 'Player'
categorical_features = ['League', 'Team', 'Season', 'Position',
                        'First Scorer', 'Last Scorer', 'weather_summary']

numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

# Step 5: Separate features and target
target_column = 'Form Score'
X = df.drop(columns=[target_column])
y = df[target_column]

# Step 6: Fit and transform the features
X_transformed = preprocessor.fit_transform(X)

# Step 7: Retrieve feature names after preprocessing
onehot_encoder = preprocessor.named_transformers_['cat'].named_steps['onehot']
cat_feature_names = onehot_encoder.get_feature_names_out(categorical_features).tolist()
feature_names = numerical_features + ['Player_encoded'] + cat_feature_names

# Step 8: Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_transformed, y, test_size=0.2, random_state=42)

# Step 9: Initialize the Random Forest model with optimized parameters
model = RandomForestRegressor(
    n_estimators=100,          # Increase the number of trees for better accuracy
    max_depth=12,              # Slightly deeper trees for better learning
    max_features='sqrt',       # Use square root of features per split
    min_samples_split=5,       # Reduce the minimum samples required to split a node
    min_samples_leaf=3,        # Reduce the minimum samples per leaf node
    random_state=42,
    n_jobs=-1                  # Use all CPU cores
)

# Step 10: Define cross-validation strategy
cv = KFold(n_splits=5, shuffle=True, random_state=42)

# Custom RMSE scorer for cross-validation
rmse_scorer = make_scorer(lambda y_true, y_pred: np.sqrt(mean_squared_error(y_true, y_pred)), greater_is_better=False)

print("Performing cross-validation...")
cv_rmse_scores = cross_val_score(model, X_train, y_train, cv=cv, scoring=rmse_scorer)
cv_r2_scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='r2')

# Cross-validation results
print("\nCross-Validation Results:")
print(f"Cross-validated RMSE: {-cv_rmse_scores.mean():.4f} ± {cv_rmse_scores.std():.4f}")
print(f"Cross-validated R²: {cv_r2_scores.mean():.4f} ± {cv_r2_scores.std():.4f}")

# Step 11: Train the model on the training set
print("Training the model...")
model.fit(X_train, y_train)
print("Model training completed.")

# Step 12: Predict on the training and test sets
print("Evaluating on the training and test sets...")
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

# Step 13: Evaluate the model
train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
train_r2 = r2_score(y_train, y_train_pred)

test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
test_r2 = r2_score(y_test, y_test_pred)

# Step 14: Output results
print("\nTraining Set Evaluation:")
print(f"Train RMSE: {train_rmse:.4f}")
print(f"Train R²: {train_r2:.4f}")

print("\nTest Set Evaluation:")
print(f"Test RMSE: {test_rmse:.4f}")
print(f"Test R²: {test_r2:.4f}")


Performing Target Encoding for 'Player'...
Performing cross-validation...

Cross-Validation Results:
Cross-validated RMSE: 0.0071 ± 0.0008
Cross-validated R²: 0.9239 ± 0.0176
Training the model...
Model training completed.
Evaluating on the training and test sets...

Training Set Evaluation:
Train RMSE: 0.0058
Train R²: 0.9491

Test Set Evaluation:
Test RMSE: 0.0085
Test R²: 0.8983


#Predicting Form score of Marcus Rashford

In [27]:
import pandas as pd

# Step 1: Define the minimal future data
future_data = pd.DataFrame({
    'League': ['Premier League'],        # Future league
    'Team': ['Manchester United'],      # Future team
    'Season': ['2024/25'],              # Future season
    'Player': ['Marcus Rashford'],      # Future player
    'Position': ['Forward'],            # Player's position
    'weather_summary': ['Normal'],       # Example weather
})

# Step 2: Add missing columns with default values
required_columns = processed_data['preprocessor'].transformers[1][2]  # Categorical feature list
for col in required_columns:
    if col not in future_data.columns:
        future_data[col] = 'Unknown'  # Default value for missing categorical columns

numerical_columns = processed_data['preprocessor'].transformers[0][2]  # Numerical feature list
for col in numerical_columns:
    if col not in future_data.columns:
        future_data[col] = 0  # Default value for missing numerical columns

# Step 3: Preprocess the future data
print("Preprocessing the future data...")
future_data_transformed = processed_data['preprocessor'].transform(future_data)

# Step 4: Predict the Form Score
print("Predicting the Form Score...")
predicted_form_scores = model.predict(future_data_transformed)

# Step 5: Add predictions to the future data
future_data['Predicted Form Score'] = predicted_form_scores

# Step 6: Output the predictions
print("Future Predictions:")
print(future_data[['Player', 'Team', 'Season', 'Predicted Form Score']])


Preprocessing the future data...
Predicting the Form Score...
Future Predictions:
            Player               Team   Season  Predicted Form Score
0  Marcus Rashford  Manchester United  2024/25              0.558159


In [28]:
d3['weather_summary'].unique()
d3.columns

Index(['League', 'Team', 'Season', 'Player', 'Position', 'Appearances',
       'Goals', 'Started', 'Started As A Sub', 'Came On', 'Taken Off',
       'Own Goals', 'First Half', 'Second Half', 'First Scorer', 'Last Scorer',
       'Home', 'Away', 'Right Foot', 'Left Foot', 'Header', 'Other Method',
       'Open Play', 'Cross', 'Free Kick', 'Direct Free Kick', 'Throw In',
       'Penalty', 'Corner', 'Other Type Of Play', 'Minutes played', 'Min/goal',
       'Assists', '% Assists', 'Recipient Goals', 'CS', '% Clean Sheets',
       'Hat Tricks', 'Yellow Cards', 'Yellow - First Half',
       'Yellow - Second Half', 'Yellow - Home', 'Yellow - Away',
       'Yelow - Minutes Played', 'Minutes per Yellow Cards', 'Sent Off',
       'Straight Red Card', 'GA Home', 'GA Away', 'GA First Half',
       'GA Second Half', 'GA First 15 mins', 'GA Last 10 mins',
       'GA Home Matches Scored In', 'GA Away Matches Scored In',
       'GA Per Match', 'GF Home', 'GF Away', 'GF First Half', 'GF Second Half',

### Looking at Impact of Weather

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import pandas as pd

# Step 1: Initialize the Random Forest Regressor
model = RandomForestRegressor(random_state=42, n_jobs=-1)

# Step 2: Define a narrower hyperparameter grid
param_grid = {
    'n_estimators': [50, 100],           # Fewer trees to reduce computation
    'max_depth': [5, 10],               # Narrower range for tree depth
    'max_features': ['sqrt'],           # Focus on square root for feature splits
    'min_samples_split': [5, 10],       # Minimum samples to split a node
    'min_samples_leaf': [2, 4],         # Minimum samples per leaf
    'bootstrap': [True],                # Use bootstrap sampling
}

# Step 3: Perform Randomized Search
random_search = RandomizedSearchCV(
    estimator=model,
    param_distributions=param_grid,
    n_iter=5,                  # Fewer combinations to try
    cv=3,                      # 3-fold cross-validation
    scoring='r2',              # Optimize for R^2 score
    verbose=1,                 # Minimal verbosity
    n_jobs=-1,                 # Use all available cores
    random_state=42
)

# Step 4: Fit RandomizedSearchCV on the training data
print("Tuning hyperparameters with RandomizedSearchCV...")
random_search.fit(X_train, y_train)

# Step 5: Retrieve the best model
best_model = random_search.best_estimator_
print(f"Best Parameters: {random_search.best_params_}")

# Step 6: Train the best model
print("Training the best model...")
best_model.fit(X_train, y_train)
print("Model training completed.")

# Step 7: Evaluate the model on the test set
y_pred = best_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R^2 Score: {r2}")

# Step 9: Test the model's sensitivity to `weather_summary`
print("\nTesting sensitivity to weather_summary:")
for weather in ['Hot', 'Cold', 'Windy', 'Snowy']:
    future_data['weather_summary'] = weather
    future_data_transformed = processed_data['preprocessor'].transform(future_data)
    predicted_form_score = best_model.predict(future_data_transformed)
    print(f"Weather: {weather}, Predicted Form Score: {predicted_form_score[0]}")


Tuning hyperparameters with RandomizedSearchCV...
Fitting 3 folds for each of 5 candidates, totalling 15 fits
Best Parameters: {'n_estimators': 100, 'min_samples_split': 5, 'min_samples_leaf': 4, 'max_features': 'sqrt', 'max_depth': 10, 'bootstrap': True}
Training the best model...
Model training completed.
Mean Squared Error: 8.739884731629014e-05
R^2 Score: 0.8771290817007901

Testing sensitivity to weather_summary:
Weather: Hot, Predicted Form Score: 0.5585883522299104
Weather: Cold, Predicted Form Score: 0.558512026476826
Weather: Windy, Predicted Form Score: 0.558512026476826
Weather: Snowy, Predicted Form Score: 0.558512026476826


### Modelling using XGB Regressor

In [26]:
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.metrics import mean_squared_error, r2_score, make_scorer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
import pandas as pd
import numpy as np

# Assuming X and y are your feature matrix and target vector
categorical_features = ['weather_summary', 'League', 'Team', 'Season', 'Position']  # Add any additional categorical columns here
numerical_features = [col for col in X.columns if col not in categorical_features]

# Ensure all categorical columns exist in X and are strings
for col in categorical_features:
    if col in X.columns:
        X[col] = X[col].astype(str)

# Step 1: Preprocess the data
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features),
        ('num', 'passthrough', numerical_features)
    ]
)

# Apply preprocessing
X_processed = preprocessor.fit_transform(X)

# Split the processed data
X_train_processed, X_test_processed, y_train, y_test = train_test_split(X_processed, y, test_size=0.2, random_state=42)

# Step 2: Initialize the XGBoost Regressor model
xgb_model = XGBRegressor(objective='reg:squarederror', random_state=42, enable_categorical=True)

# Step 3: Define Cross-Validation Strategy
cv = KFold(n_splits=5, shuffle=True, random_state=42)

# Custom RMSE scorer for cross-validation
rmse_scorer = make_scorer(lambda y_true, y_pred: np.sqrt(mean_squared_error(y_true, y_pred)), greater_is_better=False)

print("Performing cross-validation...")
cv_rmse_scores = cross_val_score(xgb_model, X_train_processed, y_train, cv=cv, scoring=rmse_scorer)
cv_r2_scores = cross_val_score(xgb_model, X_train_processed, y_train, cv=cv, scoring='r2')

# Cross-validation results
print("\nCross-Validation Results:")
print(f"Cross-validated RMSE: {-cv_rmse_scores.mean():.4f} ± {cv_rmse_scores.std():.4f}")
print(f"Cross-validated R²: {cv_r2_scores.mean():.4f} ± {cv_r2_scores.std():.4f}")

# Step 4: Train the XGBoost model on the full training data
print("Training the XGBoost Regressor model...")
xgb_model.fit(X_train_processed, y_train)
print("Model training completed.")

# Step 5: Predict on training and test sets
print("Evaluating on the training and test sets...")
y_train_pred = xgb_model.predict(X_train_processed)
y_test_pred = xgb_model.predict(X_test_processed)

# Step 6: Evaluate performance
train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
train_r2 = r2_score(y_train, y_train_pred)

test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
test_r2 = r2_score(y_test, y_test_pred)

# Step 7: Output results
print("\nTraining Set Evaluation:")
print(f"Train RMSE: {train_rmse:.4f}")
print(f"Train R²: {train_r2:.4f}")

print("\nTest Set Evaluation:")
print(f"Test RMSE: {test_rmse:.4f}")
print(f"Test R²: {test_r2:.4f}")


Performing cross-validation...

Cross-Validation Results:
Cross-validated RMSE: 0.0017 ± 0.0009
Cross-validated R²: 0.9945 ± 0.0065
Training the XGBoost Regressor model...
Model training completed.
Evaluating on the training and test sets...

Training Set Evaluation:
Train RMSE: 0.0003
Train R²: 0.9999

Test Set Evaluation:
Test RMSE: 0.0034
Test R²: 0.9834
