In [1]:
import pandas as pd

df = pd.read_csv('../../data/players_data.csv')

df = df[(df["Name"] == "Will Keane")].reset_index(drop=True)

df.head()

Unnamed: 0,Name,Age,Rating,League,Season,Appearances,Mins,Position,Alt_Pos,Height,...,Key_Passes,Pass_Percentage,Dribbles,Tackles,Interceptions,Fouls,Clearances,Blocks,Ground_Duels_Won,Aerial_Duels_Won
0,Will Keane,32,6.11,Championship,24/25,27,1122,ST,,1.88,...,0.39,81.44,0.63,0.47,0.16,1.73,1.26,0.08,2.36,1.02
1,Will Keane,31,6.93,Championship,23/24,38,2630,ST,,1.88,...,0.48,77.93,0.99,0.99,0.34,1.57,0.82,0.14,3.42,1.27
2,Will Keane,30,6.82,Championship,22/23,43,3288,ST,,1.88,...,0.65,81.05,1.53,1.16,0.28,1.19,0.94,0.17,4.26,1.51
3,Will Keane,29,7.09,League one,21/22,44,3745,ST,,1.88,...,0.76,81.83,0.76,1.01,0.38,1.43,1.08,0.08,4.18,1.96
4,Will Keane,28,6.84,League one,20/21,32,2160,ST,,1.88,...,0.88,81.2,0.96,1.21,0.42,2.08,1.08,0.04,3.79,1.33


In [2]:
# Convert Season format: "22/23" → 2022
df['Season'] = df['Season'].apply(lambda x: int("20" + x[:2]))

# Target columns to forecast
target_columns = [
    'Rating', 'Goals', 'Assists', 'Avg_Passes', 'Key_Passes',
    'Pass_Percentage', 'Tackles', 'Interceptions',
    'Fouls', 'Clearances', 'Blocks',
    'Ground_Duels_Won', 'Aerial_duals_won'
]

# Forecast storage
all_predictions = []

In [3]:
from sklearn.linear_model import LinearRegression

# Loop over each player
for player in df['Name'].unique():
    player_data = df[df['Name'] == player].sort_values('Season')

    # Forecast only if we have at least 3 seasons of data
    if len(player_data) >= 3:
        X = player_data[['Season']].values
        row = {'Name': player, 'Season': 2025}

        for col in target_columns:
            if col in player_data.columns:
                y = player_data[col].values

                # Fit linear regression model
                model = LinearRegression()
                model.fit(X, y)

                # Forecast for season 2025
                forecast = model.predict([[2025]])[0]
                row[col] = round(forecast, 2)
            else:
                row[col] = None  # Handle missing columns

        all_predictions.append(row)

In [4]:
# Convert to DataFrame
forecast_df = pd.DataFrame(all_predictions)

In [5]:
from IPython.display import display
display(forecast_df)

Unnamed: 0,Name,Season,Rating,Goals,Assists,Avg_Passes,Key_Passes,Pass_Percentage,Tackles,Interceptions,Fouls,Clearances,Blocks,Ground_Duels_Won,Aerial_duals_won
0,Will Keane,2025,6.27,4.7,-0.7,18.34,0.25,79.66,0.52,0.15,1.43,1.07,0.14,2.52,


In [6]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from IPython.display import display

FULL_SEASON_MINS = 4140  # minutes in a full season

df = pd.read_csv('../../data/players_data.csv')

df = df[(df["Name"] == "Will Keane")].reset_index(drop=True)

# --- Normalize Goals & Assists to a full season (4140 mins) ---
# Try to find a minutes column, else compute from Games * Mins_per_Game if available.
minute_col = None
for cand in ['Minutes', 'Mins', 'Min', 'Minutes_Played', 'Mins_Played']:
    if cand in df.columns:
        minute_col = cand
        break

temp_minutes_col = None
if minute_col is None:
    if 'Games' in df.columns and 'Mins_per_Game' in df.columns:
        temp_minutes_col = '__minutes_tmp__'
        df[temp_minutes_col] = df['Games'].astype(float) * df['Mins_per_Game'].astype(float)
        minute_col = temp_minutes_col
    else:
        raise ValueError(
            "Couldn't find a minutes column. Please add one (e.g., 'Minutes'/'Mins') "
            "or provide both 'Games' and 'Mins_per_Game'."
        )

mins = df[minute_col].astype(float)
# Avoid division by zero or negative values; set invalid scales to NaN
scale = FULL_SEASON_MINS / mins.replace(0, np.nan)
scale[~np.isfinite(scale)] = np.nan

for stat in ['Goals', 'Assists']:
    if stat in df.columns:
        df[stat] = (df[stat].astype(float) * scale).round(2)

# Clean up temp column if we created it
if temp_minutes_col is not None and temp_minutes_col in df.columns:
    df.drop(columns=[temp_minutes_col], inplace=True)

# --- Convert Season format: "22/23" → 2022 ---
df['Season'] = df['Season'].apply(lambda x: int("20" + str(x)[:2]))

# --- Target columns to forecast ---
target_columns = [
    'Rating', 'Goals', 'Assists', 'Avg_Passes', 'Key_Passes',
    'Pass_Percentage', 'Tackles', 'Interceptions',
    'Fouls', 'Clearances', 'Blocks',
    'Ground_Duels_Won', 'Aerial_duals_won'
]

# --- Forecast storage ---
all_predictions = []

# --- Loop over each player ---
for player in df['Name'].unique():
    player_data = df[df['Name'] == player].sort_values('Season')

    # Forecast only if we have at least 3 seasons of data
    if len(player_data) >= 3:
        X = player_data[['Season']].values
        row = {'Name': player, 'Season': 2025}

        for col in target_columns:
            if col in player_data.columns:
                y = player_data[col].values.astype(float)

                model = LinearRegression()
                model.fit(X, y)

                forecast = model.predict([[2025]])[0]
                row[col] = round(float(forecast), 2)
            else:
                row[col] = None  # Handle missing columns

        all_predictions.append(row)

# --- Convert to DataFrame and display ---
forecast_df = pd.DataFrame(all_predictions)
display(forecast_df)


Unnamed: 0,Name,Season,Rating,Goals,Assists,Avg_Passes,Key_Passes,Pass_Percentage,Tackles,Interceptions,Fouls,Clearances,Blocks,Ground_Duels_Won,Aerial_duals_won
0,Will Keane,2025,6.27,11.57,-1.63,18.34,0.25,79.66,0.52,0.15,1.43,1.07,0.14,2.52,
