In [11]:
import pandas as pd

df = pd.read_csv('../../data/players_data.csv')

df = df[(df["Name"] == "Jack Marriott")].reset_index(drop=True)

df.head()

Unnamed: 0,Name,Age,Rating,League,Season,Appearances,Mins,Position,Alt_Pos,Height,...,Key_Passes,Pass_Percentage,Dribbles,Tackles,Interceptions,Fouls,Clearances,Blocks,Ground_Duels_Won,Aerial_Duels_Won
0,Jack Marriott,30,6.17,League one,24/25,26,796,ST,,1.73,...,0.11,72.73,2.71,0.79,0.23,1.81,0.57,0.11,3.96,1.02
1,Jack Marriott,29,6.5,League one,23/24,24,1614,ST,,1.73,...,0.71,72.52,1.48,1.12,0.18,1.42,0.24,0.06,2.95,0.35
2,Jack Marriott,28,6.59,League one,22/23,40,2230,ST,,1.73,...,0.71,68.48,1.09,0.88,0.38,1.72,0.13,0.04,2.61,0.59


In [12]:
# Convert Season format: "22/23" → 2022
df['Season'] = df['Season'].apply(lambda x: int("20" + x[:2]))

# Target columns to forecast
target_columns = [
    'Rating', 'Goals', 'Assists', 'Avg_Passes', 'Key_Passes',
    'Pass_Percentage', 'Tackles', 'Interceptions',
    'Fouls', 'Clearances', 'Blocks',
    'Ground_Duels_Won', 'Aerial_duals_won'
]

# Forecast storage
all_predictions = []

In [13]:
from sklearn.linear_model import LinearRegression

# Loop over each player
for player in df['Name'].unique():
    player_data = df[df['Name'] == player].sort_values('Season')

    # Forecast only if we have at least 3 seasons of data
    if len(player_data) >= 3:
        X = player_data[['Season']].values
        row = {'Name': player, 'Season': 2025}

        for col in target_columns:
            if col in player_data.columns:
                y = player_data[col].values

                # Fit linear regression model
                model = LinearRegression()
                model.fit(X, y)

                # Forecast for season 2025
                forecast = model.predict([[2025]])[0]
                row[col] = round(forecast, 2)
            else:
                row[col] = None  # Handle missing columns

        all_predictions.append(row)

In [14]:
# Convert to DataFrame
forecast_df = pd.DataFrame(all_predictions)

In [15]:
from IPython.display import display
display(forecast_df)

Unnamed: 0,Name,Season,Rating,Goals,Assists,Avg_Passes,Key_Passes,Pass_Percentage,Tackles,Interceptions,Fouls,Clearances,Blocks,Ground_Duels_Won,Aerial_duals_won
0,Jack Marriott,2025,6.0,1.67,-2.67,13.61,-0.09,75.49,0.84,0.11,1.74,0.75,0.14,4.52,


In [16]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from IPython.display import display

FULL_SEASON_MINS = 4140  # minutes in a full season

df = pd.read_csv('../../data/players_data.csv')

df = df[(df["Name"] == "Jack Marriott")].reset_index(drop=True)

# --- Normalize Goals & Assists to a full season (4140 mins) ---
# Try to find a minutes column, else compute from Games * Mins_per_Game if available.
minute_col = None
for cand in ['Minutes', 'Mins', 'Min', 'Minutes_Played', 'Mins_Played']:
    if cand in df.columns:
        minute_col = cand
        break

temp_minutes_col = None
if minute_col is None:
    if 'Games' in df.columns and 'Mins_per_Game' in df.columns:
        temp_minutes_col = '__minutes_tmp__'
        df[temp_minutes_col] = df['Games'].astype(float) * df['Mins_per_Game'].astype(float)
        minute_col = temp_minutes_col
    else:
        raise ValueError(
            "Couldn't find a minutes column. Please add one (e.g., 'Minutes'/'Mins') "
            "or provide both 'Games' and 'Mins_per_Game'."
        )

mins = df[minute_col].astype(float)
# Avoid division by zero or negative values; set invalid scales to NaN
scale = FULL_SEASON_MINS / mins.replace(0, np.nan)
scale[~np.isfinite(scale)] = np.nan

for stat in ['Goals', 'Assists']:
    if stat in df.columns:
        df[stat] = (df[stat].astype(float) * scale).round(2)

# Clean up temp column if we created it
if temp_minutes_col is not None and temp_minutes_col in df.columns:
    df.drop(columns=[temp_minutes_col], inplace=True)

# --- Convert Season format: "22/23" → 2022 ---
df['Season'] = df['Season'].apply(lambda x: int("20" + str(x)[:2]))

# --- Target columns to forecast ---
target_columns = [
    'Rating', 'Goals', 'Assists', 'Avg_Passes', 'Key_Passes',
    'Pass_Percentage', 'Tackles', 'Interceptions',
    'Fouls', 'Clearances', 'Blocks',
    'Ground_Duels_Won', 'Aerial_duals_won'
]

# --- Forecast storage ---
all_predictions = []

# --- Loop over each player ---
for player in df['Name'].unique():
    player_data = df[df['Name'] == player].sort_values('Season')

    # Forecast only if we have at least 3 seasons of data
    if len(player_data) >= 3:
        X = player_data[['Season']].values
        row = {'Name': player, 'Season': 2025}

        for col in target_columns:
            if col in player_data.columns:
                y = player_data[col].values.astype(float)

                model = LinearRegression()
                model.fit(X, y)

                forecast = model.predict([[2025]])[0]
                row[col] = round(float(forecast), 2)
            else:
                row[col] = None  # Handle missing columns

        all_predictions.append(row)

# --- Convert to DataFrame and display ---
forecast_df = pd.DataFrame(all_predictions)
display(forecast_df)


Unnamed: 0,Name,Season,Rating,Goals,Assists,Avg_Passes,Key_Passes,Pass_Percentage,Tackles,Interceptions,Fouls,Clearances,Blocks,Ground_Duels_Won,Aerial_duals_won
0,Jack Marriott,2025,6.0,31.04,-4.95,13.61,-0.09,75.49,0.84,0.11,1.74,0.75,0.14,4.52,
