In [14]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression

In [None]:

CSV_PATH = "../../data/midfielder_performance_forecast.csv"
PREDICT_YEAR = 2025  # e.g., 2025 → "25/26"
MIN_POINTS_PER_STAT = 3

# Stats to forecast 
target_columns = [
    "Rating",
    "Appearances",
    "Mins",
    "Goals",
    "Assists",
    "Shots/90",
    "xG/90",
    "Passes/90",
    "Key_Passes/90",
    "Pass_Percentage",
    "Dribbles/90",
    "Dribble_Success_Rate",
    "Fouls/90",
    "Tackles/90",
    "Interceptions/90",
    "Ground_Duels_Won/90",
]

# Championship to League One scaling factors (heuristics)
league_adjustments = {
    "Rating": 1.05,
    "Goals": 1.20,
    "Assists": 1.15,
    "Shots/90": 1.10,
    "xG/90": 1.10,
    "Passes/90": 1.05,
    "Key_Passes/90": 1.10,
    "Pass_Percentage": ("add", 2.0),  
    "Dribbles/90": 1.10,
    "Dribble_Success_Rate": 1.10,
    "Fouls/90": 0.95,
    "Tackles/90": 0.90,
    "Interceptions/90": 0.90,
    "Ground_Duels_Won/90": 1.05,
    # "Appearances" and "Mins" are left unadjusted by default (fixture variation/team usage dependent)
}

percentage_cols = ["Pass_Percentage", "Dribble_Success_Rate"]
non_negative_cols = [
    "Appearances","Mins","Goals","Assists","Shots/90","xG/90","Passes/90",
    "Key_Passes/90","Dribbles/90","Fouls/90","Tackles/90","Interceptions/90","Ground_Duels_Won/90"
]

In [None]:
def season_to_year(season_val):
    """
    Convert '20/21' → 2020, '2021/22' → 2021, '99/00' → 1999.
    If already numeric-like, just cast to int.
    """
    if isinstance(season_val, str) and "/" in season_val:
        first = season_val.split("/")[0]
        if len(first) == 4:
            return int(first)
        yr = int(first)
        return 1900 + yr if yr >= 90 else 2000 + yr
    return int(season_val)

def year_to_season_str(year):
    """2025 → '25/26'"""
    y1 = year % 100
    y2 = (year + 1) % 100
    return f"{y1:02d}/{y2:02d}"

def fit_and_predict_year(x_years, y_values, predict_year):
    """
    Fit LinearRegression on (x_years, y_values) if we have enough points,
    otherwise return np.nan.
    """
    mask = np.isfinite(x_years) & np.isfinite(y_values)
    x = x_years[mask].reshape(-1, 1)
    y = y_values[mask]
    if len(y) < MIN_POINTS_PER_STAT:
        return np.nan
    model = LinearRegression()
    model.fit(x, y)
    return float(model.predict([[predict_year]])[0])

In [None]:
df = pd.read_csv(CSV_PATH)

# Normalize column names just in case (optional)
df.columns = [c.strip() for c in df.columns]

# Convert Season to numeric training year
df["Season_year"] = df["Season"].apply(season_to_year)

# Coerce target columns to numeric where applicable (ignore if not present)
for col in target_columns:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors="coerce")

In [None]:
all_predictions = []

for player in df["Name"].dropna().unique():
    player_data = df[df["Name"] == player].copy()
    if player_data.empty:
        continue

    # sort by numeric season
    player_data = player_data.sort_values("Season_year")

    # Prepare row with identity fields
    row = {
        "Name": player,
        "Season": year_to_season_str(PREDICT_YEAR),
        "Season_year": PREDICT_YEAR,
        "League": "League One",  # target league
    }

    # Train per stat
    x_years = player_data["Season_year"].to_numpy(dtype=float)
    for col in target_columns:
        if col not in player_data.columns:
            row[col] = np.nan
            continue

        y_vals = player_data[col].to_numpy(dtype=float)
        pred = fit_and_predict_year(x_years, y_vals, PREDICT_YEAR)
        row[col] = pred

    all_predictions.append(row)

forecast_df = pd.DataFrame(all_predictions)

In [None]:
# Apply Championship to League One adjustments

for col, adj in league_adjustments.items():
    if col in forecast_df.columns:
        if isinstance(adj, tuple) and adj[0] == "add":
            forecast_df[col] = forecast_df[col] + adj[1]
        else:
            forecast_df[col] = forecast_df[col] * adj

# Clip percentages and non-negatives
for col in percentage_cols:
    if col in forecast_df.columns:
        forecast_df[col] = forecast_df[col].clip(lower=0, upper=100)

for col in non_negative_cols:
    if col in forecast_df.columns:
        forecast_df[col] = forecast_df[col].clip(lower=0)

# Round numeric columns nicely
round_map = {}
for col in forecast_df.columns:
    if col in ["Name", "Season", "League"]:
        continue
    # percentages to 1 dp, others to 2 dp
    if col in percentage_cols:
        round_map[col] = 1
    else:
        round_map[col] = 2
if round_map:
    forecast_df = forecast_df.round(round_map)

# Order columns
front_cols = ["Name", "Season", "Season_year", "League"]
rest_cols = [c for c in target_columns if c in forecast_df.columns]
forecast_df = forecast_df[[c for c in front_cols if c in forecast_df.columns] + rest_cols]