### Machine Learning Comparision

A tool designed to find the factors which relate to a player scoring a try.

In [None]:
import sys
sys.path.append("..")
import ENVIRONMENT_VARIABLES as EV 

import json
import pandas as pd

In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.preprocessing import LabelEncoder

from xgboost import XGBClassifier


In [None]:

!python -m pip install imblearn
!python -m pip install xgboost
!python -m pip install catboost

## Select Round and Team

In [None]:
# 🔧 CHANGE THESE VARIABLES AS NEEDED
selected_team = 'Knights'  
prediction_round = 22
YEAR = 2024
ROUNDS = 31
SELECTION = 'NRL'

TRAIN_MODEL = True # Deselect this once it has been trained, so you can change the variables above

## Importing the Player Data

In [None]:

variables =["Year", "Win", "Defense", "Attack", "Margin", "Home", "Versus",  "Round"]

player_variables =["Name", "Position", "Points", "Tries", "All Run Metres", "Tackle Breaks", "Tackle Efficiency", "Kicking Metres",  "Offloads", 'All Runs', 'Line Breaks', 'Post Contact Metres','Dummy Half Runs', 'Passes','Receipts', 'Errors', 'Sin Bins', "Versus", ]

In [None]:
TEAMS = EV.TEAMS

In [None]:
if TRAIN_MODEL:
    years_arr = {}
    with open(f'../data/{SELECTION}/{YEAR}/{SELECTION}_player_statistics_{YEAR}.json', 'r') as file:
        data = json.load(file)
        data = data['PlayerStats']
        years_arr[YEAR] = data[0][str(YEAR)]
    

In [None]:
# create the structured dataframe
df = pd.DataFrame(columns=[f"{team} {variable}" for team in TEAMS for variable in player_variables])

In [None]:
if TRAIN_MODEL:
    p_dfs = {}

    def custom_sort(column_name):
        year, num = column_name.split('-')
        return int(year), int(num)


    for team in TEAMS:
        p_dfs[team] = pd.DataFrame(columns=[f"{year}-{round+1}" for round in range(0, ROUNDS) for year in [YEAR]])
        p_dfs[team] = p_dfs[team][sorted(p_dfs[team].columns, key=custom_sort)]
        

In [None]:
if TRAIN_MODEL:
    team_players = {}
    ladder = {}  # Tracks ladder info per team

    for i in range(0, ROUNDS):
        try:
            round_data = years_arr[YEAR][i]
            round_data = round_data[str(i)]
            round_ladder_snapshot = {}

            for round_game in round_data:
                for game in round_game:
                    game_split = game.split("-")
                    game_year = game_split[0]
                    game_round = game_split[1]

                    game_split = game.split("v")
                    home_team = " ".join(game_split[0].split("-")[2:]).replace("-", " ").strip()
                    away_team = " ".join(game_split[-1:]).replace("-", " ").strip()

                    players = round_game[game]

                    # Remove duplicates
                    seen = set()
                    unique_dicts = []
                    for d in players:
                        items = tuple(sorted(d.items()))
                        if items not in seen:
                            seen.add(items)
                            unique_dicts.append(d)

                    players = unique_dicts

                    # Calculate points
                    home_team_points = sum([(int(x['Points'].replace('-', '0'))) for x in players[:18]])
                    away_team_points = sum([(int(x['Points'].replace('-', '0'))) for x in players[18:]])

                    # Update ladder
                    for team, points_for, points_against in [
                        (home_team, home_team_points, away_team_points),
                        (away_team, away_team_points, home_team_points)
                    ]:
                        if team not in ladder:
                            ladder[team] = {
                                'points': 0,
                                'for': 0,
                                'against': 0,
                                'games': 0
                            }

                        ladder[team]['for'] += points_for
                        ladder[team]['against'] += points_against
                        ladder[team]['games'] += 1

                    # Assign match result points (Win = 2, Draw = 1)
                    if home_team_points > away_team_points:
                        ladder[home_team]['points'] += 2
                    elif away_team_points > home_team_points:
                        ladder[away_team]['points'] += 2
                    else:
                        ladder[home_team]['points'] += 1
                        ladder[away_team]['points'] += 1

                    # Snapshot the ladder using proper tiebreaker: points, then 'for' total
                    ladder_snapshot = sorted(
                        ladder.items(),
                        key=lambda x: (x[1]['points'], x[1]['for']),
                        reverse=True
                    )

                    for pos, (team, _) in enumerate(ladder_snapshot, start=1):
                        round_ladder_snapshot[team] = pos  # always set a valid int pos

                    # 🖨️ Print AFTER ladder snapshot
                    print(
                        f"> {home_team} (Ladder Pos: {round_ladder_snapshot[home_team]}): {home_team_points} "
                        f"v {away_team} (Ladder Pos: {round_ladder_snapshot[away_team]}): {away_team_points} "
                        f"- {game_year} - Round {game_round}"
                    )

                    # Build player stats
                    player_round_stats = {}

                    for idx, player in enumerate(players):
                        versus = home_team if idx >= 18 else away_team
                        team = away_team if idx >= 18 else home_team

                        vals = [player[val] for val in player_variables[:-1]]
                        vals.append(TEAMS.index(versus))  # versus team index
                        vals.append(round_ladder_snapshot[team])      # team ladder pos
                        vals.append(round_ladder_snapshot[versus])    # opponent ladder pos

                        player_round_stats[vals[0]] = vals[1:]

                    player_round_stats = list(player_round_stats.items())
                    player_round_stats_home, player_round_stats_away = player_round_stats[:18], player_round_stats[18:]
                    p_dfs[home_team][f"{YEAR}-{i+1}"] = player_round_stats_home
                    p_dfs[away_team][f"{YEAR}-{i+1}"] = player_round_stats_away

        except Exception as ex:
            print(f"Error in round {i}: {ex}")


## Prepare and Clean Data from All Teams

In [None]:
if TRAIN_MODEL:
    all_long_dfs = []

    for team, df in p_dfs.items():
        long_df = df.melt(ignore_index=False, var_name="Round", value_name="PlayerStats").dropna()
        long_df[['Name', 'Stats']] = pd.DataFrame(long_df['PlayerStats'].tolist(), index=long_df.index)

        stat_cols = ["Position", "Points", "Tries", "All Run Metres", "Tackle Breaks", "Tackle Efficiency", "Kicking Metres", "Offloads", 'All Runs', 'Line Breaks', 'Post Contact Metres','Dummy Half Runs', 'Passes','Receipts', 'Errors', 'Sin Bins','Versus', 'Home Pos', 'Away Pos']
        long_df[stat_cols] = pd.DataFrame(long_df['Stats'].tolist(), index=long_df.index)

        def clean_stat(val):
            if isinstance(val, str):
                if val == '-' or val.strip() == '':
                    return np.nan
                if '%' in val:
                    return float(val.replace('%', ''))
            try:
                return float(val)
            except:
                return np.nan

        for col in stat_cols:
            if col != 'Position':
                long_df[col] = long_df[col].apply(clean_stat)

        long_df['Round'] = long_df['Round'].str.extract(r'(\d+)$').astype(int)
        long_df['Team'] = team
        all_long_dfs.append(long_df)

    # Combine all team DataFrames
    long_df_all = pd.concat(all_long_dfs).reset_index(drop=True)


In [None]:
long_df_all[long_df_all['Name'] == 'Kalyn Ponga'].head(5)

## Fill in Missing Values

In [None]:
if TRAIN_MODEL:
    # Fill missing values
    fill_cols = ['Tries', 'Points', 'Kicking Metres', 'Offloads', 'All Runs', 'Line Breaks', 'Post Contact Metres','Dummy Half Runs', 'Passes','Receipts', 'Errors', 'Sin Bins',]
    for col in fill_cols:
        long_df_all[col] = long_df_all[col].fillna(0)

    # Target for training: DidScoreTry
    long_df_all['DidScoreTry'] = long_df_all['Tries'].apply(lambda x: 1 if x > 0 else 0)


In [None]:

# Encode player positions as integers
position_encoder = LabelEncoder()
long_df_all['Position'] = position_encoder.fit_transform(long_df_all['Position'])


## Prepare Training Data

In [None]:
if TRAIN_MODEL:

    features = [ 'Home Pos', 'Away Pos', "All Run Metres","Tackle Breaks", "Position", "Kicking Metres", "Offloads", 'All Runs', 'Line Breaks', 'Post Contact Metres','Dummy Half Runs', 'Passes','Receipts', 'Errors']

    # Prepare training data: all rounds before the prediction round
    train_df = long_df_all[long_df_all['Round'] < prediction_round].copy()
    train_df['DidScoreTry'] = train_df['Tries'].apply(lambda x: 1 if x > 0 else 0)

    # Features and labels
    train_data = train_df[features + ['DidScoreTry']].dropna()
    X = train_data[features]
    y = train_data['DidScoreTry']



In [None]:
X

## Train Model

In [None]:
from catboost import CatBoostClassifier

if TRAIN_MODEL:
    X_train, X_val, y_train, y_val = train_test_split(X, y, stratify=y, random_state=42)

    model = CatBoostClassifier(
        iterations=1000,
        learning_rate=0.01,  # Slower learning = better generalization
        depth=8,  # More complex interactions
        l2_leaf_reg=3,
        loss_function='Logloss',
        eval_metric='AUC',
        class_weights=[1, 4],  # Slightly lower weight for balance
        random_seed=42,
        verbose=100,
        early_stopping_rounds=100
    )


    model.fit(X_train, y_train, eval_set=(X_val, y_val), early_stopping_rounds=50)


# Evaluate

In [None]:
from sklearn.metrics import (
    accuracy_score, roc_auc_score, roc_curve,
    confusion_matrix, ConfusionMatrixDisplay,
    precision_recall_curve, PrecisionRecallDisplay
)
import matplotlib.pyplot as plt

# Predict on validation set
y_val_pred = model.predict(X_val)
y_val_prob = model.predict_proba(X_val)[:, 1]

# Core metrics
val_accuracy = accuracy_score(y_val, y_val_pred)
val_auc = roc_auc_score(y_val, y_val_prob)

print(f"🔍 Model Evaluation on Validation Set (All Teams)")
print(f"Accuracy: {val_accuracy:.2f}")
print(f"AUC: {val_auc:.2f}")


## Model Tuning

In [None]:
from sklearn.metrics import f1_score, precision_recall_curve

# Get precision, recall, thresholds
precision, recall, thresholds = precision_recall_curve(y_val, y_val_prob)
f1 = 2 * (precision * recall) / (precision + recall + 1e-8)

# Find best threshold by max F1
best_idx = f1.argmax()
best_threshold = thresholds[best_idx]

print(f"Best Threshold: {best_threshold:.2f}")
print(f"F1 Score at Best Threshold: {f1[best_idx]:.2f}")


In [None]:
# 🔍 Overall evaluation for the prediction round
all_teams_round_df = long_df_all[long_df_all['Round'] == prediction_round].copy()
all_teams_round_df['Tries'] = all_teams_round_df['Tries'].fillna(0)
all_teams_round_df = all_teams_round_df.dropna(subset=features)

if not all_teams_round_df.empty:
    X_test_all = all_teams_round_df[features]
    all_teams_round_df['Try_Prob'] = model.predict_proba(X_test_all)[:, 1]
    all_teams_round_df['DidScoreTry'] = all_teams_round_df['Tries'].apply(lambda x: 1 if x > 0 else 0)
    y_pred_all = model.predict(X_test_all)

    try:
        accuracy_all = accuracy_score(all_teams_round_df['DidScoreTry'], y_pred_all)
        auc_all = roc_auc_score(all_teams_round_df['DidScoreTry'], all_teams_round_df['Try_Prob'])
        print(f"\n🔎 Overall - Round {prediction_round} Accuracy: {accuracy_all:.2f}, AUC: {auc_all:.2f}")
    except ValueError:
        print(f"\n⚠️ Overall - Round {prediction_round}: Not enough class variety to calculate AUC")
else:
    print(f"\n❌ Overall - Round {prediction_round}: No valid data")


In [None]:
print(f"\n📊 Per-Team Evaluation:")

for selected_team_ in TEAMS:
    # Predict for selected team in the selected round
    team_round_df = long_df_all[
        (long_df_all['Round'] == prediction_round) &
        (long_df_all['Team'] == selected_team_)
    ].copy()

    # Clean and prepare
    team_round_df['Tries'] = team_round_df['Tries'].fillna(0)
    team_round_df = team_round_df.dropna(subset=features)

    if team_round_df.empty:
        print(f"❌ {selected_team_} - Round {prediction_round}: No data (possibly a bye or all rows invalid)")
        continue

    X_test = team_round_df[features]
    team_round_df['Try_Prob'] = model.predict_proba(X_test)[:, 1]
    team_round_df['DidScoreTry'] = team_round_df['Tries'].apply(lambda x: 1 if x > 0 else 0)

    y_pred = model.predict(X_test)

    try: 
        accuracy = accuracy_score(team_round_df['DidScoreTry'], y_pred)
        auc = roc_auc_score(team_round_df['DidScoreTry'], team_round_df['Try_Prob'])
        print(f"🟢 {selected_team_} - Round {prediction_round} Accuracy: {accuracy:.2f}, AUC: {auc:.2f}")
    except ValueError:
        print(f"⚠️ {selected_team_} - Round {prediction_round}: Not enough class variety to calculate AUC")


In [None]:
fpr, tpr, _ = roc_curve(y_val, y_val_prob)

plt.figure(figsize=(6, 5))
plt.plot(fpr, tpr, label=f"AUC = {val_auc:.2f}")
plt.plot([0, 1], [0, 1], 'k--', alpha=0.6)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
precision, recall, _ = precision_recall_curve(y_val, y_val_prob)
pr_disp = PrecisionRecallDisplay(precision=precision, recall=recall)
pr_disp.plot()
plt.title('Precision-Recall Curve')
plt.tight_layout()
plt.show()


In [None]:
import matplotlib.pyplot as plt

feature_importances = model.get_feature_importance()
feature_names = X.columns

plt.figure(figsize=(10, 6))
plt.barh(feature_names, feature_importances)
plt.title("Feature Importance")
plt.gca().invert_yaxis()
plt.show()


## Predict

In [None]:
# Predict for selected team in the selected round
team_round_df = long_df_all[
    (long_df_all['Round'] == prediction_round) &
    (long_df_all['Team'] == selected_team)
].copy()

# Clean and prepare
team_round_df['Tries'] = team_round_df['Tries'].fillna(0)
team_round_df = team_round_df.dropna(subset=features)

X_test = team_round_df[features]
team_round_df['Try_Prob'] = model.predict_proba(X_test)[:, 1]
team_round_df['DidScoreTry'] = team_round_df['Tries'].apply(lambda x: 1 if x > 0 else 0)


In [None]:
team_round_df.head(1)

In [None]:
# View sorted predictions for the selected team and round
team_round_df[['Name', 'Try_Prob', 'DidScoreTry']].sort_values(by='Try_Prob', ascending=False)


In [None]:
team_round_df

## Evaluate

In [None]:

# Evaluate if needed
y_pred = model.predict(X_test)

try: 
    accuracy = accuracy_score(team_round_df['DidScoreTry'], y_pred)
    auc = roc_auc_score(team_round_df['DidScoreTry'], team_round_df['Try_Prob'])
except ValueError:
    print('The team you are trying to predict for likely had a bye')



print(f"{selected_team} - Round {prediction_round} Accuracy: {accuracy:.2f}")
print(f"{selected_team} - Round {prediction_round} AUC: {auc:.2f}")
