In [43]:
import pandas as pd
import numpy as np
import re
import os
import glob

In [44]:
# ---------------------------------------------------------------------
# 1. Define division -> base rating mapping
#    You can adjust these base ratings however you like.
# ---------------------------------------------------------------------
division_base_ratings = {
    # Numeric divisions
    '2': 8000,
    '3': 6000,
    '4': 4000,
    '5': 3000,
    '6': 2000,
    '7': 1800,
    '8': 1600,
    '9': 1400,
    '10': 1200,
    '11': 1000,
    '12': 800,
    '13': 600,
    '14': 400,
    '15': 200,
    # Special names
    'premier main': 18000,      # treated like Div 1
    'premier masters': 6000,   # treated like Div 3
    'm2': 2000,                # treated like Div 6
    'm3': 1600,                # treated like Div 8
    'm4': 1000,                # treated like Div 11
    'premier ladies': 6000,    # treated like Div 3
    'l2': 2000,                # treated like Div 6
    'l3': 1200,                # treated like Div 10
    'l4': 800,                # treated like Div 12
}

In [45]:
# Identify which divisions are "Main", "Masters", or "Ladies".
# We'll create a lookup function that returns one of "main", "masters", "ladies".
def get_league_type(division_str_lower):
    # Main league divisions:
    main_divs = {
        'premier main', '2', '3', '4', '5', '6', '7', '8',
        '9', '10', '11', '12', '13', '14', '15'
    }
    masters_divs = {
        'premier masters', 'm2', 'm3', 'm4'
    }
    ladies_divs = {
        'premier ladies', 'l2', 'l3', 'l4'
    }

    if division_str_lower in main_divs:
        return 'main'
    elif division_str_lower in masters_divs:
        return 'masters'
    elif division_str_lower in ladies_divs:
        return 'ladies'
    else:
        return 'unknown'

In [46]:
def parse_division_from_filename(filepath):
    base = os.path.basename(filepath)            # e.g. "7A_players_df.csv"
    name, _ = os.path.splitext(base)             # "7A_players_df"
    division_str = name.replace("_players_df", "").strip().lower()
    
    # If there's a leading digit portion (e.g., '7' in '7a', or '15' in '15b'),
    # extract it and override division_str:
    match = re.match(r'^(\d+)', division_str)
    if match:
        division_str = match.group(1)  # "7", "15"

    return division_str

In [47]:
# 3) A helper to get the base rating for a given division name
#    (handling numeric or special names).
def get_base_rating(division_str):
    # Try exact dictionary match first
    if division_str in division_base_ratings:
        return division_base_ratings[division_str]

    # If not found, maybe there's a digit in the string (like "7a")
    # or partial match for "premier ladies"? 
    # But let's keep it simple for now:
    # Attempt to parse digit:
    for d in division_base_ratings:
        if d in division_str:
            return division_base_ratings[d]

    # Default if nothing found
    return 200  # Some minimal fallback value

In [None]:
def compute_linear_median_rating(base_rating, rank_i, total_in_team, alpha=0.15):
    """
    Returns an initial rating for a player who is rank_i in a team of total_in_team,
    ensuring the median rank gets the base_rating, #1 is up to +alpha% above base,
    and #N is up to -alpha% below base.

    rank_i = 1 => top/best player
    rank_i = total_in_team => bottom/worst
    alpha = 0.2 => ±20% from base rating at extremes
    """
    if total_in_team <= 1:
        return base_rating  # trivial case

    # Median rank (can be fractional if total_in_team is even)
    m = (total_in_team + 1) / 2.0

    # Linear scale: factor = 1 + alpha * ((m - i) / (m - 1))
    # So rank == m => factor=1
    factor = 1.0 + alpha * ((m - rank_i) / (m - 1))
    return base_rating * factor

In [49]:
def process_division_file(filepath, player_ratings, name_to_hks_set, hks_to_name_set):
    """
    Reads a single division CSV and assigns initial ratings to players
    who don't already have a rating (in player_ratings).
    Also updates name_to_hks_set and hks_to_name_set for conflict checking.
    """
    division_str = parse_division_from_filename(filepath)
    df_div = pd.read_csv(filepath)

    # If empty, nothing to do
    if df_div.empty:
        return
    
    # Rename column to avoid issues
    if "HKS No." in df_div.columns:  
        df_div.rename(columns={"HKS No.": "HKS_No"}, inplace=True)

    base = get_base_rating(division_str)

    # Group by Team
    grouped = df_div.groupby("Team", as_index=False)
    for team_name, group in grouped:
        N = len(group)
        group_sorted = group.sort_values("Order", ascending=True)

        for row in group_sorted.itertuples(index=False):
            # Get HKS_No and player name
            player_name = (row.Player or "").strip()
            hks_no = row.HKS_No

            if not player_name:
                continue

            # Check for conflicts
            if player_name not in name_to_hks_set:
                name_to_hks_set[player_name] = set()
            name_to_hks_set[player_name].add(hks_no)

            # Update hks -> name set
            if hks_no not in hks_to_name_set:
                hks_to_name_set[hks_no] = set()
            hks_to_name_set[hks_no].add(player_name)

            # If there's more than one hks_no for the same name, print a warning
            if len(name_to_hks_set[player_name]) > 1:
                print(f"Warning: Multiple HKS numbers for player '{player_name}': {name_to_hks_set[player_name]}")
            # Similarly if there's more than one name for the same hks_no
            if len(hks_to_name_set[hks_no]) > 1:
                print(f"Warning: Multiple names for HKS number '{hks_no}': {hks_to_name_set[hks_no]}")

            # Now form the rating dictionary key
            player_key = (player_name, hks_no)
            if player_key in player_ratings:
                # Already rated, skip
                continue

            # Not assigned yet -> compute initial rating
            rank_i = int(row.Order)
            init_rating = compute_linear_median_rating(base, rank_i, N, alpha=0.2)

            player_ratings[player_key] = {
                "Player": player_name,
                "HKS_No": hks_no,
                "Division": division_str,
                "Team": team_name,
                "Initial Rating": init_rating
            }


In [51]:

folder = r"C:\Users\bpali\PycharmProjects\SquashApp\2024-2025\players_df\week_1"
pattern = os.path.join(folder, "*_players_df.csv")
filepaths = glob.glob(pattern)

# Separate files by league type
main_files = []
masters_files = []
ladies_files = []

for fp in filepaths:
    d_str = parse_division_from_filename(fp)  # e.g. "3", "m2", etc.
    league_type = get_league_type(d_str)      # "main", "masters", or "ladies"

    if league_type == "main":
        main_files.append(fp)
    elif league_type == "masters":
        masters_files.append(fp)
    elif league_type == "ladies":
        ladies_files.append(fp)
    else:
        # unknown league type => ignore or handle as you wish
        pass

# We store ratings in a dict keyed by (player_name, hks_no)
player_ratings = {}

# To detect conflicts:
name_to_hks_set = {}  # name -> set of HKS no.
hks_to_name_set = {}  # HKS no. -> set of names

# 1) Process Main
for fp in main_files:
    process_division_file(fp, player_ratings, name_to_hks_set, hks_to_name_set)

# 2) Process Masters (only players not already in dictionary)
for fp in masters_files:
    process_division_file(fp, player_ratings, name_to_hks_set, hks_to_name_set)

# 3) Process Ladies (only players not already in dictionary)
for fp in ladies_files:
    process_division_file(fp, player_ratings, name_to_hks_set, hks_to_name_set)

# Build final DataFrame
rows = []
for (player_name, hks_no), info in player_ratings.items():
    rows.append([
        player_name,
        hks_no,
        info["Division"],
        info["Team"],
        info["Initial Rating"]
    ])

final_df = pd.DataFrame(rows, columns=["Player", "HKS_No", "Division", "Team", "Initial Rating"])
final_df.sort_values(by=["Initial Rating", "Division"], ascending=[False, True], inplace=True, ignore_index=True)

# Save
outpath = os.path.join(r"C:\Users\bpali\PycharmProjects\SquashApp\2024-2025", "all_initial_ratings.csv")
final_df.to_csv(outpath, index=False)
print(f"Done. {len(final_df)} players in all_initial_ratings.csv.")


Done. 1682 players in all_initial_ratings.csv.


In [None]:
# ---------------------------------------------------------------------
# 3. Margin-of-victory scoring:
#    We'll assign "winner_actual_score" and "loser_actual_score" based on 3-0, 3-1, 3-2, etc.
#    You can adjust these splits as you see fit.
# ---------------------------------------------------------------------
MARGIN_SCORES = {
    '3-0': (1.0, 0.0),
    '3-1': (0.85, 0.15),
    '3-2': (0.7, 0.3),
}

In [None]:
# ---------------------------------------------------------------------
# 4. Elo expected score function
# ---------------------------------------------------------------------
def expected_score(rating_player, rating_opponent):
    # Standard Elo logistic function
    return 1.0 / (1.0 + 10 ** ((rating_opponent - rating_player) / 400.0))

In [None]:
# ---------------------------------------------------------------------
# 5. One-match Elo update for winner & loser
# ---------------------------------------------------------------------
def update_elo(rating_w, rating_l, actual_w, actual_l, K=32):
    # Expected scores
    e_w = expected_score(rating_w, rating_l)
    e_l = 1.0 - e_w

    # Updated ratings
    new_rating_w = rating_w + K * (actual_w - e_w)
    new_rating_l = rating_l + K * (actual_l - e_l)

    return new_rating_w, new_rating_l

In [None]:
# ---------------------------------------------------------------------
# 6. Main script to read CSV, process matches, and compute final Elo
# ---------------------------------------------------------------------
def main():
    # Read your CSV file
    # Update the filename/path as needed
    df = pd.read_csv(r"C:\Users\bpali\PycharmProjects\SquashApp\2024-2025\combined_player_results_df.csv")

    # Avoid changing date to see if problem is fixed
    # df['Match Date'] = pd.to_datetime(df['Match Date'], dayfirst=True, errors='coerce')

    # Remove rows where Score is "CR" or "WO" => means walkover or no-show
    df = df[~df['Score'].isin(['CR', 'WO'])]

    # We only want to process each match once,
    # so let's filter for the row where the player "Result" == "Win".
    # That ensures we treat that row's "Player Name" as the winner, Opponent as the loser
    # and skip the mirrored "Loss" row.
    df = df[df['Result'] == 'Win'].copy()

    # Sort by date so we process matches in chronological order
    df.sort_values(by="Match Date", inplace=True)

    # We'll keep a dictionary of player data: {player_name: {'rating': X, 'matches_played': Y}}
    player_data = {}

    # Function to safely get rating (initializing if needed)
    def get_player_rating(player_name, division_str):
        # If the player is new, assign them an initial rating
        # based on the division from this row (or you could track highest division).
        if player_name not in player_data:
            base_rating = get_initial_rating_for_division(division_str)
            player_data[player_name] = {
                'rating': base_rating,
                'matches_played': 0,
                'clubs': set(),
                'divisions': set()
            }
        return player_data[player_name]['rating']

    # Now loop over each row, do the Elo update
    for _, row in df.iterrows():
        winner = row['Player Name']
        loser = row['Opponent Name']
        division_str = str(row['Division']).strip()  # e.g. "10", "7A", "Premier Masters", etc.
        score_str = row['Score']
        # Player's club from that row
        winner_club = str(row['Team']).strip()
        loser_club = str(row['Opponent Team']).strip()

        # Make sure we have valid 3-x or x-3
        # We only expect "3-0", "3-1", "3-2" from the winner's perspective.
        if score_str not in MARGIN_SCORES:
            # If it's something unexpected, skip
            continue

        (actual_w, actual_l) = MARGIN_SCORES[score_str]

        # Get current ratings (assign initial if not yet present)
        w_rating = get_player_rating(winner, division_str)
        l_rating = get_player_rating(loser, division_str)

        # Elo update
        new_w_rating, new_l_rating = update_elo(w_rating, l_rating, actual_w, actual_l, K=32)

        # Store back
        player_data[winner]['rating'] = new_w_rating
        player_data[loser]['rating'] = new_l_rating

        # Increment matches_played
        player_data[winner]['matches_played'] += 1
        player_data[loser]['matches_played'] += 1

        # Add clubs to sets
        player_data[winner]['clubs'].add(winner_club)
        player_data[loser]['clubs'].add(loser_club)

        # Add divisions to sets
        player_data[winner]['divisions'].add(division_str)
        player_data[loser]['divisions'].add(division_str)

    # -----------------------------------------------------------------
    # After processing all rows, generate a final results table
    # but only include players with ≥5 matches
    # -----------------------------------------------------------------
    results = []
    for player, info in player_data.items():
        if info['matches_played'] >= 5:  # "official" threshold
            # Convert sets to comma-separated strings
            clubs_str = ", ".join(sorted(info['clubs']))
            divisions_str = ", ".join(sorted(info['divisions']))
            results.append([player, clubs_str, divisions_str, info['rating'], info['matches_played']])

    # Convert to DataFrame and sort by rating descending
    results_df = pd.DataFrame(results, columns=['Player', 'Clubs', 'Divisions', 'Rating', 'Matches Played'])
    results_df.sort_values(by='Rating', ascending=False, inplace=True, ignore_index=True)

    # Print or save
    print(results_df)
    results_df.to_csv("elo_results.csv", index=False)

if __name__ == "__main__":
    main()