In [15]:
#calling all needed libraries

import pandas as pd
import numpy as np
from collections import defaultdict
from rapidfuzz import process, fuzz
import matplotlib.pyplot as plt
import os
from fpdf import FPDF

In [13]:
wu_names_path = r'Data\cleaned_student_data.xlsx'

wu_names = pd.read_excel(wu_names_path)

In [2]:
#Grabbing data and throwing them into the dictionary call dfs

file_path = r'Data\new data.xlsx'

excel_data = pd.ExcelFile(file_path)

dfs = {}

for sheet_name in excel_data.sheet_names:
    dfs[sheet_name] = pd.read_excel(excel_data, sheet_name=sheet_name, header=0)

for df_name, df in dfs.items():
    if 'Rk' in df.columns:
        df.drop(columns=['Rk'], inplace=True)

In [3]:
#creates a master list of names, account for spelling errors

unique_names = set()

for df in dfs.values():
    if 'name' in df.columns:
        unique_names.update(df['name'].dropna().unique())

unique_names_list = list(unique_names)
master_df = pd.DataFrame(unique_names_list, columns=['name'])

name_mapping = {}
standard_names = []

for name in master_df['name']:
    if standard_names:
        match, score, _ = process.extractOne(name, standard_names, scorer=fuzz.WRatio)
        if score >= 85:
            name_mapping[name] = match
        else:
            standard_names.append(name)
            name_mapping[name] = name
    else:
        standard_names.append(name)
        name_mapping[name] = name

master_df['name'] = master_df['name'].replace(name_mapping)

for key, df in dfs.items():
    if 'name' in df.columns:
        df['name'] = df['name'].replace(name_mapping)

In [4]:
#counts the number of times a person has received 1st, 2nd, 3rd, or 4th

result_counts = defaultdict(lambda: defaultdict(int))

for key, df in dfs.items():
    df.columns = df.columns.str.strip()
    
    name_column = next((col for col in df.columns if 'name' in col.lower()), None)
    if not name_column:
        continue

    rank_columns = [col for col in df.columns if col.startswith('Rk')]
    
    for _, row in df.iterrows():
        name = row[name_column]
        for col in rank_columns:
            if row[col] in ['1st', '2nd', '3rd', '4th']:
                result_counts[name][row[col]] += 1

master_names = master_df['name'].tolist()
filtered_results = {name: result_counts[name] for name in master_names}

for rank in ['1st', '2nd', '3rd', '4th']:
    master_df[rank] = master_df['name'].map(lambda name: filtered_results.get(name, {}).get(rank, 0))

In [5]:
score_columns = ["R1", "R2", "R3", "R4", "R5"]

master_df['Total Accumulated Score'] = 0
master_df['Overall Average Score'] = 0.0
master_df['Count Appearances in DataFrames'] = 0

for col in score_columns:
    master_df[f'Total {col} Score'] = 0.0
    master_df[f'Count {col} Appearances'] = 0
    master_df[f'Average {col}'] = 0.0

for key, df in dfs.items():
    df.columns = df.columns.str.strip()
    
    name_column = next((col for col in df.columns if 'name' in col.lower()), None)
    if not name_column:
        continue

    available_columns = [col for col in score_columns if col in df.columns]
    for col in available_columns:
        df[col] = pd.to_numeric(df[col], errors='coerce')

    for _, row in df.iterrows():
        name = row[name_column]
        if name in master_df['name'].values:
            master_df.loc[master_df['name'] == name, 'Count Appearances in DataFrames'] += 1

            total_score = row[available_columns].sum()
            average_score = row[available_columns].mean()

            master_df.loc[master_df['name'] == name, 'Total Accumulated Score'] += total_score

            for col in available_columns:
                master_df.loc[master_df['name'] == name, f'Total {col} Score'] += row[col]
                master_df.loc[master_df['name'] == name, f'Count {col} Appearances'] += 1

for col in score_columns:
    master_df[f'Average {col}'] = (
        master_df[f'Total {col} Score'] / master_df[f'Count {col} Appearances']
    ).fillna(0).round(2)

master_df['Overall Average Score'] = (
    master_df['Total Accumulated Score'] / master_df['Count Appearances in DataFrames']
).fillna(0).round(2)

master_df.drop(
    [f'Total {col} Score' for col in score_columns]
    + [f'Count {col} Appearances' for col in score_columns]
    + ['Count Appearances in DataFrames'],
    axis=1,
    inplace=True
)

In [6]:
positions = ["Opening Government", "Opening Opposition", "Closing Government", "Closing Opposition"]
for position in positions:
    master_df[position] = 0

for sheet_name, df in dfs.items():
    if "name" in df.columns and any(col.startswith("side") for col in df.columns):
        side_columns = [col for col in df.columns if col.startswith("side")]
        
        for _, row in df.iterrows():
            participant_name = row["name"]
            
            for side_col in side_columns:
                if side_col in row and row[side_col] in positions:
                    if participant_name in master_df["name"].values:
                        master_df.loc[master_df["name"] == participant_name, row[side_col]] += 1

In [7]:
position_score = defaultdict(lambda: pd.DataFrame(columns=["Side", "Total", "1st", "2nd", "3rd", "4th"]))

sides = ["Opening Government", "Opening Opposition", "Closing Government", "Closing Opposition"]
ranks = ["1st", "2nd", "3rd", "4th"]

for sheet_name, df in dfs.items():
    print(f"Processing sheet: {sheet_name}")
    
    if "name" not in df.columns:
        print(f"Skipping sheet {sheet_name}: No 'name' column")
        continue

    side_columns = [col for col in df.columns if col.startswith("side")]
    rank_columns = [col for col in df.columns if col.startswith("Rk")]

    if not side_columns or not rank_columns:
        print(f"Skipping sheet {sheet_name}: No 'side' or 'Rk' columns")
        continue

    for side_col, rank_col in zip(side_columns, rank_columns):
        for _, row in df.iterrows():
            name = row["name"]

            side = row.get(side_col, None)
            rank = row.get(rank_col, None)

            if pd.notna(side) and pd.notna(rank) and side in sides:
                if side not in position_score[name]["Side"].values:
                    new_row = {"Side": side, "Total": 0, "1st": 0, "2nd": 0, "3rd": 0, "4th": 0}
                    position_score[name] = pd.concat([position_score[name], pd.DataFrame([new_row])], ignore_index=True)
                
                position_score[name].loc[position_score[name]["Side"] == side, "Total"] += 1
                if rank in ranks:
                    position_score[name].loc[position_score[name]["Side"] == side, rank] += 1

position_score = dict(position_score)

Processing sheet: Linfield
Processing sheet: YODL 2
Processing sheet: UCLA IV
Processing sheet: YODL 1
Processing sheet: Princeton
Processing sheet: Links
Skipping sheet Links: No 'name' column


In [8]:
rank_to_points = {"1st": 3, "2nd": 2, "3rd": 1, "4th": 0}

score_columns = ["Opening Government Score", "Opening Opposition Score", "Closing Government Score", "Closing Opposition Score"]
for col in score_columns:
    master_df[col] = 0

for name, df in position_score.items():
    if name not in master_df["name"].values:
        print(f"Skipping {name}: not found in master_df")
        continue

    for side in df["Side"].unique():
        side_data = df[df["Side"] == side].iloc[0]
        total_score = (
            side_data["1st"] * rank_to_points["1st"] +
            side_data["2nd"] * rank_to_points["2nd"] +
            side_data["3rd"] * rank_to_points["3rd"] +
            side_data["4th"] * rank_to_points["4th"]
        )
        
        score_col = f"{side} Score"
        if score_col in master_df.columns:
            master_df.loc[master_df["name"] == name, score_col] = total_score

In [9]:
average_columns = [
    "Opening Government Average", 
    "Opening Opposition Average", 
    "Closing Government Average", 
    "Closing Opposition Average"
]
for col in average_columns:
    master_df[col] = 0.0

for name, df in position_score.items():
    if name not in master_df["name"].values:
        print(f"Skipping {name}: not found in master_df")
        continue

    for side in df["Side"].unique():
        side_data = df[df["Side"] == side].iloc[0]
        total_score = master_df.loc[master_df["name"] == name, f"{side} Score"].values[0]
        total_count = side_data["Total"]

        average_score = total_score / total_count if total_count > 0 else 0

        average_col = f"{side} Average"
        if average_col in master_df.columns:
            master_df.loc[master_df["name"] == name, average_col] = average_score

In [10]:
#count number of tournament appearnaces per person

master_df["Total Appearances"] = 0

for sheet_name, df in dfs.items():
    if "name" not in df.columns:
        print(f"Skipping sheet '{sheet_name}': No 'name' column")
        continue

    name_counts = df["name"].value_counts()

    for name, count in name_counts.items():
        master_df.loc[master_df["name"] == name, "Total Appearances"] += count

Skipping sheet 'Links': No 'name' column


In [11]:
#finding the highest and lowest scores

master_df["Highest Score"] = np.nan
master_df["Lowest Score"] = np.nan

for sheet_name, df in dfs.items():
    score_columns = [col for col in df.columns if col.startswith("R")]
    
    if not score_columns:
        print(f"Skipping sheet '{sheet_name}': No R# columns")
        continue
    
    for name in master_df["name"]:
        if "name" in df.columns:
            person_scores = df.loc[df["name"] == name, score_columns]
            
            scores = pd.to_numeric(person_scores.values.flatten(), errors="coerce")
            scores = scores[~np.isnan(scores)]
            
            if len(scores) > 0:
                current_high = master_df.loc[master_df["name"] == name, "Highest Score"].values[0]
                current_low = master_df.loc[master_df["name"] == name, "Lowest Score"].values[0]
                
                master_df.loc[master_df["name"] == name, "Highest Score"] = max(current_high, scores.max()) if not np.isnan(current_high) else scores.max()
                master_df.loc[master_df["name"] == name, "Lowest Score"] = min(current_low, scores.min()) if not np.isnan(current_low) else scores.min()

Skipping sheet 'Links': No R# columns


In [18]:
def prepare_wu_data(master_df, wu_names):
    wu_names["names"] = wu_names["names"].str.strip().str.lower()
    master_df["name"] = master_df["name"].str.strip().str.lower()
    
    wu_data = master_df[master_df["name"].isin(wu_names["names"])].reset_index(drop=True)
    return wu_data

def log_missing_data(wu_names, wu_data):
    missing_names = set(wu_names["names"]) - set(wu_data["name"])
    if missing_names:
        print(f"The following names have no data and will be skipped: {missing_names}")
    return missing_names

def create_bar_chart(data, name, output_dir):
    positions = ["Opening Government", "Opening Opposition", "Closing Government", "Closing Opposition"]
    average_columns = ["Opening Government Average", "Opening Opposition Average", "Closing Government Average", "Closing Opposition Average"]
    
    averages = data[average_columns].iloc[0].values

    plt.figure(figsize=(8, 5))
    bars = plt.bar(positions, averages, color=["#4CAF50", "#2196F3", "#FFC107", "#E91E63"])
    
    for bar, avg in zip(bars, averages):
        plt.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 0.05, f"{avg:.2f}", ha='center', fontsize=10)
    
    plt.ylim(0, 3)
    plt.xlabel("Position", fontsize=12)
    plt.ylabel("Average Score", fontsize=12)
    plt.title(f"Average by Position for {name.title()}", fontsize=14)
    plt.tight_layout()

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    file_path = os.path.join(output_dir, f"{name}_average_by_position.png")
    plt.savefig(file_path)
    plt.close()
    return file_path

class PDFReport(FPDF):
    def header(self):
        self.set_font('Arial', 'B', 12)
        self.cell(0, 10, 'Individual Performance Report', align='C', ln=True)

    def add_individual_chart(self, name, chart_path):
        self.add_page()
        self.set_font('Arial', '', 12)
        self.cell(0, 10, f"Performance Overview: {name.title()}", ln=True)
        self.ln(10)
        self.image(chart_path, x=10, y=30, w=190)
    
    def save(self, output_path):
        self.output(output_path)

def generate_reports(master_df, wu_names, output_dir="reports", chart_dir="charts"):
    wu_data = prepare_wu_data(master_df, wu_names)
    log_missing_data(wu_names, wu_data)

    pdf = PDFReport()
    for name in wu_data["name"].unique():
        person_data = wu_data[wu_data["name"] == name]
        if not person_data.empty:
            chart_path = create_bar_chart(person_data, name, chart_dir)
            pdf.add_individual_chart(name, chart_path)

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    pdf.save(os.path.join(output_dir, "Performance_Report.pdf"))
    print(f"Report saved to {os.path.join(output_dir, 'Performance_Report.pdf')}")

generate_reports(master_df, wu_names)

The following names have no data and will be skipped: {'alex van nieuwenhuysen', 'jacob mary', 'finn forman', 'cassidy ureda', 'brian sung', 'ciandramarie choun', 'isabel huh', 'jillian layton', 'kyle sessions', 'ryder mckernon', 'lucas da veiga', '"jay shiflett (jay)"', 'inez nieves', 'gage brock', 'eliza gonzalez', 'rocco cesario', 'sal chapell', 'mitch septoff', 'morgan shaw', 'william laduca', 'ella johanknecht', 'violeta basenko', 'dhriti sriram', 'jack crone', 'miriam schwabel', "echo o'leary", 'sarah jane early', 'sacha alialani des pres'}
Report saved to reports\Performance_Report.pdf
