In [2]:
import pandas as pd
import numpy as np
from collections import defaultdict
from rapidfuzz import process, fuzz

In [3]:
file_path = r'Data\new data.xlsx'

excel_data = pd.ExcelFile(file_path)

dfs = {}

for sheet_name in excel_data.sheet_names:
    dfs[sheet_name] = pd.read_excel(excel_data, sheet_name=sheet_name, header=0)

for df_name, df in dfs.items():
    if 'Rk' in df.columns:
        df.drop(columns=['Rk'], inplace=True)

for sheet, df in dfs.items():
    print(f"Sheet name: {sheet}")
    print(df.head())

Sheet name: Linfield
               name category     team institution  Rk1  R1  Rk2  R2  Rk3  R3  \
0  Catherine Dudley     Open  CC SADU          CC  1st  77  1st  79  1st  78   
1       June LePage     Open  CC LEMC          CC  1st  80  2nd  77  2nd  77   
2  Spencer McDonald     Open  CC LEMC          CC  1st  79  2nd  78  2nd  76   
3      Charles Said     Open  CC SADU          CC  1st  77  1st  78  1st  77   
4    Brad Tomasovic     Open  CC THTO          CC  1st  77  2nd  76  1st  80   

   Rk4  R4  Total   Avg  Stdev  
0  3rd  80    314  78.5   1.12  
1  2nd  80    314  78.5   1.50  
2  2nd  81    314  78.5   1.80  
3  3rd  80    312  78.0   1.22  
4  4th  79    312  78.0   1.58  
Sheet name: YODL 2
             team            name      category institution  Rk1  R1  Rk2  R2  \
0  Air Force GoHo     Nathan Good          Open   Air Force  1st  79  1st  78   
1  Air Force GoHo     Grace Hoyte          Open   Air Force  1st  78  1st  77   
2  Air Force HaMa     Noah Marino  Ope

In [4]:
unique_names = set()

for df in dfs.values():
    if 'name' in df.columns:
        unique_names.update(df['name'].dropna().unique())

unique_names_list = list(unique_names)
master_df = pd.DataFrame(unique_names_list, columns=['name'])

name_mapping = {}
standard_names = []

for name in master_df['name']:
    if standard_names:
        match, score, _ = process.extractOne(name, standard_names, scorer=fuzz.WRatio)
        if score >= 85:
            name_mapping[name] = match
        else:
            standard_names.append(name)
            name_mapping[name] = name
    else:
        standard_names.append(name)
        name_mapping[name] = name

master_df['name'] = master_df['name'].replace(name_mapping)

for key, df in dfs.items():
    if 'name' in df.columns:
        df['name'] = df['name'].replace(name_mapping)

In [None]:
result_counts = defaultdict(lambda: defaultdict(int))

for key, df in dfs.items():
    rank_columns = [col for col in df.columns if col.startswith('Rk')]
    
    for _, row in df.iterrows():
        name = row['name']
        for col in rank_columns:
            if row[col] in ['1st', '2nd', '3rd', '4th']:
                result_counts[name][row[col]] += 1

master_names = master_df['name'].tolist()
filtered_results = {name: result_counts[name] for name in master_names}

for rank in ['1st', '2nd', '3rd', '4th']:
    master_df[rank] = master_df['name'].map(lambda name: filtered_results.get(name, {}).get(rank, 0))

In [None]:
score_columns = ["R1", "R2", "R3", "R4", "R5"]

master_df['Total Accumulated Score'] = 0
master_df['Overall Average Score'] = 0.0

for col in score_columns:
    master_df[f'Total {col} Score'] = 0.0
    master_df[f'Count {col} Appearances'] = 0
    master_df[f'Average {col}'] = 0.0

for key, df in dfs.items():
    available_columns = [col for col in score_columns if col in df.columns]

    for col in available_columns:
        df[col] = pd.to_numeric(df[col], errors='coerce')

    for _, row in df.iterrows():
        name = row['name']
        if name in master_df['name'].values:
            total_score = row[available_columns].sum()
            average_score = row[available_columns].mean()

            master_df.loc[master_df['name'] == name, 'Total Accumulated Score'] += total_score
            master_df.loc[master_df['name'] == name, 'Overall Average Score'] += average_score / len(dfs)

            for col in available_columns:
                master_df.loc[master_df['name'] == name, f'Total {col} Score'] += row[col]
                master_df.loc[master_df['name'] == name, f'Count {col} Appearances'] += 1

for col in score_columns:
    master_df[f'Average {col}'] = (
        master_df[f'Total {col} Score'] / master_df[f'Count {col} Appearances']
    ).fillna(0).round(2)

master_df['Overall Average Score'] = master_df['Overall Average Score'].round(2)

master_df.drop(
    [f'Total {col} Score' for col in score_columns]
    + [f'Count {col} Appearances' for col in score_columns],
    axis=1,
    inplace=True
)

Updated master_df with statistics:
                       name  Total Accumulated Score  Overall Average Score  \
0                  Ryan Lee                      835                  28.53   
1                 Ava Digre                     1277                  37.58   
2                 Alex Chan                      773                    NaN   
3         Echo O&#x27;Leary                     1274                  37.48   
4              Vu Hoang Anh                      302                   9.44   
..                      ...                      ...                    ...   
621       Michelle Mangione                      374                   9.35   
622        Nguyen Ngoc Diem                      291                   9.09   
623  Laura Serafine Pilmark                      398                   9.95   
624            William Chen                      380                   9.50   
625                Ryan Lee                      835                  28.53   

     Average R1 