In [9]:
import pandas as pd
import numpy as np
from collections import defaultdict
from rapidfuzz import process, fuzz

In [2]:
file_path = r'Data\new data.xlsx'

excel_data = pd.ExcelFile(file_path)

dfs = {}

for sheet_name in excel_data.sheet_names:
    dfs[sheet_name] = pd.read_excel(excel_data, sheet_name=sheet_name, header=0)

for df_name, df in dfs.items():
    if 'Rk' in df.columns:
        df.drop(columns=['Rk'], inplace=True)

for sheet, df in dfs.items():
    print(f"Sheet name: {sheet}")
    print(df.head())

Sheet name: Linfield
               name category     team institution  Rk1  R1  Rk2  R2  Rk3  R3  \
0  Catherine Dudley     Open  CC SADU          CC  1st  77  1st  79  1st  78   
1       June LePage     Open  CC LEMC          CC  1st  80  2nd  77  2nd  77   
2  Spencer McDonald     Open  CC LEMC          CC  1st  79  2nd  78  2nd  76   
3      Charles Said     Open  CC SADU          CC  1st  77  1st  78  1st  77   
4    Brad Tomasovic     Open  CC THTO          CC  1st  77  2nd  76  1st  80   

   Rk4  R4  Total   Avg  Stdev  
0  3rd  80    314  78.5   1.12  
1  2nd  80    314  78.5   1.50  
2  2nd  81    314  78.5   1.80  
3  3rd  80    312  78.0   1.22  
4  4th  79    312  78.0   1.58  
Sheet name: YODL 2
             team            name      category institution  Rk1  R1  Rk2  R2  \
0  Air Force GoHo     Nathan Good          Open   Air Force  1st  79  1st  78   
1  Air Force GoHo     Grace Hoyte          Open   Air Force  1st  78  1st  77   
2  Air Force HaMa     Noah Marino  Ope

In [11]:
unique_names = set()

for df in dfs.values():
    if 'name' in df.columns:
        unique_names.update(df['name'].dropna().unique())

unique_names_list = list(unique_names)
master_df = pd.DataFrame(unique_names_list, columns=['name'])

name_mapping = {}
standard_names = []

for name in master_df['name']:
    if standard_names:
        match, score, _ = process.extractOne(name, standard_names, scorer=fuzz.WRatio)
        if score >= 85:
            name_mapping[name] = match
        else:
            standard_names.append(name)
            name_mapping[name] = name
    else:
        standard_names.append(name)
        name_mapping[name] = name

master_df['name'] = master_df['name'].replace(name_mapping)

for key, df in dfs.items():
    if 'name' in df.columns:
        df['name'] = df['name'].replace(name_mapping)

In [None]:
result_counts = defaultdict(lambda: defaultdict(int))

for key, df in dfs.items():
    rank_columns = [col for col in df.columns if col.startswith('Rk')]
    
    for _, row in df.iterrows():
        name = row['name']
        for col in rank_columns:
            if row[col] in ['1st', '2nd', '3rd', '4th']:
                result_counts[name][row[col]] += 1

master_names = master_df['name'].tolist()
filtered_results = {name: result_counts[name] for name in master_names}

result_df = pd.DataFrame(filtered_results).fillna(0).T
result_df.columns = ['1st', '2nd', '3rd', '4th']

result_df = result_df.reindex(master_names, fill_value=0)

                     1st  2nd  3rd  4th
Alexandr Susic       4.0  1.0  0.0  0.0
Samira Jamaale       2.0  3.0  6.0  1.0
Yaniv Regev          1.0  2.0  1.0  0.0
Isaac Martinez       0.0  2.0  2.0  0.0
Altynay Zamanbekova  0.0  0.0  3.0  2.0
...                  ...  ...  ...  ...
Ahmad Bin Tahir      1.0  1.0  2.0  1.0
Jenna Hammond        0.0  2.0  2.0  4.0
PUCV My Friend       0.0  0.0  0.0  0.0
Joseph Gellman       1.0  6.0  4.0  1.0
Zara Ammar           1.0  0.0  2.0  1.0

[571 rows x 4 columns]


In [17]:
import pandas as pd

# Function to filter valid numeric round columns
def get_score_columns(df):
    return [col for col in df.columns if col.startswith('R') and not col.startswith('Rk')]

# Step 1: Initialize master_df with required columns
sample_df = next(iter(dfs.values()))  # Use any dataframe to extract column structure
all_round_columns = list(
    set(col for df in dfs.values() for col in get_score_columns(df))
)  # Combine all unique round columns across dataframes

# Ensure master_df columns are initialized with compatible types
master_df['Total Accumulated Score'] = 0
master_df['Overall Average Score'] = 0.0  # Float for compatibility with averages

for col in all_round_columns:
    master_df[f'Average {col}'] = 0.0  # Float for compatibility

# Step 2: Process each dataframe in dfs
for key, df in dfs.items():
    score_columns = get_score_columns(df)  # Identify valid score columns in this dataframe

    # Ensure score columns are numeric
    for col in score_columns:
        df[col] = pd.to_numeric(df[col], errors='coerce')  # Convert non-numeric to NaN

    # Compute statistics per person
    for _, row in df.iterrows():
        name = row['name']
        if name in master_df['name'].values:
            total_score = row[score_columns].sum()
            average_score = row[score_columns].mean()

            # Update master_df
            master_df.loc[master_df['name'] == name, 'Total Accumulated Score'] += total_score
            master_df.loc[master_df['name'] == name, 'Overall Average Score'] += average_score / len(dfs)

            # Update per-round averages only for existing columns
            for col in score_columns:
                master_df.loc[master_df['name'] == name, f'Average {col}'] += row[col] / len(dfs)

# Finalize overall average score computation
master_df['Overall Average Score'] = master_df['Overall Average Score'].round(2)

# Step 3: Output master_df
print("Updated master_df with statistics:")
print(master_df)

Updated master_df with statistics:
                    name  Total Score  Average Score  Total Accumulated Score  \
0         Alexandr Susic            0              0                      399   
1         Samira Jamaale            0              0                      889   
2            Yaniv Regev            0              0                      295   
3         Isaac Martinez            0              0                      290   
4    Altynay Zamanbekova            0              0                      389   
..                   ...          ...            ...                      ...   
566      Ahmad Bin Tahir            0              0                      386   
567        Jenna Hammond            0              0                      591   
568       PUCV My Friend            0              0                        0   
569       Joseph Gellman            0              0                      875   
570           Zara Ammar            0              0                      