# Data Cleaning — Leo CM Cup

This notebook loads the raw Leo CM Cup survey data, performs basic cleaning, and exports
a cleaned dataset for downstream analysis and feature engineering.


In [6]:
# Core libraries
import pandas as pd
import numpy as np
from pathlib import Path

# Paths
RAW_PATH = Path("/workspaces/moomooleo/data/raw/CM Data Collection - Leo Cup (Responses).xlsx")
PROCESSED_DIR = Path("/workspaces/moomooleo/data/cleaned")
PROCESSED_DIR.mkdir(parents=True, exist_ok=True)
CLEANED_PATH = PROCESSED_DIR / "leo_cm_cleaned.csv"

RAW_PATH, CLEANED_PATH

(PosixPath('/workspaces/moomooleo/data/raw/CM Data Collection - Leo Cup (Responses).xlsx'),
 PosixPath('/workspaces/moomooleo/data/cleaned/leo_cm_cleaned.csv'))

In [7]:
# Load raw Excel responses
df = pd.read_excel(RAW_PATH)
print(f"Raw shape: {df.shape}")
df.head()

Raw shape: (1490, 64)


Unnamed: 0,Timestamp,Player IGN,CM Group,Kitasan Black LB in Account (Non-borrow),Super Creek LB in Account (Non-borrow),Select a round to fill data,R1D1 - Uma 1,R1D1 - Uma 1 Role,R1D1 - Uma 1 Running Style,R1D1 - Uma 2,...,FINALS - Uma 1 Running Style,FINALS - Uma 2,FINALS - Uma 2 Role,FINALS - Uma 2 Running Style,FINALS - Uma 3,FINALS - Uma 3 Role,FINALS - Uma 3 Running Style,FINALS RESULTS,How much have you spent on the game so far? (EUR/USD),Column 63
0,2025-10-30 05:42:28.475,Pharaday,Graded (No Limit),MLB,2LB,End Survey,,,,,...,,,,,,,,,$1000++,
1,2025-10-29 21:55:35.958,Ryan,Graded (No Limit),1LB,,End Survey,Seiun Sky,"Ace (Winner, Tanks, Recycled Aces)",Front Runner,Agnes Tachyon,...,Front Runner,Agnes Tachyon,"Ace (Winner, Tanks, Recycled Aces)",Pace Chaser,Mejiro Ryan,Aoharu Made Ace,Late Surger,2ND,F2P,
2,2025-10-28 21:03:08.629,Ramen,Graded (No Limit),3LB,,End Survey,Maruzensky (Summer),"Ace (Winner, Tanks, Recycled Aces)",Front Runner,Grass Wonder,...,Front Runner,Grass Wonder,Hybrid,Late Surger,Gold Ship,"Ace (Winner, Tanks, Recycled Aces)",End Closer,1ST,$1-$100,
3,2025-10-28 17:32:17.479,Jackenstein,Graded (No Limit),,0LB,End Survey,Gold Ship,"Ace (Winner, Tanks, Recycled Aces)",End Closer,Agnes Tachyon,...,End Closer,Agnes Tachyon,"Ace (Winner, Tanks, Recycled Aces)",Pace Chaser,Grass Wonder,Debuffer,End Closer,1ST,F2P,
4,2025-11-09 12:23:05.233,Cien,Graded (No Limit),3LB,0LB,End Survey,Oguri Cap,"Ace (Winner, Tanks, Recycled Aces)",Pace Chaser,Grass Wonder,...,Pace Chaser,Symboli Rudolf,Debuffer,End Closer,Grass Wonder,Debuffer,End Closer,1ST,$101-$500,


In [8]:
# Drop unnamed / empty columns
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
df.columns = df.columns.str.strip()

# Drop fully empty columns or rows that are missing key identifiers
df = df.dropna(axis=1, how='all')
df = df.dropna(subset=['Select a round to fill data', 'CM Group'], how='any')

# Standardize text for key categoricals
df['Select a round to fill data'] = df['Select a round to fill data'].astype(str).str.strip().str.title()
df['CM Group'] = df['CM Group'].astype(str).str.strip().str.title()

if 'FINALS RESULTS' in df.columns:
    df['FINALS RESULTS'] = df['FINALS RESULTS'].astype(str).str.upper().fillna('N/A')

print("After cleaning:", df.shape)
df.head()

After cleaning: (1490, 63)


Unnamed: 0,Timestamp,Player IGN,CM Group,Kitasan Black LB in Account (Non-borrow),Super Creek LB in Account (Non-borrow),Select a round to fill data,R1D1 - Uma 1,R1D1 - Uma 1 Role,R1D1 - Uma 1 Running Style,R1D1 - Uma 2,...,FINALS - Uma 1 Role,FINALS - Uma 1 Running Style,FINALS - Uma 2,FINALS - Uma 2 Role,FINALS - Uma 2 Running Style,FINALS - Uma 3,FINALS - Uma 3 Role,FINALS - Uma 3 Running Style,FINALS RESULTS,How much have you spent on the game so far? (EUR/USD)
0,2025-10-30 05:42:28.475,Pharaday,Graded (No Limit),MLB,2LB,End Survey,,,,,...,,,,,,,,,NAN,$1000++
1,2025-10-29 21:55:35.958,Ryan,Graded (No Limit),1LB,,End Survey,Seiun Sky,"Ace (Winner, Tanks, Recycled Aces)",Front Runner,Agnes Tachyon,...,"Ace (Winner, Tanks, Recycled Aces)",Front Runner,Agnes Tachyon,"Ace (Winner, Tanks, Recycled Aces)",Pace Chaser,Mejiro Ryan,Aoharu Made Ace,Late Surger,2ND,F2P
2,2025-10-28 21:03:08.629,Ramen,Graded (No Limit),3LB,,End Survey,Maruzensky (Summer),"Ace (Winner, Tanks, Recycled Aces)",Front Runner,Grass Wonder,...,"Ace (Winner, Tanks, Recycled Aces)",Front Runner,Grass Wonder,Hybrid,Late Surger,Gold Ship,"Ace (Winner, Tanks, Recycled Aces)",End Closer,1ST,$1-$100
3,2025-10-28 17:32:17.479,Jackenstein,Graded (No Limit),,0LB,End Survey,Gold Ship,"Ace (Winner, Tanks, Recycled Aces)",End Closer,Agnes Tachyon,...,"Ace (Winner, Tanks, Recycled Aces)",End Closer,Agnes Tachyon,"Ace (Winner, Tanks, Recycled Aces)",Pace Chaser,Grass Wonder,Debuffer,End Closer,1ST,F2P
4,2025-11-09 12:23:05.233,Cien,Graded (No Limit),3LB,0LB,End Survey,Oguri Cap,"Ace (Winner, Tanks, Recycled Aces)",Pace Chaser,Grass Wonder,...,"Ace (Winner, Tanks, Recycled Aces)",Pace Chaser,Symboli Rudolf,Debuffer,End Closer,Grass Wonder,Debuffer,End Closer,1ST,$101-$500


In [9]:
# Export cleaned dataset
df.to_csv(CLEANED_PATH, index=False)
print(f"Cleaned data saved to: {CLEANED_PATH.resolve()}")

Cleaned data saved to: /workspaces/moomooleo/data/cleaned/leo_cm_cleaned.csv
