In [1]:
from pathlib import Path
import pandas as pd

PROJECT_ROOT = Path("..").resolve()
DATA_RAW = PROJECT_ROOT / "data" / "raw"
DATA_PROCESSED = PROJECT_ROOT / "data" / "processed"
DATA_PROCESSED.mkdir(parents=True, exist_ok=True)

PROJECT_ROOT, DATA_RAW, DATA_PROCESSED


(PosixPath('/Users/arnavjain/manutd-2007-08-style-profile'),
 PosixPath('/Users/arnavjain/manutd-2007-08-style-profile/data/raw'),
 PosixPath('/Users/arnavjain/manutd-2007-08-style-profile/data/processed'))

In [2]:
standard = pd.read_csv(DATA_RAW / "manutd_2007_08_standard_stats_sample.csv")
shooting = pd.read_csv(DATA_RAW / "manutd_2007_08_shooting.csv")
goalkeeping = pd.read_csv(DATA_RAW / "manutd_2007_08_goalkeeping.csv")
playing_time = pd.read_csv(DATA_RAW / "manutd_2007_08_playing_time.csv")
misc = pd.read_csv(DATA_RAW / "manutd_2007_08_misc.csv")


In [3]:
for name, df in {
    "standard": standard,
    "shooting": shooting,
    "goalkeeping": goalkeeping,
    "playing_time": playing_time,
    "misc": misc,
}.items():
    print(f"\n=== {name.upper()} ===")
    print(df.shape)
    print(df.columns.tolist())



=== STANDARD ===
(7, 4)
['Playing Time', 'Performance', 'Per 90 Minutes', 'Unnamed: 3']

=== SHOOTING ===
(25, 17)
['Player', 'Nation', 'Pos', 'Age', '90s', 'Gls', 'Sh', 'SoT', 'SoT%', 'Sh/90', 'SoT/90', 'G/Sh', 'G/SoT', 'Dist', 'PK', 'PKatt', 'Matches']

=== GOALKEEPING ===
(3, 24)
['Player', 'Nation', 'Pos', 'Age', 'MP', 'Starts', 'Min', '90s', 'GA', 'GA90', 'SoTA', 'Saves', 'Save%', 'W', 'D', 'L', 'CS', 'CS%', 'PKatt', 'PKA', 'PKsv', 'PKm', 'Save%.1', 'Matches']

=== PLAYING_TIME ===
(25, 22)
['Player', 'Nation', 'Pos', 'Age', 'MP', 'Min', 'Mn/MP', 'Min%', '90s', 'Starts', 'Mn/Start', 'Compl', 'Subs', 'Mn/Sub', 'unSub', 'PPM', 'onG', 'onGA', '+/-', '+/-90', 'On-Off', 'Matches']

=== MISC ===
(25, 18)
['Player', 'Nation', 'Pos', 'Age', '90s', 'CrdY', 'CrdR', '2CrdY', 'Fls', 'Fld', 'Off', 'Crs', 'Int', 'TklW', 'PKwon', 'PKcon', 'OG', 'Matches']


In [4]:
def normalize_player_table(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    
    # Clean raw column names
    df.columns = df.columns.str.strip()
    
    # If "Player" isn't present, assume the first 4 columns are Player, Nation, Pos, Age
    if "Player" not in df.columns:
        key_cols = df.columns[:4]
        rename_map = {
            key_cols[0]: "Player",
            key_cols[1]: "Nation",
            key_cols[2]: "Pos",
            key_cols[3]: "Age",
        }
        df = df.rename(columns=rename_map)
    
    # Now enforce types / trimming on the key columns
    df["Player"] = df["Player"].astype(str).str.strip()
    df["Nation"] = df["Nation"].astype(str).str.strip()
    df["Pos"] = df["Pos"].astype(str).str.strip()
    df["Age"] = pd.to_numeric(df["Age"], errors="coerce")
    
    return df


In [5]:
standard = normalize_player_table(standard)
shooting = normalize_player_table(shooting)
goalkeeping = normalize_player_table(goalkeeping)
playing_time = normalize_player_table(playing_time)
misc = normalize_player_table(misc)


In [6]:
from functools import reduce

dfs = [standard, shooting, goalkeeping, playing_time, misc]

master = reduce(
    lambda left, right: pd.merge(
        left,
        right,
        on=["Player", "Nation", "Pos", "Age"],
        how="outer",
        suffixes=("", "_dup"),
    ),
    dfs,
)

master.shape


(32, 69)

In [7]:
# Save merged master dataset
OUTPUT_PATH = PROJECT_ROOT / "data" / "processed" / "manutd_2007_08_master.csv"
master.to_csv(OUTPUT_PATH, index=False)

OUTPUT_PATH


PosixPath('/Users/arnavjain/manutd-2007-08-style-profile/data/processed/manutd_2007_08_master.csv')

In [8]:
master.head()
master.info()
master.describe(include='all')


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32 entries, 0 to 31
Data columns (total 69 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Player       32 non-null     object 
 1   Nation       32 non-null     object 
 2   Pos          32 non-null     object 
 3   Age          25 non-null     float64
 4   90s          25 non-null     float64
 5   Gls          25 non-null     float64
 6   Sh           0 non-null      float64
 7   SoT          25 non-null     float64
 8   SoT%         0 non-null      float64
 9   Sh/90        25 non-null     float64
 10  SoT/90       0 non-null      float64
 11  G/Sh         20 non-null     float64
 12  G/SoT        0 non-null      float64
 13  Dist         25 non-null     float64
 14  PK           25 non-null     float64
 15  PKatt        25 non-null     object 
 16  Matches      0 non-null      float64
 17  MP           3 non-null      float64
 18  Starts       3 non-null      float64
 19  Min       

Unnamed: 0,Player,Nation,Pos,Age,90s,Gls,Sh,SoT,SoT%,Sh/90,...,Fls,Fld,Off,Crs,Int,TklW,PKwon,PKcon,OG,Matches_dup
count,32.0,32,32,25.0,25.0,25.0,0.0,25.0,0.0,25.0,...,25.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,25
unique,31.0,19,12,,,,,,,,...,,,,,,,,,,1
top,0.09,eng ENG,DF,,,,,,,,...,,,,,,,,,,Matches
freq,2.0,10,7,,,,,,,,...,,,,,,,,,,25
mean,,,,25.0,16.692,3.12,,12.04,,0.6336,...,16.24,,,,,,,,,
std,,,,4.444097,11.760595,6.821046,,21.450486,,0.77885,...,15.441503,,,,,,,,,
min,,,,19.0,0.2,0.0,,0.0,,0.0,...,0.0,,,,,,,,,
25%,,,,21.0,7.0,0.0,,2.0,,0.12,...,2.0,,,,,,,,,
50%,,,,25.0,17.0,1.0,,4.0,,0.29,...,12.0,,,,,,,,,
75%,,,,27.0,28.5,2.0,,8.0,,1.09,...,28.0,,,,,,,,,
