### This notebook demonstrates the data cleaning decisions appied to the data scraped from the World Athletics website. 

In [None]:
import pandas as pd
import os
import sqlite3
from pathlib import Path
import random
import re
import json
import time
import requests
from tqdm import tqdm
import country_converter as coco
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter
import pycountry
from dotenv import load_dotenv


In [None]:
df = pd.read_csv('../data/raw/m_high_jump/U20_m_high_jump.csv')

print(df.shape)
df.head()

(368, 11)


Unnamed: 0,Rank,Mark,WIND,Competitor,DOB,Unnamed: 5,Pos,Unnamed: 7,Venue,Date,Results Score
0,1,2.37,,Dragutin TOPIĆ,12 MAR 1971,YUG,1,,"Deveti Septemvri Stadium, Plovdiv (BUL)",12 AUG 1990,1242
1,1,2.37,,Steve SMITH,29 MAR 1973,GBR,1,,"Olympic Stadium, Seoul (KOR)",20 SEP 1992,1242
2,3,2.36,,Javier SOTOMAYOR,13 OCT 1967,CUB,1,,Santiago de Cuba (CUB),23 FEB 1986,1233
3,4,2.35,,Vladimir YASHCHENKO,12 JAN 1959,URS,1,,"Palazzo dello Sport, Milano (ITA) (i)",12 MAR 1978,1224
4,4,2.35,,Dietmar MÖGENBURG,15 AUG 1961,FRG,1,,Rehlingen (GER),26 MAY 1980,1224


#### Looking at .isnull is this df I can see columns to remove are 'WIND' (not recorded in vertical jumps i.e. high jump). With this df having 368 null rows in 'WIND' and Unnamed: 7 I am taking notes that these columns are unnecessary for this analysis. 

In [None]:
df.isnull().sum()

Rank               0
Mark               0
WIND             368
Competitor         0
DOB                0
Unnamed: 5         0
Pos                1
Unnamed: 7       368
Venue              0
Date               0
Results Score      0
dtype: int64

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 368 entries, 0 to 367
Data columns (total 11 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Rank           368 non-null    int64  
 1   Mark           368 non-null    float64
 2   WIND           0 non-null      float64
 3   Competitor     368 non-null    object 
 4   DOB            368 non-null    object 
 5   Unnamed: 5     368 non-null    object 
 6   Pos            367 non-null    object 
 7   Unnamed: 7     0 non-null      float64
 8   Venue          368 non-null    object 
 9   Date           368 non-null    object 
 10  Results Score  368 non-null    int64  
dtypes: float64(3), int64(2), object(6)
memory usage: 31.8+ KB


### I will only explore one horizontal jumps event to get a general idea of what to expect in the data before combining

In [None]:
df = pd.read_csv("../data/raw/m_triple_jump/senior_m_triple_jump.csv")

print(df.shape)
df.head()

(8480, 11)


Unnamed: 0,Rank,Mark,WIND,Competitor,DOB,Unnamed: 5,Pos,Unnamed: 7,Venue,Date,Results Score
0,1,18.43,2.4,Jonathan EDWARDS,10 MAY 1966,GBR,1,,Villeneuve d'Ascq (FRA),25 JUN 1995,1304
1,2,18.29,1.3,Jonathan EDWARDS,10 MAY 1966,GBR,1,,"Ullevi Stadium, Göteborg (SWE)",07 AUG 1995,1303
2,3,18.21,0.2,Christian TAYLOR,18 JUN 1990,USA,1,,"National Stadium, Beijing (CHN)",27 AUG 2015,1294
3,4,18.2,5.2,Willie BANKS,11 MAR 1956,USA,1,,"Indianapolis, IN (USA)",16 JUL 1988,1262
4,5,18.18,-0.3,Jordan A. DÍAZ FORTUN,23 FEB 2001,ESP,1,,"Stadio Olimpico, Roma (ITA)",11 JUN 2024,1292


In [None]:
df.isnull().sum()

Rank                0
Mark                0
WIND             2109
Competitor          0
DOB                 1
Unnamed: 5          0
Pos               220
Unnamed: 7       8480
Venue               0
Date                0
Results Score       0
dtype: int64

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8480 entries, 0 to 8479
Data columns (total 11 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Rank           8480 non-null   int64  
 1   Mark           8480 non-null   float64
 2   WIND           6371 non-null   object 
 3   Competitor     8480 non-null   object 
 4   DOB            8479 non-null   object 
 5   Unnamed: 5     8480 non-null   object 
 6   Pos            8260 non-null   object 
 7   Unnamed: 7     0 non-null      float64
 8   Venue          8480 non-null   object 
 9   Date           8480 non-null   object 
 10  Results Score  8480 non-null   int64  
dtypes: float64(2), int64(2), object(7)
memory usage: 728.9+ KB


#### The combining script handles consistency across events (adding the event and age category). 

In [None]:
# Combining multiple .csv files by event and gender in a list of dictionaries
EVENTS = [
    {
        "input_dir": Path("../data/raw/m_high_jump"),
        "output_file": "m_high_jump.csv",
        "event_name": "high jump",
    },
    {
        "input_dir": Path("../data/raw/m_long_jump"),
        "output_file": "m_long_jump.csv",
        "event_name": "long jump",
    },
    {
        "input_dir": Path("../data/raw/m_triple_jump"),
        "output_file": "m_triple_jump.csv",
        "event_name": "triple jump",
    },
    {
        "input_dir": Path("../data/raw/w_high_jump"),
        "output_file": "w_high_jump.csv",
        "event_name": "high jump",
    },
    {
        "input_dir": Path("../data/raw/w_long_jump"),
        "output_file": "w_long_jump.csv",
        "event_name": "long jump",
    },
    {
        "input_dir": Path("../data/raw/w_triple_jump"),
        "output_file": "w_triple_jump.csv",
        "event_name": "triple jump",
    },
]

output_dir = Path("../data/interim/combined")
output_dir.mkdir(parents=True, exist_ok=True)

# Loop 1: iterates over EVENTS reading from each folder verifiying the event
for event in EVENTS:
    input_dir = event["input_dir"]
    event_name = event["event_name"]
    output_path = output_dir / event["output_file"]

    print(f"\n Processing {input_dir.name}")
    print(f"Looking for .csv files in {input_dir.resolve()}")
    print(f"Saving combined files to {output_path.resolve()}")

    csv_files = list(input_dir.glob("*.csv"))
    print(f".csv files found: {len(csv_files)}")

    dfs = []

# Loop 2: iterates over the .csv files in the selected event folder
    for csv_file in csv_files:
        print(f"Working on: {csv_file.name}")
        df = pd.read_csv(csv_file)

        # Add Age Category to the .csv files
        age_category = csv_file.stem.split("_")[0].lower()
        df["Age_category"] = age_category

        # Add event name
        df["Event"] = event_name
        dfs.append(df)

    if dfs:
        combined_df = pd.concat(dfs, ignore_index=True)
        combined_df.to_csv(output_path, index=False)
        print(f"Saved combined file as: {output_path.name}")
    else:
        print("No .csv files found. Skipped")



 Processing m_high_jump
Looking for .csv files in C:\Users\Britt\Documents\Projects\capstone\data\raw\m_high_jump
Saving combined files to C:\Users\Britt\Documents\Projects\capstone\data\interim\combined\m_high_jump.csv
.csv files found: 3
Working on: senior_m_high_jump.csv
Working on: u18_m_high_jump.csv
Working on: u20_m_high_jump.csv
Saved combined file as: m_high_jump.csv

 Processing m_long_jump
Looking for .csv files in C:\Users\Britt\Documents\Projects\capstone\data\raw\m_long_jump
Saving combined files to C:\Users\Britt\Documents\Projects\capstone\data\interim\combined\m_long_jump.csv
.csv files found: 3
Working on: senior_m_long_jump.csv
Working on: u18_m_long_jump.csv
Working on: u20_m_long_jump.csv
Saved combined file as: m_long_jump.csv

 Processing m_triple_jump
Looking for .csv files in C:\Users\Britt\Documents\Projects\capstone\data\raw\m_triple_jump
Saving combined files to C:\Users\Britt\Documents\Projects\capstone\data\interim\combined\m_triple_jump.csv
.csv files fo

#### Unique venues are separated into a new .csv file to find elevation data. This data will introduce variables that can be used in the future for elevation and or weather affects on athletic performance. 

In [None]:
root_dir = Path("../data/raw")

csv_files = list(root_dir.rglob("*.csv"))
print(f"Found {len(csv_files)} CSV files")

all_dfs = []

for file in csv_files:
    try:
        df = pd.read_csv(file)

        if df.empty or df.shape[1] == 0:
            print(f"Skipping: {file} empty")
            continue
        df = df[["Venue"]]

        all_dfs.append(df)

    except pd.errors.EmptyDataError:
        print(f"Skipping: {file} completely empty")
        continue

combined_df = pd.concat(all_dfs, ignore_index=True)

venues_df = combined_df.drop_duplicates(subset=["Venue"])

output_path = Path("../interim/venues.csv")
output_path.parent.mkdir(parents=True, exist_ok=True)
venues_df.to_csv(output_path, index=False)

print(f"\nSaved unique venues to: {output_path}")
print(f"Unique venues count: {len(venues_df)}")


Found 31 CSV files

Saved unique venues to: ..\interim\venues.csv
Unique venues count: 2955


#### Geocoding the venues were more difficult than I anticipated. My initial attempt was manual, looking up each venue and documenting the city, country, latitude, longitude and elevation. Many hours were lost in this process and I resorted to using chatGPT to fill in the data for geocoded_venues.csv. 

In [None]:
venues_df = pd.read_csv("../data/interim/venues/geocoded_venues.csv")

print(venues_df.shape)
venues_df.head()

(2955, 5)


Unnamed: 0,Venue,latitude,longitude,elevation,geocode_result
0,"Stade Charléty, Paris (FRA)",,,,NOT FOUND
1,"Stadio Olimpico, Roma (ITA)",41.89332,12.482932,50.0,"Roma, Roma Capitale, Lazio, Italia"
2,Sofia (BUL),14.892394,120.794209,6.0,"Sofia, Metropolis North Subdivision, Sergio Ba..."
3,Arnstadt (GER) (i),50.83491,10.946148,286.0,"Arnstadt, Ilm-Kreis, Thüringen, 99310, Deutsch..."
4,Zagreb (CRO),45.81366,15.977115,135.0,"Cro.K, 1, Pod zidom, Kaptol, Mjesni odbor ""Aug..."


In [None]:
venues_df.isnull().sum()

Venue              0
latitude          23
longitude         23
elevation         23
geocode_result     0
dtype: int64

#### Instead of deleteting the venues with null geocoding values, I have opted to leave them in and exclude them in the analysis phase if used. Although there is no elevation, latitude, or longitude they would only affect weather or elevation based analysis which is not the focus of this capstone. 

In [None]:
null_latitude_rows = venues_df[venues_df["latitude"].isna()]
print(null_latitude_rows)

                                                  Venue  latitude  longitude  \
0                           Stade Charléty, Paris (FRA)       NaN        NaN   
179                                     Roodeport (RSA)       NaN        NaN   
561                                     Paris (FRA) (i)       NaN        NaN   
592                      Filothei Stadium, Athina (GRE)       NaN        NaN   
661                  Stadion ŠC Sloboda, Varaždin (YUG)       NaN        NaN   
756                                        Berane (MNE)       NaN        NaN   
1123               Olimpiyets Arena, Mogilyov (BLR) (i)       NaN        NaN   
1144                                     Tiraspol (MDA)       NaN        NaN   
1195                                      Bambous (MRI)       NaN        NaN   
1256              Atıcılar Atletizm Sahası, Bursa (TUR)       NaN        NaN   
1556                                    Ashkhabad (TKM)       NaN        NaN   
1861           Estadio Luguelín Santos, 

## High Jump
<img src="https://i.makeagif.com/media/5-30-2014/MOdzwa.gif" align="left"/>  


#### Exploring Men's High Jump DataFrames

In [None]:
df = pd.read_csv('../data/interim/combined/m_high_jump.csv')

print(df.shape)
df.head()

(9890, 13)


Unnamed: 0,Rank,Mark,WIND,Competitor,DOB,Unnamed: 5,Pos,Unnamed: 7,Venue,Date,Results Score,Age_category,Event
0,1,2.45,,Javier SOTOMAYOR,13 OCT 1967,CUB,1,,Salamanca (ESP),27 JUL 1993,1314,senior,high jump
1,2,2.44,,Javier SOTOMAYOR,13 OCT 1967,CUB,1,,San Juan (PUR),29 JUL 1989,1305,senior,high jump
2,3,2.43,,Javier SOTOMAYOR,13 OCT 1967,CUB,1,,Salamanca (ESP),08 SEP 1988,1296,senior,high jump
3,3,2.43,,Javier SOTOMAYOR,13 OCT 1967,CUB,1,,"Budapest Sportcsarnok, Budapest (HUN) (i)",04 MAR 1989,1296,senior,high jump
4,3,2.43,,Mutaz Essa BARSHIM,24 JUN 1991,QAT,1,,"Boudewijnstadion, Bruxelles (BEL)",05 SEP 2014,1296,senior,high jump


##### Checking for the number of null fields in each column

In [None]:
df.isnull().sum()

Rank                0
Mark                0
WIND             9890
Competitor          0
DOB                 0
Unnamed: 5          0
Pos                45
Unnamed: 7       9890
Venue               0
Date                0
Results Score       0
Age_category        0
Event               0
dtype: int64

In [None]:
null_DOB_rows = df[df["DOB"].isna()]
print(null_DOB_rows)

Empty DataFrame
Columns: [Rank, Mark, WIND, Competitor, DOB, Unnamed: 5, Pos, Unnamed: 7, Venue, Date, Results Score, Age_category, Event]
Index: []


#### Exploring Women's High Jump DataFrames

In [None]:
df = pd.read_csv("../data/interim/combined/w_high_jump.csv")

print(df.shape)
df.head()

(3876, 12)


Unnamed: 0,Rank,Mark,WIND,Competitor,DOB,Unnamed: 5,Pos,Unnamed: 7,Venue,Date,Results Score,Event
0,1,2.1,,Yaroslava MAHUCHIKH,19 SEP 2001,UKR,1,,"Stade Charléty, Paris (FRA)",07 JUL 2024,1319,high jump
1,2,2.09,,Stefka KOSTADINOVA,25 MAR 1965,BUL,1,,"Stadio Olimpico, Roma (ITA)",30 AUG 1987,1309,high jump
2,3,2.08,,Stefka KOSTADINOVA,25 MAR 1965,BUL,1,,Sofia (BUL),31 MAY 1986,1299,high jump
3,3,2.08,,Kajsa BERGQVIST,12 OCT 1976,SWE,1,,Arnstadt (GER) (i),04 FEB 2006,1299,high jump
4,3,2.08,,Blanka VLAŠIĆ,08 NOV 1983,CRO,1,,Zagreb (CRO),31 AUG 2009,1299,high jump


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3876 entries, 0 to 3875
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Rank           3876 non-null   int64  
 1   Mark           3876 non-null   float64
 2   WIND           0 non-null      float64
 3   Competitor     3876 non-null   object 
 4   DOB            3875 non-null   object 
 5   Unnamed: 5     3876 non-null   object 
 6   Pos            3857 non-null   object 
 7   Unnamed: 7     0 non-null      float64
 8   Venue          3876 non-null   object 
 9   Date           3876 non-null   object 
 10  Results Score  3876 non-null   int64  
 11  Event          3876 non-null   object 
dtypes: float64(3), int64(2), object(7)
memory usage: 363.5+ KB


In [None]:
df.dtypes

Rank               int64
Mark             float64
WIND             float64
Competitor        object
DOB               object
Unnamed: 5        object
Pos               object
Unnamed: 7       float64
Venue             object
Date              object
Results Score      int64
Event             object
dtype: object

In [None]:
df.isnull().sum()

Rank                0
Mark                0
WIND             3876
Competitor          0
DOB                 1
Unnamed: 5          0
Pos                19
Unnamed: 7       3876
Venue               0
Date                0
Results Score       0
Event               0
dtype: int64

#### Cleaning function will need to:
- [x] Rename comlumn 'Unnamed: 5' to 'Nat'
- [x] Remove all athletes with no DOB
- [x] Remove columns
    - WIND
    - Unnamed: 7
    - Results Score
    - Pos


In [None]:
# FIX: I need this to run a test cleaning and not save just print the df.head() to verify results

raw_dir = [
    Path("../data/interim/combined/w_high_jump.csv")
]

for folder in raw_dir:
    for csv_file in folder.glob("*.csv"):
        print(f"Cleaning {csv_file.name}")
        df = pd.read_csv(csv_file)
        # Rename 'Unnamed:5' to 'country'
        df = df.rename(columns={'Unnamed: 5': 'country'})
        # Drop the column 'WIND', 'Results Score'
        df = df.drop(columns=['WIND', 'Unnamed: 7', 'Results Score', 'Pos'], errors='ignore')
        # Remove athlete data with no DOB
        df = df[df["DOB"].notna()]
        # Create a gender column
        gender_letter = folder.name.split("_")[0].lower()
        df["Gender"] = gender_letter.upper()


    df.head()

## Triple Jump Data Exploration
<img src="https://i.makeagif.com/media/5-14-2018/LLGTic.gif" align="left"/>


#### Exploring Men's Triple Jump Data

In [None]:
df = pd.read_csv("../data/interim/combined/m_triple_jump.csv")

print(df.shape)
df.head()

(8697, 13)


Unnamed: 0,Rank,Mark,WIND,Competitor,DOB,Unnamed: 5,Pos,Unnamed: 7,Venue,Date,Results Score,Age_category,Event
0,1,18.43,2.4,Jonathan EDWARDS,10 MAY 1966,GBR,1,,Villeneuve d'Ascq (FRA),25 JUN 1995,1304,senior,triple jump
1,2,18.29,1.3,Jonathan EDWARDS,10 MAY 1966,GBR,1,,"Ullevi Stadium, Göteborg (SWE)",07 AUG 1995,1303,senior,triple jump
2,3,18.21,0.2,Christian TAYLOR,18 JUN 1990,USA,1,,"National Stadium, Beijing (CHN)",27 AUG 2015,1294,senior,triple jump
3,4,18.2,5.2,Willie BANKS,11 MAR 1956,USA,1,,"Indianapolis, IN (USA)",16 JUL 1988,1262,senior,triple jump
4,5,18.18,-0.3,Jordan A. DÍAZ FORTUN,23 FEB 2001,ESP,1,,"Stadio Olimpico, Roma (ITA)",11 JUN 2024,1292,senior,triple jump


In [None]:
df.isnull().sum()

Rank                0
Mark                0
WIND             2144
Competitor          0
DOB                 1
Unnamed: 5          0
Pos               228
Unnamed: 7       8697
Venue               0
Date                0
Results Score       0
Event               0
dtype: int64

In [None]:
null_DOB_rows = df[df["DOB"].isna()]
print(null_DOB_rows)

      Rank   Mark  WIND     Competitor  DOB Unnamed: 5  Pos  Unnamed: 7  \
7340  7172  16.85  +3.9  Shawn JOHNSON  NaN        USA  2q1         NaN   

                                Venue         Date  Results Score        Event  
7340  Hayward Field, Eugene, OR (USA)  07 JUL 2016           1126  triple jump  


#### Exploring Women's Triple Jump Data

In [None]:
df = pd.read_csv("../data/interim/combined/w_triple_jump.csv")

print(df.shape)
df.head()

(7465, 13)


Unnamed: 0,Rank,Mark,WIND,Competitor,DOB,Unnamed: 5,Pos,Unnamed: 7,Venue,Date,Results Score,Age_category,Event
0,1,15.74,,Yulimar ROJAS,21 OCT 1995,VEN,1.0,,"Štark Arena, Beograd (SRB) (i)",20 MAR 2022,1297,senior,triple jump
1,2,15.67,0.7,Yulimar ROJAS,21 OCT 1995,VEN,1.0,,"National Stadium, Tokyo (JPN)",01 AUG 2021,1290,senior,triple jump
2,3,15.56,3.5,Yulimar ROJAS,21 OCT 1995,VEN,1.0,,"Stade Olympique de la Pontaise, Lausanne (SUI)",26 AUG 2021,1257,senior,triple jump
3,4,15.52,0.6,Yulimar ROJAS,21 OCT 1995,VEN,,,"Stade Olympique de la Pontaise, Lausanne (SUI)",26 AUG 2021,1274,senior,triple jump
4,5,15.5,0.9,Inessa KRAVETS,05 OCT 1966,UKR,1.0,,"Ullevi Stadium, Göteborg (SWE)",10 AUG 1995,1272,senior,triple jump


In [None]:
df.isnull().sum()

Rank                0
Mark                0
WIND             1669
Competitor          0
DOB                 0
Unnamed: 5          0
Pos               230
Unnamed: 7       7465
Venue               0
Date                0
Results Score       0
Event               0
dtype: int64

## Long jump

<img src="https://i.pinimg.com/originals/00/fb/9a/00fb9a207618494916b6b2bc5673340b.gif" align="left"/>

#### Exploring Men's Long Jump Data

In [None]:
df = pd.read_csv("../data/cleaned/m_long_jump.csv")

print(df.shape)


(9860, 12)


In [None]:
df = pd.read_csv("../data/interim/combined/m_long_jump.csv")

print(df.shape)


(9886, 13)


In [None]:
df.isnull().sum()

Rank                 0
Mark                 0
WIND              2354
Competitor           0
DOB                 28
Unnamed: 5          86
Pos                299
Unnamed: 7       11191
Venue                0
Date                 0
Results Score        0
Event                0
Nat              11105
dtype: int64

In [None]:
null_DOB_rows = df[df["DOB"].isna()]
print(null_DOB_rows)

    Rank  Mark  WIND        Competitor  DOB  Nat Pos  Unnamed: 7  \
2      3  8.24  +4.5  Dodley THERMITUS  NaN  USA   1         NaN   
16    16  8.03   NaN    Reece PONTIOUS  NaN  USA   1         NaN   
31    32  7.92   NaN       Justin GUNN  NaN  USA   5         NaN   

                         Venue         Date  Results Score  
2              Hobbs, NM (USA)  16 MAY 2019           1163  
16             Mason, OH (USA)  18 MAY 2012           1144  
31  Fayetteville, AR (USA) (i)  14 MAR 2008           1120  


In [None]:
df = pd.read_csv('../data/raw/m_long_jump/u20_m_long_jump.csv')

print(df.shape)
df.head()

(305, 11)


Unnamed: 0,Rank,Mark,WIND,Competitor,DOB,Unnamed: 5,Pos,Unnamed: 7,Venue,Date,Results Score
0,1,8.44,2.2,Mattia FURLANI,07 FEB 2005,ITA,1,,"Centro Sportivo Fontanassa, Savona (ITA)",24 MAY 2023,1221
1,2,8.4,3.2,Kareem STREETE-THOMPSON,30 MAR 1973,CAY,1,,"Houston, TX (USA)",05 MAY 1991,1206
2,3,8.39,2.3,Kareem STREETE-THOMPSON,30 MAR 1973,CAY,q,,"Austin, TX (USA)",03 JUN 1992,1210
3,4,8.38,-0.5,Mattia FURLANI,07 FEB 2005,ITA,2,,"Stadio Olimpico, Roma (ITA)",08 JUN 2024,1224
4,5,8.36,1.4,Mattia FURLANI,07 FEB 2005,ITA,1,,"Centro Sportivo Fontanassa, Savona (ITA)",15 MAY 2024,1216


In [None]:
df.isnull().sum()

Rank               0
Mark               0
WIND              42
Competitor         0
DOB                3
Unnamed: 5         0
Pos                6
Unnamed: 7       305
Venue              0
Date               0
Results Score      0
dtype: int64

In [None]:
null_DOB_rows = df[df["DOB"].isna()]
print(null_DOB_rows)

     Rank  Mark  WIND        Competitor  DOB Unnamed: 5 Pos  Unnamed: 7  \
32     30  8.24  +4.5  Dodley THERMITUS  NaN        USA   1         NaN   
148   141  8.08   NWI       Tony MARTIN  NaN        USA   1         NaN   
234   224  8.03   NaN    Reece PONTIOUS  NaN        USA   1         NaN   

                 Venue         Date  Results Score  
32     Hobbs, NM (USA)  16 MAY 2019           1163  
148  Saginaw, MI (USA)  25 APR 2019           1125  
234    Mason, OH (USA)  18 MAY 2012           1144  


In [None]:
df = pd.read_csv('../data/raw/m_long_jump/senior_m_long_jump.csv')

print(df.shape)
df.head()

(10800, 11)


Unnamed: 0,Rank,Mark,WIND,Competitor,DOB,Unnamed: 5,Pos,Unnamed: 7,Venue,Date,Results Score
0,1,8.99,4.4,Mike POWELL,10 NOV 1963,USA,1,,Sestriere (ITA),21 JUL 1992,1329
1,2,8.95,0.3,Mike POWELL,10 NOV 1963,USA,1,,"National Stadium, Tokyo (JPN)",30 AUG 1991,1346
2,2,8.95,3.9,Mike POWELL,10 NOV 1963,USA,1,,Sestriere (ITA),31 JUL 1994,1323
3,4,8.92,3.3,Juan Miguel ECHEVARRÍA,11 AUG 1998,CUB,1,,La Habana (CUB),10 MAR 2019,1321
4,5,8.91,2.9,Carl LEWIS,01 JUL 1961,USA,2,,"National Stadium, Tokyo (JPN)",30 AUG 1991,1320


In [None]:
df.isnull().sum()

Rank                 0
Mark                 0
WIND              2293
Competitor           0
DOB                 22
Unnamed: 5           0
Pos                293
Unnamed: 7       10800
Venue                0
Date                 0
Results Score        0
dtype: int64

In [None]:
null_DOB_rows = df[df["DOB"].isna()]
print(null_DOB_rows)

       Rank  Mark  WIND         Competitor  DOB Unnamed: 5  Pos  Unnamed: 7  \
2031   1908  8.24  +4.5   Dodley THERMITUS  NaN        USA    1         NaN   
3312   3072  8.18  +5.3     Jonathon WATTS  NaN        USA    1         NaN   
3567   3567  8.16   NWI  Sergey VASILYENKO  NaN        URS    1         NaN   
4490   4324  8.13   NaN       Trent HUNTER  NaN        USA    1         NaN   
4601   4324  8.13   NaN      Joshua RIVERS  NaN        USA    1         NaN   
5779   5729  8.09   NaN      Derek SHELTON  NaN        USA    1         NaN   
5967   5729  8.09  +5.1       Kiwan LAWSON  NaN        USA    1         NaN   
5997   5729  8.09  +4.3   Clive CHAFAUSIPO  NaN        ZIM    1         NaN   
6078   5729  8.09  +0.7      William JONES  NaN        USA    1         NaN   
6462   6112  8.08   NWI        Tony MARTIN  NaN        USA    1         NaN   
6811   6532  8.07   NaN    Maurice CALENDA  NaN        USA  1h1         NaN   
7264   6946  8.06  +4.9   Clive CHAFAUSIPO  NaN     

Dodley THERMITUS and Reece PONTIOUS appear on U18-Senior tables

In [None]:
df = pd.read_csv('../data/raw/w_long_jump/u18_w_long_jump.csv')

print(df.shape)
df.head()

(32, 11)


Unnamed: 0,Rank,Mark,WIND,Competitor,DOB,Unnamed: 5,Pos,Unnamed: 7,Venue,Date,Results Score
0,1,7.02,4.0,Heike DAUTE,16 DEC 1964,GDR,1,,"Atletiekbaan Overvecht, Utrecht (NED)",20 AUG 1981,1198
1,2,6.91,1.0,Heike DAUTE,16 DEC 1964,GDR,1,,Jena (GDR),09 AUG 1981,1198
2,3,6.82,,Natalya SHEVCHENKO,28 DEC 1966,URS,1,,Novokuznetsk (RUS),17 JUL 1983,1178
3,4,6.71,4.0,Nicole BOEGMAN,05 MAR 1967,AUS,1,,Sydney (AUS),15 SEP 1983,1130
4,4,6.71,3.6,Susana HERNÁNDEZ,18 JAN 1999,MEX,1,,"Walnut, CA (USA)",17 APR 2015,1133


In [None]:
df.isnull().sum()

Rank              0
Mark              0
WIND              9
Competitor        0
DOB               0
Unnamed: 5        0
Pos               0
Unnamed: 7       32
Venue             0
Date              0
Results Score     0
dtype: int64

In [None]:
df = pd.read_csv('../data/raw/w_long_jump/u20_w_long_jump.csv')

print(df.shape)
df.head()

(100, 11)


Unnamed: 0,Rank,Mark,WIND,Competitor,DOB,Unnamed: 5,Pos,Unnamed: 7,Venue,Date,Results Score
0,1,7.27,2.2,Heike DAUTE,16 DEC 1964,GDR,1,,"Olympiastadion, Helsinki (FIN)",14 AUG 1983,1265
1,2,7.14,1.1,Heike DAUTE,16 DEC 1964,GDR,1,,Bratislava (TCH),04 JUN 1983,1249
2,3,7.08,1.0,Heike DAUTE,16 DEC 1964,GDR,1,,Leipzig (GER),27 JUL 1983,1236
3,4,7.03,1.3,Darya KLISHINA,15 JAN 1991,RUS,1,,"Meteor, Zhukovskiy (RUS)",26 JUN 2010,1224
4,5,7.02,4.0,Heike DAUTE,16 DEC 1964,GDR,1,,"Atletiekbaan Overvecht, Utrecht (NED)",20 AUG 1981,1198


In [None]:
df.isnull().sum()

Rank               0
Mark               0
WIND              20
Competitor         0
DOB                0
Unnamed: 5         0
Pos                4
Unnamed: 7       100
Venue              0
Date               0
Results Score      0
dtype: int64

In [None]:
df = pd.read_csv('../data/raw/w_long_jump/senior_w_long_jump.csv')

print(df.shape)
df.head()

(3868, 11)


Unnamed: 0,Rank,Mark,WIND,Competitor,DOB,Unnamed: 5,Pos,Unnamed: 7,Venue,Date,Results Score
0,1,7.63,2.1,Heike DRECHSLER,16 DEC 1964,GER,1,,Sestriere (ITA),21 JUL 1992,1346
1,2,7.52,1.4,Galina CHISTYAKOVA,26 JUL 1962,URS,1,,Leningrad (URS),11 JUN 1988,1333
2,3,7.49,1.3,Jackie JOYNER-KERSEE,03 MAR 1962,USA,1,,"New York, NY (USA)",22 MAY 1994,1327
3,3,7.49,1.7,Jackie JOYNER-KERSEE,03 MAR 1962,USA,1,,Sestriere (ITA),31 JUL 1994,1327
4,5,7.48,1.2,Heike DRECHSLER,16 DEC 1964,GDR,1,,Neubrandenburg (GDR),09 JUL 1988,1324


In [None]:
df.isnull().sum()

Rank                0
Mark                0
WIND              812
Competitor          0
DOB                 7
Unnamed: 5          0
Pos                94
Unnamed: 7       3868
Venue               0
Date                0
Results Score       0
dtype: int64

In [None]:
null_DOB_rows = df[df["DOB"].isna()]
print(null_DOB_rows)

      Rank  Mark  WIND       Competitor  DOB Unnamed: 5 Pos  Unnamed: 7  \
1745  1719  6.87  +1.2   Meosha HUBBARD  NaN        USA   1         NaN   
2120  2059  6.84  +2.7   Meosha HUBBARD  NaN        USA   4         NaN   
2477  2352  6.82  +4.6  Chioma AGWUNOBI  NaN        USA   1         NaN   
2835  2692  6.80  +2.0     Dorothy CHOW  NaN        USA   1         NaN   
3028  2860  6.79  +2.4   Sydney JOHNSON  NaN        USA   1         NaN   
3048  3038  6.78   NaN  Natalya POZDINA  NaN        URS   2         NaN   
3865  3632  6.75  +0.9    Jasmine AKINS  NaN        USA   1         NaN   

                                                  Venue         Date  \
1745                                Flagstaff, AZ (USA)  06 JUL 2000   
2120                               Sacramento, CA (USA)  16 JUL 2000   
2477                                  El Paso, TX (USA)  16 MAY 2015   
2835                      Olympia HS, Orlando, FL (USA)  27 JUN 2021   
3028                 Anteater Stadium, 

# Venues

In [None]:
df = pd.read_csv("../data/interim/venues/geocoded_venues.csv")

print(df.shape)
df.head()

(2955, 5)


Unnamed: 0,Venue,latitude,longitude,elevation,geocode_result
0,"Stade Charléty, Paris (FRA)",,,,NOT FOUND
1,"Stadio Olimpico, Roma (ITA)",41.89332,12.482932,50.0,"Roma, Roma Capitale, Lazio, Italia"
2,Sofia (BUL),14.892394,120.794209,6.0,"Sofia, Metropolis North Subdivision, Sergio Ba..."
3,Arnstadt (GER) (i),50.83491,10.946148,286.0,"Arnstadt, Ilm-Kreis, Thüringen, 99310, Deutsch..."
4,Zagreb (CRO),45.81366,15.977115,135.0,"Cro.K, 1, Pod zidom, Kaptol, Mjesni odbor ""Aug..."


In [None]:
df.isnull().sum()

Venue              0
latitude          23
longitude         23
elevation         23
geocode_result     0
dtype: int64

In [None]:
null_elevation_rows = df[df["elevation"].isna()]
print(null_elevation_rows)


                                                  Venue  latitude  longitude  \
0                           Stade Charléty, Paris (FRA)       NaN        NaN   
179                                     Roodeport (RSA)       NaN        NaN   
561                                     Paris (FRA) (i)       NaN        NaN   
592                      Filothei Stadium, Athina (GRE)       NaN        NaN   
661                  Stadion ŠC Sloboda, Varaždin (YUG)       NaN        NaN   
756                                        Berane (MNE)       NaN        NaN   
1123               Olimpiyets Arena, Mogilyov (BLR) (i)       NaN        NaN   
1144                                     Tiraspol (MDA)       NaN        NaN   
1195                                      Bambous (MRI)       NaN        NaN   
1256              Atıcılar Atletizm Sahası, Bursa (TUR)       NaN        NaN   
1556                                    Ashkhabad (TKM)       NaN        NaN   
1861           Estadio Luguelín Santos, 

# Phase 3

Combining

In [None]:
input_dir = Path("../data/interim/m_high_jump_with_age")

output_dir = Path("../data/interim/combined")
output_dir.mkdir(parents=True, exist_ok=True)

FOLDERS = [
    Path("../data/interim/w_high_jump_with_age"),
]

all_athletes = []

for folder in FOLDERS:
    print(f"\nLooking in {folder}.")

    for csv_file in folder.glob("*.csv"):
        print(f"{csv_file.name}")
        df = pd.read_csv(csv_file)

        if "Competitor" not in df.columns:
            print(f"Skipping {csv_file.name}")
            continue

        df["source_file"] = csv_file.stem
        all_athletes.append(df)

if not all_athletes:
    print("No athlete data found.")
else:
    combined = combined.sort_values(["Competitor"], ascending=[True])
    appearance_counts = combined.groupby("Competitor")["source_file"].nunique()

    multi_level_athletes = appearance_counts[appearance_counts >= 3].index

    if len(multi_level_athletes) == 0:
        print("Atheltes found appearing u18, u20, senior")
    else:
        print("Athlete who appear in multiple categories u18, u20, senior:")
        for athlete in multi_level_athletes:
            print("\n No athletes found: \n")
            print("-" * 80)
            athlete_data = combined[combined["Competitor"] == athlete]


            print(athlete_data.to_string(index=False))
            print("\n")

output_file = output_dir / "m_high_jump_athletes.csv"
combined_df.to_csv(output_file, index=False)

print("\n Combined .csv saved to:", output_file)


Looking in ..\data\interim\w_high_jump_with_age.
senior_w_high_jump.csv
u18_w_high_jump.csv
u20_w_high_jump.csv
Athlete who appear in multiple categories u18, u20, senior:

 No athletes found: 

--------------------------------------------------------------------------------
 Rank  Mark    Competitor        DOB country                                             Venue       Date Gender Age_group     Event  Age        source_file
 1103  1.98 Alina ASTAFEI 1969-06-07     GER             Estadio de la Comunidad, Madrid (ESP) 1996-06-02      W    senior high_jump 26.0 senior_w_high_jump
 1103  1.98 Alina ASTAFEI 1969-06-07     ROU                                 München (GER) (i) 1990-02-25      W    senior high_jump 20.0 senior_w_high_jump
 2039  1.96 Alina ASTAFEI 1969-06-07     ROU                           Villeneuve d'Ascq (FRA) 1993-07-02      W    senior high_jump 24.0 senior_w_high_jump
 1103  1.98 Alina ASTAFEI 1969-06-07     GER                  Globe Arena, Stockholm (SWE) (i) 

In [None]:
df = pd.read_csv('../data/interim/combined/m_high_jump.csv')

print(df.shape)
df.head()

(13776, 12)


Unnamed: 0,Rank,Mark,Competitor,DOB,Country,Venue,Date,Gender,Age_group,Event,Age,source_file
0,1,2.45,Javier SOTOMAYOR,1967-10-13,CUB,Salamanca (ESP),1993-07-27,M,senior,high_jump,25.0,senior_m_high_jump
1,2,2.44,Javier SOTOMAYOR,1967-10-13,CUB,San Juan (PUR),1989-07-29,M,senior,high_jump,21.0,senior_m_high_jump
2,3,2.43,Javier SOTOMAYOR,1967-10-13,CUB,Salamanca (ESP),1988-09-08,M,senior,high_jump,20.0,senior_m_high_jump
3,3,2.43,Javier SOTOMAYOR,1967-10-13,CUB,"Budapest Sportcsarnok, Budapest (HUN) (i)",1989-03-04,M,senior,high_jump,21.0,senior_m_high_jump
4,3,2.43,Mutaz Essa BARSHIM,1991-06-24,QAT,"Boudewijnstadion, Bruxelles (BEL)",2014-09-05,M,senior,high_jump,23.0,senior_m_high_jump


In [None]:
df.dtypes

Rank             int64
Mark           float64
Competitor      object
DOB             object
Country         object
Venue           object
Date            object
Gender          object
Age_group       object
Event           object
Age            float64
source_file     object
dtype: object