# Data loading

In [1]:
import pandas as pd
import zipfile

In [2]:
with zipfile.ZipFile('data/contest_results.csv.zip', 'r') as zip_ref:
    zip_ref.extractall('data')

In [4]:
data = pd.read_csv('data/contest_results.csv', parse_dates=["contest_date"])

In [5]:
data.head()

Unnamed: 0,contest_url,competitor_name,competitor_url,organization,contest_name,contest_date,division,class,placing,scraped_timestamp,is_loaded
0,https://contests.npcnewsonline.com/contests/20...,laura mcintosh,https://contests.npcnewsonline.com/contests/20...,cpa,cpa fedel clarke classic,2018-03-25,bikini,class a,6.0,2023-11-07 03:47:46.494501 UTC,False
1,https://contests.npcnewsonline.com/contests/20...,tasha chase,https://contests.npcnewsonline.com/contests/20...,cpa,cpa fedel clarke classic,2018-03-25,bikini,class a,5.0,2023-11-07 03:47:46.494 UTC,False
2,https://contests.npcnewsonline.com/contests/20...,sarah farrer,https://contests.npcnewsonline.com/contests/20...,cpa,cpa fedel clarke classic,2018-03-25,bikini,class a,4.0,2023-11-07 03:47:46.493408 UTC,False
3,https://contests.npcnewsonline.com/contests/20...,neha dhanda,https://contests.npcnewsonline.com/contests/20...,cpa,cpa fedel clarke classic,2018-03-25,bikini,class a,3.0,2023-11-07 03:47:46.492877 UTC,False
4,https://contests.npcnewsonline.com/contests/20...,reina espineli,https://contests.npcnewsonline.com/contests/20...,cpa,cpa fedel clarke classic,2018-03-25,bikini,class a,2.0,2023-11-07 03:47:46.492311 UTC,False


# Quick facts

In [12]:
# date range of records
print(f'Oldest competition on record: {data["contest_date"].min()}')
print(f'Most recent competition on record: {data["contest_date"].max()}')

Oldest competition on record: 2012-02-18 00:00:00
Most recent competition on record: 2024-03-17 00:00:00


In [17]:
# number of competitions on record
print(f"""
      Records of {data["contest_url"].nunique()} distinct competitions,
      {data['competitor_name'].nunique()} distinct competitors,
      and {len(data)} distinct contest entries.
      """)


      Records of 4093 distinct competitions,
      214314 distinct competitors,
      and 783707 distinct contest entries.
      


In [63]:
# number of records of pro card wins
print(
    f"""
      Dataset contains {data['class'].str.contains('pro card').sum()} pro card wins.
      There are {data[
    (data["class"].str.contains("pro card")) | (data["organization"] == "ifbb")
]["competitor_name"].nunique()} professional competitors on record.
"""
)


      Dataset contains 5812 pro card wins.
      There are 17315 professional competitors on record.



# Distribution facts

In [19]:
# most entries are for the NPC organization
data["organization"].value_counts()

organization
npc     616391
npcw     92896
ifbb     57393
cpa      17027
Name: count, dtype: int64

In [40]:
pro_quals = data[data["class"].str.contains("pro card")]["contest_name"]

In [41]:
# create pro qualifier column
data["pro_qualifier"] = False
data.loc[data["contest_name"].isin(pro_quals), "pro_qualifier"] = True

In [42]:
# number of pro-qualifer competitions in 2023
data[(data['pro_qualifier']) &
     (data['contest_date'].dt.year == 2023)]["contest_name"].nunique()

34

In [44]:
# pro qualifier competitions have the most contest entries
data[data["organization"] == "npc"].groupby(
    by=["contest_name", "pro_qualifier"]
).size().sort_values(ascending=False).head(10)

contest_name                                          pro_qualifier
npc teen collegiate & masters national championships  True             15953
npc national championships                            True             10901
npc north american championships                      True             10698
npc usa championships                                 True              8668
npc universe championships                            True              6081
npc junior nationals                                  True              5954
npc emerald cup                                       False             5689
npc universe & npc national fitness championships     True              5509
npc junior usa championships                          True              4969
npc midwest championships                             False             4614
dtype: int64

In [45]:
# women's bikini is the most popular division
data["division"].value_counts()

division
bikini                    244135
men's physique            184536
men's bodybuilding        104988
figure                     99596
men's classic physique     86265
women's physique           25755
wellness                   19884
women's wellness            8661
women's bodybuilding        4661
fitness                     3120
212                         2106
Name: count, dtype: int64

In [61]:
# number of distinct classes
# ifbb has only open and master's classes
print(
    f"""
      Total number of distinct classes:
      {data["class"].nunique()}
      """
)


      Total number of distinct classes:
      1248
      


In [60]:
# distribution of height-based classes
data[data["class"].str.startswith("class ")]["class"].value_counts()

class
class b    76626
class a    72164
class c    63720
class d    44539
class e    17132
class f    16579
class h     4098
class g     3375
class j        7
class i        6
Name: count, dtype: int64