# Data load

In [23]:
import pandas as pd

data = pd.read_csv('athlete_events.csv')
data

Unnamed: 0,ID,Name,Sex,Age,Height,Weight,Team,NOC,Games,Year,Season,City,Sport,Event,Medal
0,1,A Dijiang,M,24.0,180.0,80.0,China,CHN,1992 Summer,1992,Summer,Barcelona,Basketball,Basketball Men's Basketball,
1,2,A Lamusi,M,23.0,170.0,60.0,China,CHN,2012 Summer,2012,Summer,London,Judo,Judo Men's Extra-Lightweight,
2,3,Gunnar Nielsen Aaby,M,24.0,,,Denmark,DEN,1920 Summer,1920,Summer,Antwerpen,Football,Football Men's Football,
3,4,Edgar Lindenau Aabye,M,34.0,,,Denmark/Sweden,DEN,1900 Summer,1900,Summer,Paris,Tug-Of-War,Tug-Of-War Men's Tug-Of-War,Gold
4,5,Christine Jacoba Aaftink,F,21.0,185.0,82.0,Netherlands,NED,1988 Winter,1988,Winter,Calgary,Speed Skating,Speed Skating Women's 500 metres,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
271111,135569,Andrzej ya,M,29.0,179.0,89.0,Poland-1,POL,1976 Winter,1976,Winter,Innsbruck,Luge,Luge Mixed (Men)'s Doubles,
271112,135570,Piotr ya,M,27.0,176.0,59.0,Poland,POL,2014 Winter,2014,Winter,Sochi,Ski Jumping,"Ski Jumping Men's Large Hill, Individual",
271113,135570,Piotr ya,M,27.0,176.0,59.0,Poland,POL,2014 Winter,2014,Winter,Sochi,Ski Jumping,"Ski Jumping Men's Large Hill, Team",
271114,135571,Tomasz Ireneusz ya,M,30.0,185.0,96.0,Poland,POL,1998 Winter,1998,Winter,Nagano,Bobsleigh,Bobsleigh Men's Four,


In [2]:
data.dtypes

ID          int64
Name       object
Sex        object
Age       float64
Height    float64
Weight    float64
Team       object
NOC        object
Games      object
Year        int64
Season     object
City       object
Sport      object
Event      object
Medal      object
dtype: object

# Clean up

In [3]:
# drop unnecessary columns
ol_data = data.drop(['ID', 'Games'], axis=1)
ol_data.head()

Unnamed: 0,Name,Sex,Age,Height,Weight,Team,NOC,Year,Season,City,Sport,Event,Medal
0,A Dijiang,M,24.0,180.0,80.0,China,CHN,1992,Summer,Barcelona,Basketball,Basketball Men's Basketball,
1,A Lamusi,M,23.0,170.0,60.0,China,CHN,2012,Summer,London,Judo,Judo Men's Extra-Lightweight,
2,Gunnar Nielsen Aaby,M,24.0,,,Denmark,DEN,1920,Summer,Antwerpen,Football,Football Men's Football,
3,Edgar Lindenau Aabye,M,34.0,,,Denmark/Sweden,DEN,1900,Summer,Paris,Tug-Of-War,Tug-Of-War Men's Tug-Of-War,Gold
4,Christine Jacoba Aaftink,F,21.0,185.0,82.0,Netherlands,NED,1988,Winter,Calgary,Speed Skating,Speed Skating Women's 500 metres,


# Analysis

## Valores faltantes

In [21]:
# count the total number cells
print("Total cells:", ol_data.size)

# count the total number of cells with missing values
print("Total missing cells:", ol_data.isnull().sum().sum())

Total cells: 3524508
Total missing cells: 363853


In [27]:
# Count the number of unique values in each column
unique_values = ol_data.nunique()

# Convert the Series to a DataFrame
unique_values_df = unique_values.to_frame().reset_index()

# Rename the columns
unique_values_df.columns = ["Columna", "Dominio"]

# Save the DataFrame to a CSV file
unique_values_df.to_csv("descriptors/olympics_unique_values.csv", index=False, decimal='.')
unique_values_df

Unnamed: 0,Column,Unique Values
0,Name,134732
1,Sex,2
2,Age,74
3,Height,95
4,Weight,220
5,Team,1184
6,NOC,230
7,Year,35
8,Season,2
9,City,42


In [7]:
# count the number of missing values in each column
missing = ol_data.isnull().sum()
missing = missing[missing > 0]
missing.sort_values(inplace=True)
missing.drop('Medal', inplace=True)
missing

Age        9474
Height    60171
Weight    62875
dtype: int64

## `.describe` con valores de skewness y curtosis

In [6]:
from statistics_calc import descriptors

# Generate descriptors
desc = descriptors(data)

# Export the DataFrame to a CSV file
desc.to_csv("descriptors/olympics.csv", decimal=",")

desc

Unnamed: 0,ID,Age,Height,Weight,Year
count,271116.0,261642.0,210945.0,208241.0,271116.0
mean,68248.954396,25.556898,175.33897,70.702393,1978.37848
std,39022.286345,6.393561,10.518462,14.34802,29.877632
min,1.0,10.0,127.0,25.0,1896.0
25%,34643.0,21.0,168.0,60.0,1960.0
50%,68205.0,24.0,175.0,70.0,1988.0
75%,102097.25,28.0,183.0,79.0,2002.0
max,135571.0,97.0,226.0,214.0,2016.0
skewness,-0.004681,1.747123,0.018477,0.797169,-0.817736
kurtosis,-1.197292,6.270642,0.177728,2.017523,-0.206948


## Variables cualitativas

In [9]:
from statistics_calc import qualitative_stats

# Columnas cualitativas de interés
cols_cualitativas = [
    "Name",
    "Sex",
    "Team",
    "NOC",
    "Season",
    "City",
    "Sport",
    "Event",
    "Medal",
]

# Generate statistics
estadisticas = qualitative_stats(data, cols_cualitativas)

# Guardar datos en un archivo CSV
estadisticas.to_csv("descriptors/olympics_quality.csv", index=False, decimal=",")

# Mostrar el DataFrame de estadísticas
estadisticas

  estadisticas = pd.concat([estadisticas, nueva_fila], ignore_index=True)


Unnamed: 0,Columna,Moda,Moda (#),Moda (%),Valores unicos (#),Valores unicos (%),Valores nulos (%)
0,Name,Robert Tait McKenzie,58,0.021393,134732,49.695333,0.0
1,Sex,M,196594,72.512873,2,0.000738,0.0
2,Team,United States,17847,6.582791,1184,0.436713,0.0
3,NOC,USA,18853,6.95385,230,0.084835,0.0
4,Season,Summer,222552,82.087372,2,0.000738,0.0
5,City,London,22426,8.271736,42,0.015492,0.0
6,Sport,Athletics,38624,14.2463,66,0.024344,0.0
7,Event,Football Men's Football,5733,2.114593,765,0.282167,0.0
8,Medal,Gold,13372,4.932206,3,0.001107,85.326207


## Entradas por temporada

In [46]:
# count the number of rows with each season
seasons = ol_data['Season'].value_counts()
seasons

Season
Summer    222552
Winter     48564
Name: count, dtype: int64

## Ciudades que han hosteado las olimpiadas

In [47]:
# count the number of cities in which the olympics were held
cities = ol_data['City'].nunique()
cities

42

## Veces que se han hosteado las olimpiadas en cada ciudad

In [49]:
# count how many times the olympics were held in each city
city_counts = ol_data['City'].value_counts()
city_counts.head()

City
London            22426
Athina            15556
Sydney            13821
Atlanta           13780
Rio de Janeiro    13688
Name: count, dtype: int64

## Atletas por pais

In [51]:
# count the number of athletes from each country
athletes = ol_data['NOC'].value_counts()
athletes.head()

NOC
USA    18853
FRA    12758
GBR    12256
ITA    10715
GER     9830
Name: count, dtype: int64

## Atletas por temporada

In [55]:
# count the number of athletes that are from each season
athletes_season = ol_data.groupby('Season')['Name'].nunique()
athletes_season

Season
Summer    116122
Winter     18923
Name: Name, dtype: int64

## Atletas por año

In [57]:
# count the number of athletes per year
athletes_year = ol_data.groupby('Year')['Name'].nunique()
athletes_year.head()

Year
1896     176
1900    1220
1904     650
1906     841
1908    2024
Name: Name, dtype: int64

## Medallas ganadas por pais

In [66]:
# Count the number of medals won by each country
medal_counts = ol_data.groupby('NOC')['Medal'].count()
medal_counts.sort_values(ascending=False).head()

NOC
USA    5637
URS    2503
GER    2165
GBR    2068
FRA    1777
Name: Medal, dtype: int64

## Medallas ganadas por atleta

In [67]:
# count the atheletes who won the most medals
most_medals = ol_data.groupby('Name')['Medal'].count()
most_medals.sort_values(ascending=False).head()

Name
Michael Fred Phelps, II               28
Larysa Semenivna Latynina (Diriy-)    18
Nikolay Yefimovich Andrianov          15
Takashi Ono                           13
Borys Anfiyanovych Shakhlin           13
Name: Medal, dtype: int64

## Atletas por deporte

In [68]:
# count the most popular sports == the sports with the most athletes
popular_sports = ol_data['Sport'].value_counts()
popular_sports.head()

Sport
Athletics     38624
Gymnastics    26707
Swimming      23195
Shooting      11448
Cycling       10859
Name: count, dtype: int64

## Atletas por evento

In [69]:
# count the most popular events == the events with the most athletes
popular_events = ol_data['Event'].value_counts()
popular_events.head()

Event
Football Men's Football        5733
Ice Hockey Men's Ice Hockey    4762
Hockey Men's Hockey            3958
Water Polo Men's Water Polo    3358
Basketball Men's Basketball    3280
Name: count, dtype: int64

## Deportes antiguos que aun se existen

In [82]:
# Find the first year each event was held
first_year = ol_data.groupby('Event')['Year'].min()

# Find the most recent year in the data
latest_year = ol_data['Year'].max()

# Find the events held in the most recent year
latest_events = ol_data[ol_data['Year'] == latest_year]['Event'].unique()

# Count the number of events that were first held in the earliest year and are still being held
oldest_events_still_held = first_year[first_year.index.isin(latest_events)]

oldest_events_still_held

Event
Archery Men's Individual                          1972
Archery Men's Team                                1988
Archery Women's Individual                        1972
Archery Women's Team                              1988
Athletics Men's 1,500 metres                      1896
                                                  ... 
Wrestling Women's Flyweight, Freestyle            2004
Wrestling Women's Heavyweight, Freestyle          2004
Wrestling Women's Light-Heavyweight, Freestyle    2016
Wrestling Women's Lightweight, Freestyle          2004
Wrestling Women's Middleweight, Freestyle         2004
Name: Year, Length: 306, dtype: int64

## Atletas por sexo

In [83]:
# count the number of athletes for each sex
athletes_sex = data['Sex'].value_counts()
athletes_sex

Sex
M    196594
F     74522
Name: count, dtype: int64

## Atletas por sexo y pais

In [73]:
# count the athletes by sex and country
athletes_by_sex_country = data.groupby(['Sex', 'NOC']).size().reset_index(name='Count')
athletes_by_sex_country.head()


Unnamed: 0,Sex,NOC,Count
0,F,AFG,5
1,F,AHO,12
2,F,ALB,27
3,F,ALG,94
4,F,AND,42


## Atletas por sexo y deporte

In [75]:
# count the athletes by sex and sport
athletes_by_sex_sport = data.groupby(['Sex', 'Sport']).size().reset_index(name='Count')
athletes_by_sex_sport.head()

Unnamed: 0,Sex,Sport,Count
0,F,Alpine Skiing,3398
1,F,Alpinism,1
2,F,Archery,1015
3,F,Art Competitions,377
4,F,Athletics,11666


## Medallas ganadas por pais (por tipo de medalla)

In [76]:
# count the countries that have won the most of each medal
gold_countries = data[data['Medal'] == 'Gold']['NOC'].value_counts()
silver_countries = data[data['Medal'] == 'Silver']['NOC'].value_counts()
bronze_countries = data[data['Medal'] == 'Bronze']['NOC'].value_counts()

gold_countries.head(), silver_countries.head(), bronze_countries.head()

(NOC
 USA    2638
 URS    1082
 GER     745
 GBR     678
 ITA     575
 Name: count, dtype: int64,
 NOC
 USA    1641
 GBR     739
 URS     732
 GER     674
 FRA     610
 Name: count, dtype: int64,
 NOC
 USA    1358
 GER     746
 URS     689
 FRA     666
 GBR     651
 Name: count, dtype: int64)

## Deportes por temporada

In [78]:
# count the number of sports held in each season
sports_season = data.groupby('Season')['Sport'].nunique()
sports_season

Season
Summer    52
Winter    17
Name: Sport, dtype: int64

## Eventos por temporada

In [79]:
# count the number of events held in each season
events_season = data.groupby('Season')['Event'].nunique()
events_season

Season
Summer    651
Winter    119
Name: Event, dtype: int64