# Importing files and library

### Import Library

In [1]:
# import python libraries

import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt # visualizing data
%matplotlib inline
import seaborn as sns

In [6]:
# import csv file
df_athlete = pd.read_csv(r'../data/athlete_events.csv', encoding= 'unicode_escape')

df_noc = pd.read_csv(r'../data/noc_regions.csv', encoding= 'unicode_escape')

# Data Processing

In [7]:
athlete_shape = df_athlete.shape #shows Rows, Column 
noc_shape = df_noc.shape #shows Rows, Column

print("Athlete Shape: ", athlete_shape)
print("NOC Shape: ", noc_shape)

Athlete Shape:  (271116, 15)
NOC Shape:  (230, 3)


In [8]:
athlete_columns = df_athlete.columns #shows columns of the dataset
noc_columns = df_noc.columns #shows columns of the dataset
print("Athlete Columns: \n ", athlete_columns)
print("="*50)
print("NOC Columns: \n ", noc_columns)

Athlete Columns: 
  Index(['ID', 'Name', 'Sex', 'Age', 'Height', 'Weight', 'Team', 'NOC', 'Games',
       'Year', 'Season', 'City', 'Sport', 'Event', 'Medal'],
      dtype='object')
NOC Columns: 
  Index(['NOC', 'region', 'notes'], dtype='object')


In [9]:
# Merge the DataFrames on the 'NOC' column
df_merged = pd.merge(df_athlete, df_noc[['NOC', 'region']], on='NOC', how='left')

# Display the first few rows of the merged DataFrame

print("Merged Columns: \n", df_merged.columns)

Merged Columns: 
 Index(['ID', 'Name', 'Sex', 'Age', 'Height', 'Weight', 'Team', 'NOC', 'Games',
       'Year', 'Season', 'City', 'Sport', 'Event', 'Medal', 'region'],
      dtype='object')


In [10]:
df_merged.head(4)

Unnamed: 0,ID,Name,Sex,Age,Height,Weight,Team,NOC,Games,Year,Season,City,Sport,Event,Medal,region
0,1,A Dijiang,M,24.0,180.0,80.0,China,CHN,1992 Summer,1992,Summer,Barcelona,Basketball,Basketball Men's Basketball,,China
1,2,A Lamusi,M,23.0,170.0,60.0,China,CHN,2012 Summer,2012,Summer,London,Judo,Judo Men's Extra-Lightweight,,China
2,3,Gunnar Nielsen Aaby,M,24.0,,,Denmark,DEN,1920 Summer,1920,Summer,Antwerpen,Football,Football Men's Football,,Denmark
3,4,Edgar Lindenau Aabye,M,34.0,,,Denmark/Sweden,DEN,1900 Summer,1900,Summer,Paris,Tug-Of-War,Tug-Of-War Men's Tug-Of-War,Gold,Denmark


In [11]:
df_merged.describe

<bound method NDFrame.describe of             ID                      Name Sex   Age  Height  Weight  \
0            1                 A Dijiang   M  24.0   180.0    80.0   
1            2                  A Lamusi   M  23.0   170.0    60.0   
2            3       Gunnar Nielsen Aaby   M  24.0     NaN     NaN   
3            4      Edgar Lindenau Aabye   M  34.0     NaN     NaN   
4            5  Christine Jacoba Aaftink   F  21.0   185.0    82.0   
...        ...                       ...  ..   ...     ...     ...   
271111  135569                Andrzej ya   M  29.0   179.0    89.0   
271112  135570                  Piotr ya   M  27.0   176.0    59.0   
271113  135570                  Piotr ya   M  27.0   176.0    59.0   
271114  135571        Tomasz Ireneusz ya   M  30.0   185.0    96.0   
271115  135571        Tomasz Ireneusz ya   M  34.0   185.0    96.0   

                  Team  NOC        Games  Year  Season            City  \
0                China  CHN  1992 Summer  1992  Sum

In [12]:
df_merged.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 271116 entries, 0 to 271115
Data columns (total 16 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   ID      271116 non-null  int64  
 1   Name    271116 non-null  object 
 2   Sex     271116 non-null  object 
 3   Age     261642 non-null  float64
 4   Height  210945 non-null  float64
 5   Weight  208241 non-null  float64
 6   Team    271116 non-null  object 
 7   NOC     271116 non-null  object 
 8   Games   271116 non-null  object 
 9   Year    271116 non-null  int64  
 10  Season  271116 non-null  object 
 11  City    271116 non-null  object 
 12  Sport   271116 non-null  object 
 13  Event   271116 non-null  object 
 14  Medal   39783 non-null   object 
 15  region  270746 non-null  object 
dtypes: float64(3), int64(2), object(11)
memory usage: 33.1+ MB


In [13]:
pd.isnull(df_merged).sum()

ID             0
Name           0
Sex            0
Age         9474
Height     60171
Weight     62875
Team           0
NOC            0
Games          0
Year           0
Season         0
City           0
Sport          0
Event          0
Medal     231333
region       370
dtype: int64

In [14]:
# Calculate mean values for Age, Height, and Weight
age_mean = df_merged['Age'].mean()
height_mean = df_merged['Height'].mean()
weight_mean = df_merged['Weight'].mean()

# Print the mean values
print(f"Mean Age: {age_mean:.2f}")
print(f"Mean Height: {height_mean:.2f}")
print(f"Mean Weight: {weight_mean:.2f}")


Mean Age: 25.56
Mean Height: 175.34
Mean Weight: 70.70


In [15]:
# Enable IterativeImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

# Iterative Imputer
iterative_imputer = IterativeImputer()
df_merged[['Age', 'Height', 'Weight']] = iterative_imputer.fit_transform(df_merged[['Age', 'Height', 'Weight']])




In [28]:
iterative_imputer = IterativeImputer(max_iter=20, random_state=0)
df_merged[['Age', 'Height', 'Weight']] = iterative_imputer.fit_transform(df_merged[['Age', 'Height', 'Weight']])


In [16]:
df_merged.isnull().sum()

ID             0
Name           0
Sex            0
Age            0
Height         0
Weight         0
Team           0
NOC            0
Games          0
Year           0
Season         0
City           0
Sport          0
Event          0
Medal     231333
region       370
dtype: int64