In [1]:
import pandas as pd
import sqlite3

# Olympic Athletes & Events (ETL): Extract

In [2]:
#import countries competing in Olympics
noc_regions = pd.read_csv("../Data/01_noc_regions.csv")
print(noc_regions.shape)
noc_regions.head()

(230, 3)


Unnamed: 0,NOC,region,notes
0,AFG,Afghanistan,
1,AHO,Curacao,Netherlands Antilles
2,ALB,Albania,
3,ALG,Algeria,
4,AND,Andorra,


In [3]:
#import list of athletes competing in events
athlete_events = pd.read_csv("../Data/01_athlete_events.csv")
athlete_events.head()

Unnamed: 0,ID,Name,Sex,Age,Height,Weight,Team,NOC,Games,Year,Season,City,Sport,Event,Medal
0,1,A Dijiang,M,24.0,180.0,80.0,China,CHN,1992 Summer,1992,Summer,Barcelona,Basketball,Basketball Men's Basketball,
1,2,A Lamusi,M,23.0,170.0,60.0,China,CHN,2012 Summer,2012,Summer,London,Judo,Judo Men's Extra-Lightweight,
2,3,Gunnar Nielsen Aaby,M,24.0,,,Denmark,DEN,1920 Summer,1920,Summer,Antwerpen,Football,Football Men's Football,
3,4,Edgar Lindenau Aabye,M,34.0,,,Denmark/Sweden,DEN,1900 Summer,1900,Summer,Paris,Tug-Of-War,Tug-Of-War Men's Tug-Of-War,Gold
4,5,Christine Jacoba Aaftink,F,21.0,185.0,82.0,Netherlands,NED,1988 Winter,1988,Winter,Calgary,Speed Skating,Speed Skating Women's 500 metres,


# Olympic Athletes & Event Results (ETL): Transform

In [4]:
#drop extraneous country columns
noc_regions = noc_regions.drop(['notes'], axis=1)
print(noc_regions.shape)
noc_regions.head()

(230, 2)


Unnamed: 0,NOC,region
0,AFG,Afghanistan
1,AHO,Curacao
2,ALB,Albania
3,ALG,Algeria
4,AND,Andorra


In [5]:
#drop extraneous athlete and event columns
athlete_events = athlete_events.drop(['Name',
                                      'Team',
                                      'Games',
                                      'Season',
                                      'City',
                                      'Event',
                                      'Medal',
                                      'ID'], axis=1)

print(athlete_events.shape)
athlete_events.head()

(271116, 7)


Unnamed: 0,Sex,Age,Height,Weight,NOC,Year,Sport
0,M,24.0,180.0,80.0,CHN,1992,Basketball
1,M,23.0,170.0,60.0,CHN,2012,Judo
2,M,24.0,,,DEN,1920,Football
3,M,34.0,,,DEN,1900,Tug-Of-War
4,F,21.0,185.0,82.0,NED,1988,Speed Skating


In [6]:
#create clean athletes table for combination of years and competing countries
athletes = pd.DataFrame()

years = athlete_events['Year'].unique().tolist()
years.sort()

NOCs = athlete_events['NOC'].unique().tolist()
NOCs.sort()

for year in years:
    for NOC in NOCs:
        athletes = athletes.append({'Year': year, 'NOC': NOC}, ignore_index=True)
    
athletes['Year'] = athletes['Year'].astype(int)
print(athletes.shape)
athletes.head()

(8050, 2)


Unnamed: 0,NOC,Year
0,AFG,1896
1,AHO,1896
2,ALB,1896
3,ALG,1896
4,AND,1896


In [7]:
#count total athletes by year and country and merge into athletes table
counts = athlete_events.groupby(['Year', 'NOC']).count()
counts.rename(columns={'Sex': 'Total_Athletes'}, inplace=True)
counts = counts.drop(['Age', 'Height', 'Weight', 'Sport'], axis=1)

athletes = pd.merge(athletes, counts,  how='left', left_on=['Year','NOC'], right_on = ['Year','NOC'])
athletes.head()

Unnamed: 0,NOC,Year,Total_Athletes
0,AFG,1896,
1,AHO,1896,
2,ALB,1896,
3,ALG,1896,
4,AND,1896,


In [8]:
#find males by year and country and merge into athletes table
males = athlete_events[athlete_events.Sex != 'F']
males = males.groupby(['Year', 'NOC']).count()
males = males.drop(['Age', 'Height', 'Weight', 'Sport'], axis=1)
males.rename(columns={'Sex': 'Males'}, inplace=True)

athletes = pd.merge(athletes, males,  how='left', left_on=['Year','NOC'], right_on = ['Year','NOC'])
athletes['Male_Ratio'] = athletes['Males'] / athletes['Total_Athletes']
athletes.head()

Unnamed: 0,NOC,Year,Total_Athletes,Males,Male_Ratio
0,AFG,1896,,,
1,AHO,1896,,,
2,ALB,1896,,,
3,ALG,1896,,,
4,AND,1896,,,


In [9]:
#find sports competed by year and country and merge into athletes table
sports = athlete_events.groupby(['Year', 'NOC']).count()
sports = sports.drop(['Age', 'Height', 'Weight', 'Sex'], axis=1)
sports.rename(columns={'Sport': 'Sports_Competed'}, inplace=True)

athletes = pd.merge(athletes, sports,  how='left', left_on=['Year','NOC'], right_on = ['Year','NOC'])
athletes.head()

Unnamed: 0,NOC,Year,Total_Athletes,Males,Male_Ratio,Sports_Competed
0,AFG,1896,,,,
1,AHO,1896,,,,
2,ALB,1896,,,,
3,ALG,1896,,,,
4,AND,1896,,,,


In [10]:
#find mean age, height, weight by year and country and merge into athletes table
means = athlete_events.groupby(['Year', 'NOC']).mean()
means

athletes = pd.merge(athletes, means,  how='left', left_on=['Year','NOC'], right_on = ['Year','NOC'])
athletes.head()

Unnamed: 0,NOC,Year,Total_Athletes,Males,Male_Ratio,Sports_Competed,Age,Height,Weight
0,AFG,1896,,,,,,,
1,AHO,1896,,,,,,,
2,ALB,1896,,,,,,,
3,ALG,1896,,,,,,,
4,AND,1896,,,,,,,


In [11]:
#drop rows with null values from athletes table
print(athletes.shape)
athletes = athletes.dropna()

athletes['Total_Athletes'] = athletes['Total_Athletes'].astype(int)
athletes['Males'] = athletes['Males'].astype(int)
athletes['Sports_Competed'] = athletes['Sports_Competed'].astype(int)

print(athletes.shape)
athletes.head()

(8050, 9)
(3069, 9)


Unnamed: 0,NOC,Year,Total_Athletes,Males,Male_Ratio,Sports_Competed,Age,Height,Weight
74,GBR,1896,25,25,1.0,25,26.526316,188.0,102.0
79,GER,1896,94,94,1.0,94,24.261905,163.681818,64.714286
81,GRE,1896,148,148,1.0,148,21.235294,175.666667,78.0
216,USA,1896,27,27,1.0,27,23.074074,179.875,72.461538
242,AUS,1900,6,6,1.0,6,22.0,178.0,71.0


# Olympic Athletes & Event Results (ETL): Load

In [12]:
#creating SQL connection
conn = sqlite3.connect('../Data/olympics.db')
c = conn.cursor()

#function to close connection
def close_c_conn():
    c.close()
    conn.close()

In [13]:
#create regions table in SQL
noc_regions.to_sql('regions', conn, if_exists='replace')

#check SQL regions table
regions = pd.read_sql_query("SELECT * FROM regions;", conn)
print(regions.shape)
regions.head()

(230, 3)


Unnamed: 0,index,NOC,region
0,0,AFG,Afghanistan
1,1,AHO,Curacao
2,2,ALB,Albania
3,3,ALG,Algeria
4,4,AND,Andorra


In [14]:
#create athletes table in SQL
athletes.to_sql('athletes', conn, if_exists='replace')

#check SQL athletes table
athletes = pd.read_sql_query("SELECT * FROM athletes;", conn)
print(athletes.shape)
athletes.head()

(3069, 10)


Unnamed: 0,index,NOC,Year,Total_Athletes,Males,Male_Ratio,Sports_Competed,Age,Height,Weight
0,74,GBR,1896,25,25,1.0,25,26.526316,188.0,102.0
1,79,GER,1896,94,94,1.0,94,24.261905,163.681818,64.714286
2,81,GRE,1896,148,148,1.0,148,21.235294,175.666667,78.0
3,216,USA,1896,27,27,1.0,27,23.074074,179.875,72.461538
4,242,AUS,1900,6,6,1.0,6,22.0,178.0,71.0


In [15]:
#close connection
close_c_conn()