In [56]:
import pandas as pd
import sqlite3

# Olympic Sports and Medals (ETL): Extract

In [82]:
summer = pd.read_csv("../Data/02_summer.csv")
print(summer.shape)
summer.head()

(31165, 9)


Unnamed: 0,Year,City,Sport,Discipline,Athlete,Country,Gender,Event,Medal
0,1896,Athens,Aquatics,Swimming,"HAJOS, Alfred",HUN,Men,100M Freestyle,Gold
1,1896,Athens,Aquatics,Swimming,"HERSCHMANN, Otto",AUT,Men,100M Freestyle,Silver
2,1896,Athens,Aquatics,Swimming,"DRIVAS, Dimitrios",GRE,Men,100M Freestyle For Sailors,Bronze
3,1896,Athens,Aquatics,Swimming,"MALOKINIS, Ioannis",GRE,Men,100M Freestyle For Sailors,Gold
4,1896,Athens,Aquatics,Swimming,"CHASAPIS, Spiridon",GRE,Men,100M Freestyle For Sailors,Silver


In [83]:
winter = pd.read_csv("../Data/02_winter.csv")
print(winter.shape)
winter.head()

(5770, 9)


Unnamed: 0,Year,City,Sport,Discipline,Athlete,Country,Gender,Event,Medal
0,1924,Chamonix,Biathlon,Biathlon,"BERTHET, G.",FRA,Men,Military Patrol,Bronze
1,1924,Chamonix,Biathlon,Biathlon,"MANDRILLON, C.",FRA,Men,Military Patrol,Bronze
2,1924,Chamonix,Biathlon,Biathlon,"MANDRILLON, Maurice",FRA,Men,Military Patrol,Bronze
3,1924,Chamonix,Biathlon,Biathlon,"VANDELLE, André",FRA,Men,Military Patrol,Bronze
4,1924,Chamonix,Biathlon,Biathlon,"AUFDENBLATTEN, Adolf",SUI,Men,Military Patrol,Gold


In [115]:
winners = pd.concat([summer, winter])
print(winners.shape)
winners.head()

(36935, 9)


Unnamed: 0,Year,City,Sport,Discipline,Athlete,Country,Gender,Event,Medal
0,1896,Athens,Aquatics,Swimming,"HAJOS, Alfred",HUN,Men,100M Freestyle,Gold
1,1896,Athens,Aquatics,Swimming,"HERSCHMANN, Otto",AUT,Men,100M Freestyle,Silver
2,1896,Athens,Aquatics,Swimming,"DRIVAS, Dimitrios",GRE,Men,100M Freestyle For Sailors,Bronze
3,1896,Athens,Aquatics,Swimming,"MALOKINIS, Ioannis",GRE,Men,100M Freestyle For Sailors,Gold
4,1896,Athens,Aquatics,Swimming,"CHASAPIS, Spiridon",GRE,Men,100M Freestyle For Sailors,Silver


# Olympic Sports and Medals (ETL): Transform

In [116]:
#drop extraneous athlete and event columns
winners = winners.drop(['City',
                        'Sport',
                        'Discipline',
                        'Athlete',
                        'Gender',
                        'Event'], axis=1)
print(winners.shape)
winners.head()

(36935, 3)


Unnamed: 0,Year,Country,Medal
0,1896,HUN,Gold
1,1896,AUT,Silver
2,1896,GRE,Bronze
3,1896,GRE,Gold
4,1896,GRE,Silver


In [117]:
#create clean medals table for combination of years and competing countries
medals = pd.DataFrame()

years = winners['Year'].unique().tolist()
years.sort()

NOCs = winners['Country'].dropna().unique().tolist()
NOCs.sort()

for year in years:
    for NOC in NOCs:
        medals = medals.append({'Year': year, 'NOC': NOC}, ignore_index=True)
    
medals['Year'] = medals['Year'].astype(int)
print(medals.shape)
medals.head()

(4884, 2)


Unnamed: 0,NOC,Year
0,AFG,1896
1,AHO,1896
2,ALG,1896
3,ANZ,1896
4,ARG,1896


In [118]:
#count total medals by year and country and merge into medals table
totals = winners.groupby(['Year', 'Country']).count()
totals.rename(columns={'Medal': 'Total_Medals'}, inplace=True)

medals = pd.merge(medals, totals,  how='left', left_on=['Year','NOC'], right_on = ['Year','Country'])
medals.head()

Unnamed: 0,NOC,Year,Total_Medals
0,AFG,1896,
1,AHO,1896,
2,ALG,1896,
3,ANZ,1896,
4,ARG,1896,


In [119]:
#count gold medals by year and country and merge into medals table
golds = winners[winners.Medal == 'Gold']
golds = golds.groupby(['Year', 'Country']).count()
golds.rename(columns={'Medal': 'Golds'}, inplace=True)

medals = pd.merge(medals, golds,  how='left', left_on=['Year','NOC'], right_on = ['Year','Country'])
medals.head()

Unnamed: 0,NOC,Year,Total_Medals,Golds
0,AFG,1896,,
1,AHO,1896,,
2,ALG,1896,,
3,ANZ,1896,,
4,ARG,1896,,


In [120]:
#count silver medals by year and country and merge into medals table
silvers = winners[winners.Medal == 'Silver']
silvers = silvers.groupby(['Year', 'Country']).count()
silvers.rename(columns={'Medal': 'Silvers'}, inplace=True)

medals = pd.merge(medals, silvers,  how='left', left_on=['Year','NOC'], right_on = ['Year','Country'])
medals.head()

Unnamed: 0,NOC,Year,Total_Medals,Golds,Silvers
0,AFG,1896,,,
1,AHO,1896,,,
2,ALG,1896,,,
3,ANZ,1896,,,
4,ARG,1896,,,


In [121]:
#count bronze medals by year and country and merge into medals table
bronzes = winners[winners.Medal == 'Bronze']
bronzes = bronzes.groupby(['Year', 'Country']).count()
bronzes.rename(columns={'Medal': 'Bronzes'}, inplace=True)

medals = pd.merge(medals, bronzes,  how='left', left_on=['Year','NOC'], right_on = ['Year','Country'])
medals.tail()

Unnamed: 0,NOC,Year,Total_Medals,Golds,Silvers,Bronzes
4879,VIE,2014,,,,
4880,YUG,2014,,,,
4881,ZAM,2014,,,,
4882,ZIM,2014,,,,
4883,ZZX,2014,,,,


In [122]:
#drop rows from medals table with zero values
medals = medals.dropna()

medals['Total_Medals'] = medals['Total_Medals'].astype(int)
medals['Golds'] = medals['Golds'].astype(int)
medals['Silvers'] = medals['Silvers'].astype(int)
medals['Bronzes'] = medals['Bronzes'].astype(int)

print(medals.shape)
medals.head()

(696, 6)


Unnamed: 0,NOC,Year,Total_Medals,Golds,Silvers,Bronzes
7,AUT,1896,5,2,1,2
32,DEN,1896,6,1,2,3
44,FRA,1896,11,5,4,2
47,GBR,1896,7,2,3,2
50,GER,1896,33,26,5,2


In [112]:
#count total medals by year
total_annual_medals = medals.groupby(['Year']).sum()
total_annual_medals.rename(columns={'Total_Medals': 'Total_Annual_Medals'}, inplace=True)
total_annual_medals

Unnamed: 0_level_0,Total_Annual_Medals,Golds,Silvers,Bronzes
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1896,146,61,45,40
1900,457,170,172,115
1904,420,156,144,120
1908,757,307,259,191
1912,826,295,284,247
1920,1245,496,406,343
1924,938,315,333,290
1928,722,207,253,262
1932,669,226,220,223
1936,890,309,306,275


In [123]:
#count win percentage of total medals by year and country and merge into medals table
win_percentage = total_annual_medals.drop(['Golds',
                        'Silvers',
                        'Bronzes'], axis=1)
win_percentage.rename(columns={'Total_Annual_Medals': 'Win_Percentage'}, inplace=True)

medals = pd.merge(medals, win_percentage,  how='left', left_on=['Year'], right_on = ['Year'])
medals['Win_Percentage'] = medals['Total_Medals'] / medals['Win_Percentage'] * 100
medals.head()

Unnamed: 0,NOC,Year,Total_Medals,Golds,Silvers,Bronzes,Win_Percentage
0,AUT,1896,5,2,1,2,3.424658
1,DEN,1896,6,1,2,3,4.109589
2,FRA,1896,11,5,4,2,7.534247
3,GBR,1896,7,2,3,2,4.794521
4,GER,1896,33,26,5,2,22.60274


In [125]:
medals.tail(100)

Unnamed: 0,NOC,Year,Total_Medals,Golds,Silvers,Bronzes,Win_Percentage
596,GBR,2008,77,31,25,21,4.273030
597,GER,2008,101,42,16,43,5.604883
598,HUN,2008,27,16,8,3,1.498335
599,INA,2008,7,2,2,3,0.388457
600,ITA,2008,42,8,14,20,2.330744
601,JAM,2008,17,9,3,5,0.943396
602,JPN,2008,51,23,11,17,2.830189
603,KAZ,2008,13,2,4,7,0.721421
604,KEN,2008,14,6,4,4,0.776915
605,KOR,2008,78,41,11,26,4.328524


# Olympic Sports and Medals (ETL): Load

In [10]:
#creating SQL connection
conn = sqlite3.connect('../Data/olympics.db')
c = conn.cursor()

#function to close connection
def close_c_conn():
    c.close()
    conn.close()

In [13]:
#check SQL regions table
regions = pd.read_sql_query("SELECT * FROM regions;", conn)
print(regions.shape)
regions.head()

(230, 2)


Unnamed: 0,NOC,Region
0,AFG,Afghanistan
1,AHO,Curacao
2,ALB,Albania
3,ALG,Algeria
4,AND,Andorra


In [25]:
#close connection
close_c_conn()