In [1]:
import pandas as pd
import sqlite3

# Olympic Sports and Medals (ETL): Extract

In [2]:
summer = pd.read_csv("../Data/02_summer.csv")
print(summer.shape)
summer.head()

(31165, 9)


Unnamed: 0,Year,City,Sport,Discipline,Athlete,Country,Gender,Event,Medal
0,1896,Athens,Aquatics,Swimming,"HAJOS, Alfred",HUN,Men,100M Freestyle,Gold
1,1896,Athens,Aquatics,Swimming,"HERSCHMANN, Otto",AUT,Men,100M Freestyle,Silver
2,1896,Athens,Aquatics,Swimming,"DRIVAS, Dimitrios",GRE,Men,100M Freestyle For Sailors,Bronze
3,1896,Athens,Aquatics,Swimming,"MALOKINIS, Ioannis",GRE,Men,100M Freestyle For Sailors,Gold
4,1896,Athens,Aquatics,Swimming,"CHASAPIS, Spiridon",GRE,Men,100M Freestyle For Sailors,Silver


In [3]:
winter = pd.read_csv("../Data/02_winter.csv")
print(winter.shape)
winter.head()

(5770, 9)


Unnamed: 0,Year,City,Sport,Discipline,Athlete,Country,Gender,Event,Medal
0,1924,Chamonix,Biathlon,Biathlon,"BERTHET, G.",FRA,Men,Military Patrol,Bronze
1,1924,Chamonix,Biathlon,Biathlon,"MANDRILLON, C.",FRA,Men,Military Patrol,Bronze
2,1924,Chamonix,Biathlon,Biathlon,"MANDRILLON, Maurice",FRA,Men,Military Patrol,Bronze
3,1924,Chamonix,Biathlon,Biathlon,"VANDELLE, André",FRA,Men,Military Patrol,Bronze
4,1924,Chamonix,Biathlon,Biathlon,"AUFDENBLATTEN, Adolf",SUI,Men,Military Patrol,Gold


In [4]:
winners = pd.concat([summer, winter])
print(winners.shape)
winners.head()

(36935, 9)


Unnamed: 0,Year,City,Sport,Discipline,Athlete,Country,Gender,Event,Medal
0,1896,Athens,Aquatics,Swimming,"HAJOS, Alfred",HUN,Men,100M Freestyle,Gold
1,1896,Athens,Aquatics,Swimming,"HERSCHMANN, Otto",AUT,Men,100M Freestyle,Silver
2,1896,Athens,Aquatics,Swimming,"DRIVAS, Dimitrios",GRE,Men,100M Freestyle For Sailors,Bronze
3,1896,Athens,Aquatics,Swimming,"MALOKINIS, Ioannis",GRE,Men,100M Freestyle For Sailors,Gold
4,1896,Athens,Aquatics,Swimming,"CHASAPIS, Spiridon",GRE,Men,100M Freestyle For Sailors,Silver


# Olympic Sports and Medals (ETL): Transform

In [5]:
#drop extraneous athlete and event columns
winners = winners.drop(['City',
                        'Sport',
                        'Discipline',
                        'Athlete',
                        'Gender',
                        'Event'], axis=1)
print(winners.shape)
winners.head()

(36935, 3)


Unnamed: 0,Year,Country,Medal
0,1896,HUN,Gold
1,1896,AUT,Silver
2,1896,GRE,Bronze
3,1896,GRE,Gold
4,1896,GRE,Silver


In [6]:
#create clean medals table for combination of years and competing countries
medals = pd.DataFrame()

years = winners['Year'].unique().tolist()
years.sort()

NOCs = winners['Country'].dropna().unique().tolist()
NOCs.sort()

for year in years:
    for NOC in NOCs:
        medals = medals.append({'Year': year, 'NOC': NOC}, ignore_index=True)
    
medals['Year'] = medals['Year'].astype(int)
print(medals.shape)
medals.head()

(4884, 2)


Unnamed: 0,NOC,Year
0,AFG,1896
1,AHO,1896
2,ALG,1896
3,ANZ,1896
4,ARG,1896


In [7]:
#count total medals by year and country and merge into medals table
totals = winners.groupby(['Year', 'Country']).count()
totals.rename(columns={'Medal': 'Total_Medals'}, inplace=True)

medals = pd.merge(medals, totals,  how='left', left_on=['Year','NOC'], right_on = ['Year','Country'])
medals = medals.dropna()
print(medals.shape)
medals.head()

(1323, 3)


Unnamed: 0,NOC,Year,Total_Medals
6,AUS,1896,2.0
7,AUT,1896,5.0
32,DEN,1896,6.0
44,FRA,1896,11.0
47,GBR,1896,7.0


In [8]:
#count gold medals by year and country and merge into medals table
golds = winners[winners.Medal == 'Gold']
golds = golds.groupby(['Year', 'Country']).count()
golds.rename(columns={'Medal': 'Golds'}, inplace=True)

medals = pd.merge(medals, golds,  how='left', left_on=['Year','NOC'], right_on = ['Year','Country'])
medals.head()

Unnamed: 0,NOC,Year,Total_Medals,Golds
0,AUS,1896,2.0,2.0
1,AUT,1896,5.0,2.0
2,DEN,1896,6.0,1.0
3,FRA,1896,11.0,5.0
4,GBR,1896,7.0,2.0


In [9]:
#count silver medals by year and country and merge into medals table
silvers = winners[winners.Medal == 'Silver']
silvers = silvers.groupby(['Year', 'Country']).count()
silvers.rename(columns={'Medal': 'Silvers'}, inplace=True)

medals = pd.merge(medals, silvers,  how='left', left_on=['Year','NOC'], right_on = ['Year','Country'])
medals.head()

Unnamed: 0,NOC,Year,Total_Medals,Golds,Silvers
0,AUS,1896,2.0,2.0,
1,AUT,1896,5.0,2.0,1.0
2,DEN,1896,6.0,1.0,2.0
3,FRA,1896,11.0,5.0,4.0
4,GBR,1896,7.0,2.0,3.0


In [10]:
#count bronze medals by year and country and merge into medals table
bronzes = winners[winners.Medal == 'Bronze']
bronzes = bronzes.groupby(['Year', 'Country']).count()
bronzes.rename(columns={'Medal': 'Bronzes'}, inplace=True)

medals = pd.merge(medals, bronzes,  how='left', left_on=['Year','NOC'], right_on = ['Year','Country'])
medals.tail()

Unnamed: 0,NOC,Year,Total_Medals,Golds,Silvers,Bronzes
1318,SUI,2014,32.0,6.0,4.0,22.0
1319,SVK,2014,1.0,1.0,,
1320,SWE,2014,55.0,8.0,35.0,12.0
1321,UKR,2014,5.0,4.0,,1.0
1322,USA,2014,65.0,10.0,31.0,24.0


In [11]:
#fill rows from medals table with zero values
medals = medals.fillna(0)

medals['Total_Medals'] = medals['Total_Medals'].astype(int)
medals['Golds'] = medals['Golds'].astype(int)
medals['Silvers'] = medals['Silvers'].astype(int)
medals['Bronzes'] = medals['Bronzes'].astype(int)

print(medals.shape)
medals.head()

(1323, 6)


Unnamed: 0,NOC,Year,Total_Medals,Golds,Silvers,Bronzes
0,AUS,1896,2,2,0,0
1,AUT,1896,5,2,1,2
2,DEN,1896,6,1,2,3
3,FRA,1896,11,5,4,2
4,GBR,1896,7,2,3,2


In [12]:
#count total medals by year
total_annual_medals = medals.groupby(['Year']).sum()
total_annual_medals.rename(columns={'Total_Medals': 'Total_Annual_Medals'}, inplace=True)
total_annual_medals

Unnamed: 0_level_0,Total_Annual_Medals,Golds,Silvers,Bronzes
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1896,151,64,47,40
1900,512,178,192,142
1904,470,188,159,123
1908,804,311,282,211
1912,885,301,300,284
1920,1298,497,446,355
1924,1002,339,337,326
1928,799,259,267,273
1932,731,246,250,235
1936,983,335,331,317


In [13]:
#count win percentage of total medals by year and country and merge into medals table
win_percentage = total_annual_medals.drop(['Golds',
                        'Silvers',
                        'Bronzes'], axis=1)
win_percentage.rename(columns={'Total_Annual_Medals': 'Win_Percentage'}, inplace=True)

medals = pd.merge(medals, win_percentage,  how='left', left_on=['Year'], right_on = ['Year'])
medals['Win_Percentage'] = medals['Total_Medals'] / medals['Win_Percentage'] * 100
medals.head()

Unnamed: 0,NOC,Year,Total_Medals,Golds,Silvers,Bronzes,Win_Percentage
0,AUS,1896,2,2,0,0,1.324503
1,AUT,1896,5,2,1,2,3.311258
2,DEN,1896,6,1,2,3,3.97351
3,FRA,1896,11,5,4,2,7.284768
4,GBR,1896,7,2,3,2,4.635762


# Olympic Sports and Medals (ETL): Load

In [14]:
#creating SQL connection
conn = sqlite3.connect('../Data/olympics.db')
c = conn.cursor()

#function to close connection
def close_c_conn():
    c.close()
    conn.close()

In [15]:
#create medals table in SQL
medals.to_sql('medals', conn, if_exists='replace')

#check SQL medals table
medals = pd.read_sql_query("SELECT * FROM medals;", conn)
print(medals.shape)
medals.head()

(1323, 8)


Unnamed: 0,index,NOC,Year,Total_Medals,Golds,Silvers,Bronzes,Win_Percentage
0,0,AUS,1896,2,2,0,0,1.324503
1,1,AUT,1896,5,2,1,2,3.311258
2,2,DEN,1896,6,1,2,3,3.97351
3,3,FRA,1896,11,5,4,2,7.284768
4,4,GBR,1896,7,2,3,2,4.635762


In [16]:
#close connection
close_c_conn()