# sql queries on pandas

**Aditional resources:**

* Comparison with SQL on the pandas documentation: https://pandas.pydata.org/docs/getting_started/comparison/comparison_with_sql.html

<img src = "https://i.imgflip.com/47k7yb.jpg">

# libraries

In [139]:
import pandas

import seaborn
import matplotlib.pyplot as plt

import time
import sys
import os

## data tables

In [140]:
os.listdir("data/olympics")

['.DS_Store',
 'athletes.csv',
 'medalist.csv',
 'games.csv',
 'athlete_events.csv',
 'events.csv']

In [143]:
[file for file in os.listdir("data/olympics") if ".csv" in file]

['athletes.csv',
 'medalist.csv',
 'games.csv',
 'athlete_events.csv',
 'events.csv']

In [174]:
games = pandas.read_csv("data/olympics/games.csv")
events = pandas.read_csv("data/olympics/events.csv")
athletes = pandas.read_csv("data/olympics/athletes.csv")
medalists = pandas.read_csv("data/olympics/medalist.csv")

In [162]:
games.head()

Unnamed: 0,id_games,Games,Year,Season,City
0,0,1896 Summer,1896,Summer,Athina
1,1,1900 Summer,1900,Summer,Paris
2,2,1904 Summer,1904,Summer,St. Louis
3,3,1906 Summer,1906,Summer,Athina
4,4,1908 Summer,1908,Summer,London


In [163]:
events.head()

Unnamed: 0,id_event,Games,Sport,Event,id_games
0,0,1896 Summer,Athletics,Athletics Men's High Jump,0
1,1,1896 Summer,Athletics,Athletics Men's 100 metres,0
2,2,1896 Summer,Tennis,Tennis Men's Singles,0
3,3,1896 Summer,Cycling,Cycling Men's 333 metres Time Trial,0
4,4,1896 Summer,Tennis,Tennis Men's Doubles,0


In [164]:
athletes.head()

Unnamed: 0,id_athlete,Name,Sex,Age,Height,Weight,NOC,id_event,id_games
0,1,A Dijiang,M,24.0,180.0,80.0,CHN,3723,37
1,2,A Lamusi,M,23.0,170.0,60.0,CHN,5676,48
2,3,Gunnar Nielsen Aaby,M,24.0,,,DEN,671,6
3,4,Edgar Lindenau Aabye,M,34.0,,,DEN,127,1
4,5,Christine Jacoba Aaftink,F,21.0,185.0,82.0,NED,3593,36


In [165]:
medalists.head()

Unnamed: 0,id_athlete,id_event,Medal
0,35698,4,Bronze
1,12929,4,Gold
2,101352,4,Bronze
3,121713,4,Gold
4,18785,4,Silver


## SELECT columns

`select columns from games`

In [121]:
games.head()

Unnamed: 0,id_games,Games,Year,Season,City
0,0,1896 Summer,1896,Summer,Athina
1,1,1900 Summer,1900,Summer,Paris
2,2,1904 Summer,1904,Summer,St. Louis
3,3,1906 Summer,1906,Summer,Athina
4,4,1908 Summer,1908,Summer,London


In [71]:
games.head()[["id_games", "Games"]]

Unnamed: 0,id_games,Games
0,0,1896 Summer
1,1,1900 Summer
2,2,1904 Summer
3,3,1906 Summer
4,4,1908 Summer


In [72]:
games.head().get(["id_games", "Games"])

Unnamed: 0,id_games,Games
0,0,1896 Summer
1,1,1900 Summer
2,2,1904 Summer
3,3,1906 Summer
4,4,1908 Summer


*select columns by pattern*

In [73]:
athletes.head()

Unnamed: 0,id_athlete,Name,Sex,Age,Height,Weight,NOC,id_event,id_games
0,1,A Dijiang,M,24.0,180.0,80.0,CHN,3723,37
1,2,A Lamusi,M,23.0,170.0,60.0,CHN,5676,48
2,3,Gunnar Nielsen Aaby,M,24.0,,,DEN,671,6
3,4,Edgar Lindenau Aabye,M,34.0,,,DEN,127,1
4,5,Christine Jacoba Aaftink,F,21.0,185.0,82.0,NED,3593,36


In [74]:
columns_selected = [column for column in athletes.columns if "id" in column]
athletes.get(columns_selected)

Unnamed: 0,id_athlete,id_event,id_games
0,1,3723,37
1,2,5676,48
2,3,671,6
3,4,127,1
4,5,3593,36
...,...,...,...
187447,135568,5932,50
187448,135569,2822,30
187449,135570,5834,49
187450,135571,4308,41


*select column by type*

In [75]:
medalists.head()

Unnamed: 0,id_athlete,Name,Sex,Age,Height,Weight,NOC,id_event,id_games
0,1,A Dijiang,M,24.0,180.0,80.0,CHN,3723,37
1,2,A Lamusi,M,23.0,170.0,60.0,CHN,5676,48
2,3,Gunnar Nielsen Aaby,M,24.0,,,DEN,671,6
3,4,Edgar Lindenau Aabye,M,34.0,,,DEN,127,1
4,5,Christine Jacoba Aaftink,F,21.0,185.0,82.0,NED,3593,36


In [76]:
medalists.select_dtypes("float")

Unnamed: 0,Age,Height,Weight
0,24.0,180.0,80.0
1,23.0,170.0,60.0
2,24.0,,
3,34.0,,
4,21.0,185.0,82.0
...,...,...,...
187447,33.0,171.0,69.0
187448,29.0,179.0,89.0
187449,27.0,176.0,59.0
187450,30.0,185.0,96.0


## JOINS

`
select a.columns, b.columns
from  a left join b 
on a.key = b.key`

In [150]:
athletes.head()

Unnamed: 0,id_athlete,Name,Sex,Age,Height,Weight,NOC,id_event,id_games
0,1,A Dijiang,M,24.0,180.0,80.0,CHN,3723,37
1,2,A Lamusi,M,23.0,170.0,60.0,CHN,5676,48
2,3,Gunnar Nielsen Aaby,M,24.0,,,DEN,671,6
3,4,Edgar Lindenau Aabye,M,34.0,,,DEN,127,1
4,5,Christine Jacoba Aaftink,F,21.0,185.0,82.0,NED,3593,36


In [151]:
len(athletes)

187452

In [152]:
events.head()

Unnamed: 0,id_event,Games,Sport,Event,id_games
0,0,1896 Summer,Athletics,Athletics Men's High Jump,0
1,1,1896 Summer,Athletics,Athletics Men's 100 metres,0
2,2,1896 Summer,Tennis,Tennis Men's Singles,0
3,3,1896 Summer,Cycling,Cycling Men's 333 metres Time Trial,0
4,4,1896 Summer,Tennis,Tennis Men's Doubles,0


*join on one key vs join on two keys*

In [153]:
athletes_by_event = pandas.merge(athletes, events, on = ["id_event", "id_games"], how = "left")

In [154]:
len(athletes_by_event)

187452

In [155]:
athletes_by_event.head()

Unnamed: 0,id_athlete,Name,Sex,Age,Height,Weight,NOC,id_event,id_games,Games,Sport,Event
0,1,A Dijiang,M,24.0,180.0,80.0,CHN,3723,37,1992 Summer,Basketball,Basketball Men's Basketball
1,2,A Lamusi,M,23.0,170.0,60.0,CHN,5676,48,2012 Summer,Judo,Judo Men's Extra-Lightweight
2,3,Gunnar Nielsen Aaby,M,24.0,,,DEN,671,6,1920 Summer,Football,Football Men's Football
3,4,Edgar Lindenau Aabye,M,34.0,,,DEN,127,1,1900 Summer,Tug-Of-War,Tug-Of-War Men's Tug-Of-War
4,5,Christine Jacoba Aaftink,F,21.0,185.0,82.0,NED,3593,36,1988 Winter,Speed Skating,Speed Skating Women's 500 metres


In [156]:
athletes_event_games = pandas.merge(athletes_by_event, games[["id_games", "Season", "City"]], how = "left")

In [157]:
len(athletes_event_games)

187452

In [158]:
athletes_event_games.head()

Unnamed: 0,id_athlete,Name,Sex,Age,Height,Weight,NOC,id_event,id_games,Games,Sport,Event,Season,City
0,1,A Dijiang,M,24.0,180.0,80.0,CHN,3723,37,1992 Summer,Basketball,Basketball Men's Basketball,Summer,Barcelona
1,2,A Lamusi,M,23.0,170.0,60.0,CHN,5676,48,2012 Summer,Judo,Judo Men's Extra-Lightweight,Summer,London
2,3,Gunnar Nielsen Aaby,M,24.0,,,DEN,671,6,1920 Summer,Football,Football Men's Football,Summer,Antwerpen
3,4,Edgar Lindenau Aabye,M,34.0,,,DEN,127,1,1900 Summer,Tug-Of-War,Tug-Of-War Men's Tug-Of-War,Summer,Paris
4,5,Christine Jacoba Aaftink,F,21.0,185.0,82.0,NED,3593,36,1988 Winter,Speed Skating,Speed Skating Women's 500 metres,Winter,Calgary


In [175]:
medalists.head()

Unnamed: 0,athlete_id,id_event,Medal
0,35698,4,Bronze
1,12929,4,Gold
2,101352,4,Bronze
3,121713,4,Gold
4,18785,4,Silver


In [177]:
full_dataset = pandas.merge(athletes_event_games, medalists, 
                            left_on = ["id_athlete", "id_event"], 
                            right_on = ["athlete_id", "id_event"], 
                            how = "left")

In [178]:
len(full_dataset)

187452

In [179]:
full_dataset.head()

Unnamed: 0,id_athlete,Name,Sex,Age,Height,Weight,NOC,id_event,id_games,Games,Sport,Event,Season,City,athlete_id,Medal
0,1,A Dijiang,M,24.0,180.0,80.0,CHN,3723,37,1992 Summer,Basketball,Basketball Men's Basketball,Summer,Barcelona,,
1,2,A Lamusi,M,23.0,170.0,60.0,CHN,5676,48,2012 Summer,Judo,Judo Men's Extra-Lightweight,Summer,London,,
2,3,Gunnar Nielsen Aaby,M,24.0,,,DEN,671,6,1920 Summer,Football,Football Men's Football,Summer,Antwerpen,,
3,4,Edgar Lindenau Aabye,M,34.0,,,DEN,127,1,1900 Summer,Tug-Of-War,Tug-Of-War Men's Tug-Of-War,Summer,Paris,4.0,Gold
4,5,Christine Jacoba Aaftink,F,21.0,185.0,82.0,NED,3593,36,1988 Winter,Speed Skating,Speed Skating Women's 500 metres,Winter,Calgary,,


In [180]:
full_dataset.loc[~full_dataset["Medal"].isnull()].head()

Unnamed: 0,id_athlete,Name,Sex,Age,Height,Weight,NOC,id_event,id_games,Games,Sport,Event,Season,City,athlete_id,Medal
3,4,Edgar Lindenau Aabye,M,34.0,,,DEN,127,1,1900 Summer,Tug-Of-War,Tug-Of-War Men's Tug-Of-War,Summer,Paris,4.0,Gold
20,15,Arvo Ossian Aaltonen,M,30.0,,,FIN,672,6,1920 Summer,Swimming,Swimming Men's 200 metres Breaststroke,Summer,Antwerpen,15.0,Bronze
22,16,Juhamatti Tapio Aaltonen,M,28.0,184.0,85.0,FIN,5800,49,2014 Winter,Ice Hockey,Ice Hockey Men's Ice Hockey,Winter,Sochi,16.0,Bronze
23,17,Paavo Johannes Aaltonen,M,28.0,175.0,64.0,FIN,1347,15,1948 Summer,Gymnastics,Gymnastics Men's Individual All-Around,Summer,London,17.0,Bronze
28,20,Kjetil Andr Aamodt,M,22.0,176.0,85.0,NOR,3993,39,1994 Winter,Alpine Skiing,Alpine Skiing Men's Downhill,Winter,Lillehammer,20.0,Silver


In [181]:
full_dataset = full_dataset[[column for column in full_dataset.columns if "id" not in column]]
full_dataset.head()

Unnamed: 0,Name,Sex,Age,Height,Weight,NOC,Games,Sport,Event,Season,City,Medal
0,A Dijiang,M,24.0,180.0,80.0,CHN,1992 Summer,Basketball,Basketball Men's Basketball,Summer,Barcelona,
1,A Lamusi,M,23.0,170.0,60.0,CHN,2012 Summer,Judo,Judo Men's Extra-Lightweight,Summer,London,
2,Gunnar Nielsen Aaby,M,24.0,,,DEN,1920 Summer,Football,Football Men's Football,Summer,Antwerpen,
3,Edgar Lindenau Aabye,M,34.0,,,DEN,1900 Summer,Tug-Of-War,Tug-Of-War Men's Tug-Of-War,Summer,Paris,Gold
4,Christine Jacoba Aaftink,F,21.0,185.0,82.0,NED,1988 Winter,Speed Skating,Speed Skating Women's 500 metres,Winter,Calgary,


## NVL
`select nvl(column, value_to_fill) from table`

In [182]:
full_dataset["Medal"] = full_dataset["Medal"].fillna("No Medal")

In [183]:
full_dataset.head()

Unnamed: 0,Name,Sex,Age,Height,Weight,NOC,Games,Sport,Event,Season,City,Medal
0,A Dijiang,M,24.0,180.0,80.0,CHN,1992 Summer,Basketball,Basketball Men's Basketball,Summer,Barcelona,No Medal
1,A Lamusi,M,23.0,170.0,60.0,CHN,2012 Summer,Judo,Judo Men's Extra-Lightweight,Summer,London,No Medal
2,Gunnar Nielsen Aaby,M,24.0,,,DEN,1920 Summer,Football,Football Men's Football,Summer,Antwerpen,No Medal
3,Edgar Lindenau Aabye,M,34.0,,,DEN,1900 Summer,Tug-Of-War,Tug-Of-War Men's Tug-Of-War,Summer,Paris,Gold
4,Christine Jacoba Aaftink,F,21.0,185.0,82.0,NED,1988 Winter,Speed Skating,Speed Skating Women's 500 metres,Winter,Calgary,No Medal


## SELECT WHERE

`select columns where condition`

In [186]:
full_dataset.loc[(athletes["Sex"] == "F") & (athletes["NOC"] == "PER")]

Unnamed: 0,Name,Sex,Age,Height,Weight,NOC,Games,Sport,Event,Season,City,Medal
2047,Fiorella Ata Junek,F,23.0,170.0,65.0,PER,2000 Summer,Volleyball,Volleyball Women's Volleyball,Summer,Sydney,No Medal
6840,Wilma Yanet Arizapana Yucra,F,29.0,164.0,54.0,PER,2012 Summer,Athletics,Athletics Women's Marathon,Summer,London,No Medal
7379,Olga Asato Hichiva,F,19.0,166.0,65.0,PER,1968 Summer,Volleyball,Volleyball Women's Volleyball,Summer,Mexico City,No Medal
8418,Mara Pia Ayora,F,18.0,178.0,68.0,PER,1980 Summer,Swimming,Swimming Women's 400 metres Freestyle,Summer,Moskva,No Medal
9728,E. Gladys Baldwin Lopez (-de Seminario-),F,31.0,167.0,67.0,PER,1968 Summer,Shooting,"Shooting Mixed Small-Bore Rifle, Prone, 50 metres",Summer,Mexico City,No Medal
...,...,...,...,...,...,...,...,...,...,...,...,...
173780,Norma Velarde Alvarez,F,21.0,169.0,74.0,PER,1968 Summer,Volleyball,Volleyball Women's Volleyball,Summer,Mexico City,No Medal
173796,Claudia Silvana Velsquez Ponzoni,F,16.0,,,PER,1992 Summer,Swimming,Swimming Women's 100 metres Breaststroke,Summer,Barcelona,No Medal
174952,Mara Luisa Vilca Alzola,F,24.0,164.0,58.0,PER,1972 Summer,Athletics,Athletics Women's 100 metres,Summer,Munich,No Medal
185060,Yulissa Noelia Zamudio Orl,F,20.0,185.0,75.0,PER,1996 Summer,Volleyball,Volleyball Women's Volleyball,Summer,Atlanta,No Medal


`select column where column like '%pattern %'`

*Winklevoss* 

In [187]:
full_dataset.loc[athletes["Name"].str.contains("Winklevoss")]

Unnamed: 0,Name,Sex,Age,Height,Weight,NOC,Games,Sport,Event,Season,City,Medal
181082,Cameron Howard Winklevoss,M,26.0,196.0,96.0,USA,2008 Summer,Rowing,Rowing Men's Coxless Pairs,Summer,Beijing,No Medal
181083,Tyler Howard Winklevoss,M,26.0,196.0,95.0,USA,2008 Summer,Rowing,Rowing Men's Coxless Pairs,Summer,Beijing,No Medal


In [188]:
full_dataset.loc[athletes["Name"].str.lower().str.contains("montoya")]

Unnamed: 0,Name,Sex,Age,Height,Weight,NOC,Games,Sport,Event,Season,City,Medal
65447,Sergio Luis Henao Montoya,M,24.0,170.0,61.0,COL,2012 Summer,Cycling,"Cycling Men's Road Race, Individual",Summer,London,No Medal
65448,Sergio Luis Henao Montoya,M,28.0,170.0,61.0,COL,2016 Summer,Cycling,"Cycling Men's Road Race, Individual",Summer,Rio de Janeiro,No Medal
66223,Armando Herrera Montoya,M,24.0,170.0,69.0,MEX,1960 Summer,Basketball,Basketball Men's Basketball,Summer,Roma,No Medal
66224,Armando Herrera Montoya,M,28.0,170.0,69.0,MEX,1964 Summer,Basketball,Basketball Men's Basketball,Summer,Tokyo,No Medal
92573,Jos Luis Laverdeza Montoya,M,19.0,167.0,67.0,CUB,1980 Summer,Fencing,"Fencing Men's Sabre, Individual",Summer,Moskva,No Medal
107602,Luis Medina Montoya,M,24.0,176.0,63.0,CUB,1976 Summer,Athletics,Athletics Men's 800 metres,Summer,Montreal,No Medal
112455,Alejandro Montoya Vera,M,20.0,169.0,74.0,CUB,1972 Summer,Boxing,Boxing Men's Middleweight,Summer,Munich,No Medal
112456,Ana Mara Montoya Prophater,F,20.0,169.0,62.0,COL,2012 Summer,Football,Football Women's Football,Summer,London,No Medal
112457,Daniela Montoya Quiroz,F,21.0,158.0,55.0,COL,2012 Summer,Football,Football Women's Football,Summer,London,No Medal
112458,Elmer Roberto Montoya Meza,M,22.0,175.0,72.0,HON,2000 Summer,Football,Football Men's Football,Summer,Sydney,No Medal


In [189]:
full_dataset.loc[(athletes_by_event["NOC"] == "PER") & (athletes_by_event["Sex"] == "F")]

Unnamed: 0,Name,Sex,Age,Height,Weight,NOC,Games,Sport,Event,Season,City,Medal
2047,Fiorella Ata Junek,F,23.0,170.0,65.0,PER,2000 Summer,Volleyball,Volleyball Women's Volleyball,Summer,Sydney,No Medal
6840,Wilma Yanet Arizapana Yucra,F,29.0,164.0,54.0,PER,2012 Summer,Athletics,Athletics Women's Marathon,Summer,London,No Medal
7379,Olga Asato Hichiva,F,19.0,166.0,65.0,PER,1968 Summer,Volleyball,Volleyball Women's Volleyball,Summer,Mexico City,No Medal
8418,Mara Pia Ayora,F,18.0,178.0,68.0,PER,1980 Summer,Swimming,Swimming Women's 400 metres Freestyle,Summer,Moskva,No Medal
9728,E. Gladys Baldwin Lopez (-de Seminario-),F,31.0,167.0,67.0,PER,1968 Summer,Shooting,"Shooting Mixed Small-Bore Rifle, Prone, 50 metres",Summer,Mexico City,No Medal
...,...,...,...,...,...,...,...,...,...,...,...,...
173780,Norma Velarde Alvarez,F,21.0,169.0,74.0,PER,1968 Summer,Volleyball,Volleyball Women's Volleyball,Summer,Mexico City,No Medal
173796,Claudia Silvana Velsquez Ponzoni,F,16.0,,,PER,1992 Summer,Swimming,Swimming Women's 100 metres Breaststroke,Summer,Barcelona,No Medal
174952,Mara Luisa Vilca Alzola,F,24.0,164.0,58.0,PER,1972 Summer,Athletics,Athletics Women's 100 metres,Summer,Munich,No Medal
185060,Yulissa Noelia Zamudio Orl,F,20.0,185.0,75.0,PER,1996 Summer,Volleyball,Volleyball Women's Volleyball,Summer,Atlanta,No Medal


In [190]:
athletes_PER = full_dataset.loc[(full_dataset["NOC"] == "PER") & (full_dataset["Sex"] == "F")]
athletes_PER.get(["Name", "Games"]).groupby("Name", as_index = False).count().sort_values(by = "Games", ascending = False)

Unnamed: 0,Name,Games
71,Natalia Mara Mlaga Dibos,4
11,Cecilia Roxana Tait Villacorta,3
82,Santa Ins Melchor Huiza,3
50,Mara Cecilia del Risco,3
79,Rosa Gisella Garca Rivas,3
...,...,...
39,Kimberly Garca Len,1
38,"Katherine ""Kathy"" Horny",1
37,Karin Brandes,1
36,Karen Horning,1


In [191]:
full_dataset.loc[athletes["Name"] == "Natalia Mara Mlaga Dibos"]

Unnamed: 0,Name,Sex,Age,Height,Weight,NOC,Games,Sport,Event,Season,City,Medal
101868,Natalia Mara Mlaga Dibos,F,16.0,170.0,59.0,PER,1980 Summer,Volleyball,Volleyball Women's Volleyball,Summer,Moskva,No Medal
101869,Natalia Mara Mlaga Dibos,F,20.0,170.0,59.0,PER,1984 Summer,Volleyball,Volleyball Women's Volleyball,Summer,Los Angeles,No Medal
101870,Natalia Mara Mlaga Dibos,F,24.0,170.0,59.0,PER,1988 Summer,Volleyball,Volleyball Women's Volleyball,Summer,Seoul,Silver
101871,Natalia Mara Mlaga Dibos,F,36.0,170.0,59.0,PER,2000 Summer,Volleyball,Volleyball Women's Volleyball,Summer,Sydney,No Medal


In [193]:
full_dataset.loc[(full_dataset["NOC"] == "PER") & 
                 (full_dataset["Sport"] == "Volleyball") &
                 (full_dataset["Games"] == "1988 Summer")]

Unnamed: 0,Name,Sex,Age,Height,Weight,NOC,Games,Sport,Event,Season,City,Medal
26619,Luisa Haydee Cervera Cevedon,F,24.0,173.0,70.0,PER,1988 Summer,Volleyball,Volleyball Women's Volleyball,Summer,Seoul,Silver
35983,Alejandra de la Guerra,F,20.0,173.0,59.0,PER,1988 Summer,Volleyball,Volleyball Women's Volleyball,Summer,Seoul,Silver
46458,Denisse Fajardo Garca,F,24.0,171.0,62.0,PER,1988 Summer,Volleyball,Volleyball Women's Volleyball,Summer,Seoul,Silver
52523,Miriam Gallardo,F,20.0,168.0,57.0,PER,1988 Summer,Volleyball,Volleyball Women's Volleyball,Summer,Seoul,Silver
53348,Rosa Gisella Garca Rivas,F,24.0,175.0,69.0,PER,1988 Summer,Volleyball,Volleyball Women's Volleyball,Summer,Seoul,Silver
65911,Sonia Isabel Heredia,F,24.0,175.0,69.0,PER,1988 Summer,Volleyball,Volleyball Women's Volleyball,Summer,Seoul,Silver
68832,"Katherine ""Kathy"" Horny",F,18.0,186.0,76.0,PER,1988 Summer,Volleyball,Volleyball Women's Volleyball,Summer,Seoul,Silver
101870,Natalia Mara Mlaga Dibos,F,24.0,170.0,59.0,PER,1988 Summer,Volleyball,Volleyball Women's Volleyball,Summer,Seoul,Silver
128653,"Gabriela Lourdes ""Gaby"" Prez del Solar Cuculiza",F,20.0,194.0,72.0,PER,1988 Summer,Volleyball,Volleyball Women's Volleyball,Summer,Seoul,Silver
162855,Cecilia Roxana Tait Villacorta,F,26.0,182.0,70.0,PER,1988 Summer,Volleyball,Volleyball Women's Volleyball,Summer,Seoul,Silver


## SELECT DISTINCT

`select distinct column from table`

In [194]:
full_dataset.head()

Unnamed: 0,Name,Sex,Age,Height,Weight,NOC,Games,Sport,Event,Season,City,Medal
0,A Dijiang,M,24.0,180.0,80.0,CHN,1992 Summer,Basketball,Basketball Men's Basketball,Summer,Barcelona,No Medal
1,A Lamusi,M,23.0,170.0,60.0,CHN,2012 Summer,Judo,Judo Men's Extra-Lightweight,Summer,London,No Medal
2,Gunnar Nielsen Aaby,M,24.0,,,DEN,1920 Summer,Football,Football Men's Football,Summer,Antwerpen,No Medal
3,Edgar Lindenau Aabye,M,34.0,,,DEN,1900 Summer,Tug-Of-War,Tug-Of-War Men's Tug-Of-War,Summer,Paris,Gold
4,Christine Jacoba Aaftink,F,21.0,185.0,82.0,NED,1988 Winter,Speed Skating,Speed Skating Women's 500 metres,Winter,Calgary,No Medal


In [195]:
full_dataset["City"].unique()

array(['Barcelona', 'London', 'Antwerpen', 'Paris', 'Calgary',
       'Albertville', 'Lillehammer', 'Los Angeles', 'Salt Lake City',
       'Helsinki', 'Lake Placid', 'Sydney', 'Atlanta', 'Stockholm',
       'Sochi', 'Nagano', 'Torino', 'Beijing', 'Rio de Janeiro', 'Athina',
       'Squaw Valley', 'Innsbruck', 'Sarajevo', 'Mexico City', 'Munich',
       'Seoul', 'Berlin', 'Oslo', "Cortina d'Ampezzo", 'Melbourne',
       'Roma', 'Amsterdam', 'Montreal', 'Moskva', 'Tokyo', 'Vancouver',
       'Grenoble', 'Sapporo', 'Chamonix', 'St. Louis', 'Sankt Moritz',
       'Garmisch-Partenkirchen'], dtype=object)

In [196]:
full_dataset.get("City").drop_duplicates()

0                   Barcelona
1                      London
2                   Antwerpen
3                       Paris
4                     Calgary
5                 Albertville
6                 Lillehammer
11                Los Angeles
12             Salt Lake City
13                   Helsinki
14                Lake Placid
15                     Sydney
16                    Atlanta
19                  Stockholm
22                      Sochi
29                     Nagano
31                     Torino
32                    Beijing
33             Rio de Janeiro
35                     Athina
36               Squaw Valley
37                  Innsbruck
39                   Sarajevo
41                Mexico City
42                     Munich
43                      Seoul
46                     Berlin
57                       Oslo
58          Cortina d'Ampezzo
69                  Melbourne
70                       Roma
73                  Amsterdam
80                   Montreal
119       

In [197]:
full_dataset["City"].nunique()

42

## GROUP BY COUNT

`select column, count()
from table
group by column`

In [200]:
full_dataset.get(["City", "Games"])\
            .drop_duplicates()\
            .groupby("City")\
            .count()

Unnamed: 0_level_0,Games
City,Unnamed: 1_level_1
Albertville,1
Amsterdam,1
Antwerpen,1
Athina,3
Atlanta,1
Barcelona,1
Beijing,1
Berlin,1
Calgary,1
Chamonix,1


In [201]:
df_count = full_dataset.get(["City", "Games"])\
                        .drop_duplicates()\
                        .groupby("City")\
                        .count()

df_count.head()

Unnamed: 0_level_0,Games
City,Unnamed: 1_level_1
Albertville,1
Amsterdam,1
Antwerpen,1
Athina,3
Atlanta,1


In [203]:
df_count.sort_values(by = "Games", ascending = False).head()

Unnamed: 0_level_0,Games
City,Unnamed: 1_level_1
Athina,3
London,3
Paris,2
Los Angeles,2
Lake Placid,2


In [204]:
games.loc[games["City"] == "London"]

Unnamed: 0,id_games,Games,Year,Season,City
4,4,1908 Summer,1908,Summer,London
15,15,1948 Summer,1948,Summer,London
48,48,2012 Summer,2012,Summer,London


In [205]:
games.loc[games["City"] == "Paris"]

Unnamed: 0,id_games,Games,Year,Season,City
1,1,1900 Summer,1900,Summer,Paris
7,7,1924 Summer,1924,Summer,Paris


## Union

In [217]:
os.listdir("data/olympics/")

['split2.csv',
 'split1.csv',
 '.DS_Store',
 'athletes.csv',
 'medalist.csv',
 'games.csv',
 'athlete_events.csv',
 'events.csv']

In [218]:
df1 = pandas.read_csv("data/olympics/split1.csv")
df2 = pandas.read_csv("data/olympics/split2.csv")

# Contact

Manuel Montoya 
* Mail: manuel.montoya@pucp.edu.pe
* Linkedin: https://www.linkedin.com/in/manuel-montoya-gamio/

Numpy pain: https://www.reddit.com/r/ProgrammerHumor/comments/aouyj1/when_you_program_python_for_a_year_and_realize/ 