In [25]:
import pandas as pd
import numpy as np

from bs4 import BeautifulSoup
import requests

from IPython.core.display import display, HTML

In [26]:
# importing the data

PATH_1 = "./data/athlete_events.csv" 

df = pd.read_csv(PATH_1)  
df.head()

Unnamed: 0,ID,Name,Sex,Age,Height,Weight,Team,NOC,Games,Year,Season,City,Sport,Event,Medal
0,1,A Dijiang,M,24.0,180.0,80.0,China,CHN,1992 Summer,1992,Summer,Barcelona,Basketball,Basketball Men's Basketball,
1,2,A Lamusi,M,23.0,170.0,60.0,China,CHN,2012 Summer,2012,Summer,London,Judo,Judo Men's Extra-Lightweight,
2,3,Gunnar Nielsen Aaby,M,24.0,,,Denmark,DEN,1920 Summer,1920,Summer,Antwerpen,Football,Football Men's Football,
3,4,Edgar Lindenau Aabye,M,34.0,,,Denmark/Sweden,DEN,1900 Summer,1900,Summer,Paris,Tug-Of-War,Tug-Of-War Men's Tug-Of-War,Gold
4,5,Christine Jacoba Aaftink,F,21.0,185.0,82.0,Netherlands,NED,1988 Winter,1988,Winter,Calgary,Speed Skating,Speed Skating Women's 500 metres,


In [27]:
# preprocessing
df = df[df["Season"] == "Summer"]
df["Medal"] = (df["Medal"].isna()==False)

In [28]:
def to_flourish(data, var):
    series_VY = data.groupby([var,"Year"]).sum()["Medal"]

    YEARS_QT = data["Year"].unique().size
    VAR_QT = data[var].unique().size

    YEARS = sorted(data["Year"].unique())
    VAR = sorted(data[var].unique())

    array_zeros = np.zeros(shape=(YEARS_QT, VAR_QT))
    data_zeros = pd.DataFrame(data=array_zeros, index=YEARS, columns=VAR)

    for i in VAR:
        data_zeros[i] = data_zeros[i] + series_VY[i].cumsum()
        
    data_final = data_zeros.fillna(method="ffill")
    
    return data_final.T.fillna(0)

In [29]:
df_top15 = to_flourish(df, "NOC")

In [30]:
df_top15

Unnamed: 0,1896,1900,1904,1906,1908,1912,1920,1924,1928,1932,...,1980,1984,1988,1992,1996,2000,2004,2008,2012,2016
AFG,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,2.0
AHO,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
ALB,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ALG,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,2.0,2.0,4.0,7.0,12.0,12.0,14.0,15.0,17.0
AND,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
YEM,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
YMD,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
YUG,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,14.0,14.0,...,233.0,320.0,383.0,383.0,383.0,383.0,383.0,383.0,383.0,383.0
ZAM,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0


In [11]:
# Now all that's left is to fill a column with url's of images of the flags of each country


# The DataFrame for the NOC translate

PATH_2 = "./data/noc_regions.csv"

noc_reg = pd.read_csv(PATH_2)
noc_reg.head()

Unnamed: 0,NOC,region,notes
0,AFG,Afghanistan,
1,AHO,Curacao,Netherlands Antilles
2,ALB,Albania,
3,ALG,Algeria,
4,AND,Andorra,


In [12]:
# function to get one url given a NOC

def get_link_flag(NOC):
    countr = noc_reg[noc_reg["NOC"] == NOC]["region"].values[0] # The name of the country 
    url = f"https://pt.wikipedia.org/wiki/Ficheiro:Flag_of_{countr}.svg" # the url of the wikipedia flags website
    html = requests.get(url).text 
    soup = BeautifulSoup(html, "html.parser")

    CLASS_ = "fullImageLink" # The class with the image url
    string = str(soup.find_all("div", {"class":CLASS_})[0]) # All the class in str type
    
    # Searching for the scr and .png to get the url 
    start = string.find("src=") + 7 
    end = string.find(".png") + 4
    
    return string[start:end]

In [13]:
# get all url for NOC

not_found = []
found = []

for i in noc_reg["NOC"].values:
    try:
        found.append("https://"+get_link_flag(i))
    except IndexError:
        not_found.append(i)
        found.append(i)
        continue

In [14]:
# creating a new col for put the url's
df_top15["FLAGS"] = found 
df_top15.head()

Unnamed: 0,1896,1900,1904,1906,1908,1912,1920,1924,1928,1932,...,1984,1988,1992,1996,2000,2004,2008,2012,2016,FLAGS
AFG,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,2.0,https://upload.wikimedia.org/wikipedia/commons...
AHO,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,https://upload.wikimedia.org/wikipedia/commons...
ALB,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,https://upload.wikimedia.org/wikipedia/commons...
ALG,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.0,2.0,4.0,7.0,12.0,12.0,14.0,15.0,17.0,https://upload.wikimedia.org/wikipedia/commons...
AND,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,https://upload.wikimedia.org/wikipedia/commons...


In [15]:
# taking care of the not_found manually

URLS = ["https://upload.wikimedia.org/wikipedia/commons/thumb/d/de/Flag_of_Bolivia_%28state%29.svg/800px-Flag_of_Bolivia_%28state%29.svg.png",
        "https://upload.wikimedia.org/wikipedia/commons/thumb/6/6f/Flag_of_the_Democratic_Republic_of_the_Congo.svg/800px-Flag_of_the_Democratic_Republic_of_the_Congo.svg.png",
        "https://upload.wikimedia.org/wikipedia/commons/0/08/Flag_of_the_Individual_Olympic_Athletes_%282000_Summer_Olympics%29.svg",
        "https://upload.wikimedia.org/wikipedia/commons/thumb/f/f8/Flag_of_the_United_States_Virgin_Islands.svg/1275px-Flag_of_the_United_States_Virgin_Islands.svg.png",
        "https://upload.wikimedia.org/wikipedia/commons/thumb/4/42/Flag_of_the_British_Virgin_Islands.svg/1200px-Flag_of_the_British_Virgin_Islands.svg.png",
        "https://compote.slate.com/images/57d479a0-63d2-4462-97b0-1275e4efefe1.jpg",
        "https://upload.wikimedia.org/wikipedia/commons/f/fe/Flag_of_Saint_Kitts_and_Nevis.svg",
        "https://www.countryflags.com/wp-content/uploads/trinidad-and-tobago-flag-png-large.png",
        "https://upload.wikimedia.org/wikipedia/commons/thumb/3/38/Flag_of_Tuvalu.svg/1200px-Flag_of_Tuvalu.svg.png",
        "https://upload.wikimedia.org/wikipedia/commons/2/2e/Unknown_flag_-_European_version.png",
        "https://upload.wikimedia.org/wikipedia/commons/thumb/6/6d/Flag_of_Saint_Vincent_and_the_Grenadines.svg/1200px-Flag_of_Saint_Vincent_and_the_Grenadines.svg.png",
        "https://upload.wikimedia.org/wikipedia/commons/thumb/1/15/Flag_of_the_West_Indies_Federation_%281958%E2%80%931962%29.svg/1200px-Flag_of_the_West_Indies_Federation_%281958%E2%80%931962%29.svg.png"]

for i,url in enumerate(URLS):
    df_top15.loc[not_found[i],"FLAGS"] = url

# This 2 changes is because:
#                           the NOC of URS was "Russia", but should be "URSS";
#                           GDR was "Germany", but should be "East Germany".

df_top15.loc["GDR","FLAGS"] = "https://upload.wikimedia.org/wikipedia/commons/thumb/a/a1/Flag_of_East_Germany.svg/1280px-Flag_of_East_Germany.svg.png"
df_top15.loc["URS","FLAGS"] = "https://upload.wikimedia.org/wikipedia/commons/thumb/a/a9/Flag_of_the_Soviet_Union.svg/1200px-Flag_of_the_Soviet_Union.svg.png"

In [16]:
# exporting the csv for flourish
PATH_3 = "./data/df_for_flourish_top15.csv"

df_top15.to_csv(PATH_3)

In [None]:
# Add link to flourish, and a brief description

In [21]:
# after flourish...

flourish_url = "<iframe src='https://flo.uri.sh/visualisation/6409642/embed' title='Interactive or visual content' class='flourish-embed-iframe' frameborder='0' scrolling='no' style='width:100%;height:600px;' sandbox='allow-same-origin allow-forms allow-scripts allow-downloads allow-popups allow-popups-to-escape-sandbox allow-top-navigation-by-user-activation'></iframe><div style='width:100%!;margin-top:4px!important;text-align:right!important;'><a class='flourish-credit' href='https://public.flourish.studio/visualisation/6409642/?utm_source=embed&utm_campaign=visualisation/6409642' target='_top' style='text-decoration:none!important'><img alt='Made with Flourish' src='https://public.flourish.studio/resources/made_with_flourish.svg' style='width:105px!important;height:16px!important;border:none!important;margin:0!important;'> </a></div>"

display(HTML(flourish_url))

In [38]:
br = df[df["NOC"] == "BRA"]
us = df[df["NOC"] == "USA"]

In [39]:
br_final = to_flourish(br, "Sport")
us_final = to_flourish(us, "Sport")

In [40]:
PATH_4 = "./data/df_for_flourish_BR.csv"

br_final.to_csv(PATH_4)

In [41]:
PATH_5 = "./data/df_for_flourish_US.csv"

us_final.to_csv(PATH_5)

In [45]:
flourish_url_br = "<iframe src='https://flo.uri.sh/visualisation/6500927/embed' title='Interactive or visual content' class='flourish-embed-iframe' frameborder='0' scrolling='no' style='width:100%;height:600px;' sandbox='allow-same-origin allow-forms allow-scripts allow-downloads allow-popups allow-popups-to-escape-sandbox allow-top-navigation-by-user-activation'></iframe><div style='width:100%!;margin-top:4px!important;text-align:right!important;'><a class='flourish-credit' href='https://public.flourish.studio/visualisation/6500927/?utm_source=embed&utm_campaign=visualisation/6500927' target='_top' style='text-decoration:none!important'><img alt='Made with Flourish' src='https://public.flourish.studio/resources/made_with_flourish.svg' style='width:105px!important;height:16px!important;border:none!important;margin:0!important;'> </a></div>"

display(HTML(flourish_url_br))

In [43]:
flourish_url_us = "<iframe src='https://flo.uri.sh/visualisation/6502031/embed' title='Interactive or visual content' class='flourish-embed-iframe' frameborder='0' scrolling='no' style='width:100%;height:600px;' sandbox='allow-same-origin allow-forms allow-scripts allow-downloads allow-popups allow-popups-to-escape-sandbox allow-top-navigation-by-user-activation'></iframe><div style='width:100%!;margin-top:4px!important;text-align:right!important;'><a class='flourish-credit' href='https://public.flourish.studio/visualisation/6502031/?utm_source=embed&utm_campaign=visualisation/6502031' target='_top' style='text-decoration:none!important'><img alt='Made with Flourish' src='https://public.flourish.studio/resources/made_with_flourish.svg' style='width:105px!important;height:16px!important;border:none!important;margin:0!important;'> </a></div>"

display(HTML(flourish_url_us))