In [18]:
import math
import matplotlib
import matplotlib.ticker as mtick
from matplotlib import pyplot as plt
import matplotlib.ticker as ticker
import numpy as np
import pandas as pd
#import seaborn as sns
import plotly.graph_objects as go
from plotly.subplots import make_subplots
pd.options.display.max_rows = 200

In [19]:
# For exporting SVGs with editable text
new_rc_params = {'text.usetex': False,


    "svg.fonttype": 'none'


}

matplotlib.rcParams.update(new_rc_params)

# COLORS
light_blue = "#00a5ff"
medium_blue = "#0064b4"
dark_blue = "#002d5a"   
chart_gray = "#cbd2d8"
highlight_gray = "#7f8891"
bg_gray = "#f1f3f5"
orange = '#eb6e14'

In [20]:
df = pd.read_csv("all-competitors.csv")

In [21]:
# Removes Art events
df = df[df.abbrv!='ART'].reset_index()

In [22]:
# Adds mising 2020 events
missing_2020 = pd.read_csv("missing-events-at-080821.csv")

In [23]:
# Joins both
df = pd.concat([df, missing_2020])

In [24]:
# Makes everything lowercase to avoid confusion
df["medal"] = df.medal.str.lower()

In [25]:
# Isolates 2020
df_2020 = df[df.year==2020]

#### Best and worst competitor/medal ratio
Ammount of competitors (individual entrants in a single event) / total medal wons by country 

In [26]:
def victory_ratio(df):

    competitor_count = df.groupby("noc").competitor.count().reset_index().rename(columns={"competitor":"competitors"})
    
    medal_count = (df[~df.medal.isna()]
                     .pivot_table(index='noc',
                                  columns='medal',
                                  aggfunc='count')
                     .event
                     .fillna(0)
                     .reset_index())
    
    medal_count["medals"] = medal_count.gold + medal_count.silver + medal_count.bronze

    merged = competitor_count.merge(medal_count, how='left').fillna(0)
    merged["medal_ratio"] = merged.medals / merged.competitors
    merged["fail_ratio"] = 1 - merged.medal_ratio

    
    
    merged['blewed_chances'] = merged['competitors'] - merged['medals']
    
    return merged.sort_values(by=['medal_ratio', 'competitors'], ascending=False)

In [27]:
vr = victory_ratio(df)

This list changes a lot of common perceptions about Olympic powers: countries like Kenya, Azerbaijan and Ehtiopia would end up in the top 10 all-time medal winners, ahead of countries like Germany, Japan, France...

In [28]:
vr.head(20)

Unnamed: 0,noc,competitors,bronze,gold,silver,medals,medal_ratio,fail_ratio,blewed_chances
134,MIX,12,4.0,3.0,2.0,9.0,0.75,0.25,3.0
215,URS,3092,296.0,395.0,319.0,1010.0,0.326649,0.673351,2082.0
77,GDR,1465,127.0,153.0,129.0,409.0,0.279181,0.720819,1056.0
67,EUN,422,29.0,45.0,38.0,112.0,0.265403,0.734597,310.0
217,USA,10908,744.0,1064.0,837.0,2645.0,0.242483,0.757517,8263.0
42,CHN,3157,173.0,262.0,198.0,633.0,0.200507,0.799493,2524.0
169,ROC,355,22.0,20.0,27.0,69.0,0.194366,0.805634,286.0
107,KEN,656,36.0,35.0,42.0,113.0,0.172256,0.827744,543.0
172,RUS,2552,154.0,150.0,129.0,433.0,0.169671,0.830329,2119.0
111,KOS,19,0.0,3.0,0.0,3.0,0.157895,0.842105,16.0


On the other hand, there are many countries that had hundreds of chances of winnng and never made it. The neighbors El Salvador, Honduras and Nicaragua are all among them. Angola, Bolivia, Madacasgar, Seychelles, Myanmar, Papua New Guinea and Antigua & Barbuda too.

In [36]:
vr.sort_values(by='gold', ascending=False).head(10)

Unnamed: 0,noc,competitors,bronze,gold,silver,medals,medal_ratio,fail_ratio,blewed_chances
217,USA,10908,744.0,1064.0,837.0,2645.0,0.242483,0.757517,8263.0
215,URS,3092,296.0,395.0,319.0,1010.0,0.326649,0.673351,2082.0
75,GBR,7722,312.0,288.0,320.0,920.0,0.11914,0.88086,6802.0
42,CHN,3157,173.0,262.0,198.0,633.0,0.200507,0.799493,2524.0
80,GER,5187,281.0,229.0,257.0,767.0,0.14787,0.85213,4420.0
70,FRA,7821,280.0,227.0,252.0,759.0,0.097046,0.902954,7062.0
101,ITA,5605,211.0,216.0,186.0,613.0,0.109367,0.890633,4992.0
91,HUN,4788,173.0,181.0,154.0,508.0,0.106099,0.893901,4280.0
105,JPN,4685,177.0,169.0,150.0,496.0,0.10587,0.89413,4189.0
12,AUS,4767,208.0,163.0,170.0,541.0,0.113489,0.886511,4226.0


In [32]:
vr[vr.medals==0].sort_values(by='medal_ratio', ascending=True).head(20)

Unnamed: 0,noc,competitors,bronze,gold,silver,medals,medal_ratio,fail_ratio,blewed_chances
63,ESA,200,0.0,0.0,0.0,0.0,0.0,1.0,200.0
119,LCA,31,0.0,0.0,0.0,0.0,0.0,1.0,31.0
184,SOL,32,0.0,0.0,0.0,0.0,0.0,1.0,32.0
102,IVB,32,0.0,0.0,0.0,0.0,0.0,1.0,32.0
125,MAL,34,0.0,0.0,0.0,0.0,0.0,1.0,34.0
22,BHU,34,0.0,0.0,0.0,0.0,0.0,1.0,34.0
185,SOM,37,0.0,0.0,0.0,0.0,0.0,1.0,37.0
226,YEM,40,0.0,0.0,0.0,0.0,0.0,1.0,40.0
222,VIN,40,0.0,0.0,0.0,0.0,0.0,1.0,40.0
61,EOR,40,0.0,0.0,0.0,0.0,0.0,1.0,40.0


And what about the 2020 victory ratios?

In [14]:
vr_2020 = victory_ratio(df_2020).head(100).reset_index()

In [40]:
vr_2020
df2020 = vr_2020.sort_values(by='medal_ratio', ascending=False).head(10)
df2020
df2020.to_csv('victory_ratio2020.csv', index = False )


Let's print some emojis to illustrate that.

In [15]:
def emoji_magic(df):
        
    gold = "🥇"
    silver = "🥈"
    bronze = "🥉"
    blew = "❌"
    
    for index, row in df.head(10).iterrows():
        
        print(row.noc, end=' - ')
        print(gold * int(row.gold), end='')
        print(silver * int(row.silver), end='')
        print(bronze * int(row.bronze), end='')
        print(blew * int(row.blewed_chances), end='')
        print()

In [16]:
emoji_magic(vr_2020)

BER - 🥇❌
SMR - 🥈🥉🥉❌❌❌❌
CHN - 🥇🥇🥇🥇🥇🥇🥇🥇🥇🥇🥇🥇🥇🥇🥇🥇🥇🥇🥇🥇🥇🥇🥇🥇🥇🥇🥇🥇🥇🥇🥇🥇🥇🥇🥇🥇🥇🥇🥈🥈🥈🥈🥈🥈🥈🥈🥈🥈🥈🥈🥈🥈🥈🥈🥈🥈🥈🥈🥈🥈🥈🥈🥈🥈🥈🥈🥈🥈🥈🥉🥉🥉🥉🥉🥉🥉🥉🥉🥉🥉🥉🥉🥉🥉🥉🥉🥉❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌
FIJ - 🥇🥉❌❌❌❌❌❌
GEO - 🥇🥇🥈🥈🥈🥈🥈🥉❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌
USA - 🥇🥇🥇🥇🥇🥇🥇🥇🥇🥇🥇🥇🥇🥇🥇🥇🥇🥇🥇🥇🥇🥇🥇🥇🥇🥇🥇🥇🥇🥇🥇🥇🥇🥇🥇🥇🥇🥇🥈🥈🥈🥈🥈🥈🥈🥈🥈🥈🥈🥈🥈🥈🥈🥈🥈🥈🥈🥈🥈🥈🥈🥈🥈🥈🥈🥈🥈🥈🥈🥈🥈🥈🥈🥈🥈🥈🥈🥈🥈🥉🥉🥉🥉🥉🥉🥉🥉🥉🥉🥉🥉🥉🥉🥉🥉🥉🥉🥉🥉🥉🥉🥉🥉🥉🥉🥉🥉🥉🥉🥉🥉🥉❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌
ARM - 🥈🥈🥉🥉❌❌❌❌❌❌❌❌❌❌❌❌❌❌❌
CUB - 🥇🥇🥇🥇🥇🥇🥇🥈🥈🥈🥉🥉🥉🥉🥉❌❌❌❌❌❌❌❌❌❌❌❌❌

Argh, too many emojis to make sense of what is going on. Let's try a different approach.

In [16]:
def make_pies(df, nocs, cols=3):
    
    
    # Based on the amount of nations, we compute
    # how many rows we will need, always using ncols
    # columns
    rows = math.ceil(len(nocs) / cols)
    
    # Generates three positions for each row
    positions = []
    for row in range(rows):
        for column in range(1,cols+1):
            positions.append((row+1,column))
            
    
    # Defines marker colors         
    marker_colors = [light_blue, chart_gray]
            
    
    # Create subplots, using 'domain' type for pie charts
    specs = []
    for row in range(rows):
        spec_line = [{'type': 'domain'} for col in range(cols)]
        specs.append(spec_line)
    fig = make_subplots(rows=rows, cols=cols, specs=specs, subplot_titles=nocs)
    fig.update_layout(height=200*rows, width=1920/2)

    
    
    for noc, position in zip(nocs, positions):

        subset = df[df.noc==noc]
        
        assert subset.shape[0] == 1
        
        subset = subset.iloc[0]
        
        row = position[0]
        col = position[1]
            
        fig.add_trace(go.Pie(values=[subset["medal_ratio"], subset["fail_ratio"]], name='noc',
                             marker_colors=marker_colors), row=row, col=col)
        
        

    # Tune layout and hover info
    fig.update_traces(hoverinfo='label+percent+name', textinfo='none')
    fig.update(layout_title_text='',
               layout_showlegend=False)

    fig = go.Figure(fig)
    fig.show()


In [17]:
make_pies(vr_2020, vr_2020.noc.head(9).tolist() + ['JPN', 'GER','ROC'])

NameError: name 'light_blue' is not defined

Cool, but I'd rather make stacked bars, maybe.

In [19]:
def stacked_bars_vr(df, nocs):
    
        fig = go.Figure()
        
        fig.update_layout(height=600, width=620)
        
        subset = df[df.noc.isin(nocs)]
        
        subset = subset.sort_values(by='medal_ratio')
        
        fig.add_trace(go.Bar(
            y=subset.noc,
            x=subset.medal_ratio,
            orientation='h',
            showlegend=False,
            marker=dict(
                color=light_blue,
            )
        ))

        fig.add_trace(go.Bar(
            y=subset.noc,
            x=subset.fail_ratio,
            orientation='h',
            showlegend=False,
            marker=dict(
                color=chart_gray,
            )
        ))
        
        
        fig.update_layout(
            title="",
            xaxis_title="",
            yaxis_title="",
            paper_bgcolor=bg_gray,
            plot_bgcolor=bg_gray,
            margin=dict(pad=30),
        )
        
        fig.update_yaxes(visible=True,
             showgrid=False,
             tickfont=dict(family='Noto Sans', size=12),
             gridcolor=chart_gray,
             zeroline=False,
        )
        

        fig.update_layout(barmode='stack')
        fig.update_xaxes(
                         range=[0, 1],
                         tickformat="%",
                         showgrid=False,
                         tickfont=dict(family='Noto Sans', size=12),
                         gridcolor=chart_gray,
                         zeroline=False,
                         )
        fig.show()
        
        fig.write_image(f"../output/victory-ratio.svg")

In [20]:
stacked_bars_vr(vr_2020, vr_2020.noc.head(10).tolist() + ['JPN'])

#### The ones who lost the most in Tokyo
Which countries blew more chances at getting a medal?

In [21]:
vr_2020.sort_values(by='blewed_chances', ascending=False)

Unnamed: 0,index,noc,competitors,bronze,gold,silver,medals,medal_ratio,fail_ratio,blewed_chances
5,198,USA,503,33.0,38.0,41.0,112.0,0.222664,0.777336,391.0
24,97,JPN,395,16.0,27.0,14.0,57.0,0.144304,0.855696,338.0
46,73,GER,352,16.0,10.0,11.0,37.0,0.105114,0.894886,315.0
11,156,ROC,355,22.0,20.0,27.0,69.0,0.194366,0.805634,286.0
43,65,FRA,313,11.0,10.0,12.0,33.0,0.105431,0.894569,280.0
34,93,ITA,308,19.0,9.0,10.0,38.0,0.123377,0.876623,270.0
23,10,AUS,316,21.0,18.0,7.0,46.0,0.14557,0.85443,270.0
2,38,CHN,345,18.0,38.0,31.0,87.0,0.252174,0.747826,258.0
56,33,CAN,283,12.0,7.0,6.0,25.0,0.088339,0.911661,258.0
8,69,GBR,320,22.0,22.0,21.0,65.0,0.203125,0.796875,255.0


#### Countries that never won medals that got the closest to winning

But what about the countries that never won that got close? Let's take a look at all the 4th and 5th places that could have been the first medals if things have went differently.

In [22]:
almost = df[(df.noc.isin(vr[vr.medals == 0].noc)) &
   (df.position.isin(["4", "5"]))
  ]

In [23]:
almost

Unnamed: 0,index,abbrv,discipline,event,event_id,year,competitor,medal,noc,position
2736,2736.0,WLT,Weightlifting,"Middleweight (≤73 kilograms), Men",19002038,2020,Briken Calja,,ALB,4
2924,2924.0,FTB,Football,"Football, Men",359500,2016,Honduras,,HON,4
22501,23169.0,BOX,Boxing,"Light-Middleweight (≤71 kilograms), Men",26792,1996,Rival Cadeau,,SEY,5
33484,34358.0,FTB,Football,"Football, Men",34224,2004,Mali,,MLI,5
34532,35439.0,WLT,Weightlifting,"Lightweight (≤69 kilograms), Men",354308,2016,Briken Calja,,ALB,5
37425,38355.0,ATH,Athletics,"400 metres Hurdles, Men",19000451,2020,Kyron McMaster,,IVB,4
71935,73157.0,WLT,Weightlifting,"Flyweight (≤52 kilograms), Men",29487,1972,Gyi Aung Maung,,MYA,5
75558,76787.0,WLT,Weightlifting,"Flyweight (≤48 kilograms), Women",30233,2000,Win Kay Thi,,MYA,4
76611,77854.0,WLT,Weightlifting,"Featherweight (≤53 kilograms), Women",30242,2000,Win Swe Swe,,MYA,5
77330,78573.0,WLT,Weightlifting,"Middleweight (≤77 kilograms), Men",30188,2000,Ilirian Suli,,ALB,5


We can add to this list the British Virgin Islands athlete that finished 4th in 2020's Men's 400m hurdles: Kyron McMaster.

And what about the country that won more 4th places ever?

In [46]:
df4000 = df[df.position=="4"].groupby("noc").count().event.sort_values(ascending=False).head(20)
df4000

df4000.to_csv('4thplace.csv', index = True )

Not much to see, to be fair. This ressembles closely the all-time medal table.

#### Nations that spend the most time competing without winning

In [47]:
def elapsed_time(df):

    intervals = []
        
    medal_counts =  df.groupby(['noc', 'year']).count().unstack(fill_value=0).stack().reset_index()
    medal_counts = medal_counts[medal_counts.event > 0] # Removes competitions country wasn't part of
    medal_counts = medal_counts[["noc", "year", "medal"]]
    medal_counts["won_medals"] = np.where(medal_counts.medal > 0 , True, False)
    
    
    #display(medal_counts[medal_counts.noc=='ANG'])

    for noc in medal_counts.noc.unique():
        
        
        beginning = None
        end = None
        last_row = False
        subset = medal_counts[medal_counts.noc==noc].reset_index()
        
        # If nation won a medal at any point in time, we want to ignore it
        if subset.medal.sum() > 0:
            continue
        
        for i, (index, row) in enumerate(subset.iterrows()):
            
            if i == subset.shape[0] - 1:
                last_row = True
                     
            # Start counting at the first games with no medals
            if row.won_medals is False and beginning is None:
                beginning = row.year
                
            # Stop at the first game with medals after those
            if row.won_medals is True and beginning is not None:
                end = row.year
            
            # Add after the end of the interval
            if beginning is not None and end is not None:
                interval = { "noc": noc, "beginning": beginning, "end": end, "span": end - beginning}
                intervals.append(interval)
                beginning = None
                end = None
                
            # If the country never won, add it too
            if beginning is not None and end is None and last_row is True:
                end = 2024
                interval = { "noc": noc, "beginning": beginning, "end": end, "span": end - beginning}
                intervals.append(interval)
                beginning = None
                end = None

            
        
    return intervals

In [48]:
et = pd.DataFrame(elapsed_time(df)).sort_values(by='span', ascending=False)

In [49]:
# Removes some countries that now compete onder other names
# or that competed as part of other nations (eg USSR)
et = et[~et.noc.isin(['CHI ', 'BOH', 'MIX', 'SRB', 'UNK', 'SAA', 'RUS', 'LTU', 'NBO', 'RHO', 'CHN'])]

In [50]:
et = et[et.end==2024]

In [52]:
et = et.head(15)
et

Unnamed: 0,noc,beginning,end,span
48,MON,1920,2024,104
47,MLT,1928,2024,96
40,LIE,1936,2024,88
11,BOL,1936,2024,88
50,MYA,1948,2024,76
74,VNM,1952,2024,72
37,LBR,1956,2024,68
42,MAL,1956,2024,68
66,SOM,1960,2024,64
53,NEP,1964,2024,60


In [30]:
# Plots!
def plot_timespan(df, xrange=[1896, 2024]):

    # Creates the figure
    fig = go.Figure()
    
    df = df.sort_values(by='span')
    
    height = 600
    width = 620
    fig.update_layout(height=height, width=800)

    
    # Add lollipop 'bars'
    # point a -> % medals
    # point b -> % athletes
    x_vals = []
    
    nocs = df.noc
    beginnings = df.beginning
    ends = df.end
    for noc,  beginning, end in zip(nocs, beginnings, ends):
        fig.add_trace(go.Scatter(y=[noc, noc],
                x=[beginning, end],
                mode='lines',
                line=dict(color=light_blue,
                         width=3),
                showlegend=False))
        
        

            
    # Start
    fig.add_trace(go.Scatter(y=df.noc,
                    x=df.beginning,
                    mode='markers+text',
                    textposition="middle left",
                    text=df.beginning,
                    line_shape='linear',
                    marker=dict(size=8, 
                                color=light_blue,
                                line=dict(width=2)),
                    showlegend=False))
    
    
    # End
    fig.add_trace(go.Scatter(y=df[df.end!=2024].noc,
                    x=df[df.end!=2024].end,
                    mode='markers',
                    line_shape='linear',
                    marker=dict(size=8, 
                                color=light_blue,
                                line=dict(width=2)
                    ),
                    showlegend=False))
    
    
    # Adds a 'now' line
    y = [df.iloc[0].noc, df.iloc[-1].noc]
    fig.add_trace(go.Scatter(y=y, 
                             x=[2020,2020],
                             mode='lines',
                             showlegend=False,
                             marker_color=highlight_gray,
                             line=dict(width=3)))
    
        
    
    fig.update_yaxes(visible=True, 
         showgrid=False,
         gridcolor=chart_gray,
         zeroline=False,
    )




    fig.update_xaxes(
        range=xrange,
        showgrid=True,
        tickvals=[1896, 2020],
#         tickvals=[1896, 1900, 1904, 1908, 1912, 1916, 1920, 1924, 1928, 1932, 
#                    1936, 1940, 1944, 1948, 1952, 1956, 1960, 1964, 1968, 1972,
#                    1976, 1980, 1984, 1988, 1992, 1996, 2000, 2004, 2008, 2012,
#                    2016, 2020],
        tickfont=dict(family='Noto Sans', size=12),
        gridcolor=chart_gray,
        zeroline=False,
        zerolinecolor=chart_gray,
    )
    
    
    fig.update_layout(
        title="",
        xaxis_title="",
        yaxis_title="",
        paper_bgcolor=bg_gray,
        plot_bgcolor=bg_gray,
        margin=dict(pad=30),
    )

    fig.update_yaxes(visible=True,
         showgrid=False,
         tickfont=dict(family='Noto Sans', size=12),
         gridcolor=chart_gray,
         zeroline=False,
    )



        

    fig.show()
    
    fig.write_image(f"../output/time-without-medal.svg")

In [31]:
plot_timespan(et)

#### One hit wonders
Countries were a single person (or team) won all the medals.

In [32]:
def one_hit_wonders(df):

    df_ = df[~df.medal.isna()]
    for noc in df_.noc.unique():

        subset = df_[df_.noc==noc]
        
        competitors = subset.competitor.unique().tolist()
        sports = subset.discipline.unique().tolist()
        years = subset.year.unique().tolist()
        #print(len(competitors))
        
        if len(competitors) == 1:
            print(noc, competitors, sports, years)

In [33]:
one_hit_wonders(df)

FIJ ['Fiji'] ['Rugby Sevens'] [2016, 2020]
GUA ['Erick Barrondo'] ['Athletics'] [2012]
MNE ['Montenegro'] ['Handball'] [2012]
AFG ['Rohullah Nikpai'] ['Taekwondo'] [2012, 2008]
SEN ['Amadou Dia Bâ'] ['Athletics'] [1988]
ISV ['Peter Holmberg'] ['Sailing'] [1988]
SUR ['Anthony Nesty'] ['Swimming'] [1988, 1992]
PAR ['Paraguay'] ['Football'] [2004]
TOG ['Benjamin Boukpeti'] ['Canoe Slalom'] [2008]
SAM ['Ele Opeloge'] ['Weightlifting'] [2008]
MOZ ['Maria Mutola'] ['Athletics'] [1996, 2000]
GAB ['Anthony Obame'] ['Taekwondo'] [2012]
CYP ['Pavlos Kontides'] ['Sailing'] [2012]
GRN ['Kirani James'] ['Athletics', 'athletics'] [2016, 2012, 2020]
UNK [nan] ['Wrestling'] [1912]
ERI ['Zersenay Tadese'] ['Athletics'] [2004]
AHO ['Jan Boersma'] ['Sailing'] [1988]
GUY ['Michael Anthony'] ['Boxing'] [1980]
DJI ['Ahmed Salah'] ['Athletics'] [1988]
SUD ['Ismail Ahmed Ismail'] ['Athletics'] [2008]
BAR ['Obadele Thompson'] ['Athletics'] [2000]
MRI ['Bruno Julie'] ['Boxing'] [2008]
TKM ['Polina Guryeva'] ['W

In the list above, we should add the San Marino shooter mentioned above and weightlifter Polina Guryeva, first turkmen to win an Olympic medal. Note that Kuwait had another medal winner in 2016, Abdullah Al-Rashidi, but he competed under the banner of the Independent Olympic Athletes.

Some of those might not be one hit wonders anymore. Kosovo, for instance, has another judoka that won medals in 2020.

#### Distribution between groups
Are the top nations getting more or less medals in recent editions of the Olympics?

In [53]:
def totals_by_year(df):
    
    medal_rankings = df.groupby(['year', 'noc']).medal.count()

    dataset = []
    
    for year in sorted(df.year.unique().tolist()):
        
        subset = medal_rankings[year].reset_index().sort_values(by='medal', ascending=False).reset_index(drop=True)
        
        total_medals = subset.medal.sum()
        top_10 = subset.loc[0:9].medal.sum()
        _11_to_20 = subset.loc[10:19].medal.sum()
        _21_to_40 = subset.loc[20:39].medal.sum()
        all_others = subset.loc[40:].medal.sum()

        
        row = [ year,
                top_10 / total_medals, 
                _11_to_20 / total_medals,
                _21_to_40 / total_medals, 
                all_others / total_medals ]
        
        
        dataset.append(row)
    return pd.DataFrame(dataset, columns=["year", "top_10", "11_to_20", "21_to_40", "all_others"])

In [57]:
tby = totals_by_year(df)
tby
tby.to_csv('medal_spread.csv', index = True )

In [45]:
def stacked_share_by_year(df):

    fig = go.Figure()
    
        
    height = 600
    width = 620

    fig.add_trace(go.Bar(
        y=df.year,
        x=df.top_10,
        name='Top 10',
        orientation='h',
        showlegend=False,
        marker=dict(
            color=dark_blue,
        )
    ))

    fig.add_trace(go.Bar(
        y=df.year,
        x=df["11_to_20"],
        name='11 to 20',
        orientation='h',
        showlegend=False,
        marker=dict(
            color=medium_blue,
        )
    ))

    fig.add_trace(go.Bar(
        y=df.year,
        x=df["21_to_40"],
        name='21 to 40',
        orientation='h',
        showlegend=False,
        marker=dict(
            color=light_blue,
        )
    ))

    fig.add_trace(go.Bar(
        y=df.year,
        x=df["all_others"],
        name='40 and under',
        orientation='h',
        showlegend=False,
        marker=dict(
            color=orange,
        )
    ))


    fig.update_yaxes(visible=True, 
         showgrid=False,
         gridcolor=chart_gray,
         zeroline=False,
    )


    fig.update_xaxes(
        tickformat="%",
        showgrid=True,
        tickfont=dict(family='Noto Sans', size=12),
        gridcolor=chart_gray,
        zeroline=False,
        zerolinecolor=chart_gray,
    )


    fig.update_layout(
        title="",
        xaxis_title="",
        yaxis_title="",
        paper_bgcolor=bg_gray,
        plot_bgcolor=bg_gray,
        margin=dict(pad=30),
    )

    fig.update_yaxes(visible=True,
         showgrid=False,
         tickfont=dict(family='Noto Sans', size=12),
         gridcolor=chart_gray,
         zeroline=False,
    )




    fig.update_layout(barmode='stack')
    fig.show()
    fig.write_image(f"../output/stacked_share_medals_by_group.svg")

In [37]:
stacked_share_by_year(tby)

Yay! Apparently the losers are winning more. See how the green and blue nations, lower in the medal rankings, are winning an increasing share of medals.

#### Save charts 

In [38]:
vr.to_csv("../output/csvs/victory_ratio.csv", index=False)

In [39]:
tby.to_csv("../output/csvs/totals_by_year.csv", index=False)