In [104]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go

In [105]:
import warnings
warnings.filterwarnings("ignore")

In [106]:
from statsmodels.tsa.statespace.sarimax import SARIMAX

In [107]:
df = pd.read_csv('../data/all_data/timeseries_data.csv')

In [108]:
df

Unnamed: 0.1,Unnamed: 0,Year,NOC,Athletes,Sports,Events,Gold,Silver,Bronze,Men,Women
0,0,1960,AFG,16,2,13,0,0,0,16,0
1,1,1960,AHO,5,1,4,0,0,0,5,0
2,2,1960,ARG,116,15,54,0,3,1,115,1
3,3,1960,AUS,277,19,125,10,21,11,219,58
4,4,1960,AUT,141,18,83,1,2,0,96,45
...,...,...,...,...,...,...,...,...,...,...,...
2696,2696,2016,VIE,28,10,24,1,1,0,12,16
2697,2697,2016,VIN,2,1,2,0,0,0,1,1
2698,2698,2016,YEM,3,3,3,0,0,0,2,1
2699,2699,2016,ZAM,6,3,6,0,0,0,4,2


In [109]:
df = df.drop(columns=['Unnamed: 0'])

In [110]:
df

Unnamed: 0,Year,NOC,Athletes,Sports,Events,Gold,Silver,Bronze,Men,Women
0,1960,AFG,16,2,13,0,0,0,16,0
1,1960,AHO,5,1,4,0,0,0,5,0
2,1960,ARG,116,15,54,0,3,1,115,1
3,1960,AUS,277,19,125,10,21,11,219,58
4,1960,AUT,141,18,83,1,2,0,96,45
...,...,...,...,...,...,...,...,...,...,...
2696,2016,VIE,28,10,24,1,1,0,12,16
2697,2016,VIN,2,1,2,0,0,0,1,1
2698,2016,YEM,3,3,3,0,0,0,2,1
2699,2016,ZAM,6,3,6,0,0,0,4,2


In [111]:
df['Year'] = pd.to_datetime(df['Year'], format='%Y')
df.set_index('Year', inplace=True)

In [112]:
usa_data = df[df['NOC'] == 'USA']

In [113]:
usa_data['Total_Medals'] = usa_data['Gold'] + usa_data['Silver'] + usa_data['Bronze']

In [114]:
usa_data

Unnamed: 0_level_0,NOC,Athletes,Sports,Events,Gold,Silver,Bronze,Men,Women,Total_Medals
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1960-01-01,USA,474,23,164,97,24,22,361,113,143
1964-01-01,USA,619,29,190,96,39,42,452,167,177
1968-01-01,USA,664,28,199,100,37,36,480,184,173
1972-01-01,USA,719,31,216,72,77,46,521,198,195
1976-01-01,USA,660,29,225,73,61,40,446,214,174
1980-01-01,USA,144,10,38,24,4,1,103,41,29
1984-01-01,USA,842,35,255,190,121,50,538,304,361
1988-01-01,USA,896,37,274,89,67,58,549,347,214
1992-01-01,USA,955,40,303,94,57,86,594,361,237
1994-01-01,USA,232,12,59,6,8,5,144,88,19


In [115]:
usa_data = usa_data.drop(columns=['NOC'])

## Hisorical Athlete Participation

In [116]:
fig1 = go.Figure()

# Add traces for Athletes, Sports, Events, and Total Medals
fig1.add_trace(go.Scatter(x=usa_data.index, y=usa_data['Sports'], mode='lines+markers', name='Sports', line=dict(color='green')))
fig1.add_trace(go.Scatter(x=usa_data.index, y=usa_data['Events'], mode='lines+markers', name='Events', line=dict(color='red')))
fig1.add_trace(go.Scatter(x=usa_data.index, y=usa_data['Total_Medals'], mode='lines+markers', name='Total Medals', line=dict(color='purple')))

# Set title and labels
fig1.update_layout(
    title='Historical Athlete Participation for USA',
    xaxis_title='Year',
    yaxis_title='Count',
    legend_title='Metrics',
    template='plotly_white'
)

# Show the plot
fig1.show()


In [117]:
# Historical Medal Count
fig2 = go.Figure()

# Add traces for Gold, Silver, and Bronze Medals
fig2.add_trace(go.Scatter(x=usa_data.index, y=usa_data['Gold'], mode='lines+markers', name='Gold Medals', line=dict(color='gold')))
fig2.add_trace(go.Scatter(x=usa_data.index, y=usa_data['Silver'], mode='lines+markers', name='Silver Medals', line=dict(color='silver')))
fig2.add_trace(go.Scatter(x=usa_data.index, y=usa_data['Bronze'], mode='lines+markers', name='Bronze Medals', line=dict(color='brown')))

# Set title and labels
fig2.update_layout(
    title='Historical Medal Count for USA',
    xaxis_title='Year',
    yaxis_title='Count',
    legend_title='Medal Type',
    template='plotly_white'
)

# Show the plot
fig2.show()


In [118]:
fig3 = go.Figure()

# Add traces for Athletes, Sports, Events, and Total Medals
fig3.add_trace(go.Scatter(x=usa_data.index, y=usa_data['Athletes'], mode='lines+markers', name='Athletes', line=dict(color='green')))
fig3.add_trace(go.Scatter(x=usa_data.index, y=usa_data['Men'], mode='lines+markers', name='Sports', line=dict(color='blue')))
fig3.add_trace(go.Scatter(x=usa_data.index, y=usa_data['Women'], mode='lines+markers', name='Sports', line=dict(color='magenta')))

# Set title and labels
fig3.update_layout(
    title='Historical Athlete Participation for USA',
    xaxis_title='Year',
    yaxis_title='Count',
    legend_title='Metrics',
    template='plotly_white'
)

# Show the plot
fig3.show()


In [119]:
df

Unnamed: 0_level_0,NOC,Athletes,Sports,Events,Gold,Silver,Bronze,Men,Women
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1960-01-01,AFG,16,2,13,0,0,0,16,0
1960-01-01,AHO,5,1,4,0,0,0,5,0
1960-01-01,ARG,116,15,54,0,3,1,115,1
1960-01-01,AUS,277,19,125,10,21,11,219,58
1960-01-01,AUT,141,18,83,1,2,0,96,45
...,...,...,...,...,...,...,...,...,...
2016-01-01,VIE,28,10,24,1,1,0,12,16
2016-01-01,VIN,2,1,2,0,0,0,1,1
2016-01-01,YEM,3,3,3,0,0,0,2,1
2016-01-01,ZAM,6,3,6,0,0,0,4,2


In [120]:
df['Total_Medals'] = df['Gold'] + df['Silver'] + df['Bronze']

In [121]:
# Choropleth Map for Total Medals
fig = px.choropleth(df, 
                    locations="NOC", 
                    locationmode='ISO-3',
                    color="Total_Medals",
                    hover_name="NOC",
                    hover_data=["Gold", "Silver", "Bronze"],
                    title="Geographical Distribution of Total Medals",
                    color_continuous_scale=px.colors.sequential.Viridis,
                    labels={'Total_Medals': 'Total Medals'},
                    template="plotly_white")

fig.update_geos(showcoastlines=True, coastlinecolor="Gray", showland=True, landcolor="LightGray")
fig.show()


In [122]:
# Scatter plot for Men vs Women Participation Over Time
fig = px.scatter(usa_data.reset_index(), x='Year', y=['Men', 'Women'],
                 title='Men vs Women Participation Over Time for USA',
                 labels={'value': 'Count', 'variable': 'Gender'},
                 template='plotly_white')
fig.show()


## Timeseries model

In [123]:
from statsmodels.tsa.vector_ar.var_model import VAR

In [124]:
train = usa_data

In [125]:
# Fit VAR model
model = VAR(train)
model_fit = model.fit()

In [126]:
# Forecast for 2024
forecast = model_fit.forecast(train.values, steps=8)

In [127]:
# Print forecasted values for 2024
print("Forecasted values for 2024:")
years = pd.date_range(start='2017-01-01', end='2025-01-01', freq='Y')
forecast_df = pd.DataFrame(forecast, columns=usa_data.columns, index=years)
forecast_df.index = forecast_df.index.strftime('%Y-01-01')
forecast_df

Forecasted values for 2024:


Unnamed: 0,Athletes,Sports,Events,Gold,Silver,Bronze,Men,Women,Total_Medals
2017-01-01,499.964689,18.59906,128.595257,32.906313,49.640479,35.839786,299.496501,200.468187,118.386578
2018-01-01,738.011885,33.201735,254.783564,120.347327,72.89177,64.240939,361.468928,376.542956,257.480036
2019-01-01,411.326845,16.19986,110.517292,19.10858,34.91371,31.169142,230.701446,180.625399,85.191432
2020-01-01,633.111344,28.692958,217.671545,114.376407,67.097086,55.916916,303.577908,329.533436,237.390409
2021-01-01,455.47805,18.291682,126.358179,39.83685,46.277317,33.913979,250.751271,204.72678,120.028146
2022-01-01,667.698042,29.486326,225.003818,109.821237,68.505189,57.057747,330.621574,337.076468,235.384174
2023-01-01,466.492343,19.100316,133.422435,39.233018,44.221852,35.938602,251.526563,214.965779,119.393471
2024-01-01,629.101477,27.742147,210.788147,102.023354,64.174548,54.215677,310.593289,318.508188,220.413579


## Visualizations for 2024 - USA

In [128]:
# Create traces for each plot
fig1 = go.Figure()
fig1.add_trace(go.Scatter(x=usa_data.index, y=usa_data['Total_Medals'], mode='lines+markers', name='Total Medals'))
fig1.add_trace(go.Scatter(x=forecast_df.index, y=forecast_df['Total_Medals'], mode='lines+markers', name='Total Medals'))
fig1.add_trace(go.Scatter(x=[forecast_df.index[-1]], y=[forecast_df['Total_Medals'].iloc[-1]], mode='markers', name='2024 Forecast', marker=dict(color='red', size=14)))
fig1.update_layout(title='Total Medals Over the Years with 2024 Forecast',
                   xaxis_title='Year',
                   yaxis_title='Total Medals')
fig1.show()

fig2 = go.Figure()
fig2.add_trace(go.Scatter(x=usa_data.index, y=usa_data['Gold'], mode='lines+markers', name='Gold Medals', line=dict(color='gold')))
fig2.add_trace(go.Scatter(x=usa_data.index, y=usa_data['Silver'], mode='lines+markers', name='Silver Medals', line=dict(color='silver')))
fig2.add_trace(go.Scatter(x=usa_data.index, y=usa_data['Bronze'], mode='lines+markers', name='Bronze Medals', line=dict(color='#CD7F32')))
fig2.add_trace(go.Scatter(x=forecast_df.index, y=forecast_df['Gold'], mode='lines+markers', name='Gold Medals', line=dict(color='gold')))
fig2.add_trace(go.Scatter(x=forecast_df.index, y=forecast_df['Silver'], mode='lines+markers', name='Silver Medals', line=dict(color='silver')))
fig2.add_trace(go.Scatter(x=forecast_df.index, y=forecast_df['Bronze'], mode='lines+markers', name='Bronze Medals', line=dict(color='#CD7F32')))
fig2.add_trace(go.Scatter(x=[forecast_df.index[-1]], y=[forecast_df['Gold'].iloc[-1]], mode='markers', name='Gold 2024 Forecast', marker=dict(color='gold', size=14)))
fig2.add_trace(go.Scatter(x=[forecast_df.index[-1]], y=[forecast_df['Silver'].iloc[-1]], mode='markers', name='Silver 2024 Forecast', marker=dict(color='silver', size=14)))
fig2.add_trace(go.Scatter(x=[forecast_df.index[-1]], y=[forecast_df['Bronze'].iloc[-1]], mode='markers', name='Bronze 2024 Forecast', marker=dict(color='#CD7F32', size=14)))
fig2.update_layout(title='Medals (Gold, Silver, Bronze) Over the Years with 2024 Forecast',
                   xaxis_title='Year',
                   yaxis_title='Number of Medals')
fig2.show()

fig3 = go.Figure()
fig3.add_trace(go.Scatter(x=usa_data.index, y=usa_data['Athletes'], mode='lines+markers', name='Athletes'))
fig3.add_trace(go.Scatter(x=usa_data.index, y=usa_data['Events'], mode='lines+markers', name='Events'))
fig3.add_trace(go.Scatter(x=usa_data.index, y=usa_data['Sports'], mode='lines+markers', name='Sports'))
fig3.add_trace(go.Scatter(x=forecast_df.index, y=forecast_df['Athletes'], mode='lines+markers', name='Athletes'))
fig3.add_trace(go.Scatter(x=forecast_df.index, y=forecast_df['Events'], mode='lines+markers', name='Events'))
fig3.add_trace(go.Scatter(x=forecast_df.index, y=forecast_df['Sports'], mode='lines+markers', name='Sports'))
fig3.add_trace(go.Scatter(x=[forecast_df.index[-1]], y=[forecast_df['Athletes'].iloc[-1]], mode='markers', name='Athletes 2024 Forecast', marker=dict(color='blue', size=14)))
fig3.add_trace(go.Scatter(x=[forecast_df.index[-1]], y=[forecast_df['Events'].iloc[-1]], mode='markers', name='Events 2024 Forecast', marker=dict(color='green', size=14)))
fig3.add_trace(go.Scatter(x=[forecast_df.index[-1]], y=[forecast_df['Sports'].iloc[-1]], mode='markers', name='Sports 2024 Forecast', marker=dict(color='orange', size=14)))
fig3.update_layout(title='Athletes, Events, and Sports Over the Years with 2024 Forecast',
                   xaxis_title='Year',
                   yaxis_title='Count')
fig3.show()


In [129]:
fig4 = go.Figure()
fig4.add_trace(go.Scatter(x=usa_data.index, y=usa_data['Men'], mode='lines+markers', name='Men'))
fig4.add_trace(go.Scatter(x=usa_data.index, y=usa_data['Women'], mode='lines+markers', name='Women'))
fig4.add_trace(go.Scatter(x=forecast_df.index, y=forecast_df['Men'], mode='lines+markers', name='Men'))
fig4.add_trace(go.Scatter(x=forecast_df.index, y=forecast_df['Women'], mode='lines+markers', name='Women'))
fig4.add_trace(go.Scatter(x=[forecast_df.index[-1]], y=[forecast_df['Men'].iloc[-1]], mode='markers', name='Men 2024 Forecast', marker=dict(color='blue', size=14)))
fig4.add_trace(go.Scatter(x=[forecast_df.index[-1]], y=[forecast_df['Women'].iloc[-1]], mode='markers', name='Women 2024 Forecast', marker=dict(color='magenta', size=14)))
fig4.update_layout(title='Men and Women Over the Years with 2024 Forecast',
                   xaxis_title='Year',
                   yaxis_title='Count')
fig4.show()

In [139]:
data = forecast_df.iloc[-1]

In [140]:
medals_data = {
    'Gold': data['Gold'],
    'Silver': data['Silver'],
    'Bronze': data['Bronze'],
    'Total_Medals': data['Total_Medals']
}

In [142]:
fig = go.Figure(go.Sunburst(
    labels=["Total Medals", "Gold", "Silver", "Bronze"],
    parents=["", "Total Medals", "Total Medals", "Total Medals"],
    values=[medals_data['Total_Medals'], medals_data['Gold'], medals_data['Silver'], medals_data['Bronze']],
    branchvalues="total",
    marker=dict(colors=["#FFD700", "#C0C0C0", "#CD7F32"]),
))

fig.update_layout(
    title=f"Medal Composition for USA",
    template="plotly_white"
)

fig.show()

## Validation

In [79]:
medal_2021 = pd.read_excel('../data/2021/Medals.xlsx', engine='openpyxl')

In [80]:
medal_2021_usa = medal_2021[medal_2021['Team/NOC'] == 'United States of America']

In [81]:
medal_2021_usa

Unnamed: 0,Rank,Team/NOC,Gold,Silver,Bronze,Total,Rank by Total
0,1,United States of America,39,41,33,113,1


In [82]:
fig1 = go.Figure()
fig1.add_trace(go.Scatter(x=usa_data.index, y=usa_data['Total_Medals'], mode='lines+markers', name='Total Medals'))
fig1.add_trace(go.Scatter(x=forecast_df.index, y=forecast_df['Total_Medals'], mode='lines+markers', name='Total Medals'))
fig1.add_trace(go.Scatter(x=['2021'], y=medal_2021_usa['Total'], mode='markers', name='2021 Actual', marker=dict(color='red', size=14)))
fig1.update_layout(title='Total Medals Over the Years with 2024 Forecast',
                   xaxis_title='Year',
                   yaxis_title='Total Medals')
fig1.show()


In [83]:

fig2 = go.Figure()
fig2.add_trace(go.Scatter(x=usa_data.index, y=usa_data['Gold'], mode='lines+markers', name='Gold Medals', line=dict(color='gold')))
fig2.add_trace(go.Scatter(x=usa_data.index, y=usa_data['Silver'], mode='lines+markers', name='Silver Medals', line=dict(color='silver')))
fig2.add_trace(go.Scatter(x=usa_data.index, y=usa_data['Bronze'], mode='lines+markers', name='Bronze Medals', line=dict(color='#CD7F32')))
fig2.add_trace(go.Scatter(x=forecast_df.index, y=forecast_df['Gold'], mode='lines+markers', name='Gold Medals', line=dict(color='gold')))
fig2.add_trace(go.Scatter(x=forecast_df.index, y=forecast_df['Silver'], mode='lines+markers', name='Silver Medals', line=dict(color='silver')))
fig2.add_trace(go.Scatter(x=forecast_df.index, y=forecast_df['Bronze'], mode='lines+markers', name='Bronze Medals', line=dict(color='#CD7F32')))
fig2.add_trace(go.Scatter(x=[2021], y=medal_2021_usa['Gold'], mode='markers', name='Gold 2024 Forecast', marker=dict(color='gold', size=14)))
fig2.add_trace(go.Scatter(x=[2021], y=medal_2021_usa['Silver'], mode='markers', name='Silver 2024 Forecast', marker=dict(color='silver', size=14)))
fig2.add_trace(go.Scatter(x=[2021], y= medal_2021_usa['Bronze'], mode='markers', name='Bronze 2024 Forecast', marker=dict(color='#CD7F32', size=14)))
fig2.update_layout(title='Medals (Gold, Silver, Bronze) Over the Years with 2024 Forecast',
                   xaxis_title='Year',
                   yaxis_title='Number of Medals')
fig2.show()

## Function

In [84]:
def forecast(country_data, country):
    # Historical data for the specified country
    country_data['Total_Medals'] = country_data['Gold'] + country_data['Silver'] + country_data['Bronze']
    country_data = country_data.drop(columns=['NOC'])
    
    # Fit VAR model
    model = VAR(country_data)
    model_fit = model.fit()
    
    # Forecast for 2024
    forecast = model_fit.forecast(country_data.values, steps=8)
    years = pd.date_range(start='2017-01-01', end='2025-01-01', freq='Y')
    forecast_df = pd.DataFrame(forecast, columns=country_data.columns, index=years)
    forecast_df.index = forecast_df.index.strftime('%Y-01-01')
    
    # Return forecast_df for saving to CSV
    return forecast_df

In [85]:
unique_countries = df['NOC'].unique()
forecasts = {}

# Iterate through each country, generate forecasts, and store in dictionary
for country in unique_countries:
    country_data = df[df['NOC'] == country]
    # skip if any error
    try:
        forecasts[country] = forecast(country_data, country)
    except:
        print(f"Error forecasting for {country}")

Error forecasting for MAL
Error forecasting for RHO
Error forecasting for UAR
Error forecasting for WIF
Error forecasting for SOM
Error forecasting for GAM
Error forecasting for YAR
Error forecasting for YMD
Error forecasting for EUN
Error forecasting for GBS
Error forecasting for SKN
Error forecasting for STP
Error forecasting for FSM
Error forecasting for KIR
Error forecasting for TLS
Error forecasting for MHL
Error forecasting for KOS
Error forecasting for ROT
Error forecasting for SSD


In [86]:
# Save forecasts to CSV files for each country
for country, forecast_df in forecasts.items():
    filename = f'../data/all_data/forecasts/forecast_{country}.csv'
    forecast_df.to_csv(filename)

print("Forecasts saved to CSV files.")

Forecasts saved to CSV files.
