In [490]:
import pandas as pd
from sqlalchemy import false
from tqdm import *
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
# from
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
import math

In [491]:
team_df = pd.read_csv("data/team_index.csv")
programs_index = pd.read_csv('data/programs_index.csv')['program_id'].tolist()
team_never_get_medal_list = pd.read_csv("data/team_never_get_medal.csv")["NOC"].tolist()

In [492]:
team_no_data = []
number_each_medal = 329

In [502]:
def train_one(dataset, cross_val=False, cross_val_cv=5, val_method="neg_mean_squared_error"):
    X = dataset.drop(["Year", "NOC", "Gold_rate", "Silver_rate", "Bronze_rate", "Total_rate"], )
    Y = dataset[['Gold_rate', 'Silver_rate', 'Bronze_rate', 'Total_rate']]
    y_g = Y['Gold_rate']
    y_s = Y['Silver_rate']
    y_b = Y['Bronze_rate']
    y_t = Y["Total_rate"]
    # print(X, y_g)
    g = RandomForestRegressor()
    s = RandomForestRegressor()
    b = RandomForestRegressor()
    t = RandomForestRegressor()
    g.fit(X, y_g)
    s.fit(X, y_s)
    b.fit(X, y_b)
    t.fit(X, y_t)
    if cross_val:
        if cross_val_cv == -1:
            cross_val_cv = dataset.shape[0]
            # print("Cross validation cv is set to ", cross_val_cv)
        g_score = cross_val_score(g, X, y_g, cv=cross_val_cv, scoring=val_method)
        s_score = cross_val_score(s, X, y_s, cv=cross_val_cv, scoring=val_method)
        b_score = cross_val_score(b, X, y_b, cv=cross_val_cv, scoring=val_method)
        t_score = cross_val_score(b, X, y_t, cv=cross_val_cv, scoring=val_method)
        return g, s, b, t, g_score.mean(), s_score.mean(), b_score.mean(), t_score.mean()

    return g, s, b, t

In [504]:
def predict_one(data_set, model_g, model_s, model_b, model_t):
    X = data_set.drop(["Year", "NOC"], axis=1)
    y_g_pred = model_g.predict(X)
    y_s_pred = model_s.predict(X)
    y_b_pred = model_b.predict(X)
    y_t_pred = model_t.predict(X)
    df = pd.DataFrame()
    df["NOC"] = [data_set["NOC"].iloc[0]]
    df["Gold_rate"] = y_g_pred
    df["Silver_rate"] = y_s_pred
    df["Bronze_rate"] = y_b_pred
    df["Total_rate"] = y_t_pred
    return df

In [509]:
def constraint_func(x):
    if x < 0:
        return 0
    elif x > 1:
        return 1
    else:
        return x

def rate_to_medal(rate_df, sum_num):
    rate_df["Gold_rate"] = rate_df["Gold_rate"].apply(constraint_func)
    rate_df["Silver_rate"] = rate_df["Silver_rate"].apply(constraint_func)
    rate_df["Bronze_rate"] = rate_df["Bronze_rate"].apply(constraint_func)
    rate_df["Total_rate"] = rate_df["Total_rate"].apply(constraint_func)
    rate_df["Gold"] = (rate_df["Gold_rate"] * sum_num).round()
    rate_df["Silver"] = (rate_df["Silver_rate"] * sum_num).round()
    rate_df["Bronze"] = (rate_df["Bronze_rate"] * sum_num).round()
    rate_df["Total"] = (rate_df["Total_rate"] * sum_num * 3).round()
    rate_df["Sum"] = rate_df["Gold"] + rate_df["Silver"] + rate_df["Bronze"]
    return rate_df

In [496]:
def sort_medal(df):
    df = df.sort_values(by=["Gold", "Sum"], ascending=False)
    #df["Rank"] = range(1, df.shape[0] + 1)
    return df

## 部分1 为每个国家单独建立模型评估，应用于特定国家

In [497]:
def train_model_each_team(cross_val=False, cross_val_cv=5, val_method="neg_mean_squared_error"):
    model_list = []
    predict_result_df = pd.DataFrame()
    team_sum = len(team_df)
    team_no_data = []
    tqdm_bar = tqdm(range(team_sum))
    g_score = 0
    s_score = 0
    b_score = 0
    t_score = 0
    for i in tqdm_bar:
        noc = team_df.iloc[i]["NOC"]
        try:
            df_train = pd.read_csv("data/dataset/train/" + noc + ".csv")
        except FileNotFoundError:
            team_no_data.append(noc)
            continue
        except pd.errors.EmptyDataError:
            team_no_data.append(noc)
            continue

        if df_train.shape[0] < 8 and cross_val:
            continue

        if cross_val:
            model_g, model_s, model_b, model_t, tmp_g_score, tmp_s_score, tmp_b_score, tmp_t_score = train_one(df_train, cross_val=True, cross_val_cv=cross_val_cv, val_method=val_method)
            model_list.append([noc, model_g, model_s, model_b, model_t])
            g_score += tmp_g_score
            s_score += tmp_s_score
            b_score += tmp_b_score
            t_score += tmp_t_score
        else:
            model_g, model_s, model_b, model_t = train_one(df_train)
            model_list.append([noc, model_g, model_s, model_b, model_t])

        tqdm_bar.set_description('Team [{}/{}], NOC: {} '.format(i+1, team_sum, noc))

    if cross_val:
        team_finish_num = len(model_list)
        g_score /= team_finish_num
        s_score /= team_finish_num
        b_score /= team_finish_num
        t_score /= team_finish_num
        print("Gold MSE: ", g_score)
        print("Silver MSE: ", s_score)
        print("Bronze MSE: ", b_score)
        print("Total MSE: ", t_score)

    return model_list, team_no_data

## 部分2 为所有国家建立相同的模型，并对特定国家进行评估

In [498]:
def train_model_all_team(cross_val=False, cross_val_cv=5, val_method="neg_mean_squared_error"):
    df_train_sum = pd.read_csv("data/dataset/train/sum_dataset.csv")
    model_g, model_s, model_b, model_t = train_one(df_train_sum)
    if cross_val:
        model_g, model_s, model_b, model_t, g_score, s_score, b_score, t_score = train_one(df_train_sum, cross_val=True, cross_val_cv=cross_val_cv, val_method=val_method)
        print("Gold " + val_method + ": ", g_score)
        print("Silver " + val_method + ": ", s_score)
        print("Bronze " + val_method + ": ", b_score)
        print("Total " + val_method + ": ", t_score)
        return [model_g, model_s, model_b, model_t, g_score, s_score, b_score, t_score]
    else:
        return [model_g, model_s, model_b, model_t]

In [506]:
def predict(model_single_list_, model_all, rate=0.5):
    predict_result_df = pd.DataFrame()
    for model_single in model_single_list_:
        noc = model_single[0]
        model_g = model_single[1]
        model_s = model_single[2]
        model_b = model_single[3]
        model_t = model_single[4]
        df_predict = pd.read_csv("data/dataset/predict/" + noc + ".csv")
        tmp_single = predict_one(df_predict, model_g, model_s, model_b, model_t)
        tmp_all = predict_one(df_predict, model_all[0], model_all[1], model_all[2], model_all[3])
        tmp_result = pd.DataFrame()
        tmp_result["NOC"] = [noc]
        tmp_result["Gold_rate"] = tmp_single["Gold_rate"] * rate + tmp_all["Gold_rate"] * (1 - rate)
        tmp_result["Silver_rate"] = tmp_single["Silver_rate"] * rate + tmp_all["Silver_rate"] * (1 - rate)
        tmp_result["Bronze_rate"] = tmp_single["Bronze_rate"] * rate + tmp_all["Bronze_rate"] * (1 - rate)
        tmp_result["Total_rate"] = tmp_single["Total_rate"] * rate + tmp_all["Total_rate"] * (1 - rate)
        predict_result_df = pd.concat([predict_result_df, tmp_result], ignore_index=True)
    return predict_result_df

# Verify the Models

In [434]:
train_model_all_team(cross_val=True, cross_val_cv=200)

Gold neg_mean_squared_error:  -0.0002512572208773606
Silver neg_mean_squared_error:  -0.00012386293091468797
Bronze neg_mean_squared_error:  -0.00010492190927115364


[RandomForestRegressor(),
 RandomForestRegressor(),
 RandomForestRegressor(),
 -0.0002512572208773606,
 -0.00012386293091468797,
 -0.00010492190927115364]

In [447]:
train_model_each_team(cross_val=True, cross_val_cv=-1) # 对每个队伍单独建立模型

Team [206/206], NOC: LES : 100%|██████████| 206/206 [07:31<00:00,  2.19s/it]

Gold MSE:  -7.588751385789039e-05
Silver MSE:  -6.265619274818169e-05
Bronze MSE:  -5.7720508444064835e-05





([['CHN',
   RandomForestRegressor(),
   RandomForestRegressor(),
   RandomForestRegressor()],
  ['DEN',
   RandomForestRegressor(),
   RandomForestRegressor(),
   RandomForestRegressor()],
  ['NED',
   RandomForestRegressor(),
   RandomForestRegressor(),
   RandomForestRegressor()],
  ['FIN',
   RandomForestRegressor(),
   RandomForestRegressor(),
   RandomForestRegressor()],
  ['NOR',
   RandomForestRegressor(),
   RandomForestRegressor(),
   RandomForestRegressor()],
  ['ROU',
   RandomForestRegressor(),
   RandomForestRegressor(),
   RandomForestRegressor()],
  ['FRA',
   RandomForestRegressor(),
   RandomForestRegressor(),
   RandomForestRegressor()],
  ['MAR',
   RandomForestRegressor(),
   RandomForestRegressor(),
   RandomForestRegressor()],
  ['ESP',
   RandomForestRegressor(),
   RandomForestRegressor(),
   RandomForestRegressor()],
  ['EGY',
   RandomForestRegressor(),
   RandomForestRegressor(),
   RandomForestRegressor()],
  ['IRI',
   RandomForestRegressor(),
   RandomFor

In [445]:
train_model_all_team(cross_val=True, cross_val_cv=200, val_method="r2")

Gold r2:  -0.2708612145743632
Silver r2:  -0.1513944970765121
Bronze r2:  -0.101107255493585


[RandomForestRegressor(),
 RandomForestRegressor(),
 RandomForestRegressor(),
 -0.2708612145743632,
 -0.1513944970765121,
 -0.101107255493585]

In [446]:
# train_model_each_team(cross_val=True, cross_val_cv=4, val_method="r2") # 对每个队伍单独建立模型

# 获取项目权重

In [500]:
def get_weight(noc, single_model_list, without_host=False):
    weight_df = pd.DataFrame()
    for single_models in single_model_list:
        if single_models[0] == noc:
            if not without_host:
                tmp_df = pd.DataFrame()
                tmp_df["NOC"] = [noc]
                tmp_df["Feature"] = ["Host"]
                tmp_df["Gold"] = single_models[1].feature_importances_[0]
                tmp_df["Silver"] = single_models[2].feature_importances_[0]
                tmp_df["Bronze"] = single_models[3].feature_importances_[0]
                tmp_df["Total"] = single_models[4].feature_importances_[0]
                weight_df = pd.concat([weight_df, tmp_df], ignore_index=True)

            for i in range(len(single_models[1].feature_importances_) - 1):
                tmp_df = pd.DataFrame()
                tmp_df["NOC"] = [noc]
                tmp_df["Feature"] = [programs_index[i]]
                tmp_df["Gold"] = single_models[1].feature_importances_[i + 1]
                tmp_df["Silver"] = single_models[2].feature_importances_[i + 1]
                tmp_df["Bronze"] = single_models[3].feature_importances_[i + 1]
                tmp_df["Total"] = single_models[4].feature_importances_[i + 1]
                weight_df = pd.concat([weight_df, tmp_df], ignore_index=True)
            return weight_df
    return weight_df

# Train model and predict

In [507]:

print("Start training single model")
model_single_list, no_data_list = train_model_each_team() # 对每个队伍单独建立模型
print("Done")

print("Start training all_team model")
model_all_list = train_model_all_team()                   # 对所有队伍建立模型
print("Done")

print("Start predicting")
predict_result = predict(model_single_list, model_all_list) # 预测并进行模型结果平均
print("Done")

Start training single model


Team [206/206], NOC: LES : 100%|██████████| 206/206 [01:12<00:00,  2.84it/s]


Done
Start training all_team model
Done
Start predicting
Done


# Question 1: The medal count in 2028

In [510]:
predict_result = rate_to_medal(predict_result, number_each_medal)
medal_tops = sort_medal(predict_result)
# medal_tops = medal_tops[["NOC", "Gold", "Silver", "Bronze", "Sum"]]
medal_tops

Unnamed: 0,NOC,Gold_rate,Silver_rate,Bronze_rate,Total_rate,Gold,Silver,Bronze,Total,Sum
25,USA,0.149252,0.124031,0.132196,0.135717,49.0,41.0,43.0,134.0,133.0
0,CHN,0.123116,0.090379,0.083602,0.098633,41.0,30.0,28.0,97.0,99.0
78,GBR,0.116957,0.099882,0.090777,0.103666,38.0,33.0,30.0,102.0,101.0
164,KOR,0.063131,0.032990,0.035208,0.038032,21.0,11.0,12.0,38.0,44.0
38,CAN,0.062240,0.034922,0.054471,0.048269,20.0,11.0,18.0,48.0,49.0
...,...,...,...,...,...,...,...,...,...,...
198,MHL,0.000244,0.000292,0.000378,0.000317,0.0,0.0,0.0,0.0,0.0
199,KIR,0.000244,0.000292,0.000378,0.000317,0.0,0.0,0.0,0.0,0.0
200,TUV,0.000244,0.000292,0.000378,0.000317,0.0,0.0,0.0,0.0,0.0
201,TGA,0.000244,0.000292,0.000378,0.000317,0.0,0.0,0.0,0.0,0.0


# Question 2: The most potential team to get their first medal in 2028

In [521]:
# team_never_get_medal_df = predict_result[predict(model_single_list, model_all_list, rate=0)["NOC"].isin(team_never_get_medal_list)]
# team_never_get_medal_df = team_never_get_medal_df.drop(["Gold", "Silver", "Bronze", "Sum", "Gold_rate", "Silver_rate", "Bronze_rate"], axis=1)
# team_never_get_medal_df = team_never_get_medal_df.sort_values(by="Total_rate", ascending=False)
# team_never_get_medal_df["Total_rate"] = team_never_get_medal_df["Total_rate"] * number_each_medal * 3
# team_never_get_medal_df
# wgmm_dataset =


def train_WGMM(data_set):
    model = LogisticRegression()
    X = data_set.drop(["Year", "NOC", "Total_rate"], axis=1)
    Y = data_set["Total_rate"]
    model.fit(X, Y)
    return model

def predict_WGMM(data_set, model):
    X = data_set.drop(["Year", "NOC"], axis=1)
    Y = model.predict_proba(X)[:, 1]
    return Y

In [527]:
classify_dataset = pd.read_csv("data/dataset/train/classify_dataset.csv")
WGMM_model = train_WGMM(classify_dataset)

classify_result_df = pd.DataFrame()
for each_noc in team_df["NOC"].to_list():
    try:
        df = pd.read_csv("data/dataset/predict/" + each_noc + ".csv")
    except FileNotFoundError:
        continue
    except pd.errors.EmptyDataError:
        continue
    result_df = pd.DataFrame()
    result_df["NOC"] = [each_noc]
    result_df["Medal_rate"] = predict_WGMM(df, WGMM_model)
    classify_result_df = pd.concat([classify_result_df, result_df], ignore_index=True)

team_never_get_medal_df = classify_result_df[classify_result_df["NOC"].isin(team_never_get_medal_list)]
team_never_get_medal_df = team_never_get_medal_df.sort_values(by="Medal_rate", ascending=False)
team_never_get_medal_df


Unnamed: 0,NOC,Medal_rate
167,PRK,0.626590
88,SLO,0.599849
67,SUI,0.587492
178,HKG,0.404873
142,MDA,0.383522
...,...,...
108,SKN,0.269527
107,MTN,0.269527
105,SEY,0.269527
104,MRI,0.269527


# Question 3: The importance of each program in each country

In [475]:
weight = get_weight("USA", model_single_list, without_host=False) # 重要性，输谁查谁
weight

Unnamed: 0,NOC,Feature,Gold,Silver,Bronze,Medal
0,USA,Host,6.482985e-05,7.704708e-05,3e-06,4.8e-05
1,USA,SWA,0.0,0.0,0.0,0.0
2,USA,DIV,0.07169196,0.0384987,0.135865,0.082018
3,USA,OWS,6.149409e-05,7.989697e-07,9.5e-05,5.2e-05
4,USA,SWM,0.07875126,0.06120127,0.146828,0.095594
5,USA,WPO,0.2926688,0.2919516,0.120356,0.234992
6,USA,ARC,0.0625557,0.09308486,0.031224,0.062288
7,USA,ATH,0.109902,0.08278955,0.025861,0.072851
8,USA,BK3,0.0,0.0,0.0,0.0
9,USA,BOX,0.01225141,0.005694585,0.010254,0.0094


[['CHN',
  RandomForestRegressor(),
  RandomForestRegressor(),
  RandomForestRegressor()],
 ['DEN',
  RandomForestRegressor(),
  RandomForestRegressor(),
  RandomForestRegressor()],
 ['NED',
  RandomForestRegressor(),
  RandomForestRegressor(),
  RandomForestRegressor()],
 ['FIN',
  RandomForestRegressor(),
  RandomForestRegressor(),
  RandomForestRegressor()],
 ['NOR',
  RandomForestRegressor(),
  RandomForestRegressor(),
  RandomForestRegressor()],
 ['ROU',
  RandomForestRegressor(),
  RandomForestRegressor(),
  RandomForestRegressor()],
 ['EST',
  RandomForestRegressor(),
  RandomForestRegressor(),
  RandomForestRegressor()],
 ['FRA',
  RandomForestRegressor(),
  RandomForestRegressor(),
  RandomForestRegressor()],
 ['MAR',
  RandomForestRegressor(),
  RandomForestRegressor(),
  RandomForestRegressor()],
 ['ESP',
  RandomForestRegressor(),
  RandomForestRegressor(),
  RandomForestRegressor()],
 ['EGY',
  RandomForestRegressor(),
  RandomForestRegressor(),
  RandomForestRegressor()],