In [369]:
import pandas as pd
from sqlalchemy import false
from tqdm import *
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
# from
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
import math

In [324]:
team_df = pd.read_csv("data/team_index.csv")
programs_index = pd.read_csv('data/programs_index.csv')['program_id'].tolist()

In [325]:
team_no_data = []
number_each_medal = 329

In [399]:
def train_one(dataset, cross_val=False, cross_val_cv=5):
    X = dataset.drop(["Year", "NOC", "Gold_rate", "Silver_rate", "Bronze_rate"], axis=1)
    Y = dataset[['Gold_rate', 'Silver_rate', 'Bronze_rate']]
    y_g = Y['Gold_rate']
    y_s = Y['Silver_rate']
    y_b = Y['Bronze_rate']
    # print(X, y_g)
    g = RandomForestRegressor()
    s = RandomForestRegressor()
    b = RandomForestRegressor()
    g.fit(X, y_g)
    s.fit(X, y_s)
    b.fit(X, y_b)
    if cross_val:
        if cross_val_cv == -1:
            cross_val_cv = dataset.shape[0]
            # print("Cross validation cv is set to ", cross_val_cv)
        g_score = cross_val_score(g, X, y_g, cv=cross_val_cv, scoring='neg_mean_squared_error')
        s_score = cross_val_score(s, X, y_s, cv=cross_val_cv, scoring='neg_mean_squared_error')
        b_score = cross_val_score(b, X, y_b, cv=cross_val_cv, scoring='neg_mean_squared_error')
        return g, s, b, g_score.mean(), s_score.mean(), b_score.mean()

    return g, s, b

In [391]:
def predict_one(data_set, model_g, model_s, model_b):
    X = data_set.drop(["Year", "NOC"], axis=1)
    y_g_pred = model_g.predict(X)
    y_s_pred = model_s.predict(X)
    y_b_pred = model_b.predict(X)
    df = pd.DataFrame()
    df["NOC"] = [data_set["NOC"].iloc[0]]
    df["Gold_rate"] = y_g_pred
    df["Silver_rate"] = y_s_pred
    df["Bronze_rate"] = y_b_pred
    return df

In [392]:
def constraint_func(x):
    if x < 0:
        return 0
    elif x > 1:
        return 1
    else:
        return x

def rate_to_medal(rate_df, sum_num):
    rate_df["Gold_rate"] = rate_df["Gold_rate"].apply(constraint_func)
    rate_df["Silver_rate"] = rate_df["Silver_rate"].apply(constraint_func)
    rate_df["Bronze_rate"] = rate_df["Bronze_rate"].apply(constraint_func)
    rate_df["Gold"] = (rate_df["Gold_rate"] * sum_num).round()
    rate_df["Silver"] = (rate_df["Silver_rate"] * sum_num).round()
    rate_df["Bronze"] = (rate_df["Bronze_rate"] * sum_num).round()
    rate_df["Sum"] = rate_df["Gold"] + rate_df["Silver"] + rate_df["Bronze"]
    return rate_df

In [393]:
def sort_medal(df):
    df = df.sort_values(by=["Gold", "Sum"], ascending=False)
    #df["Rank"] = range(1, df.shape[0] + 1)
    return df

## 部分1 为每个国家单独建立模型评估，应用于特定国家

In [394]:
def train_model_each_team(cross_val=False, cross_val_cv=5):
    model_list = []
    predict_result_df = pd.DataFrame()
    team_sum = len(team_df)
    team_no_data = []
    tqdm_bar = tqdm(range(team_sum))
    g_score = 0
    s_score = 0
    b_score = 0
    for i in tqdm_bar:
        noc = team_df.iloc[i]["NOC"]
        try:
            df_train = pd.read_csv("data/dataset/train/" + noc + ".csv")
        except FileNotFoundError:
            team_no_data.append(noc)
            continue
        except pd.errors.EmptyDataError:
            team_no_data.append(noc)
            continue

        if cross_val:
            model_g, model_s, model_b, tmp_g_score, tmp_s_score, tmp_b_score = train_one(df_train, cross_val=True, cross_val_cv=cross_val_cv)
            model_list.append([noc, model_g, model_s, model_b])
            g_score += tmp_g_score
            s_score += tmp_s_score
            b_score += tmp_b_score
        else:
            model_g, model_s, model_b = train_one(df_train)
            model_list.append([noc, model_g, model_s, model_b])

        tqdm_bar.set_description('Team [{}/{}], NOC: {} '.format(i+1, team_sum, noc))

    if cross_val:
        team_finish_num = len(model_list)
        g_score /= team_finish_num
        s_score /= team_finish_num
        b_score /= team_finish_num
        print("Gold MSE: ", g_score)
        print("Silver MSE: ", s_score)
        print("Bronze MSE: ", b_score)

    return model_list, team_no_data

## 部分2 为所有国家建立相同的模型，并对特定国家进行评估

In [395]:
def train_model_all_team(cross_val=False, cross_val_cv=5):
    df_train_sum = pd.read_csv("data/dataset/train/sum_dataset.csv")
    model_g, model_s, model_b = train_one(df_train_sum)
    if cross_val:
        model_g, model_s, model_b, g_score, s_score, b_score = train_one(df_train_sum, cross_val=True, cross_val_cv=cross_val_cv)
        print("Gold MSE: ", g_score)
        print("Silver MSE: ", s_score)
        print("Bronze MSE: ", b_score)
        return [model_g, model_s, model_b, g_score, s_score, b_score]
    else:
        return [model_g, model_s, model_b]

In [396]:
def predict(model_single_list, model_all):
    predict_result_df = pd.DataFrame()
    for model_single in model_single_list:
        noc = model_single[0]
        model_g = model_single[1]
        model_s = model_single[2]
        model_b = model_single[3]
        df_predict = pd.read_csv("data/dataset/predict/" + noc + ".csv")
        tmp_single = predict_one(df_predict, model_g, model_s, model_b)
        tmp_all = predict_one(df_predict, model_all[0], model_all[1], model_all[2])
        tmp_result = pd.DataFrame()
        tmp_result["NOC"] = [noc]
        tmp_result["Gold_rate"] = (tmp_single["Gold_rate"] + tmp_all["Gold_rate"]) / 2
        tmp_result["Silver_rate"] = (tmp_single["Silver_rate"] + tmp_all["Silver_rate"]) / 2
        tmp_result["Bronze_rate"] = (tmp_single["Bronze_rate"] + tmp_all["Bronze_rate"]) / 2
        predict_result_df = pd.concat([predict_result_df, tmp_result], ignore_index=True)
    return predict_result_df

# 模型训练验证

In [409]:
model_all_list = train_model_all_team(cross_val=True, cross_val_cv=40)

Gold MSE:  -0.0002588449128203271
Silver MSE:  -0.00012394391770041607
Bronze MSE:  -0.00010126312560158558


In [410]:
model_single_list, no_data_list = train_model_each_team(cross_val=True, cross_val_cv=-1) # 对每个队伍单独建立模型

Team [206/206], NOC: LES : 100%|██████████| 206/206 [09:28<00:00,  2.76s/it]

Gold MSE:  -5.360827036610374e-05
Silver MSE:  -4.8441736002987295e-05
Bronze MSE:  -4.285461989795019e-05





# 模型实际运行

In [408]:

print("Start training single model")
model_single_list, no_data_list = train_model_each_team() # 对每个队伍单独建立模型
print("Done")

print("Start training all_team model")
model_all_list = train_model_all_team()                   # 对所有队伍建立模型
print("Done")

print("Start predicting")
predict_result = predict(model_single_list, model_all_list) # 预测并进行模型结果平均
print("Done")

Start training single model


Team [206/206], NOC: LES : 100%|██████████| 206/206 [00:53<00:00,  3.88it/s]


Done
Start training all_team model
Done
Start predicting
Done


In [411]:
predict_result = rate_to_medal(predict_result, number_each_medal)
medal_tops = sort_medal(predict_result)
# medal_tops = medal_tops[["NOC", "Gold", "Silver", "Bronze", "Sum"]]
medal_tops

Unnamed: 0,NOC,Gold_rate,Silver_rate,Bronze_rate,Gold,Silver,Bronze,Sum
25,USA,0.141779,0.123208,0.119498,47.0,41.0,39.0,127.0
0,CHN,0.121992,0.078855,0.063548,40.0,26.0,21.0,87.0
65,JPN,0.055587,0.036236,0.030806,18.0,12.0,10.0,40.0
78,GBR,0.051397,0.064337,0.064783,17.0,21.0,21.0,59.0
40,AUS,0.046768,0.052128,0.044479,15.0,17.0,15.0,47.0
...,...,...,...,...,...,...,...,...
198,MHL,0.000007,0.000026,0.000034,0.0,0.0,0.0,0.0
199,KIR,0.000007,0.000026,0.000034,0.0,0.0,0.0,0.0
200,TUV,0.000007,0.000026,0.000034,0.0,0.0,0.0,0.0
201,TGA,0.000007,0.000026,0.000034,0.0,0.0,0.0,0.0
