# 导入必要的库

In [110]:
import pandas as pd
import numpy as np

# 读取数据

In [111]:
athletes_df = pd.read_csv(r'D:\mcm2025\data\processed\athletes.csv', encoding='ISO-8859-1')
hosts_df = pd.read_csv(r'D:\mcm2025\data\processed\hosts.csv', encoding='ISO-8859-1')
medal_counts_df = pd.read_csv(r'D:\mcm2025\data\processed\medal_counts.csv', encoding='ISO-8859-1')
programs_df = pd.read_csv(r'D:\mcm2025\data\processed\programs.csv', encoding='ISO-8859-1')

In [112]:
print(athletes_df.head())
print(hosts_df.head())
print(medal_counts_df.head())
print(programs_df.head())

                    Name Sex            Team  NOC  Year         City  \
0              A Dijiang   M           China  CHN  1992    Barcelona   
1               A Lamusi   M           China  CHN  2012       London   
2            Gunnar Aaby   M         Denmark  DEN  1920    Antwerpen   
3            Edgar Aabye   M  Denmark/Sweden  DEN  1900        Paris   
4  Cornelia (-strannood)   F     Netherlands  NED  1932  Los Angeles   

        Sport                         Event     Medal  
0  Basketball   Basketball Men's Basketball  No medal  
1        Judo  Judo Men's Extra-Lightweight  No medal  
2    Football       Football Men's Football  No medal  
3  Tug-Of-War   Tug-Of-War Men's Tug-Of-War      Gold  
4   Athletics  Athletics Women's 100 metres  No medal  
   Year                         Host
0  1896            Â Athens,Â Greece
1  1900             Â Paris,Â France
2  1904  Â St. Louis,Â United States
3  1908    Â London,Â United Kingdom
4  1912         Â Stockholm,Â Sweden
   Rank  

# XGboost

In [113]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [114]:
athletes_data = pd.read_csv(r'D:\mcm2025\data\processed\athletes.csv', encoding='ISO-8859-1')
hosts_data = pd.read_csv(r'D:\mcm2025\data\processed\hosts.csv', encoding='ISO-8859-1')
medal_counts_data = pd.read_csv(r'D:\mcm2025\data\processed\medal_counts.csv', encoding='ISO-8859-1')
programs_data = pd.read_csv(r'D:\mcm2025\data\processed\programs.csv', encoding='ISO-8859-1')

In [115]:
medal_data = medal_counts_data[["NOC", "Year", "Gold", "Silver", "Bronze", "Total"]]

medal_data = medal_data.merge(hosts_data, on="Year", how="left")
medal_data["Host_Status"] = (medal_data["Host"] == medal_data["NOC"]).astype(int)


In [116]:
athletes_summary = athletes_data.groupby(["NOC", "Year"]).agg(
    Total_Athletes=("Name", "count"),  # Total number of athletes
    Total_Sports=("Sport", "nunique")  # Unique sports represented
).reset_index()

medal_data = medal_data.merge(athletes_summary, on=["NOC", "Year"], how="left")

medal_data["Total_Athletes"] = medal_data["Total_Athletes"].fillna(0)
medal_data["Total_Sports"] = medal_data["Total_Sports"].fillna(0)


In [117]:
programs_data_melted = programs_data.melt(
    id_vars=["Sport", "Discipline", "Code", "Sports Governing Body"], 
    var_name="Year", 
    value_name="Event_Count"
)

programs_data_melted["Year"] = pd.to_numeric(programs_data_melted["Year"], errors="coerce")
programs_data_melted = programs_data_melted.dropna(subset=["Year", "Event_Count"])

programs_events = programs_data_melted.groupby("Year").agg({"Event_Count": "sum"}).reset_index()
programs_events.rename(columns={"Event_Count": "Total_Events"}, inplace=True)

medal_data = medal_data.merge(programs_events, on="Year", how="left")


In [118]:
original_noc = medal_data["NOC"]

medal_data["NOC"] = medal_data["NOC"].astype("category").cat.codes

noc_mapping = dict(enumerate(original_noc.astype("category").cat.categories))


In [119]:
# 添加历史表现的金牌数和总奖牌数
medal_data["Historical_Gold"] = (
    medal_data.groupby("NOC")["Gold"]
    .apply(lambda x: x.shift().cumsum())
    .fillna(0)  # Replace NaN with 0 for the first year
    .reset_index(drop=True)
)

medal_data["Historical_Total"] = (
    medal_data.groupby("NOC")["Total"]
    .apply(lambda x: x.shift().cumsum())
    .fillna(0)  # Replace NaN with 0 for the first year
    .reset_index(drop=True)
)

In [120]:
# 添加加权历史表现和滚动平均值
def calculate_weighted_and_rolling_features(df, group_col, value_col, span):
    """
    :param df: DataFrame
    :param group_col: 分组列（国家）
    :param value_col: 计算的数值列（如 Gold 或 Total）
    :param span: EWMA 的跨度
    :return: 包含加权历史和滚动平均值的新列
    """
    df[f"Weighted_{value_col}"] = (
        df.groupby(group_col)[value_col]
        .apply(lambda x: x.shift().ewm(span=span, adjust=False).mean())
        .fillna(0)  # 首次历史值填充为0
        .reset_index(drop=True)
    )
    df[f"Rolling_{value_col}"] = (
        df.groupby(group_col)[value_col]
        .apply(lambda x: x.shift().rolling(window=3, min_periods=1).mean())
        .fillna(0)  # 首次滚动值填充为0
        .reset_index(drop=True)
    )
    return df

In [121]:
# 应用函数到金牌数和总奖牌数
medal_data = calculate_weighted_and_rolling_features(medal_data, "NOC", "Gold", span=3)
medal_data = calculate_weighted_and_rolling_features(medal_data, "NOC", "Total", span=3)

In [122]:
X = medal_data[[
    "Year", "NOC", "Host_Status", "Total_Events", 
    "Total_Athletes", "Total_Sports", 
    "Historical_Gold", "Historical_Total", 
    "Weighted_Gold", "Rolling_Gold", 
    "Weighted_Total", "Rolling_Total"
]]
y = medal_data["Gold"]  # Predicting Gold medal count

# 确保所有数据类型正确
X["Year"] = X["Year"].astype(int)
X["Host_Status"] = X["Host_Status"].astype(int)
X["Total_Events"] = X["Total_Events"].astype(int)
X["Total_Athletes"] = X["Total_Athletes"].astype(int)
X["Total_Sports"] = X["Total_Sports"].astype(int)
X["Historical_Gold"] = X["Historical_Gold"].astype(int)
X["Historical_Total"] = X["Historical_Total"].astype(int)
X["Weighted_Gold"] = X["Weighted_Gold"].astype(int)
X["Rolling_Gold"] = X["Rolling_Gold"].astype(int)
X["Weighted_Total"] = X["Weighted_Total"].astype(int)
X["Rolling_Total"] = X["Rolling_Total"].astype(int)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["Year"] = X["Year"].astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["Host_Status"] = X["Host_Status"].astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["Total_Events"] = X["Total_Events"].astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try us

In [123]:
xgb_model = xgb.XGBRegressor(
    objective="reg:squarederror",
    n_estimators=200,
    max_depth=5,
    learning_rate=0.2,
    random_state=42
)

xgb_model.fit(X_train, y_train)


In [124]:
# 模型评估
y_pred = xgb_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")


Mean Squared Error: 54.26072788338403


In [126]:
# 预测结果并将其转换为正整数
predictions = pd.DataFrame({
    "Year": X_test["Year"],
    "NOC": X_test["NOC"],
    "Actual_Gold": y_test,
    "Predicted_Gold": y_pred
})

# 将预测结果转换为正整数
predictions["Predicted_Gold"] = predictions["Predicted_Gold"].round().astype(int)
predictions["Predicted_Gold"] = predictions["Predicted_Gold"].apply(lambda x: max(x, 0))  # 确保不为负数

# 将 NOC 编码还原为国家名称
predictions["NOC_Name"] = predictions["NOC"].map(noc_mapping)

# 按年份和国家排序
predictions = predictions.sort_values(by=["Year", "NOC_Name"]).reset_index(drop=True)

# 显示部分结果
print(predictions.head(10))

# 保存结果到 CSV 文件
predictions.to_csv("predicted_gold_medals.csv", index=False)


   Year  NOC  Actual_Gold  Predicted_Gold       NOC_Name
0  1900   16            6               2        Belgium
1  1900   19            0              12        Bohemia
2  1900   30            1               1         Canada
3  1900   84            0               0          India
4  1904    7            0               3      Australia
5  1904   62            0               6         France
6  1904  194           76              23  United States
7  1908    9            0               1        Austria
8  1908   30            3               1         Canada
9  1908   66            3               7        Germany
