## 序列数据加载

In [2]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim

In [3]:


# 1. 加载数据
file_path = 'data/summerOly_athletes.csv'
data = pd.read_csv(file_path)

# 2. 数据预处理

# 获取独特的运动项目、年份和代表队
sports = sorted(data['Sport'].unique())  # 运动项目 S
years = sorted(data['Year'].unique())   # 奥运会年份 T
nocs = sorted(data['NOC'].unique())     # 国家代表队 N

# 过滤掉没有奖牌的数据
data = data[data['Medal'] != 'No medal']

# 构建映射表
sport_to_idx = {sport: i for i, sport in enumerate(sports)}
year_to_idx = {year: i for i, year in enumerate(years)}
noc_to_idx = {noc: i for i, noc in enumerate(nocs)}
print(sport_to_idx)
print(year_to_idx)

# 初始化结果张量
S, T, N = len(sports), len(years), len(nocs)
result_tensor = np.zeros((S, T, N), dtype=int)

# 3. 构建张量
for _, row in data.iterrows():
    sport_idx = sport_to_idx[row['Sport']]
    year_idx = year_to_idx[row['Year']]
    noc_idx = noc_to_idx[row['NOC']]
    result_tensor[sport_idx, year_idx, noc_idx] = 1  # 标记该国家在该年该运动有奖牌

# 4. 保存结果
output_path = 'data/olympic_medals.npy'
np.save(output_path, result_tensor)
print(f"结果张量已保存到 {output_path}")
print(f"张量形状: {result_tensor.shape}")  # 输出 S*T*N 的张量形状


{'3x3 Basketball': 0, 'Aeronautics': 1, 'Alpinism': 2, 'Archery': 3, 'Art Competitions': 4, 'Artistic Gymnastics': 5, 'Artistic Swimming': 6, 'Athletics': 7, 'Badminton': 8, 'Baseball': 9, 'Baseball/Softball': 10, 'Basketball': 11, 'Basque Pelota': 12, 'Beach Volleyball': 13, 'Boxing': 14, 'Breaking': 15, 'Canoe Slalom': 16, 'Canoe Sprint': 17, 'Canoeing': 18, 'Cricket': 19, 'Croquet': 20, 'Cycling': 21, 'Cycling BMX Freestyle': 22, 'Cycling BMX Racing': 23, 'Cycling Mountain Bike': 24, 'Cycling Road': 25, 'Cycling Road, Cycling Mountain Bike': 26, 'Cycling Road, Cycling Track': 27, 'Cycling Road, Triathlon': 28, 'Cycling Track': 29, 'Diving': 30, 'Equestrian': 31, 'Equestrianism': 32, 'Fencing': 33, 'Figure Skating': 34, 'Football': 35, 'Golf': 36, 'Gymnastics': 37, 'Handball': 38, 'Hockey': 39, 'Ice Hockey': 40, 'Jeu De Paume': 41, 'Judo': 42, 'Karate': 43, 'Lacrosse': 44, 'Marathon Swimming': 45, 'Marathon Swimming, Swimming': 46, 'Modern Pentathlon': 47, 'Motorboating': 48, 'Polo':

In [4]:
t = result_tensor[8]
print(sum(sum(t)))
# 筛选出sum(sum(t))的值小于10的运动
sport_to_idx_inv = {v: k for k, v in sport_to_idx.items()}
selected_sports = [sport_to_idx_inv[i] for i in range(S) if sum(sum(result_tensor[i])) < 10]
print(selected_sports)


57
['Aeronautics', 'Alpinism', 'Artistic Swimming', 'Baseball/Softball', 'Basque Pelota', 'Breaking', 'Cricket', 'Croquet', 'Cycling BMX Racing', 'Cycling Mountain Bike', 'Cycling Road, Cycling Mountain Bike', 'Cycling Road, Cycling Track', 'Cycling Road, Triathlon', 'Figure Skating', 'Ice Hockey', 'Jeu De Paume', 'Lacrosse', 'Marathon Swimming', 'Marathon Swimming, Swimming', 'Motorboating', 'Racquets', 'Roque', 'Surfing', 'Trampoline Gymnastics']


## 平滑操作

In [5]:
# 加载某一项运动对应的数据作为时间序列数据用于训练
idx = 8  # 选择第 8 项运动
medal_dist = torch.tensor(np.load('data/olympic_medals.npy')[idx], dtype=torch.float32).numpy()

# 加载对应的运动员数量-国家分布作为平滑参数
sport = sport_to_idx_inv[idx]
file_path = f'data/athletes_count/{sport}.csv'
athletes_count = pd.read_csv(file_path)
# 列索引中年份并不是顺序的，需要重新排序
years = sorted(athletes_count.columns)
athletes_count = athletes_count[years]
# 将数据转换为 NumPy 数组
athletes_per_noc = athletes_count.values
# 每一行的最后一项是NOC并转置，使每一行表示每一年的运动员数量-国家分布
athletes_per_noc = athletes_per_noc[:, :-1].T



def smooth_distribution_with_decrease(A, B, smooth_factor=1.0):
    A = np.array(A, dtype=float)
    B = np.array(B, dtype=float)
    # 确保 A 和 B 的长度相等
    assert A.shape == B.shape, f"A({A.shape}) and B({B.shape}) must have the same shape."
    # 标准化 B，避免 B 全零的情况
    if B.sum() > 0:
        B_norm = B / B.sum()
    else:
        B_norm = np.zeros_like(B)
    
    # 构造平滑因子
    S = B_norm * smooth_factor
    # 初始化平滑后的 A
    A_smoothed = np.zeros_like(A)
    
    # 对 A 的每一项进行更新
    for i in range(len(A)):
        if A[i] == 0:
            # A[i] 为 0 的情况下，平滑值来源于 S[i]
            A_smoothed[i] = S[i]
        else:
            # A[i] 为非零时，值减少，且减少量不能超过 S[i]
            A_smoothed[i] = A[i] - min(A[i], S[i])
    
    # 保证 B[i] 为 0 时 A_smoothed 也为 0
    A_smoothed[B == 0] = 0
    
    return A_smoothed


# 对每个年份的分布进行平滑处理
for i in range(T):
    medal_dist[i] = smooth_distribution_with_decrease(medal_dist[i], athletes_per_noc[i])
# 保存平滑后的数据
np.save('data/medals_dist_smoothed.npy', medal_dist)


## 建立模型 （针对某一项运动）

In [None]:

# 加载平滑后的数据转换
x = torch.tensor(np.load('data/medals_dist_smoothed.npy'), dtype=torch.float32)
# x = torch.tensor(np.load('data/olympic_medals.npy')[idx], dtype=torch.float32)

T, N = x.size()  # 时间序列长度和向量维度

# 定义权重优化模型
class WeightOptimizer(nn.Module):
    def __init__(self, T):
        super(WeightOptimizer, self).__init__()
        # 初始化全局权重序列为可训练参数
        self.weights = nn.Parameter(torch.ones(T))  # 长度为 T 的权重序列


    def forward(self, x):
        T, N = x.size()  # 时间序列长度和向量维度
        total_loss = 0  # 损失累计
        
        for k in range(1, T):  # 从时间步 2 开始预测
            # 通过 Softplus 确保非负，再通过 cumsum 累加确保单增
            weights = torch.cumsum(torch.nn.functional.softplus(self.weights), dim=0)  # (T,)
            # 截取前 k 个权重
            k_weights = weights[:k]
            # 根据权重加权计算预测值
            x_pred = torch.mean(k_weights.unsqueeze(1) * x[:k], dim=0)  # (N,)
            # 计算当前时间步的预测损失
            total_loss += torch.norm(x_pred - x[k]) ** 2

        # 返回平均损失
        return total_loss / T

# 实例化模型
model = WeightOptimizer(T)

# 定义优化器
optimizer = optim.Adam(model.parameters(), lr=0.01)

# 训练循环
num_epochs = 1000
for epoch in range(num_epochs):
    optimizer.zero_grad()  # 清除梯度
    loss = model(x)  # 计算损失
    loss.backward()  # 反向传播
    optimizer.step()  # 更新权重参数
    if (epoch + 1) % 50 == 0:
        print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {loss.item():.4f}")

# 查看学习到的权重
raw_weights = model.weights
weights = torch.cumsum(torch.nn.functional.softplus(raw_weights), dim=0)
print("Learned weights:", weights)


Epoch 50/1000, Loss: 5.8723
Epoch 100/1000, Loss: 3.2642
Epoch 150/1000, Loss: 2.0102
Epoch 200/1000, Loss: 1.3615
Epoch 250/1000, Loss: 1.0005
Epoch 300/1000, Loss: 0.7868
Epoch 350/1000, Loss: 0.6538
Epoch 400/1000, Loss: 0.5675
Epoch 450/1000, Loss: 0.5098
Epoch 500/1000, Loss: 0.4701
Epoch 550/1000, Loss: 0.4423
Epoch 600/1000, Loss: 0.4225
Epoch 650/1000, Loss: 0.4081
Epoch 700/1000, Loss: 0.3977
Epoch 750/1000, Loss: 0.3900
Epoch 800/1000, Loss: 0.3843
Epoch 850/1000, Loss: 0.3800
Epoch 900/1000, Loss: 0.3768
Epoch 950/1000, Loss: 0.3744
Epoch 1000/1000, Loss: 0.3726
Learned weights: tensor([0.2363, 0.4725, 0.7088, 0.9450, 1.1813, 1.4175, 1.6538, 1.8901, 2.1263,
        2.3626, 2.5988, 2.8351, 3.0713, 3.3076, 3.5438, 3.7801, 4.0164, 4.2526,
        4.4889, 4.7251, 4.9614, 5.1976, 5.4339, 5.6593, 5.8821, 6.0946, 6.3149,
        6.5198, 6.7180, 6.9205, 8.2338], grad_fn=<CumsumBackward0>)
