In [136]:
import pandas as pd
import re
import numpy as np
from datetime import datetime

## Preprocess the data

In [137]:
raw_data_path = './dataset/raw_data.csv'
wind_list_path = './dataset/wind_list.pkl'

### Preprocess the data
###     1. process birthday attribute with dropping the place of the birth in the string
###     2. process record of the performance with replacing '-' into '-1 (h0)'
###     3. extract weight and height attribute from body and drop the rows with no body information
###     4. drop some unuseful columns

raw_data = pd.read_csv(raw_data_path)
raw_data.dropna(subset=['Birth', 'Body', 'Year'], inplace=True)
raw_data['Birthday'] = raw_data['Birth'].apply(lambda x: x.split("in")[0])
raw_data.loc[raw_data['R1'] == '–', 'R1'] = '-1 (h0)'
raw_data.loc[raw_data['R2'] == '–', 'R2'] = '-1 (h0)'
raw_data.loc[raw_data['R3'] == '–', 'R3'] = '-1 (h0)'
drop_index = []
for idx in raw_data.index:
    try:
        raw_data.loc[idx, 'Weight'] = int(raw_data['Body'][idx].split()[0])
        raw_data.loc[idx, 'Height'] = int(raw_data['Body'][idx].split()[3])
    except:
        drop_index.append(idx)
raw_data.drop(index=drop_index, inplace=True)
raw_data.drop(columns=['Gold', 'Silver', 'Bronze', 'Birth', 'Body'], inplace=True)

### Process the information of wind

wind_list = pd.read_pickle(wind_list_path)
wind_dict = {}
for idx in range(len(wind_list)):
    wind_dict[1948 + idx * 4] = wind_list[idx]
print(wind_dict)

### Process the data of performance record
###     1. to make sure the data could be convert to float type
###     2. extract the heat information

raw_data['R1'] = raw_data['R1'].apply(lambda x: re.sub(r'[\[\]w]', '', x))
raw_data['R2'] = raw_data['R2'].apply(lambda x: re.sub(r'[\[\]w]', '', x))
raw_data['R3'] = raw_data['R3'].apply(lambda x: re.sub(r'[\[\]w]', '', x))

raw_data['R1'] = raw_data['R1'].apply(lambda x: re.sub('–', '-1', x))
raw_data['R2'] = raw_data['R2'].apply(lambda x: re.sub('–', '-1', x))
raw_data['R3'] = raw_data['R3'].apply(lambda x: re.sub('–', '-1', x))
raw_data['R4'] = raw_data['R3'].apply(lambda x: re.sub('–', '-1', x))
for i in raw_data.index:
    p1 = raw_data['R1'][i].find('h')
    p2 = raw_data['R2'][i].find('h')
    p3 = raw_data['R3'][i].find('h')
    raw_data.loc[i, 'heat_r1'] = int(raw_data['R1'][i][p1+1:-1])
    raw_data.loc[i, 'heat_r2'] = int(raw_data['R2'][i][p2+1:-1])
    raw_data.loc[i, 'heat_r3'] = int(raw_data['R3'][i][p3+1:-1])
    p1 = raw_data['R1'][i].find('(')
    p2 = raw_data['R2'][i].find('(')
    p3 = raw_data['R3'][i].find('(')
    p4 = raw_data['R4'][i].find('(')
    raw_data.loc[i, 'R1'] = float(raw_data['R1'][i][:p1])
    raw_data.loc[i, 'R2'] = float(raw_data['R2'][i][:p2])
    raw_data.loc[i, 'R3'] = float(raw_data['R3'][i][:p3])
    raw_data.loc[i, 'R4'] = float(raw_data['R4'][i][:p3])


num_heat = [raw_data.groupby('Year')['heat_r1'].max(), # use to match the wind information
            raw_data.groupby('Year')['heat_r2'].max(), 
            raw_data.groupby('Year')['heat_r3'].max()]
raw_data

{1948: [1.1, 0.3, 0.4, 1.3, 0.9, 1.0, 0.9, 0.3, 0.6, 3.3, 0.9, 0.2, 0.0, 1.5, 1.9, 1.5, 1.3, 1.6], 1952: [], 1956: [0.0, -1.2, -2.8, 0.0, 0.1, -0.5, -0.6, 0.0, -0.9, -1.2, -0.7, 0.0, -1.4, 0.0, -1.0, -2.2, -2.3, -1.1, -2.5], 1960: [-0.3, -0.2, -0.3, -0.2, -0.2, -0.5, -0.3, -0.1, -0.2, -0.5, -0.3, -1.3, -2.3, 0.0, 0.0, 0.0], 1964: [0.6, -2.5, -2.8, -0.7, 0.3, 1.6, -1.8, 0.2, 0.2, -0.5, 1.9, 1.7, 1.0, 1.7, 5.3, -1.3, 1.0], 1968: [2.8, 0.8, 0.0, 0.6, 0.7, 3.8, 0.4, 0.0, 0.0, 1.8, 0.5, 4.2, 2.0, 1.6, 0.0, 0.3], 1972: [-0.7, -2.3, 0.5, 2.3, 0.8, 1.2, -1.9, 2.1, 0.6, -0.3, -2.2, -0.3, 1.8, -2.3, 0.0, 3.4, 0.3, 0.0, 0.2, 0.3], 1976: [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.1, 0.1, 0.1, 0.1, 1.1, 0.7, 0.0], 1980: [0.0, 0.1, 0.5, -0.2, 0.9, 0.0, -0.1, -0.1, 0.3, 1.4, 0.2, 0.3, 0.3, 0.5, 0.5, 1.2], 1984: [-0.4, 1.8, 1.8, -0.8, 1.4, 1.9, 0.8, -0.8, -1.7, -1.4, 1.4, -0.7, 0.1, 1.4, 0.8, 0.8, 0.7, -1.5, 0.2], 1988: [0.6, 0.9, 0.7, 0.8, 1.1, 1.4, 1.8, 2.0, 1.0, 1.4, 1.0, 1.4, 0.9, 1.2, 1.7, 0

Unnamed: 0,Name,Nation,R1,R2,R3,R4,Year,Birthday,Weight,Height,heat_r1,heat_r2,heat_r3
0,Harrison Dillard,USA,10.4,10.4,10.5,10.5,1948,8 July 1923,178.0,69.0,5.0,1.0,1.0
1,Barney Ewell,USA,10.5,10.5,10.5,10.5,1948,25 February 1918,180.0,71.0,1.0,2.0,1.0
2,Lloyd LaBeach,PAN,10.5,10.5,10.5,10.5,1948,28 June 1922,185.0,73.0,3.0,4.0,2.0
3,Alastair McCorquodale,GBR,10.5,10.5,10.7,10.7,1948,5 December 1925,183.0,78.0,1.0,3.0,1.0
4,Mel Patton,USA,10.6,10.4,10.4,10.4,1948,16 November 1924,185.0,72.0,2.0,3.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1422,Hassan Saaid,MDV,10.7,-1.0,-1.0,-1.0,2020,4 March 1992,157.0,60.0,1.0,0.0,0.0
1425,Arturo Rojas,BOL,10.64,-1.0,-1.0,-1.0,2020,27 May 1993,183.0,67.0,2.0,0.0,0.0
1428,Didier Kiki,BEN,10.69,-1.0,-1.0,-1.0,2020,30 November 1995,185.0,84.0,2.0,0.0,0.0
1437,Karalo Maibuca,TUV,11.42,-1.0,-1.0,-1.0,2020,10 June 1999,176.0,64.0,3.0,0.0,0.0


## Process the data

### Extract train data from raw data

In [138]:
train_data = pd.DataFrame(columns=['Name', 'NOC', 'Weight', 'Height', 'Birthday' , 'Year', 'Round', 'Wind', 'Label'])
for _, row in raw_data.iterrows():
    for round in range(1, 5):
        col = f'R{round}'
        if row[col] == -1:
            continue
        try:
            wind_idx = 0 if round == 1 else num_heat[round - 2][row['Year']]
            wind_idx = wind_idx + row[f'heat_r{round}'] - 1 if round != 4 else -1
            wind = wind_dict[row['Year']][int(wind_idx)]
        except:
            break
        new_row = list(row[['Name', 'Nation', 'Weight', 'Height', 'Birthday' , 'Year']])
        new_row += [round, wind, row[col]]
        train_data.loc[len(train_data)] = new_row
train_data

Unnamed: 0,Name,NOC,Weight,Height,Birthday,Year,Round,Wind,Label
0,Harrison Dillard,USA,178.0,69.0,8 July 1923,1948,1,0.9,10.40
1,Harrison Dillard,USA,178.0,69.0,8 July 1923,1948,2,0.0,10.40
2,Harrison Dillard,USA,178.0,69.0,8 July 1923,1948,3,0.9,10.50
3,Harrison Dillard,USA,178.0,69.0,8 July 1923,1948,4,1.6,10.50
4,Barney Ewell,USA,180.0,71.0,25 February 1918,1948,1,1.1,10.50
...,...,...,...,...,...,...,...,...,...
2291,Didier Kiki,BEN,185.0,84.0,30 November 1995,2020,1,0.0,10.69
2292,Karalo Maibuca,TUV,176.0,64.0,10 June 1999,2020,1,0.9,11.42
2293,Chijindu Ujah,GBR,182.0,81.0,5 March 1994,2020,2,0.8,10.08
2294,Chijindu Ujah,GBR,182.0,81.0,5 March 1994,2020,3,0.8,10.11


### Process the train data

In [139]:
import calendar
olympic_list = {
    1948: ['GBR', '19480730'],
    1952: ['FIN', '19520721'],
    1956: ['AUS', '19561123'],
    1960: ['ITA', '19600831'],
    1964: ['JPN', '19641014'],
    1968: ['MEX', '19681013'], 
    1972: ['GER', '19720831'],
    1976: ['CAN', '19760723'],
    1980: ['URS', '19800724'],
    1984: ['USA', '19840803'],
    1988: ['KOR', '19880923'],
    1992: ['ESP', '19920731'],
    1996: ['USA', '19960726'],
    2000: ['AUS', '20000922'],
    2004: ['GRE', '20040821'],
    2008: ['CHN', '20080815'],
    2012: ['GBR', '20120804'],
    2016: ['BRA', '20160813'],
    2020: ['JPN', '20210731'],
}

### Process the data with 

nation_cnt = dict(train_data.groupby(by=['NOC']).count()['Label'].sort_values(ascending=False))
nation_list = {}
nation_index = 0
print(nation_cnt)
for k, i in nation_cnt.items():
    nation_list[k] = nation_index
    nation_index += 1
print(nation_list)

drop_index = []
for idx, row in train_data.iterrows():
    birthday = row['Birthday']
    try:
        month = birthday.split()[1]
    except:
        drop_index.append(idx)
        continue
    birthday = birthday.replace(month, str(list(calendar.month_name).index(month)))
    while birthday[-1] == ' ':
        birthday = birthday[:-1]
    birthday = datetime.strptime(birthday, "%d %m %Y")
    train_data.loc[idx, "Age"] = (datetime.strptime(olympic_list[row['Year']][1], "%Y%m%d") - birthday).days / 365.25
    train_data.loc[idx, "isHometown"] = int(row['NOC'] == olympic_list[row['Year']][0])
    train_data.loc[idx, "BMI"] = row['Weight'] * 10000 / row['Height'] / row['Height']
    train_data.loc[idx, 'Nation'] = nation_list[row['NOC']]

train_data_path = './dataset/train_data.csv'
cols = ['Name', 'Age', 'Nation', 'Weight', 'Height', 'BMI', 'Year', 'Round', 'Wind', 'isHometown', 'Label']
train_data = train_data[cols]
train_data.drop(index=drop_index, inplace=True)
train_data.to_csv(train_data_path, index=False)
train_data


{'USA': 181, 'GBR': 128, 'JAM': 116, 'FRA': 95, 'CAN': 87, 'NGR': 73, 'TTO': 70, 'BRA': 64, 'URS': 57, 'GHA': 56, 'JPN': 56, 'POL': 53, 'CUB': 43, 'AUS': 39, 'BAH': 36, 'GDR': 36, 'CIV': 35, 'ITA': 33, 'GER': 31, 'CHN': 27, 'SKN': 26, 'FRG': 26, 'BAR': 22, 'HUN': 21, 'SEN': 20, 'INA': 19, 'VEN': 19, 'ESP': 18, 'KEN': 17, 'QAT': 15, 'RSA': 15, 'GRE': 14, 'POR': 13, 'CMR': 13, 'UGA': 13, 'ANT': 13, 'NAM': 12, 'PAK': 12, 'BUL': 12, 'BEL': 12, 'CGO': 11, 'CYP': 11, 'PAN': 11, 'DOM': 11, 'GAM': 11, 'SUR': 11, 'MAD': 10, 'ISV': 10, 'TPE': 10, 'THA': 10, 'IRI': 9, 'AHO': 9, 'PUR': 9, 'MDV': 9, 'SGP': 9, 'SUI': 9, 'ZAM': 8, 'UKR': 8, 'NOR': 8, 'OMA': 8, 'MAS': 8, 'BER': 8, 'CHI': 8, 'BUR': 8, 'NZL': 8, 'KOR': 7, 'NED': 7, 'MEX': 7, 'MLI': 7, 'KSA': 7, 'GAB': 7, 'BAN': 7, 'LBR': 7, 'RUS': 7, 'TGA': 7, 'SLE': 7, 'TCH': 7, 'GUY': 6, 'CAY': 6, 'ARG': 6, 'AUT': 6, 'BEN': 6, 'TUR': 6, 'LAO': 6, 'GUI': 6, 'SWE': 6, 'ISL': 6, 'MRI': 6, 'EUN': 6, 'FIJ': 6, 'MAR': 5, 'PLW': 5, 'URU': 5, 'PHI': 5, 'LES':

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data.drop(index=drop_index, inplace=True)


Unnamed: 0,Name,Age,Nation,Weight,Height,BMI,Year,Round,Wind,isHometown,Label
0,Harrison Dillard,25.062286,0.0,178.0,69.0,373.871035,1948,1,0.9,0.0,10.40
1,Harrison Dillard,25.062286,0.0,178.0,69.0,373.871035,1948,2,0.0,0.0,10.40
2,Harrison Dillard,25.062286,0.0,178.0,69.0,373.871035,1948,3,0.9,0.0,10.50
3,Harrison Dillard,25.062286,0.0,178.0,69.0,373.871035,1948,4,1.6,0.0,10.50
4,Barney Ewell,30.425736,0.0,180.0,71.0,357.072010,1948,1,1.1,0.0,10.50
...,...,...,...,...,...,...,...,...,...,...,...
2291,Didier Kiki,25.667351,81.0,185.0,84.0,262.188209,2020,1,0.0,0.0,10.69
2292,Karalo Maibuca,22.140999,131.0,176.0,64.0,429.687500,2020,1,0.9,0.0,11.42
2293,Chijindu Ujah,27.405886,1.0,182.0,81.0,277.396738,2020,2,0.8,0.0,10.08
2294,Chijindu Ujah,27.405886,1.0,182.0,81.0,277.396738,2020,3,0.8,0.0,10.11
