In [1]:
from bs4 import BeautifulSoup

In [2]:
from urllib.request import urlopen

In [3]:
import pandas as pd
import numpy as np
from itertools import combinations
from scipy.stats import kendalltau

## Scrap scores from result pages

In [22]:
links = {
    'US': 'http://www.isuresults.com/results/season1718/gpusa2017/CAT001RS.HTM',
    'CA': 'http://www.isuresults.com/results/season1718/gpcan2017/CAT001RS.HTM',
    'FR': 'http://www.isuresults.com/results/season1718/gpfra2017/CAT001RS.HTM',
    'RU': 'http://www.isuresults.com/results/season1718/gprus2017/CAT001RS.HTM',
    'CN': 'http://www.isuresults.com/results/season1718/gpchn2017/CAT001RS.HTM',
    'JP': 'http://www.isuresults.com/results/season1718/gpjpn2017/data0190.htm',
    'EU': 'http://www.isuresults.com/results/season1718/ec2018/CAT001RS.HTM',
    '4C': 'http://www.isuresults.com/results/season1718/fc2018/CAT001RS.HTM',
    'WR': 'http://www.isuresults.com/results/season1718/wc2018/CAT001RS.HTM'
}

In [25]:
def parse_score(link, event_name, year):
    html = urlopen(link)
    bs = BeautifulSoup(html.read(), 'html.parser')

    table_str = str(bs.findAll('table')[0])
    table = pd.read_html(table_str)[0]
    if year == 2017 and event_name == 'JP':
        table = table[[1, 5]].dropna().iloc[1:-1]
    else:
        table = table[[1, 6, 8]].dropna().iloc[2:, :2]
    table.columns = ['name', 'score']
    table['event'] = event_name
    table['score'] = table['score'].astype(float)
    table['year'] = year
    return table

In [33]:
# scores = pd.concat((parse_score(link, event_name, 2017) for event_name, link in links.items()), axis=0)
# scores['name'] = scores['name'].str.replace('\xa0', ' ')
# scores.reset_index(drop=True, inplace=True)
scores = pd.read_csv('scores/2017.csv', names=['name', 'score', 'event', 'year'])

In [35]:
season_scores = scores.loc[scores['event'] != 'WR']
world_scores = scores.loc[scores['event'] == 'WR']

In [36]:
season_avg = season_scores.groupby('name')['score'].mean().sort_values(ascending=False)
season_avg.head()

name
Shoma UNO           290.786667
Yuzuru HANYU        290.770000
Nathan CHEN         284.835000
Javier FERNANDEZ    277.440000
Boyang JIN          270.486667
Name: score, dtype: float64

## Implement kendall tau metrics

In [39]:
season_ranking = list(season_avg.loc[season_avg.index.isin(world_scores['name'])].index)
world_ranking = list(world_scores.loc[world_scores['name'].isin(season_avg.index), 'name'])

In [43]:
season_pairs = set(combinations(season_ranking, 2))
world_pairs = set(combinations(world_ranking, 2))
concordant_pairs = set(season_pairs) & set(world_pairs)
kendall = (2 * len(concordant_pairs) - len(season_pairs)) / len(season_pairs)
kendall

0.5434782608695652

Result agrees with kendalltau from scipy

In [44]:
season_numeric_rank = list(range(len(season_ranking)))
world_numeric_rank = list(season_ranking.index(skater) for skater in world_ranking)

In [45]:
kendalltau(season_numeric_rank, world_numeric_rank)

KendalltauResult(correlation=0.5434782608695652, pvalue=0.00019870245032364205)

RMSE with mean model

In [47]:
season_avg

name
Shoma UNO                290.786667
Yuzuru HANYU             290.770000
Nathan CHEN              284.835000
Javier FERNANDEZ         277.440000
Boyang JIN               270.486667
Mikhail KOLYADA          269.780000
Sergei VORONOV           264.305000
Adam RIPPON              264.220000
Jason BROWN              258.770000
Misha GE                 254.210000
Keiji TANAKA             253.740000
Max AARON                250.780000
Alexei BYCHENKO          245.983333
Dmitri ALIEV             245.706667
Patrick CHAN             245.700000
Alexander SAMARIN        244.333333
Vincent ZHOU             239.435000
Elladj BALDE             238.200000
Han YAN                  236.956667
Deniss VASILJEVS         235.283333
Moris KVITELASHVILI      233.743333
Kazuki TOMONO            231.930000
Nam NGUYEN               230.160000
Nicolas NADEAU           229.430000
Jorik HENDRICKX          227.740000
Michal BREZINA           227.563333
Keegan MESSING           226.775000
Kevin REYNOLDS         

In [63]:
pd.__version__

'0.23.0'

In [61]:
pd.merge(season_scores, season_avg.to_frame(), left_on='name', right_index=True)

Unnamed: 0,name,score_x,event,year,score_y
0,Nathan CHEN,275.88,US,2017,284.835000
33,Nathan CHEN,293.79,RU,2017,284.835000
1,Adam RIPPON,266.45,US,2017,264.220000
58,Adam RIPPON,261.99,JP,2017,264.220000
2,Sergei VORONOV,257.49,US,2017,264.305000
57,Sergei VORONOV,271.12,JP,2017,264.305000
3,Boyang JIN,246.03,US,2017,270.486667
46,Boyang JIN,264.48,CN,2017,270.486667
92,Boyang JIN,300.95,4C,2017,270.486667
4,Han YAN,228.33,US,2017,236.956667
