# Create Badminton Rankings

Create a dictionary for each event that records the ELO ranking of each player/pair
We'll attempt several different ELO ranking algorithms and attempt to test them on upcoming tournaments to evaluate accuracy

### Grab some save data 

In [263]:
import pandas as pd
from tqdm import tqdm
import re
pd.options.mode.chained_assignment = None 

In [224]:
# Join the files
df = pd.DataFrame()
import glob


for filename in glob.glob('/Users/andrewzhuang/Desktop/Badminton Data/jnotebooks/2018 Tournaments/*.csv'):
    df_temp = pd.read_csv(filename)
    del df_temp['Unnamed: 0']
    df = df.append(df_temp)

for filename in glob.glob('/Users/andrewzhuang/Desktop/Badminton Data/jnotebooks/2017 Tournaments/*.csv'):
    df_temp = pd.read_csv(filename)
    del df_temp['Unnamed: 0']
    df = df.append(df_temp)
    
df = df.reset_index()
del df['index']


### Create and populate dictionary for each event

In [227]:
def filter_name(string):
    del_chars = ['[',']','1','2','3','4','5','6','7','8','9','0']
    return_string = string
    for char in string:
        if char in del_chars:
            return_string = return_string.replace(char,'')
    
    return_string = return_string.strip()
    return (return_string)

In [228]:
#Take in line, player team you want to extract and singles or doubles
def split_players(x,player_num,s_or_d):
    #Need to filter out using regex
    return_val = re.findall("'(.*?)'",x)
    
    #Singles
    if s_or_d == 's':
        #only if there are two opponents
        if len(return_val) == 2 and player_num <= 1:
            for i in range(len(return_val)):
                return_val[i] = filter_name(return_val[i])
            return (return_val[player_num])
        else:
            return ('no opponent')
    
    #Doubles
    if s_or_d == 'd':
        #only if position exists
        if len(return_val) == 4 and player_num <= 1:
            for i in range(len(return_val)):
                return_val[i] = filter_name(return_val[i])
            #sort names alphbetically
            ret_list = sorted([return_val[player_num*2],return_val[player_num*2 + 1]])
            ret_str = filter_name(ret_list[0]) + ',' + ret_list[1]
            return (ret_str)
        else:
            return ('no opponent')

In [318]:
import trueskill

#This function will create a dictionary with trueskill rankings for each player
def create_TS_event_dict(df):
    temp_dict = {}
    temp_set = set()
    for player in df['WINNER'].values:
        temp_set.add(player)
    for player in df['LOSER'].values:
        temp_set.add(player)
    
    for player in temp_set:
        temp_dict[player] = trueskill.Rating()
    return (temp_dict)

In [320]:
#This function will update Trueskill rankings in dictionary based on match dataframe
def update_TS_ratings(event_df, event_dict):
    for i in tqdm(range(len(event_df))):
        winner = event_df.iloc[i]['WINNER']
        loser = event_df.iloc[i]['LOSER']
        event_dict[winner], event_dict[loser] = trueskill.rate_1vs1(event_dict[winner],event_dict[loser])
    return (event_dict)

In [242]:
#This will display the rankings in a dataframe for a specific event dictionary
def get_rankings(event_dict):

    player_list = []
    rating = []

    for player in event_dict:
        player_list.append(player)
        rating.append(trueskill.expose(event_dict[player]))

    df_ratings = pd.DataFrame()
    df_ratings['PLAYERS'] = player_list
    df_ratings['RATING'] = rating
    df_ratings = df_ratings.sort_values(by = 'RATING', ascending = False)

    return (df_ratings)

### Rankings for all events

In [229]:
#Create dictionary and event dataframes sorted by date
MS_dict, WS_dict, MD_dict, WD_dict, XD_dict = {},{},{},{},{}

#Create the dataframes for each event
df_MS = df[df['EVENT'] == 'MS'].sort_values(by = 'match_week')
df_WS = df[df['EVENT'] == 'WS'].sort_values(by = 'match_week')
df_MD = df[df['EVENT'] == 'MD'].sort_values(by = 'match_week')
df_WD = df[df['EVENT'] == 'WD'].sort_values(by = 'match_week')
df_XD = df[df['EVENT'] == 'XD'].sort_values(by = 'match_week')

#Split players into winners and losers of a match
df_MS['WINNER'] = df_MS['PLAYERS'].apply(lambda x: split_players(x,0,'s'))
df_MS['LOSER'] = df_MS['PLAYERS'].apply(lambda x: split_players(x,1,'s'))
df_WS['WINNER'] = df_WS['PLAYERS'].apply(lambda x: split_players(x,0,'s'))
df_WS['LOSER'] = df_WS['PLAYERS'].apply(lambda x: split_players(x,1,'s'))
df_MD['WINNER'] = df_MD['PLAYERS'].apply(lambda x: split_players(x,0,'d'))
df_MD['LOSER'] = df_MD['PLAYERS'].apply(lambda x: split_players(x,1,'d'))
df_WD['WINNER'] = df_WD['PLAYERS'].apply(lambda x: split_players(x,0,'d'))
df_WD['LOSER'] = df_WD['PLAYERS'].apply(lambda x: split_players(x,1,'d'))
df_XD['WINNER'] = df_XD['PLAYERS'].apply(lambda x: split_players(x,0,'d'))
df_XD['LOSER'] = df_XD['PLAYERS'].apply(lambda x: split_players(x,1,'d'))

In [232]:
#Initialize the Ranking dictionaries!
MS_dict = create_TS_event_dict(df_MS)
WS_dict = create_TS_event_dict(df_WS)
MD_dict = create_TS_event_dict(df_MD)
WD_dict = create_TS_event_dict(df_WD)
XD_dict = create_TS_event_dict(df_XD)

#Create the rankings
MS_dict = update_TS_ratings(df_MS, MS_dict)
WS_dict = update_TS_ratings(df_WS, WS_dict)
MD_dict = update_TS_ratings(df_MD, MD_dict)
WD_dict = update_TS_ratings(df_WD, WD_dict)
XD_dict = update_TS_ratings(df_XD, XD_dict)

In [248]:
#Show rankings
get_rankings(MS_dict).head()

Unnamed: 0,PLAYERS,RATING
914,Kento Momota,34.669369
615,Shi Yuqi,34.182693
552,Kidambi Srikanth,33.444534
718,Chen Long,33.233323
1152,Chou Tien Chen,32.590268


### How well does this predict?

Can we predict winner?
Can we predict close game?

Let's take out the last 200 matches in MS and try to predict them

In [342]:
def make_TS_predictions(player,event_dict):
    return (trueskill.expose(event_dict[player]))

def correct_pred(row):
    if row['WINNER RATING'] > row['LOSER RATING']:
        return (1)
    else:
        return (0)

In [353]:
def test_TS_model(event):    
    #Create dictionary and event dataframes sorted by date
    event_dict = {}

    #Create the dataframes for each event
    event_df = df[df['EVENT'] == event].sort_values(by = 'match_week')

    #Split players into winners and losers of a match
    if event == 'MS' or event == 'WS':
        event_df['WINNER'] = event_df['PLAYERS'].apply(lambda x: split_players(x,0,'s'))
        event_df['LOSER'] = event_df['PLAYERS'].apply(lambda x: split_players(x,1,'s'))
    else:
        event_df['WINNER'] = event_df['PLAYERS'].apply(lambda x: split_players(x,0,'d'))
        event_df['LOSER'] = event_df['PLAYERS'].apply(lambda x: split_players(x,1,'d'))
    #Initialize the Ranking dictionary
    event_dict = create_event_dict(event_df)

    #split into test and training
    df_train = event_df[:-100]
    df_test = event_df[-100:]
    event_dict = update_TS_ratings(df_train, event_dict)

    df_test['WINNER RATING'] = df_test['WINNER'].apply(lambda x: make_TS_predictions(x,event_dict))
    df_test['LOSER RATING'] = df_test['LOSER'].apply(lambda x: make_TS_predictions(x,event_dict))
    df_test['Correct Prediction'] = df_test.apply(correct_pred,axis = 1)
    return (df_test)

In [354]:
print ('MS prediction rate: ' + str(test_TS_model('MS')['Correct Prediction'].mean()))
print ('WS prediction rate: ' + str(test_TS_model('WS')['Correct Prediction'].mean()))
print ('MD prediction rate: ' + str(test_TS_model('MD')['Correct Prediction'].mean()))
print ('WD prediction rate: ' + str(test_TS_model('WD')['Correct Prediction'].mean()))
print ('XD prediction rate: ' + str(test_TS_model('XD')['Correct Prediction'].mean()))

100%|██████████| 3806/3806 [00:02<00:00, 1517.64it/s]
  6%|▋         | 161/2486 [00:00<00:01, 1605.11it/s]

MS prediction rate: 0.38


100%|██████████| 2486/2486 [00:01<00:00, 1556.98it/s]
  0%|          | 0/2099 [00:00<?, ?it/s]

WS prediction rate: 0.61


100%|██████████| 2099/2099 [00:01<00:00, 1457.29it/s]
  0%|          | 0/1612 [00:00<?, ?it/s]

MD prediction rate: 0.56


100%|██████████| 1612/1612 [00:01<00:00, 1538.99it/s]
  0%|          | 0/2064 [00:00<?, ?it/s]

WD prediction rate: 0.6


100%|██████████| 2064/2064 [00:01<00:00, 1501.77it/s]

XD prediction rate: 0.54





Looks like WS is the only event I'm willing to bet on! And I might not even do that!!

# Try Elo ratings

In [311]:
import elo as elo
#Everyone starts at 0

In [319]:
#This function will create a dictionary with ELO ranking = 0 for each player
def create_ELO_event_dict(df):
    temp_dict = {}
    temp_set = set()
    for player in df['WINNER'].values:
        temp_set.add(player)
    for player in df['LOSER'].values:
        temp_set.add(player)
    
    for player in temp_set:
        temp_dict[player] = 0
    return (temp_dict)

In [326]:
#This function will update rankings in dictionary based on match dataframe
def update_ELO_ratings(event_df, event_dict):
    for i in tqdm(range(len(event_df))):
        winner = event_df.iloc[i]['WINNER']
        loser = event_df.iloc[i]['LOSER']
        event_dict[winner], event_dict[loser] = elo.rate_1vs1(event_dict[winner],event_dict[loser])
    return (event_dict)

In [327]:
#Initialize the Ranking dictionaries!
MS_dict = create_ELO_event_dict(df_MS)
WS_dict = create_ELO_event_dict(df_WS)
MD_dict = create_ELO_event_dict(df_MD)
WD_dict = create_ELO_event_dict(df_WD)
XD_dict = create_ELO_event_dict(df_XD)

#Create the rankings
MS_dict = update_ELO_ratings(df_MS, MS_dict)
WS_dict = update_ELO_ratings(df_WS, WS_dict)
MD_dict = update_ELO_ratings(df_MD, MD_dict)
WD_dict = update_ELO_ratings(df_WD, WD_dict)
XD_dict = update_ELO_ratings(df_XD, XD_dict)

100%|██████████| 3906/3906 [00:01<00:00, 3681.06it/s]
100%|██████████| 2586/2586 [00:00<00:00, 3600.60it/s]
100%|██████████| 2199/2199 [00:00<00:00, 3484.04it/s]
100%|██████████| 1712/1712 [00:00<00:00, 3477.64it/s]
100%|██████████| 2164/2164 [00:00<00:00, 3606.43it/s]


In [330]:
def get_elo_rankings(event_dict):
    names = []
    rating = []
    for name in event_dict:
        names.append(name)
        rating.append(event_dict[name])
    temp_df = pd.DataFrame()
    temp_df['Name'] = names
    temp_df['Rating'] = rating
    return (temp_df.sort_values(by = 'Rating', ascending = False))

In [339]:
get_elo_rankings(MD_dict).head()

Unnamed: 0,Name,Rating
269,"Lu Ching Yao,Yang Po Han",85.845531
19,"Liu Cheng,Zhang Nan",77.731795
387,"Li Junhui,Liu Yuchen",76.450373
443,"Fajar Alfian,Muhammad Rian Ardianto",73.776528
1063,"Carsten Mogensen,Mathias Boe",70.115577


In [343]:
def make_ELO_predictions(player,event_dict):
    return (event_dict[player])

In [344]:
def test_ELO_model(event):    
    #Create dictionary and event dataframes sorted by date
    event_dict = {}

    #Create the dataframes for each event
    event_df = df[df['EVENT'] == event].sort_values(by = 'match_week')

    #Split players into winners and losers of a match
    if event == 'MS' or event == 'WS':
        event_df['WINNER'] = event_df['PLAYERS'].apply(lambda x: split_players(x,0,'s'))
        event_df['LOSER'] = event_df['PLAYERS'].apply(lambda x: split_players(x,1,'s'))
    else:
        event_df['WINNER'] = event_df['PLAYERS'].apply(lambda x: split_players(x,0,'d'))
        event_df['LOSER'] = event_df['PLAYERS'].apply(lambda x: split_players(x,1,'d'))
    #Initialize the Ranking dictionary
    event_dict = create_ELO_event_dict(event_df)

    #split into test and training
    df_train = event_df[:-100]
    df_test = event_df[-100:]
    event_dict = update_ELO_ratings(df_train, event_dict)

    df_test['WINNER RATING'] = df_test['WINNER'].apply(lambda x: make_ELO_predictions(x,event_dict))
    df_test['LOSER RATING'] = df_test['LOSER'].apply(lambda x: make_ELO_predictions(x,event_dict))
    df_test['Correct Prediction'] = df_test.apply(correct_pred,axis = 1)
    return (df_test)

In [352]:
print ('MS prediction rate: ' + str(test_ELO_model('MS')['Correct Prediction'].mean()))
print ('WS prediction rate: ' + str(test_ELO_model('WS')['Correct Prediction'].mean()))
print ('MD prediction rate: ' + str(test_ELO_model('MD')['Correct Prediction'].mean()))
print ('WD prediction rate: ' + str(test_ELO_model('WD')['Correct Prediction'].mean()))
print ('XD prediction rate: ' + str(test_ELO_model('XD')['Correct Prediction'].mean()))

100%|██████████| 3806/3806 [00:00<00:00, 4575.96it/s]
 16%|█▋        | 404/2486 [00:00<00:00, 4035.19it/s]

MS prediction rate: 0.49


100%|██████████| 2486/2486 [00:00<00:00, 4262.26it/s]
  0%|          | 0/2099 [00:00<?, ?it/s]

WS prediction rate: 0.54


100%|██████████| 2099/2099 [00:00<00:00, 4152.17it/s]
  0%|          | 0/1612 [00:00<?, ?it/s]

MD prediction rate: 0.48


100%|██████████| 1612/1612 [00:00<00:00, 4114.87it/s]
  0%|          | 0/2064 [00:00<?, ?it/s]

WD prediction rate: 0.59


100%|██████████| 2064/2064 [00:00<00:00, 4339.86it/s]

XD prediction rate: 0.53



