# Create Badminton Rankings

Create a dictionary for each event that records the ELO ranking of each player/pair
We'll attempt several different ELO ranking algorithms and attempt to test them on upcoming tournaments to evaluate accuracy

### Grab some save data 

In [263]:
import pandas as pd
from tqdm import tqdm
import re
pd.options.mode.chained_assignment = None 

In [224]:
# Join the files
df = pd.DataFrame()
import glob


for filename in glob.glob('/Users/andrewzhuang/Desktop/Badminton Data/jnotebooks/2018 Tournaments/*.csv'):
    df_temp = pd.read_csv(filename)
    del df_temp['Unnamed: 0']
    df = df.append(df_temp)

for filename in glob.glob('/Users/andrewzhuang/Desktop/Badminton Data/jnotebooks/2017 Tournaments/*.csv'):
    df_temp = pd.read_csv(filename)
    del df_temp['Unnamed: 0']
    df = df.append(df_temp)
    
df = df.reset_index()
del df['index']


### Create and populate dictionary for each event

In [227]:
def filter_name(string):
    del_chars = ['[',']','1','2','3','4','5','6','7','8','9','0']
    return_string = string
    for char in string:
        if char in del_chars:
            return_string = return_string.replace(char,'')
    
    return_string = return_string.strip()
    return (return_string)

In [228]:
#Take in line, player team you want to extract and singles or doubles
def split_players(x,player_num,s_or_d):
    #Need to filter out using regex
    return_val = re.findall("'(.*?)'",x)
    
    #Singles
    if s_or_d == 's':
        #only if there are two opponents
        if len(return_val) == 2 and player_num <= 1:
            for i in range(len(return_val)):
                return_val[i] = filter_name(return_val[i])
            return (return_val[player_num])
        else:
            return ('no opponent')
    
    #Doubles
    if s_or_d == 'd':
        #only if position exists
        if len(return_val) == 4 and player_num <= 1:
            for i in range(len(return_val)):
                return_val[i] = filter_name(return_val[i])
            #sort names alphbetically
            ret_list = sorted([return_val[player_num*2],return_val[player_num*2 + 1]])
            ret_str = filter_name(ret_list[0]) + ',' + ret_list[1]
            return (ret_str)
        else:
            return ('no opponent')

In [231]:
import trueskill

#This function will create a dictionary with trueskill rankings for each player
def create_event_dict(df):
    temp_dict = {}
    temp_set = set()
    for player in df['WINNER'].values:
        temp_set.add(player)
    for player in df['LOSER'].values:
        temp_set.add(player)
    
    for player in temp_set:
        temp_dict[player] = trueskill.Rating()
    return (temp_dict)

In [233]:
#This function will update rankings in dictionary based on match dataframe
def update_ratings(event_df, event_dict):
    for i in tqdm(range(len(event_df))):
        winner = event_df.iloc[i]['WINNER']
        loser = event_df.iloc[i]['LOSER']
        event_dict[winner], event_dict[loser] = trueskill.rate_1vs1(event_dict[winner],event_dict[loser])
    return (event_dict)

In [242]:
#This will display the rankings in a dataframe for a specific event dictionary
def get_rankings(event_dict):

    player_list = []
    rating = []

    for player in event_dict:
        player_list.append(player)
        rating.append(trueskill.expose(event_dict[player]))

    df_ratings = pd.DataFrame()
    df_ratings['PLAYERS'] = player_list
    df_ratings['RATING'] = rating
    df_ratings = df_ratings.sort_values(by = 'RATING', ascending = False)

    return (df_ratings)

### Rankings for all events

In [229]:
#Create dictionary and event dataframes sorted by date
MS_dict, WS_dict, MD_dict, WD_dict, XD_dict = {},{},{},{},{}

#Create the dataframes for each event
df_MS = df[df['EVENT'] == 'MS'].sort_values(by = 'match_week')
df_WS = df[df['EVENT'] == 'WS'].sort_values(by = 'match_week')
df_MD = df[df['EVENT'] == 'MD'].sort_values(by = 'match_week')
df_WD = df[df['EVENT'] == 'WD'].sort_values(by = 'match_week')
df_XD = df[df['EVENT'] == 'XD'].sort_values(by = 'match_week')

#Split players into winners and losers of a match
df_MS['WINNER'] = df_MS['PLAYERS'].apply(lambda x: split_players(x,0,'s'))
df_MS['LOSER'] = df_MS['PLAYERS'].apply(lambda x: split_players(x,1,'s'))
df_WS['WINNER'] = df_WS['PLAYERS'].apply(lambda x: split_players(x,0,'s'))
df_WS['LOSER'] = df_WS['PLAYERS'].apply(lambda x: split_players(x,1,'s'))
df_MD['WINNER'] = df_MD['PLAYERS'].apply(lambda x: split_players(x,0,'d'))
df_MD['LOSER'] = df_MD['PLAYERS'].apply(lambda x: split_players(x,1,'d'))
df_WD['WINNER'] = df_WD['PLAYERS'].apply(lambda x: split_players(x,0,'d'))
df_WD['LOSER'] = df_WD['PLAYERS'].apply(lambda x: split_players(x,1,'d'))
df_XD['WINNER'] = df_XD['PLAYERS'].apply(lambda x: split_players(x,0,'d'))
df_XD['LOSER'] = df_XD['PLAYERS'].apply(lambda x: split_players(x,1,'d'))

In [232]:
#Initialize the Ranking dictionaries!
MS_dict = create_event_dict(df_MS)
WS_dict = create_event_dict(df_WS)
MD_dict = create_event_dict(df_MD)
WD_dict = create_event_dict(df_WD)
XD_dict = create_event_dict(df_XD)

#Create the rankings
MS_dict = update_ratings(df_MS, MS_dict)
WS_dict = update_ratings(df_WS, WS_dict)
MD_dict = update_ratings(df_MD, MD_dict)
WD_dict = update_ratings(df_WD, WD_dict)
XD_dict = update_ratings(df_XD, XD_dict)

In [248]:
#Show rankings
get_rankings(MS_dict).head()

Unnamed: 0,PLAYERS,RATING
914,Kento Momota,34.669369
615,Shi Yuqi,34.182693
552,Kidambi Srikanth,33.444534
718,Chen Long,33.233323
1152,Chou Tien Chen,32.590268


### How well does this predict?

Can we predict winner?
Can we predict close game?

Let's take out the last 200 matches in MS and try to predict them

In [253]:
def make_predictions(player,event_dict):
    return (trueskill.expose(event_dict[player]))

def correct_pred(row):
    if row['WINNER RATING'] > row['LOSER RATING']:
        return (1)
    else:
        return (0)

In [295]:
def test_model(event):    
    #Create dictionary and event dataframes sorted by date
    event_dict = {}

    #Create the dataframes for each event
    event_df = df[df['EVENT'] == event].sort_values(by = 'match_week')

    #Split players into winners and losers of a match
    if event == 'MS' or event == 'WS':
        event_df['WINNER'] = event_df['PLAYERS'].apply(lambda x: split_players(x,0,'s'))
        event_df['LOSER'] = event_df['PLAYERS'].apply(lambda x: split_players(x,1,'s'))
    else:
        event_df['WINNER'] = event_df['PLAYERS'].apply(lambda x: split_players(x,0,'d'))
        event_df['LOSER'] = event_df['PLAYERS'].apply(lambda x: split_players(x,1,'d'))
    #Initialize the Ranking dictionary
    event_dict = create_event_dict(event_df)

    #split into test and training
    df_train = event_df[:-100]
    df_test = event_df[-100:]
    MS_dict = update_ratings(df_train, event_dict)

    df_test['WINNER RATING'] = df_test['WINNER'].apply(lambda x: make_predictions(x,event_dict))
    df_test['LOSER RATING'] = df_test['LOSER'].apply(lambda x: make_predictions(x,event_dict))
    df_test['Correct Prediction'] = df_test.apply(correct_pred,axis = 1)
    return (df_test)

In [296]:
test_model('MS')['Correct Prediction'].mean()

100%|██████████| 3806/3806 [00:02<00:00, 1527.06it/s]


0.38

In [297]:
test_model('WS')['Correct Prediction'].mean()

100%|██████████| 2486/2486 [00:01<00:00, 1567.94it/s]


0.61

In [298]:
test_model('WD')['Correct Prediction'].mean()

100%|██████████| 1612/1612 [00:01<00:00, 1579.81it/s]


0.6

In [299]:
test_model('MD')['Correct Prediction'].mean()

100%|██████████| 2099/2099 [00:01<00:00, 1565.11it/s]


0.56

In [300]:
test_model('XD')['Correct Prediction'].mean()

100%|██████████| 2064/2064 [00:01<00:00, 1539.96it/s]


0.54

Looks like WS is the only event I'm willing to bet on! And I might not even do that!!