In [857]:
import pandas as pd
import numpy as np
import os 
import matplotlib.pyplot as plt
#import tensorflow as tf 


# Preprocess Each Dataset (Fighters and Bouts)

In [858]:
#just fighters
fighters_data = pd.read_csv('Fights_scraper/spiders/scraped_fighters.csv') 
#just bouts
bouts_data = pd.read_csv('Bouts_Scraper/bouts_scraped/bouts_scraped/spiders/scraped_bouts.csv') 
#combined dataset, created with SQL Query, joined on fighter1 and 2 names
fighter_bouts = pd.read_csv('fighters_bouts_joined.csv') 


In [859]:
fighter_bouts = fighter_bouts.drop(columns=['round','time'])

In [860]:
#shuffles winners in the df to make the classes of winners and losers balanced
import math
negative_index = np.random.choice(len(fighter_bouts),
                                  size= math.ceil(len(fighter_bouts)/2),
                                  replace = False)

fighter_bouts.iloc[negative_index,[2,3]] = fighter_bouts.iloc[negative_index,[3,2]].values



In [861]:
#make winner column align correctly with negative index
fighter_bouts['winner'].iloc[negative_index] = fighter_bouts['fighter2'].iloc[negative_index]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [862]:
#creates two lists of the relevant columns stats for f1 and f2
f1_list = [col for col in fighter_bouts.columns if 'f1' in col]
f2_list = [col for col in fighter_bouts.columns if 'f2' in col]


In [863]:
f1_index = []
for col in fighter_bouts.columns:
    if col in f1_list:
        f1_index.append(fighter_bouts.columns.get_loc(col))

f2_index = []
for col in fighter_bouts.columns:
    if col in f2_list:
        f2_index.append(fighter_bouts.columns.get_loc(col))

In [864]:
fb_copy = fighter_bouts.copy()
#all values at the negative index of f1's become the values of negative index of f2's
fighter_bouts.iloc[negative_index,f1_index] = fighter_bouts.iloc[negative_index,f2_index].values
#all values at the negative index of f2's become the values of negative index of f1's using copy of fb_copy of fighter bouts
fighter_bouts.iloc[negative_index,f2_index] =fb_copy.iloc[negative_index,f1_index].values

In [865]:
#seperate df consisting only of categorical dtypes
categorical_fb = fighter_bouts.select_dtypes(include='object')
#drop those types for now from fighter bouts leaving only numeric values
fighter_bouts = fighter_bouts.drop(columns=categorical_fb.columns)


In [866]:
#some fighters have '--' as a null for their DOB and height
categorical_fb['f1_dob'] =  categorical_fb['f1_dob'].replace('--',None) 
categorical_fb['f2_dob'] =  categorical_fb['f2_dob'].replace('--',None)
#same as above for height
categorical_fb['f1_height'] =  categorical_fb['f1_height'].replace('--',None) 
categorical_fb['f2_height'] =  categorical_fb['f2_height'].replace('--',None)

In [867]:
categorical_fb['event_date'] = pd.to_datetime(categorical_fb['event_date'])
categorical_fb['f1_dob'] = pd.to_datetime(categorical_fb['f1_dob'])
categorical_fb['f2_dob'] = pd.to_datetime(categorical_fb['f2_dob'])


In [868]:
#weights can be seen as an ordinal category value so this list declares that order
weights_ordered = ["Women's Strawweight","Women's Flyweight","Women's Bantamweight",
               "Women's Featherweight", "Flyweight", "Bantamweight", "Featherweight",
               "Lightweight","Welterweight", "Middleweight","Light Heavyweight",
               "Heavyweight", "Super Heavyweight", "Open Weight","Catch Weight"]



In [869]:
categorical_fb["weight_class"] = categorical_fb.weight_class.astype("category",
                                 ordered=True,
                                 categories= weights_ordered).cat.codes


  This is separate from the ipykernel package so we can avoid doing imports until


In [870]:
#Creating a new feature for fighter1 and fighter2: ageAtFight. It's ordinal so no need to one hot encode,
categorical_fb['f1_ageAtFight'] = (categorical_fb.event_date - categorical_fb.f1_dob).dt.days
categorical_fb['f2_ageAtFight'] = (categorical_fb.event_date - categorical_fb.f2_dob).dt.days

In [871]:
#starting to clean up the the fighters' records categories, first we replace text ('Record') with nothing
categorical_fb = categorical_fb.replace('Record: ',"",regex=True) 

In [872]:
#make these values all zero (just in case something goes wrong with record_split()
categorical_fb['f1_win'],categorical_fb['f1_loss'],categorical_fb['f1_draw'],categorical_fb['f1_nc'] = 0,0,0,0
categorical_fb['f2_win'],categorical_fb['f2_loss'],categorical_fb['f2_draw'],categorical_fb['f2_nc'] = 0,0,0,0


In [873]:
#a small function to handle the remainder of the record strings
#takes in a row split the values on '-'
#if 'NC' is contained the draw var(the only place it could be) then we split draw on brackets first
#draw is simply equal to the 0th element of that split
#nc is equal to the 1st element of that split with a regex to remove any remaining non numeric values
#cast all values to int and return all 4 
import re

def record_split(row):
        win,loss, draw = row.split('-')
        nc= 0
        if 'NC' in draw:       
            draw = draw.split('(')
            nc = re.sub('[^0-9]','', draw[1])
            draw = draw[0]
        else:
            nc = 0     
            
        win,loss, draw, nc = int(win),int(loss), int(draw), int(nc)
            
        return win,loss,draw,nc
        

In [874]:
#this is a messy way to do these assignments but works. 
#use the pd apply function to apply our record_split() function on each row in df
#zip(*...) unpacks our results nicely
categorical_fb['f1_win'],\
categorical_fb['f1_loss'],\
categorical_fb['f1_draw'],\
categorical_fb['f1_nc'] = zip(*categorical_fb.apply(lambda x: record_split(x['f1_record']), axis=1))

categorical_fb['f2_win'],\
categorical_fb['f2_loss'],\
categorical_fb['f2_draw'],\
categorical_fb['f2_nc'] = zip(*categorical_fb.apply(lambda x: record_split(x['f2_record']), axis=1))



In [875]:
categorical_fb = categorical_fb.drop(columns =['f1_record','f2_record'])

In [876]:

#adds winner to end of column for readability 
cols = list(categorical_fb.columns.values) 
cols.pop(cols.index('winner')) 
categorical_fb = categorical_fb[cols+['winner']] 



In [877]:
categorical_fb.head()

Unnamed: 0,event_date,figher1,fighter2,weight_class,win_method_finish,win_method_type,f1_dob,f1_height,f1_stance,f2_dob,...,f2_ageAtFight,f1_win,f1_loss,f1_draw,f1_nc,f2_win,f2_loss,f2_draw,f2_nc,winner
0,2018-01-14,Dooho Choi,Jeremy Stephens,6,Punch,KO/TKO,1991-04-10,5' 10,Orthodox,1986-05-25,...,11557,14,3,0,0,28,16,0,0,Jeremy Stephens
1,2018-01-14,Jessica-Rose Clark,Paige VanZant,1,,U-DEC,1987-11-28,5' 5,Orthodox,1994-03-26,...,8695,9,5,0,1,8,4,0,0,Jessica-Rose Clark
2,2018-01-14,Emil Meek,Kamaru Usman,8,,U-DEC,1988-08-20,5' 11,Switch,1987-05-11,...,11206,9,4,1,1,15,1,0,0,Kamaru Usman
3,2018-01-14,Michael Johnson,Darren Elkins,6,Rear Naked Choke,SUB,1986-06-04,5' 10,Southpaw,1984-05-16,...,12296,20,14,0,0,25,7,0,0,Darren Elkins
4,2018-01-14,Alex White,James Krause,7,,U-DEC,1988-10-22,6' 0,Southpaw,1986-06-04,...,11547,13,5,0,0,25,8,0,0,James Krause


In [878]:
def parse_height(height):
    #expected format is 5' 10, 6'3 etc
    ht = height.split("' ")
    ft = float(ht[0])
    inch = float(ht[1])
    return (12*ft) + inch
        

In [879]:
categorical_fb.f2_height = categorical_fb.f2_height.apply(lambda x: parse_height(x))
categorical_fb.f1_height = categorical_fb.f1_height.apply(lambda x: parse_height(x))

In [880]:
categorical_fb = categorical_fb.join(pd.get_dummies(categorical_fb.win_method_type,prefix= 'win_method'))
categorical_fb = categorical_fb.join(pd.get_dummies(categorical_fb.win_method_finish,prefix= 'win_finish'))
categorical_fb = categorical_fb.join(pd.get_dummies(categorical_fb.f1_stance,prefix= 'f1_stance'))
categorical_fb = categorical_fb.join(pd.get_dummies(categorical_fb.f2_stance,prefix= 'f2_stance'))

In [881]:
categorical_fb = categorical_fb.drop(columns=['win_method_type','win_method_finish','f1_stance','f2_stance','f1_dob','f2_dob','event_date'])

In [882]:

#adds winner to end of column for readability 
cols = list(categorical_fb.columns.values) 
cols.pop(cols.index('winner')) 
categorical_fb = categorical_fb[cols+['winner']] 



In [883]:
categorical_fb['winner'] = (categorical_fb['fighter2'] == categorical_fb['winner']).astype('int')
categorical_fb['figher1'] = 0
categorical_fb['fighter2'] = 1

In [884]:
fbs_joined = pd.concat([fighter_bouts,categorical_fb],axis=1)

In [885]:
target = fbs_joined.winner

In [886]:
fbs_joined = fbs_joined.drop(columns=['event_attendence'])

In [887]:
#we'll be using this later for the second model prep
fbj_copy = fbs_joined.copy()

In [888]:
fbs_joined = fbs_joined.drop(columns=['winner'])

In [889]:
fbs_joined.head()

Unnamed: 0,pass_stat_f1,pass_stat_f2,str_stat_f1,str_stat_f2,sub_stat_f1,sub_stat_f2,td_stat_f1,td_stat_f2,f1_reach,f1_sapm,...,f1_stance_Open Stance,f1_stance_Orthodox,f1_stance_Sideways,f1_stance_Southpaw,f1_stance_Switch,f2_stance_Open Stance,f2_stance_Orthodox,f2_stance_Sideways,f2_stance_Southpaw,f2_stance_Switch
0,0,0,44,49,0,0,0,0,70.0,6.26,...,0,1,0,0,0,0,1,0,0,0
1,4,0,55,54,1,0,2,0,64.0,4.13,...,0,1,0,0,0,0,1,0,0,0
2,0,7,30,50,2,0,0,8,74.0,2.64,...,0,0,0,0,1,0,0,0,0,1
3,0,2,45,18,0,1,0,1,73.0,3.82,...,0,0,0,1,0,0,1,0,0,0
4,0,4,56,35,0,2,1,3,71.0,2.87,...,0,0,0,1,0,0,1,0,0,0


In [890]:
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy= 'median',copy= False)
scaler = StandardScaler()

In [891]:
scaled_fbs = scaler.fit_transform(fbs_joined)
imputed_fbs = imputer.fit_transform(scaled_fbs)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [None]:
scaled_fbs[0:10]

In [893]:
from sklearn.model_selection import StratifiedShuffleSplit
sss= StratifiedShuffleSplit(n_splits=20,test_size=0.2,random_state=42)

for train_index, test_index in sss.split(imputed_fbs, target):
    X_train, X_test = imputed_fbs[train_index], imputed_fbs[test_index]
    y_train, y_test = target[train_index], target[test_index]

In [894]:
np.savetxt("model_data/X_train.csv", X_train, delimiter=",")
np.savetxt("model_data/y_train.csv", y_train, delimiter=",")
np.savetxt("model_data/X_test.csv", X_test, delimiter=",")
np.savetxt("model_data/y_test.csv", y_test, delimiter=",")

# Fight Stats Prediction Data Prep

This next step is about preparing the data for the second model that will be used.
The aim of this model is to produce multiple linear regressions that will try to predict the 
performance (fight stats) of two fighters in a bout and then pass those on to our main Neural Net

In [895]:
drop_cols =[]
for col in fbj_copy.columns:
    if 'win_method_' in col or 'win_finish_' in col:
        drop_cols.append(col)
fbj_copy = fbj_copy.drop(columns=drop_cols)        

In [896]:
fbj_copy.columns

Index(['pass_stat_f1', 'pass_stat_f2', 'str_stat_f1', 'str_stat_f2',
       'sub_stat_f1', 'sub_stat_f2', 'td_stat_f1', 'td_stat_f2', 'f1_reach',
       'f1_sapm', 'f1_slpm', 'f1_stk_acc', 'f1_stk_def', 'f1_sub_avg',
       'f1_td_acc', 'f1_td_avg', 'f1_td_def', 'f1_weight', 'f2_reach',
       'f2_sapm', 'f2_slpm', 'f2_stk_acc', 'f2_stk_def', 'f2_sub_avg',
       'f2_td_acc', 'f2_td_avg', 'f2_td_def', 'f2_weight', 'figher1',
       'fighter2', 'weight_class', 'f1_height', 'f2_height', 'f1_ageAtFight',
       'f2_ageAtFight', 'f1_win', 'f1_loss', 'f1_draw', 'f1_nc', 'f2_win',
       'f2_loss', 'f2_draw', 'f2_nc', 'f1_stance_Open Stance',
       'f1_stance_Orthodox', 'f1_stance_Sideways', 'f1_stance_Southpaw',
       'f1_stance_Switch', 'f2_stance_Open Stance', 'f2_stance_Orthodox',
       'f2_stance_Sideways', 'f2_stance_Southpaw', 'f2_stance_Switch',
       'winner'],
      dtype='object')

In [897]:
predictor_cols = ['pass_stat_f1', 'pass_stat_f2', 'str_stat_f1', 'str_stat_f2',
       'sub_stat_f1', 'sub_stat_f2', 'td_stat_f1', 'td_stat_f2',]

In [898]:
fbj_copy[predictor_cols].head()

Unnamed: 0,pass_stat_f1,pass_stat_f2,str_stat_f1,str_stat_f2,sub_stat_f1,sub_stat_f2,td_stat_f1,td_stat_f2
0,0,0,44,49,0,0,0,0
1,4,0,55,54,1,0,2,0
2,0,7,30,50,2,0,0,8
3,0,2,45,18,0,1,0,1
4,0,4,56,35,0,2,1,3


In [899]:
targets = fbj_copy[predictor_cols]

In [900]:
fbj_copy =fbj_copy.drop(columns= targets.columns)

In [901]:
fbj_copy.head()

Unnamed: 0,f1_reach,f1_sapm,f1_slpm,f1_stk_acc,f1_stk_def,f1_sub_avg,f1_td_acc,f1_td_avg,f1_td_def,f1_weight,...,f1_stance_Orthodox,f1_stance_Sideways,f1_stance_Southpaw,f1_stance_Switch,f2_stance_Open Stance,f2_stance_Orthodox,f2_stance_Sideways,f2_stance_Southpaw,f2_stance_Switch,winner
0,70.0,6.26,5.64,53,54,0.0,20,0.55,50,145.0,...,1,0,0,0,0,1,0,0,0,1
1,64.0,4.13,4.6,50,61,0.7,66,1.33,57,125.0,...,1,0,0,0,0,1,0,0,0,0
2,74.0,2.64,2.24,38,32,1.3,50,0.33,48,170.0,...,0,0,0,1,0,0,0,0,1,1
3,73.0,3.82,4.15,37,58,0.1,45,0.56,78,145.0,...,0,0,1,0,0,1,0,0,0,1
4,71.0,2.87,3.77,42,62,0.6,50,1.05,72,155.0,...,0,0,1,0,0,1,0,0,0,1


In [902]:
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import Normalizer
imputer = SimpleImputer(strategy= 'median',copy= False)
scaler = StandardScaler()
normalizer = Normalizer()

In [903]:
scaled_fbj = scaler.fit_transform(fbj_copy)
imputed_fbj = imputer.fit_transform(fbj_copy)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [904]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(imputed_fbj, targets, test_size=0.10, random_state=42)


In [905]:
np.savetxt("model_data/X_train_stats.csv", X_train, delimiter=",")
np.savetxt("model_data/y_train_stats.csv", y_train, delimiter=",")
np.savetxt("model_data/X_test_stats.csv", X_test, delimiter=",")
np.savetxt("model_data/y_test_stats.csv", y_test, delimiter=",")

In [664]:
print(targets)

[[0.         0.         0.74404871 ... 0.         0.         0.        ]
 [0.05180408 0.         0.71230617 ... 0.         0.02590204 0.        ]
 [0.         0.11803529 0.50586551 ... 0.         0.         0.13489747]
 ...
 [0.10724978 0.08043734 0.85799827 ... 0.         0.05362489 0.18768712]
 [0.         0.         0.65845909 ... 0.         0.0117582  0.        ]
 [0.01528366 0.         0.91701964 ... 0.         0.03056732 0.        ]]
