# Data Clean and wrangling

Downdloaded data files from Github and imported as a batch

In [752]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from numpy.random import seed
import seaborn as sns
from os import listdir
import re

In [753]:
filepaths = ['data/tennis_atp/match_00_19/'+f for f in listdir("data/tennis_atp/match_00_19") if f.endswith('.csv')]
df = pd.concat(map(pd.read_csv, filepaths), sort=False)

In [754]:
filepaths

['data/tennis_atp/match_00_19/atp_matches_2000.csv',
 'data/tennis_atp/match_00_19/atp_matches_2001.csv',
 'data/tennis_atp/match_00_19/atp_matches_2002.csv',
 'data/tennis_atp/match_00_19/atp_matches_2003.csv',
 'data/tennis_atp/match_00_19/atp_matches_2004.csv',
 'data/tennis_atp/match_00_19/atp_matches_2005.csv',
 'data/tennis_atp/match_00_19/atp_matches_2006.csv',
 'data/tennis_atp/match_00_19/atp_matches_2007.csv',
 'data/tennis_atp/match_00_19/atp_matches_2008.csv',
 'data/tennis_atp/match_00_19/atp_matches_2009.csv',
 'data/tennis_atp/match_00_19/atp_matches_2010.csv',
 'data/tennis_atp/match_00_19/atp_matches_2011.csv',
 'data/tennis_atp/match_00_19/atp_matches_2012.csv',
 'data/tennis_atp/match_00_19/atp_matches_2013.csv',
 'data/tennis_atp/match_00_19/atp_matches_2014.csv',
 'data/tennis_atp/match_00_19/atp_matches_2015.csv',
 'data/tennis_atp/match_00_19/atp_matches_2016.csv',
 'data/tennis_atp/match_00_19/atp_matches_2017.csv',
 'data/tennis_atp/match_00_19/atp_matches_2018

In [755]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 61560 entries, 0 to 2780
Data columns (total 49 columns):
tourney_id            61560 non-null object
tourney_name          61560 non-null object
surface               61442 non-null object
draw_size             2781 non-null float64
tourney_level         61560 non-null object
tourney_date          61560 non-null int64
match_num             61560 non-null int64
winner_id             61560 non-null int64
winner_seed           25567 non-null object
winner_entry          7346 non-null object
winner_name           61560 non-null object
winner_hand           61542 non-null object
winner_ht             56229 non-null float64
winner_ioc            61560 non-null object
winner_age            61545 non-null float64
loser_id              61560 non-null int64
loser_seed            13973 non-null object
loser_entry           12107 non-null object
loser_name            61560 non-null object
loser_hand            61514 non-null object
loser_ht       

In [756]:
df = df[df.score.str.contains('RET') == False]
print('1890 retired rows are removed')

1890 retired rows are removed


In [757]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 59670 entries, 0 to 2780
Data columns (total 49 columns):
tourney_id            59670 non-null object
tourney_name          59670 non-null object
surface               59557 non-null object
draw_size             2722 non-null float64
tourney_level         59670 non-null object
tourney_date          59670 non-null int64
match_num             59670 non-null int64
winner_id             59670 non-null int64
winner_seed           24958 non-null object
winner_entry          7058 non-null object
winner_name           59670 non-null object
winner_hand           59653 non-null object
winner_ht             54523 non-null float64
winner_ioc            59670 non-null object
winner_age            59657 non-null float64
loser_id              59670 non-null int64
loser_seed            13458 non-null object
loser_entry           11886 non-null object
loser_name            59670 non-null object
loser_hand            59624 non-null object
loser_ht       

In [758]:
df.groupby('tourney_level').tourney_id.count()

tourney_level
A    32651
D     5890
F      286
G     9746
M    11097
Name: tourney_id, dtype: int64

In [759]:
Total_A_origin = df[df.tourney_level=='A'].tourney_id.count()
Total_D_origin = df[df.tourney_level=='D'].tourney_id.count()
Total_F_origin = df[df.tourney_level=='F'].tourney_id.count()
Total_G_origin = df[df.tourney_level=='G'].tourney_id.count()
Total_M_origin = df[df.tourney_level=='M'].tourney_id.count()
print('Total games of level A = ' + str(Total_A_origin))
print('Total games of level D = ' + str(Total_D_origin))
print('Total games of level F = ' + str(Total_F_origin))
print('Total games of level G = ' + str(Total_G_origin))
print('Total games of level M = ' + str(Total_M_origin))

Total games of level A = 32651
Total games of level D = 5890
Total games of level F = 286
Total games of level G = 9746
Total games of level M = 11097


In [760]:
# remove tourey_level F with 286 rows.  So small unkown data set will not impact overll analysis
df = df[df.tourney_level!='F']

In [761]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 59384 entries, 0 to 2780
Data columns (total 49 columns):
tourney_id            59384 non-null object
tourney_name          59384 non-null object
surface               59271 non-null object
draw_size             2707 non-null float64
tourney_level         59384 non-null object
tourney_date          59384 non-null int64
match_num             59384 non-null int64
winner_id             59384 non-null int64
winner_seed           24850 non-null object
winner_entry          7058 non-null object
winner_name           59384 non-null object
winner_hand           59367 non-null object
winner_ht             54258 non-null float64
winner_ioc            59384 non-null object
winner_age            59371 non-null float64
loser_id              59384 non-null int64
loser_seed            13350 non-null object
loser_entry           11886 non-null object
loser_name            59384 non-null object
loser_hand            59338 non-null object
loser_ht       

Remove columns will not be used for late analysis

In [762]:
print('Verify results')
df.groupby('tourney_level').tourney_id.count()

Verify results


tourney_level
A    32651
D     5890
G     9746
M    11097
Name: tourney_id, dtype: int64

In [763]:
# remove rows dont have players attributes such as hadedneess, age, etc.  
df = df[df.winner_hand.notnull()]
df = df[df.winner_ht.notnull()]
df = df[df.winner_age.notnull()]
df = df[df.winner_rank.notnull()]
df = df[df.loser_hand.notnull()]
df = df[df.loser_ht.notnull()]
df = df[df.loser_age.notnull()]
df = df[df.loser_rank.notnull()]

In [764]:
# drop columns with low ratio of complete data entries
columns = ['draw_size','winner_seed','winner_entry','loser_seed','loser_entry']
df.drop(columns, inplace=True, axis=1)

In [765]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 48953 entries, 0 to 2773
Data columns (total 44 columns):
tourney_id            48953 non-null object
tourney_name          48953 non-null object
surface               48917 non-null object
tourney_level         48953 non-null object
tourney_date          48953 non-null int64
match_num             48953 non-null int64
winner_id             48953 non-null int64
winner_name           48953 non-null object
winner_hand           48953 non-null object
winner_ht             48953 non-null float64
winner_ioc            48953 non-null object
winner_age            48953 non-null float64
loser_id              48953 non-null int64
loser_name            48953 non-null object
loser_hand            48953 non-null object
loser_ht              48953 non-null float64
loser_ioc             48953 non-null object
loser_age             48953 non-null float64
score                 48953 non-null object
best_of               48953 non-null int64
round        

In [766]:
print('Remove numeric fields which are null.')
df = df[df.minutes.notnull()]
df=df[df['w_ace'].notnull()]
df=df[df['w_df'].notnull()]
df=df[df['w_svpt'].notnull()]
df=df[df['w_1stIn'].notnull()]
df=df[df['w_1stWon'].notnull()]
df=df[df['w_2ndWon'].notnull()]
df=df[df['w_SvGms'].notnull()]
df=df[df['w_bpSaved'].notnull()]
df=df[df['w_bpFaced'].notnull()]
df=df[df['l_ace'].notnull()]
df=df[df['l_df'].notnull()]
df=df[df['l_svpt'].notnull()]
df=df[df['l_1stIn'].notnull()]
df=df[df['l_1stWon'].notnull()]
df=df[df['l_2ndWon'].notnull()]
df=df[df['l_SvGms'].notnull()]
df=df[df['l_bpSaved'].notnull()]
df=df[df['l_bpFaced'].notnull()]

Remove numeric fields which are null.


In [767]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 45441 entries, 0 to 2773
Data columns (total 44 columns):
tourney_id            45441 non-null object
tourney_name          45441 non-null object
surface               45441 non-null object
tourney_level         45441 non-null object
tourney_date          45441 non-null int64
match_num             45441 non-null int64
winner_id             45441 non-null int64
winner_name           45441 non-null object
winner_hand           45441 non-null object
winner_ht             45441 non-null float64
winner_ioc            45441 non-null object
winner_age            45441 non-null float64
loser_id              45441 non-null int64
loser_name            45441 non-null object
loser_hand            45441 non-null object
loser_ht              45441 non-null float64
loser_ioc             45441 non-null object
loser_age             45441 non-null float64
score                 45441 non-null object
best_of               45441 non-null int64
round        

In [768]:
print('Verify results')
df.groupby('tourney_level').tourney_id.count()

Verify results


tourney_level
A    27131
D      137
G     8390
M     9783
Name: tourney_id, dtype: int64

In [769]:
# After the above clean up, level D only has 139 complete data entry against total 6060 rows.  Leve D data donst have good data.  Remove D level
df = df[df.tourney_level!='D']

In [770]:
print('Verify results')
df.groupby('tourney_level').tourney_id.count()

Verify results


tourney_level
A    27131
G     8390
M     9783
Name: tourney_id, dtype: int64

In [771]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 45304 entries, 0 to 2588
Data columns (total 44 columns):
tourney_id            45304 non-null object
tourney_name          45304 non-null object
surface               45304 non-null object
tourney_level         45304 non-null object
tourney_date          45304 non-null int64
match_num             45304 non-null int64
winner_id             45304 non-null int64
winner_name           45304 non-null object
winner_hand           45304 non-null object
winner_ht             45304 non-null float64
winner_ioc            45304 non-null object
winner_age            45304 non-null float64
loser_id              45304 non-null int64
loser_name            45304 non-null object
loser_hand            45304 non-null object
loser_ht              45304 non-null float64
loser_ioc             45304 non-null object
loser_age             45304 non-null float64
score                 45304 non-null object
best_of               45304 non-null int64
round        

In [772]:
# calculate percentae of serve stats for winners
df['w_ace_pct'] = np.around(df.w_ace/df.w_svpt, 2)
df['w_df_pct'] = np.around(df.w_df/df.w_svpt, 2)
df['w_1stIn_pct'] = np.around(df.w_1stIn / df.w_svpt, 2)
df['w_2ndIn_pct'] = np.around(((df.w_svpt - df.w_1stIn) / df.w_svpt), 2)
df['w_1stWon_pct'] = np.around(df.w_1stWon / df.w_svpt, 2)
df['w_2ndWon_pct'] = np.around(df.w_2ndWon/df.w_svpt, 2)
# calculate total games won and loss
df['w_GmsWon'] = df.w_SvGms-(df.w_bpFaced - df.w_bpSaved) + (df.l_bpFaced - df.l_bpSaved)
df['w_GmsLoss'] = df.l_SvGms-(df.l_bpFaced - df.l_bpSaved) + (df.w_bpFaced - df.w_bpSaved)
# calculate percentae of serve stats for losers
df['l_ace_pct'] = np.around(df.l_ace/df.l_svpt, 2)
df['l_df_pct'] = np.around(df.l_df/df.l_svpt, 2)
df['l_1stIn_pct'] = np.around(df.l_1stIn / df.l_svpt, 2)
df['l_2ndIn_pct'] = np.around(((df.l_svpt - df.l_1stIn) / df.l_svpt), 2)
df['l_1stWon_pct'] = np.around(df.l_1stWon / df.l_svpt, 2)
df['l_2ndWon_pct'] = np.around(df.l_2ndWon/df.l_svpt, 2)

In [773]:
# add year of game
df['year'] = df['tourney_date'].astype(str).str.slice(0,4)

In [774]:
df.year.unique()

array(['2000', '2001', '2002', '2003', '2004', '2005', '2006', '2007',
       '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015',
       '2016', '2017', '2018', '2019'], dtype=object)

In [775]:
df[df.year=='2019'].winner_ht.mean()

185.96961690885072

In [776]:
# add winner age bucket
df.loc[df.winner_age <20, 'winner_age_bucket'] = '<20'
df.loc[df.winner_age >=20, 'winner_age_bucket'] = '20-24'
df.loc[df.winner_age >=25, 'winner_age_bucket'] = '25-29'
df.loc[df.winner_age >=30, 'winner_age_bucket'] = '30-34'
df.loc[df.winner_age >=35, 'winner_age_bucket'] = '35-39'
df.loc[df.winner_age >=40, 'winner_age_bucket'] = '>=40'

In [777]:
df[df.winner_age_bucket.isnull()].winner_age

Series([], Name: winner_age, dtype: float64)

In [778]:
# still have data issue or need to understand data
#df['w_svptWon_pct'] = np.around((df.w_ace + df.w_1stWon + df.w_2ndWon - df.w_df) / (df.w_svpt + df.w_ace), 2)
#df['l_svptWon_pct'] = np.around((df.l_ace + df.l_1stWon + df.l_2ndWon - df.l_df) / (df.l_svpt + df.w_ace), 2)

In [779]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 45304 entries, 0 to 2588
Data columns (total 60 columns):
tourney_id            45304 non-null object
tourney_name          45304 non-null object
surface               45304 non-null object
tourney_level         45304 non-null object
tourney_date          45304 non-null int64
match_num             45304 non-null int64
winner_id             45304 non-null int64
winner_name           45304 non-null object
winner_hand           45304 non-null object
winner_ht             45304 non-null float64
winner_ioc            45304 non-null object
winner_age            45304 non-null float64
loser_id              45304 non-null int64
loser_name            45304 non-null object
loser_hand            45304 non-null object
loser_ht              45304 non-null float64
loser_ioc             45304 non-null object
loser_age             45304 non-null float64
score                 45304 non-null object
best_of               45304 non-null int64
round        

In [780]:
print('These two exception is because no data captured for loser.  Need to exclue them.')
df = df[df.l_ace_pct.notnull()]

These two exception is because no data captured for loser.  Need to exclue them.


In [781]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 45302 entries, 0 to 2588
Data columns (total 60 columns):
tourney_id            45302 non-null object
tourney_name          45302 non-null object
surface               45302 non-null object
tourney_level         45302 non-null object
tourney_date          45302 non-null int64
match_num             45302 non-null int64
winner_id             45302 non-null int64
winner_name           45302 non-null object
winner_hand           45302 non-null object
winner_ht             45302 non-null float64
winner_ioc            45302 non-null object
winner_age            45302 non-null float64
loser_id              45302 non-null int64
loser_name            45302 non-null object
loser_hand            45302 non-null object
loser_ht              45302 non-null float64
loser_ioc             45302 non-null object
loser_age             45302 non-null float64
score                 45302 non-null object
best_of               45302 non-null int64
round        

In [782]:
df.winner_hand.unique()

array(['R', 'L'], dtype=object)

In [783]:
df[df.loser_hand=='U']

Unnamed: 0,tourney_id,tourney_name,surface,tourney_level,tourney_date,match_num,winner_id,winner_name,winner_hand,winner_ht,...,w_GmsWon,w_GmsLoss,l_ace_pct,l_df_pct,l_1stIn_pct,l_2ndIn_pct,l_1stWon_pct,l_2ndWon_pct,year,winner_age_bucket
1606,2003-500,Halle,Grass,A,20030609,4,101962,Younes El Aynaoui,R,193.0,...,12.0,3.0,0.04,0.06,0.57,0.43,0.28,0.19,2003,30-34


In [784]:
df = df[df.loser_hand!='U']

In [785]:
df.loser_hand.unique()

array(['R', 'L'], dtype=object)

In [786]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 45301 entries, 0 to 2588
Data columns (total 60 columns):
tourney_id            45301 non-null object
tourney_name          45301 non-null object
surface               45301 non-null object
tourney_level         45301 non-null object
tourney_date          45301 non-null int64
match_num             45301 non-null int64
winner_id             45301 non-null int64
winner_name           45301 non-null object
winner_hand           45301 non-null object
winner_ht             45301 non-null float64
winner_ioc            45301 non-null object
winner_age            45301 non-null float64
loser_id              45301 non-null int64
loser_name            45301 non-null object
loser_hand            45301 non-null object
loser_ht              45301 non-null float64
loser_ioc             45301 non-null object
loser_age             45301 non-null float64
score                 45301 non-null object
best_of               45301 non-null int64
round        

In [787]:
# get games between right hand and left hand players
df_RL = df[df.winner_hand != df.loser_hand]

In [788]:
df.query('winner_hand=="L" & loser_hand=="R"').head()

Unnamed: 0,tourney_id,tourney_name,surface,tourney_level,tourney_date,match_num,winner_id,winner_name,winner_hand,winner_ht,...,w_GmsWon,w_GmsLoss,l_ace_pct,l_df_pct,l_1stIn_pct,l_2ndIn_pct,l_1stWon_pct,l_2ndWon_pct,year,winner_age_bucket
10,2000-339,Adelaide,Hard,A,20000103,11,102652,Christian Vinck,L,183.0,...,15.0,13.0,0.02,0.07,0.64,0.36,0.46,0.14,2000,20-24
35,2000-891,Chennai,Hard,A,20000103,5,102259,Jerome Golmard,L,188.0,...,12.0,7.0,0.1,0.03,0.34,0.66,0.23,0.38,2000,25-29
43,2000-891,Chennai,Hard,A,20000103,13,103714,Andreas Vinciguerra,L,180.0,...,12.0,6.0,0.05,0.09,0.49,0.51,0.31,0.18,2000,<20
49,2000-891,Chennai,Hard,A,20000103,19,102259,Jerome Golmard,L,188.0,...,12.0,5.0,0.12,0.0,0.56,0.44,0.37,0.29,2000,25-29
56,2000-891,Chennai,Hard,A,20000103,26,102259,Jerome Golmard,L,188.0,...,12.0,7.0,0.08,0.06,0.53,0.47,0.39,0.22,2000,25-29


In [789]:
df_RL.groupby('winner_hand').tourney_id.count()

winner_hand
L    4805
R    5261
Name: tourney_id, dtype: int64

In [790]:
Total_A_cln = df[df.tourney_level=='A'].tourney_id.count()
Total_G_cln = df[df.tourney_level=='G'].tourney_id.count()
Total_M_cln = df[df.tourney_level=='M'].tourney_id.count()

In [791]:
print ('Level A data quality % = '+ str(np.around(Total_A_cln / Total_A_origin, 2)))
print ('Level G data quality % = '+ str(np.around(Total_G_cln / Total_G_origin, 2)))
print ('Level M data quality % = '+ str(np.around(Total_M_cln / Total_M_origin, 2)))

Level A data quality % = 0.83
Level G data quality % = 0.86
Level M data quality % = 0.88


In [792]:
df_RL.groupby(['surface', 'winner_hand']).tourney_id.count()

surface  winner_hand
Carpet   L               104
         R               135
Clay     L              1709
         R              1666
Grass    L               543
         R               597
Hard     L              2449
         R              2863
Name: tourney_id, dtype: int64

In [793]:
df_RL.groupby(['tourney_level','surface', 'winner_hand']).tourney_id.count()

tourney_level  surface  winner_hand
A              Carpet   L                81
                        R               106
               Clay     L              1104
                        R              1095
               Grass    L               332
                        R               356
               Hard     L              1397
                        R              1618
G              Clay     L               239
                        R               239
               Grass    L               211
                        R               241
               Hard     L               415
                        R               486
M              Carpet   L                23
                        R                29
               Clay     L               366
                        R               332
               Hard     L               637
                        R               759
Name: tourney_id, dtype: int64

In [794]:
# exam round values
df.loc[:,'round'].unique()

array(['R32', 'R16', 'QF', 'SF', 'F', 'R128', 'R64', 'RR', 'BR'],
      dtype=object)

In [795]:
df[df['round']=='RR'].tourney_level.unique()

array(['A'], dtype=object)

In [796]:
df[df['round']=='BR'].tourney_level.unique()

array(['A'], dtype=object)

In [797]:
df.groupby('round').winner_id.count()

round
BR          1
F        1183
QF       4449
R128     4994
R16      8631
R32     15627
R64      7814
RR        342
SF       2260
Name: winner_id, dtype: int64

In [798]:
df[df.tourney_level=='A'].groupby('round').count()

Unnamed: 0_level_0,tourney_id,tourney_name,surface,tourney_level,tourney_date,match_num,winner_id,winner_name,winner_hand,winner_ht,...,w_GmsWon,w_GmsLoss,l_ace_pct,l_df_pct,l_1stIn_pct,l_2ndIn_pct,l_1stWon_pct,l_2ndWon_pct,year,winner_age_bucket
round,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
BR,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
F,942,942,942,942,942,942,942,942,942,942,...,942,942,942,942,942,942,942,942,942,942
QF,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,...,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512
R16,6795,6795,6795,6795,6795,6795,6795,6795,6795,6795,...,6795,6795,6795,6795,6795,6795,6795,6795,6795,6795
R32,12016,12016,12016,12016,12016,12016,12016,12016,12016,12016,...,12016,12016,12016,12016,12016,12016,12016,12016,12016,12016
R64,1724,1724,1724,1724,1724,1724,1724,1724,1724,1724,...,1724,1724,1724,1724,1724,1724,1724,1724,1724,1724
RR,342,342,342,342,342,342,342,342,342,342,...,342,342,342,342,342,342,342,342,342,342
SF,1796,1796,1796,1796,1796,1796,1796,1796,1796,1796,...,1796,1796,1796,1796,1796,1796,1796,1796,1796,1796


In [799]:
print('Round code RR and BR are from level A - Other tourney level')

Round code RR and BR are from level A - Other tourney level


In [800]:
print('Added sequence number for each round')
df.loc[df['round']=='R128', 'round_num'] = 1
df.loc[df['round']=='R64', 'round_num'] = 2
df.loc[df['round']=='R32', 'round_num'] = 3
df.loc[df['round']=='R16', 'round_num'] = 4
df.loc[df['round']=='QF', 'round_num'] = 5
df.loc[df['round']=='SF', 'round_num'] = 6
df.loc[df['round']=='F', 'round_num'] = 7
print('Assigned temp value 11 and 10 for BR and RR.  RR and BR only used at leve lA games. Need more understanding on how RR and BR working in round sequence')
df.loc[df['round']=='RR', 'round_num'] = 10
df.loc[df['round']=='BR', 'round_num'] = 11

Added sequence number for each round
Assigned temp value 11 and 10 for BR and RR.  RR and BR only used at leve lA games. Need more understanding on how RR and BR working in round sequence


In [801]:
df[df.round_num.isnull()]

Unnamed: 0,tourney_id,tourney_name,surface,tourney_level,tourney_date,match_num,winner_id,winner_name,winner_hand,winner_ht,...,w_GmsLoss,l_ace_pct,l_df_pct,l_1stIn_pct,l_2ndIn_pct,l_1stWon_pct,l_2ndWon_pct,year,winner_age_bucket,round_num


In [802]:
w_columns = ['tourney_id','tourney_name','surface','tourney_level','tourney_date','match_num','winner_id','winner_name','winner_hand','winner_ht','winner_ioc','winner_age','score','best_of','round','minutes','w_ace','w_df','w_svpt','w_1stIn','w_1stWon','w_2ndWon','w_SvGms','w_bpSaved','w_bpFaced','winner_rank','winner_rank_points','w_ace_pct','w_df_pct','w_1stIn_pct','w_2ndIn_pct','w_1stWon_pct','w_2ndWon_pct','w_GmsWon','w_GmsLoss','year','loser_id', 'loser_name']
l_columns = ['tourney_id','tourney_name','surface','tourney_level','tourney_date','match_num','loser_id','loser_name','loser_hand','loser_ht','loser_ioc','loser_age','score','best_of','round','minutes','l_ace','l_df','l_svpt','l_1stIn','l_1stWon','l_2ndWon','l_SvGms','l_bpSaved','l_bpFaced','loser_rank','loser_rank_points','l_ace_pct','l_df_pct','l_1stIn_pct','l_2ndIn_pct','l_1stWon_pct','l_2ndWon_pct','w_GmsLoss','w_GmsWon','year','winner_id', 'winner_name']
columns = ['tourney_id','tourney_name','surface','tourney_level','tourney_date','match_num','player_id','player_name','player_hand','player_ht','player_ioc','player_age','score','best_of','round','minutes','ace','df','svpt','sv1stIn','sv1stWon','sv2ndWon','SvGms','bpSaved','bpFaced','player_rank','player_rank_points','ace_pct','df_pct','sv1stIn_pct','sv2ndIn_pct','sv1stWon_pct','sv2ndWon_pct','GmsWon','GmsLoss','year','opponent_id', 'opponent_name', 'won_flag']

In [803]:
df_all_l = df[l_columns]
df_all_w = df[w_columns]
df_all_l['won_flag']=0
df_all_w['won_flag']=1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [804]:
df_all_w.columns=columns
df_all_l.columns=columns
df_all = df_all_w.append(df_all_l)

In [805]:
df_all.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 90602 entries, 0 to 2588
Data columns (total 39 columns):
tourney_id            90602 non-null object
tourney_name          90602 non-null object
surface               90602 non-null object
tourney_level         90602 non-null object
tourney_date          90602 non-null int64
match_num             90602 non-null int64
player_id             90602 non-null int64
player_name           90602 non-null object
player_hand           90602 non-null object
player_ht             90602 non-null float64
player_ioc            90602 non-null object
player_age            90602 non-null float64
score                 90602 non-null object
best_of               90602 non-null int64
round                 90602 non-null object
minutes               90602 non-null float64
ace                   90602 non-null float64
df                    90602 non-null float64
svpt                  90602 non-null float64
sv1stIn               90602 non-null float64
sv1stWon 

In [806]:
# add player age bucket
df_all.loc[df_all.player_age <20, 'player_age_bucket'] = '<20'
df_all.loc[df_all.player_age >=20, 'player_age_bucket'] = '20-24'
df_all.loc[df_all.player_age >=25, 'player_age_bucket'] = '25-29'
df_all.loc[df_all.player_age >=30, 'player_age_bucket'] = '30-34'
df_all.loc[df_all.player_age >=35, 'player_age_bucket'] = '35-39'
df_all.loc[df_all.player_age >=40, 'player_age_bucket'] = '>=40'

In [807]:
df_all_rl_l = df_RL[l_columns]
df_all_rl_w = df_RL[w_columns]

In [808]:
df_all_rl_l['won_flag']=0
df_all_rl_w['won_flag']=1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [809]:
df_all_rl_w.columns=columns
df_all_rl_l.columns=columns
df_all_rl = df_all_rl_w.append(df_all_rl_l)

In [810]:
df_all_rl.loc[df_all_rl.player_age <20, 'player_age_bucket'] = '<20'
df_all_rl.loc[df_all_rl.player_age >=20, 'player_age_bucket'] = '20-24'
df_all_rl.loc[df_all_rl.player_age >=25, 'player_age_bucket'] = '25-29'
df_all_rl.loc[df_all_rl.player_age >=30, 'player_age_bucket'] = '30-34'
df_all_rl.loc[df_all_rl.player_age >=35, 'player_age_bucket'] = '35-39'
df_all_rl.loc[df_all_rl.player_age >=40, 'player_age_bucket'] = '>=40'

In [811]:
df_all_rl.describe()

Unnamed: 0,tourney_date,match_num,player_id,player_ht,player_age,best_of,minutes,ace,df,svpt,...,ace_pct,df_pct,sv1stIn_pct,sv2ndIn_pct,sv1stWon_pct,sv2ndWon_pct,GmsWon,GmsLoss,opponent_id,won_flag
count,20132.0,20132.0,20132.0,20132.0,20132.0,20132.0,20132.0,20132.0,20132.0,20132.0,...,20132.0,20132.0,20132.0,20132.0,20132.0,20132.0,20132.0,20132.0,20132.0,20132.0
mean,20083780.0,63.714981,103899.153537,185.544854,26.730311,3.367177,105.76207,6.008593,3.084095,79.883618,...,0.075237,0.038562,0.611268,0.388718,0.436195,0.199405,12.496175,12.496175,103899.153537,0.5
std,54908.22,98.979493,1055.752391,6.377392,3.88311,0.774315,40.208425,5.296905,2.495345,28.252196,...,0.059738,0.028936,0.083404,0.083398,0.085619,0.060696,4.935331,4.935331,1055.752391,0.500012
min,20000100.0,1.0,100644.0,163.0,15.55,3.0,10.0,0.0,0.0,17.0,...,0.0,0.0,0.26,0.09,0.08,0.0,0.0,0.0,100644.0,0.0
25%,20040110.0,13.0,103196.75,183.0,23.81,3.0,77.0,2.0,1.0,59.0,...,0.03,0.02,0.56,0.33,0.38,0.16,9.0,9.0,103196.75,0.0
50%,20080720.0,25.0,103852.0,185.0,26.66,3.0,98.0,5.0,3.0,75.0,...,0.06,0.03,0.61,0.39,0.43,0.2,12.0,12.0,103852.0,0.5
75%,20130210.0,58.0,104745.0,190.0,29.49,3.0,127.0,8.0,4.0,95.0,...,0.1,0.05,0.67,0.44,0.49,0.24,15.0,15.0,104745.0,1.0
max,20191030.0,1701.0,106401.0,208.0,44.06,5.0,1146.0,75.0,20.0,237.0,...,0.46,0.3,0.91,0.74,1.33,0.76,45.0,45.0,106401.0,1.0


#### Export cleansed data to csv files for Data Story and EAD:

In [812]:
print('export cleansed data to csv files for Data Story and EAD')
df.to_csv('data/all_h_00_19.csv')
df_all.to_csv('data/all_v_00_19.csv')
df_RL.to_csv('data/RL_h.csv')
df_all_rl.to_csv('data/RL_v.csv')

export cleansed data to csv files for Data Story and EAD
