## NLL Goalies Stats Cleaning

Work by Alexander Palensky

For questions, 
contact me on [Twitter](https://twitter.com/AlPalensky)
or view my [Kaggle Account](https://www.kaggle.com/apalensky) for supplemental content

In [75]:
import numpy as np
import pandas as pd
import datetime
from datetime import date
import re

In [76]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

In [77]:
df = pd.read_csv('NLLGoaliesGameStats.csv')

In [78]:
split1 = df['Name'].str.split('\n ', n = 1, expand = True)
df['Name'] = split1[0]
df['Credit'] = split1[1]

In [79]:
df.head()

Unnamed: 0.1,Unnamed: 0,#,Name,MIN,SOG,SV Q1,SV Q2,SV Q3,SV Q4,SV,GA,Date,Location,Day,Team,SV OT,Credit
0,0,0,D.Lee,15:00,0,37,0,0,,37,0,1993-03-28,New York,Sun,Baltimore Goalies,,(loss)
1,1,Totals:,15:00,0,37,0,0,,37,0,,1993-03-28,New York,Sun,,,
2,2,#,Name,MIN,SOG,SV Q1,SV Q2,SV Q3,SV Q4,SV,GA,1993-03-28,New York,Sun,,,
3,3,0,S.LoCascio,15:00,0,29,0,0,,29,0,1993-03-28,New York,Sun,New York Goalies,,(win)
4,4,Totals:,15:00,0,29,0,0,,29,0,,1993-03-28,New York,Sun,,,


In [80]:
df.drop(df[df['#'] == '#'].index, inplace = True) 

In [81]:
df.drop(df[df['#'] == 'Totals:'].index, inplace = True) 

In [82]:
len(df)

5800

In [83]:
new_row1 = {'Unnamed: 0':10293, '#':39, 'Name':"P.O'Toole", 'MIN':'59:50','SOG':50,'SV Q1':5, 
           'SV Q2':8, 'SV Q3':16, 'SV Q4':10, 'SV':39, 'GA':11, 'Date':'2006-02-10', 
           'Location':'Toronto','Day':'Fri', 'Team':'Rochester Goalies', 'SV OT':0, 'Credit':'(loss)'}
#append row to the dataframe
df = df.append(new_row1, ignore_index=True)

In [84]:
new_row2 = {'Unnamed: 0':10294, '#':52, 'Name':"G.Crawley", 'MIN':np.nan,'SOG':np.nan,'SV Q1':np.nan, 
           'SV Q2':np.nan, 'SV Q3':np.nan, 'SV Q4':np.nan, 'SV':np.nan, 'GA':np.nan, 'Date':'2006-02-10', 
           'Location':'Toronto','Day':'Fri', 'Team':'Rochester Goalies', 'SV OT':np.nan, 'Credit':'(b)'}
#append row to the dataframe
df = df.append(new_row2, ignore_index=True)

In [85]:
new_row3 = {'Unnamed: 0':10295, '#':29, 'Name':"B.Watson", 'MIN':'60:10','SOG':43,'SV Q1':10, 
           'SV Q2':4, 'SV Q3':3, 'SV Q4':16, 'SV':33, 'GA':10, 'Date':'2006-02-10', 
           'Location':'Toronto','Day':'Fri', 'Team':'Toronto Goalies', 'SV OT':0, 'Credit':'(win)'}
#append row to the dataframe
df = df.append(new_row3, ignore_index=True)

In [86]:
new_row4 = {'Unnamed: 0':10296, '#':90, 'Name':"P.Wetherup", 'MIN':np.nan,'SOG':np.nan,'SV Q1':np.nan, 
           'SV Q2':np.nan, 'SV Q3':np.nan, 'SV Q4':np.nan, 'SV':np.nan, 'GA':np.nan, 'Date':'2006-02-10', 
           'Location':'Toronto','Day':'Fri', 'Team':'Toronto Goalies', 'SV OT':np.nan, 'Credit':'(b)'}
#append row to the dataframe
df = df.append(new_row4, ignore_index=True)

In [87]:
df = df.reset_index()
df.tail(25)

Unnamed: 0.1,index,Unnamed: 0,#,Name,MIN,SOG,SV Q1,SV Q2,SV Q3,SV Q4,SV,GA,Date,Location,Day,Team,SV OT,Credit
5779,5779,10257,31,A.Shute,,,,,,,,,2020-03-07,Saskatchewan,Sat,Saskatchewan Goalies,,(b)
5780,5780,10259,49,R.Hartley,38:54,46.0,13.0,11.0,7.0,,31.0,15.0,2020-03-07,San Diego,Sat,Rochester Goalies,,(loss)
5781,5781,10260,1,C.Wende,21:06,22.0,,,5.0,13.0,18.0,4.0,2020-03-07,San Diego,Sat,Rochester Goalies,,
5782,5782,10263,92,F.Scigliano,51:41,31.0,7.0,5.0,9.0,4.0,25.0,6.0,2020-03-07,San Diego,Sat,San Diego Goalies,,(win)
5783,5783,10264,39,N.Damude,8:19,4.0,,,,4.0,4.0,0.0,2020-03-07,San Diego,Sat,San Diego Goalies,,
5784,5784,10266,48,M.Vinc,59:59,53.0,10.0,13.0,12.0,8.0,43.0,10.0,2020-03-08,Halifax,Sun,Buffalo Goalies,,(loss)
5785,5785,10267,57,D.Buchan,,,,,,,,,2020-03-08,Halifax,Sun,Buffalo Goalies,,(b)
5786,5786,10270,76,W.Hill,60:00,55.0,13.0,13.0,13.0,7.0,46.0,9.0,2020-03-08,Halifax,Sun,Halifax Goalies,,(win)
5787,5787,10271,35,P.Dubenski,,,,,,,,,2020-03-08,Halifax,Sun,Halifax Goalies,,(b)
5788,5788,10273,35,C.Del Bianco,60:00,49.0,5.0,12.0,13.0,9.0,39.0,10.0,2020-03-08,Toronto,Sun,Calgary Goalies,,(win)


In [88]:
df = df[df.columns[[14, 12, 13, 2, 3, 17, 15, 4, 6, 7, 8, 9, 16 , 10, 5, 11]]]

In [89]:
split2 = df['Team'].str.split('Goalies', n = 1, expand = True)
df['Team'] = split2[0]

In [90]:
df['Date'] = pd.to_datetime(df['Date'])

In [91]:
for i in range(0,len(df)):
    if df.Credit[i] == None:
        continue
    else:
        df.loc[i, 'Credit'] = df.Credit[i].strip().replace('(','').replace(')','')

In [92]:
df[["SV Q1", "SV Q2", "SV Q3", "SV Q4", "SV OT", "SV", "SOG", "GA"]] = df[["SV Q1", "SV Q2", "SV Q3", "SV Q4", "SV OT", "SV", "SOG", "GA"]].apply(pd.to_numeric)

In [93]:
for i in range(0, len(df)):
    if type(df.MIN[i]) == float:
        continue
    else:
        df.loc[i, 'MIN'] = int((df['MIN'][i]).split(':')[0]) + (int((df['MIN'][i]).split(':')[1]) / 60)

In [94]:
df.tail(10)

Unnamed: 0,Day,Date,Location,#,Name,Credit,Team,MIN,SV Q1,SV Q2,SV Q3,SV Q4,SV OT,SV,SOG,GA
5794,Sun,2020-03-08,Georgia,30,M.Poulin,loss,Georgia,60.0,7.0,15.0,7.0,7.0,,36.0,48.0,12.0
5795,Sun,2020-03-08,Georgia,29,K.Orleman,b,Georgia,,,,,,,,,
5796,Sun,2020-03-08,Colorado,39,S.Fryer,loss,Rochester,60.0,7.0,14.0,6.0,7.0,,34.0,44.0,10.0
5797,Sun,2020-03-08,Colorado,1,C.Wende,b,Rochester,,,,,,,,,
5798,Sun,2020-03-08,Colorado,30,T.Carlson,win,Colorado,50.65,5.0,7.0,10.0,15.0,,37.0,40.0,3.0
5799,Sun,2020-03-08,Colorado,45,D.Ward,,Colorado,9.35,7.0,,,,,7.0,12.0,5.0
5800,Fri,2006-02-10,Toronto,39,P.O'Toole,loss,Rochester,59.8333,5.0,8.0,16.0,10.0,0.0,39.0,50.0,11.0
5801,Fri,2006-02-10,Toronto,52,G.Crawley,b,Rochester,,,,,,,,,
5802,Fri,2006-02-10,Toronto,29,B.Watson,win,Toronto,60.1667,10.0,4.0,3.0,16.0,0.0,33.0,43.0,10.0
5803,Fri,2006-02-10,Toronto,90,P.Wetherup,b,Toronto,,,,,,,,,


In [95]:
#df.to_csv('CleanedNLLGoaliesGameStats.csv')