# PLL Data Cleaning
### Cleaning of downloaded HTML data from [Premier Lacrosse League](https://stats.premierlacrosseleague.com/) before analysis
#### Player Stats

### Import required libraries

In [106]:
import pandas as pd

### Reading In HTML data with pandas


In [107]:
dfs = pd.read_html("raw_data/Stats _ Premier Lacrosse League.html", header=0)
print(type(dfs))
for df in dfs:
    print(df.head(2))
type(df)

<class 'list'>
           PLAYER TEAM   #  POSITION  GP  P  G  A  1Pt. 1Pt. % 2Pt. 2Pt. %  \
0  Adam Ghitelman  ARC   8    Goalie   9  0  0  0   0-0   0.0%  0-2   0.0%   
1     Austin Sims  ARC  18  Midfield   3  2  2  0  2-10  20.0%  0-2   0.0%   

   Sh   Sh %  SOG  TO  CT  GB   FO  FO %  
0   2   0.0%    2   3   1  11  0-0  0.0%  
1  12  16.6%    6   2   0   1  0-0  0.0%  


pandas.core.frame.DataFrame

### Create dataframe from first item in list

In [108]:
dfcomb = dfs[0]
dfcomb.head()

Unnamed: 0,PLAYER,TEAM,#,POSITION,GP,P,G,A,1Pt.,1Pt. %,2Pt.,2Pt. %,Sh,Sh %,SOG,TO,CT,GB,FO,FO %
0,Adam Ghitelman,ARC,8,Goalie,9,0,0,0,0-0,0.0%,0-2,0.0%,2,0.0%,2,3,1,11,0-0,0.0%
1,Austin Sims,ARC,18,Midfield,3,2,2,0,2-10,20.0%,0-2,0.0%,12,16.6%,6,2,0,1,0-0,0.0%
2,Ben McIntosh,ARC,90,Midfield,10,13,11,2,11-43,25.5%,0-1,0.0%,44,25.0%,27,7,0,7,0-0,0.0%
3,Christian Cuccinello,ARC,30,Attack,8,12,8,4,8-27,29.6%,0-1,0.0%,28,28.5%,17,7,0,10,0-0,0.0%
4,Danny Eipp,ARC,3,Midfield,9,9,5,4,5-19,26.3%,0-0,0.0%,19,26.3%,12,4,1,3,0-0,0.0%


### Split up names into individual columns

In [109]:
dfcomb["first_name"]= dfcomb.PLAYER.apply(lambda x: x.split(" ")[0])

In [110]:
dfcomb["last_name"]= dfcomb.PLAYER.apply(lambda x: x.split(" ")[1])

### Renaming columns for non-lacrosse viewers!

In [111]:
coldict = {"PLAYER":"full_name","TEAM":"team_code","#":"number","POSITION":"position","GP":"games_played","P":"points","G":"goals","A":"assists","1Pt.":"1_point_attempts","1Pt. %":"1_point_percent","2Pt.":"2_point_attempts","2Pt. %":"2_point_percent","Sh":"total_shots","Sh %":"shooting_percent","SOG":"shots_on_goal","TO":"turnovers","CT":"caused_turnovers","GB":"groundballs","FO":"faceoff_attempts","FO %":"faceoff_win_percent"}
dfcomb.rename(columns=coldict, inplace=True)

### Lambda function to format and save to file

In [112]:
dfcomb["1_point_goals"] = dfcomb["1_point_attempts"].apply(lambda x: x.split("-")[0])
dfcomb["1_point_shots"] = dfcomb["1_point_attempts"].apply(lambda x: x.split("-")[1])
dfcomb["2_point_goals"] = dfcomb["2_point_attempts"].apply(lambda x: x.split("-")[0])
dfcomb["2_point_shots"] = dfcomb["2_point_attempts"].apply(lambda x: x.split("-")[1])
dfcomb["faceoff_wins"] = dfcomb["faceoff_attempts"].apply(lambda x: x.split("-")[0])
dfcomb["faceoffs_taken"] = dfcomb["faceoff_attempts"].apply(lambda x: x.split("-")[1])
dfcomb.to_csv("clean_data/PLL_RegSeason_Clean.csv", index=False)

### Do the same for postseason stats:

In [113]:
dfsPS = pd.read_html("raw_data/Stats_Premier_Lacrosse_League_Postseason.html", header=0)
dfPS = dfsPS[0]
dfPS.head(10)

Unnamed: 0,PLAYER,TEAM,#,POSITION,GP,P,G,A,1Pt.,1Pt. %,2Pt.,2Pt. %,Sh,Sh %,SOG,TO,CT,GB,FO,FO %
0,Adam Ghitelman,ARC,8,Goalie,3,1,0,1,0-0,0.0%,0-0,0.0%,0,0.0%,0,2,2,7,0-0,0.0%
1,Austin Sims,ARC,18,Midfield,1,0,0,0,0-3,0.0%,0-0,0.0%,3,0.0%,1,0,1,2,0-0,0.0%
2,Ben McIntosh,ARC,90,Midfield,2,3,1,2,1-7,14.2%,0-0,0.0%,7,14.2%,5,0,0,0,0-0,0.0%
3,Christian Cuccinello,ARC,30,Attack,3,5,4,1,4-9,44.4%,0-0,0.0%,9,44.4%,8,1,1,6,0-0,0.0%
4,Danny Eipp,ARC,3,Midfield,3,6,3,3,3-9,33.3%,0-0,0.0%,9,33.3%,6,1,1,3,0-0,0.0%
5,Davey Emala,ARC,0,Attack,1,1,1,0,1-3,33.3%,0-0,0.0%,3,33.3%,2,0,0,0,0-0,0.0%
6,Dominique Alexander,ARC,23,Midfield,3,1,1,0,1-2,50.0%,0-0,0.0%,2,50.0%,1,0,0,3,0-0,0.0%
7,Drew Adams,ARC,14,Goalie,3,0,0,0,0-0,0.0%,0-0,0.0%,0,0.0%,0,0,0,2,0-0,0.0%
8,Evan Connell,ARC,99,Defense,2,0,0,0,0-0,0.0%,0-0,0.0%,0,0.0%,0,0,0,1,0-0,0.0%
9,Goran Murray,ARC,44,Defense,0,0,0,0,0-0,0.0%,0-0,0.0%,0,0.0%,0,0,0,0,0-0,0.0%


In [114]:
dfPS["first_name"]= dfPS.PLAYER.apply(lambda x: x.split(" ")[0])
dfPS["last_name"]= dfPS.PLAYER.apply(lambda x: x.split(" ")[1])
coldict = {"PLAYER":"full_name","TEAM":"team_code","#":"number","POSITION":"position","GP":"games_played","P":"points","G":"goals","A":"assists","1Pt.":"1_point_attempts","1Pt. %":"1_point_percent","2Pt.":"2_point_attempts","2Pt. %":"2_point_percent","Sh":"total_shots","Sh %":"shooting_percent","SOG":"shots_on_goal","TO":"turnovers","CT":"caused_turnovers","GB":"groundballs","FO":"faceoff_attempts","FO %":"faceoff_win_percent"}
dfPS.rename(columns=coldict, inplace=True)


In [115]:
dfPS.head()

Unnamed: 0,full_name,team_code,number,position,games_played,points,goals,assists,1_point_attempts,1_point_percent,...,total_shots,shooting_percent,shots_on_goal,turnovers,caused_turnovers,groundballs,faceoff_attempts,faceoff_win_percent,first_name,last_name
0,Adam Ghitelman,ARC,8,Goalie,3,1,0,1,0-0,0.0%,...,0,0.0%,0,2,2,7,0-0,0.0%,Adam,Ghitelman
1,Austin Sims,ARC,18,Midfield,1,0,0,0,0-3,0.0%,...,3,0.0%,1,0,1,2,0-0,0.0%,Austin,Sims
2,Ben McIntosh,ARC,90,Midfield,2,3,1,2,1-7,14.2%,...,7,14.2%,5,0,0,0,0-0,0.0%,Ben,McIntosh
3,Christian Cuccinello,ARC,30,Attack,3,5,4,1,4-9,44.4%,...,9,44.4%,8,1,1,6,0-0,0.0%,Christian,Cuccinello
4,Danny Eipp,ARC,3,Midfield,3,6,3,3,3-9,33.3%,...,9,33.3%,6,1,1,3,0-0,0.0%,Danny,Eipp


In [116]:
dfPS["1_point_goals"] = dfPS["1_point_attempts"].apply(lambda x: x.split("-")[0])
dfPS["1_point_shots"] = dfPS["1_point_attempts"].apply(lambda x: x.split("-")[1])
dfPS["2_point_goals"] = dfPS["2_point_attempts"].apply(lambda x: x.split("-")[0])
dfPS["2_point_shots"] = dfPS["2_point_attempts"].apply(lambda x: x.split("-")[1])
dfPS["faceoff_wins"] = dfPS["faceoff_attempts"].apply(lambda x: x.split("-")[0])
dfPS["faceoffs_taken"] = dfPS["faceoff_attempts"].apply(lambda x: x.split("-")[1])
dfPS.to_csv("clean_data/PLL_PostSeason_Clean.csv", index=False)

# Loading Regular Season Games as DataFrame

In [117]:
dfGs = pd.read_html("raw_data/Stats_Premier_Lacrosse_League_Games.html", header=0)
dfG = dfGs[0]
dfG.head()

Unnamed: 0,TEAMS,DATE,WEEK,STADIUM,LOCATION
0,WHIPSNAKES(8) vs ARCHERS(11),8/25/2019,10,Tom & Mary Casey Stadium,Albany
1,REDWOODS(18) vs CHROME(7),8/25/2019,10,Tom & Mary Casey Stadium,Albany
2,ATLAS(12) vs CHAOS(9),8/24/2019,10,Tom & Mary Casey Stadium,Albany
3,WHIPSNAKES(17) vs REDWOODS(4),8/18/2019,9,Tim Hortons Field,Hamilton
4,CHROME(14) vs ATLAS(17),8/17/2019,9,Tim Hortons Field,Hamilton


In [118]:
dfG["away_team"]  = dfG["TEAMS"].apply(lambda x: x.rsplit("v")[0])
dfG["home_team"]  = dfG["TEAMS"].apply(lambda x: x.split("vs")[1])


dfG["away_score"]  = dfG["away_team"].apply(lambda x: x.split("(")[1])
dfG["home_score"]  = dfG["home_team"].apply(lambda x: x.split("(")[1])
dfG["away_score"]  = dfG["away_score"].apply(lambda x: x.split(")")[0])
dfG["home_score"]  = dfG["home_score"].apply(lambda x: x.split(")")[0])

dfG["away_score"]  = pd.to_numeric(dfG["away_score"])
dfG["home_score"]  = pd.to_numeric(dfG["home_score"])


dfG["away_team"]  = dfG["away_team"].apply(lambda x: x.split("(")[0])
dfG["home_team"]  = dfG["home_team"].apply(lambda x: x.split("(")[0])

In [119]:
dfG.drop(index=12, axis=0, inplace=True)

In [120]:
dfG["away_win"] = dfG.away_score > dfG.home_score
dfG["home_win"] = dfG.away_score < dfG.home_score
dfG.head()

Unnamed: 0,TEAMS,DATE,WEEK,STADIUM,LOCATION,away_team,home_team,away_score,home_score,away_win,home_win
0,WHIPSNAKES(8) vs ARCHERS(11),8/25/2019,10,Tom & Mary Casey Stadium,Albany,WHIPSNAKES,ARCHERS,8,11,False,True
1,REDWOODS(18) vs CHROME(7),8/25/2019,10,Tom & Mary Casey Stadium,Albany,REDWOODS,CHROME,18,7,True,False
2,ATLAS(12) vs CHAOS(9),8/24/2019,10,Tom & Mary Casey Stadium,Albany,ATLAS,CHAOS,12,9,True,False
3,WHIPSNAKES(17) vs REDWOODS(4),8/18/2019,9,Tim Hortons Field,Hamilton,WHIPSNAKES,REDWOODS,17,4,True,False
4,CHROME(14) vs ATLAS(17),8/17/2019,9,Tim Hortons Field,Hamilton,CHROME,ATLAS,14,17,False,True


### Change date and save to csv

In [121]:
dfG["date"] = pd.to_datetime(dfG["DATE"])
dfG.columns
dfGclean = dfG[["away_team","home_team",'away_score', 'home_score', 'away_win', 'home_win','date',"WEEK"]]
dfGclean.to_csv("clean_data/PLL_Games_clean.csv", index=False)