# MLS Player Categorization
The main objective of your project is to predict and categorize MLS players based on their performance into different tiers, such as elite, all-star, starter, rotation, and out of the league.

## Setup and Data    

In [87]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [88]:
# Import CSV files into python
goalies_data = pd.read_csv("all_goalkeepers.csv")
player_data = pd.read_csv("all_players.csv")
table_data = pd.read_csv("all_tables.csv")
event_data = pd.read_csv("events.csv", low_memory = False)
match_data = pd.read_csv("matches.csv", low_memory = False)
awards_data = pd.read_csv("awards.csv")

## Exploratory Data Analysis & Cleaning
[FILL IN]

### Goalkeeper Data Transformation

In [89]:
# Goalkeeper Data
goalies_data.head()

Unnamed: 0,Player,Club,POS,GP,GS,MINS,SHTS,SV,GA,GAA,PKG/A,W,L,T,ShO,W%,Sv%,Year,Season
0,Mark Dodd,DAL,GK,31,31,2776,235,160,45,1.45,5/5,17,14,0,6,54.8,68.1,1996,reg
1,Tony Meola,MET,GK,29,29,2610,188,142,38,1.31,3/4,14,15,0,9,48.3,75.5,1996,reg
2,Mark Dougherty,TB,GK,28,28,2520,186,123,47,1.68,9/10,17,11,0,3,60.7,66.1,1996,reg
3,Jorge Campos,LA,GK,24,24,2160,133,100,27,1.13,0/1,13,8,0,4,54.2,75.2,1996,reg
4,Garth Lagerwey,KC,GK,23,22,1959,133,83,38,1.73,2/2,12,10,0,3,52.2,62.4,1996,reg


In [90]:
SELECT "PKG/A", COUNT(*) AS "CT" FROM goalies_data GROUP BY "PKG/A" 

Unnamed: 0,PKG/A,CT
0,5/5,12
1,3/4,36
2,9/10,2
3,0/1,62
4,2/2,77
5,1/2,40
6,7/8,3
7,4/4,29
8,0/0,1322
9,2/3,36


In [91]:
# Split the "PKG/A" column into two separate columns using the '/' delimiter
goalies_data[['PK Made', 'PK Attempted']] = goalies_data['PKG/A'].str.split('/', expand=True)

# Convert the 'PK Made' and 'PK Attempted' columns to integers
goalies_data['PK Made'] = goalies_data['PK Made'].astype(int)
goalies_data['PK Attempted'] = goalies_data['PK Attempted'].astype(int)

# Calculate the PK Save Ratio as an integer and replace the "PKG/A" column
goalies_data['PKG/A'] = np.where(goalies_data['PK Attempted'] != 0, (goalies_data['PK Made'] / goalies_data['PK Attempted']), 0)

In [92]:
print(goalies_data.dtypes)

Player           object
Club             object
POS              object
GP                int64
GS                int64
MINS              int64
SHTS              int64
SV                int64
GA                int64
GAA             float64
PKG/A           float64
W                 int64
L                 int64
T                 int64
ShO               int64
W%              float64
Sv%             float64
Year              int64
Season           object
PK Made           int64
PK Attempted      int64
dtype: object


In [93]:
# Missing Data: Determine what columns are NaN or null in each dataset 
missing_goalies_data = goalies_data.columns[goalies_data.isna().any()].tolist()

print('Missing Goalie Column Data: ', missing_goalies_data)

Missing Goalie Column Data:  ['Club']


### Player Data Transformation

In [94]:
# Player Data
player_data.head()

Unnamed: 0,Player,Club,POS,GP,GS,MINS,G,A,SHTS,SOG,GWG,PKG/A,HmG,RdG,G/90min,SC%,GWA,HmA,RdA,A/90min,FC,FS,OFF,YC,RC,SOG%,Year,Season
0,Roy Lassiter,TB,F,30,30,2580,27,4,76,49,4,3/3,19,8,0.94,35.5,2,2,2,0.14,20,39,70,2,0,64.47,1996,reg
1,Raul Diaz Arce,DC,F,28,28,2351,23,2,100,49,4,4/4,15,8,0.88,23.0,0,0,2,0.08,32,26,35,6,1,49.0,1996,reg
2,Eduardo Hurtado,LA,F,26,26,2323,21,7,87,56,6,2/3,13,8,0.81,24.1,0,4,3,0.27,48,26,25,5,0,64.37,1996,reg
3,Preki,KC,M,32,32,2880,18,13,140,61,3,7/8,14,4,0.56,12.9,2,9,4,0.41,26,44,7,3,0,43.57,1996,reg
4,Brian McBride,CLB,F,28,28,2307,17,3,79,44,3,3/4,12,5,0.66,21.5,0,2,1,0.12,21,46,10,0,0,55.7,1996,reg


In [95]:
# Missing Data: Determine what columns are NaN or null in each dataset 
missing_player_data = player_data.columns[player_data.isna().any()].tolist()

print('Missing Player Column Data: ', missing_player_data)

Missing Player Column Data:  ['Club', 'SOG%']


### Table Data Transformation

In [96]:
# Table Data
table_data.head()

Unnamed: 0,Pos,Team,GP,W,L,SW,GF,GA,GD,Pts,Qualification,Conference,Year,SL,D,Head-to-head,PPG
0,1.0,Tampa Bay Mutiny,32,19,12,1.0,66,51,+15,58,Playoffs,Eastern Conference,1996,,,,
1,2.0,D.C. United,32,15,16,1.0,62,56,+6,46,Playoffs,Eastern Conference,1996,,,,
2,3.0,NY/NJ MetroStars,32,12,17,3.0,45,47,−2,39,Playoffs,Eastern Conference,1996,,,,
3,4.0,Columbus Crew,32,11,17,4.0,59,60,−1,37,Playoffs,Eastern Conference,1996,,,,
4,5.0,New England Revolution,32,9,17,6.0,43,56,−13,33,,Eastern Conference,1996,,,,


In [97]:
# Missing Data: Determine what columns are NaN or null in each dataset 
missing_table_data = table_data.columns[table_data.isna().any()].tolist()

print('Missing Table Column Data: ', missing_table_data)

Missing Table Column Data:  ['Pos', 'SW', 'Qualification', 'SL', 'D', 'Head-to-head', 'PPG']


### Event Data Transformation

In [98]:
# Event Data
event_data.head()

Unnamed: 0,id,Time,Event
0,14876,-,no commentary
1,14877,-,no commentary
2,14879,-,no commentary
3,14878,-,no commentary
4,14880,-,no commentary


### Match Data Transformation

In [99]:
# Match Data
match_data = match_data[['id', 'home', 'away', 'date', 'year', 'part_of_competition', 'game_status', 'home_score', 'away_score']]

match_data.head(5)

Unnamed: 0,id,home,away,date,year,part_of_competition,game_status,home_score,away_score
0,,New England,San Jose,7/31/1996,1996,Regular Season,FT,2,0
1,,Dallas,Colorado,6/15/1996,1996,Regular Season,FT,1,1
2,,Colorado,D.C. United,8/29/1996,1996,Regular Season,FT,1,2
3,,LA Galaxy,New England,8/8/1996,1996,Regular Season,FT,1,0
4,,New England,D.C. United,7/20/1996,1996,Regular Season,FT,2,0


In [100]:
SELECT year, COUNT(*) FROM match_data WHERE id IS NULL GROUP BY "year"

Unnamed: 0,year,count_star()
0,1996,177
1,1997,173
2,1998,206
3,1999,208
4,2000,209


In [101]:
# Missing Data: Determine what columns are NaN or null in each dataset 
missing_match_data = match_data.columns[match_data.isna().any()].tolist()

print('Missing Match Column Data: ', missing_match_data)

Missing Match Column Data:  ['id']


## PART 1: Awards 

**QUESTION:** What is the average number of points per game for players in the 2007-2021 seasons who won All NBA First, Second, and Third teams (**not** the All Defensive Teams), as well as for players who were in the All-Star Game (**not** the rookie all-star game)?

In [None]:
import pandas as pd
awards = pd.read_csv('./awards.csv')
awards