## Project Update
### Date: 06/08/2021
### Team: Bryce Randolph, Jeannie Davis, Harsandeep Singh, Kevin Robell

In [3]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import rcParams
import seaborn as sns
sns.set()
rcParams['figure.figsize'] = 8,6
sns.set_context('talk')   # 'talk' for slightly larger

### Choice of Dataset
Our dataset is focused on the 2019 season from this dataset of NBA games(games.csv). https://www.kaggle.com/nathanlauga/nba-games. We are choosing to make the program modular enough that other seasons can also be analyzed, but only a single season is chosen at a time to stay within the goals of the project. Drop the games.csv in the same folder as this file to run the following code.

In [20]:
df = pd.read_csv("./games.csv")
df = df[df.SEASON == 2019]

### What we are going to predict
We want to predict the winner of an NBA game based on the game stats from the 2019 season which is in the HOME_TEAM_WINS column.


### Predictors
To predict the winner of each game we will use all of the game stats except for the PTS_away and PTS_home columns. In addition, two predictors have been added, the REB_PCT_away and REB_PCT_home columns, based upon the REB_away and REB_home columns as seen below.

#### List of Predictors 
 8   FG_PCT_home       1241 non-null   float64   
 9   FT_PCT_home       1241 non-null   float64   
 10  FG3_PCT_home      1241 non-null   float64   
 11  AST_home          1241 non-null   float64   
 12  REB_home          1241 non-null   float64   
 15  FG_PCT_away       1241 non-null   float64   
 16  FT_PCT_away       1241 non-null   float64   
 17  FG3_PCT_away      1241 non-null   float64   
 18  AST_away          1241 non-null   float64   
 19  REB_away          1241 non-null   float64   
 21  REB_PCT_away      1241 non-null   float64   
 22  REB_PCT_home      1241 non-null   float64   

In [21]:
df['REB_PCT_away'] = df['REB_away'] / (df['REB_away'] + df['REB_home'])
df['REB_PCT_home'] = df['REB_home'] / (df['REB_away'] + df['REB_home'])

### Preprocessing
The dataset doesn't have any na values so it's unnecessary to drop columns for that reason. All games are final which means we don't need to drop partially completed games. At the same time, the columns GAME_DATE_EST, GAME_ID, GAME_STATUS_TEXT, HOME_TEAM_ID, VISITOR_TEAM_ID, SEASON, TEAM_ID_home, PTS_home, TEAM_ID_away, and PTS_away aren't used so they will all be dropped.

In [9]:
df.isna().sum() # Shows that data doesn't have NA values.

GAME_DATE_EST       0
GAME_ID             0
GAME_STATUS_TEXT    0
HOME_TEAM_ID        0
VISITOR_TEAM_ID     0
SEASON              0
TEAM_ID_home        0
PTS_home            0
FG_PCT_home         0
FT_PCT_home         0
FG3_PCT_home        0
AST_home            0
REB_home            0
TEAM_ID_away        0
PTS_away            0
FG_PCT_away         0
FT_PCT_away         0
FG3_PCT_away        0
AST_away            0
REB_away            0
HOME_TEAM_WINS      0
REB_PCT_away        0
REB_PCT_home        0
dtype: int64

In [22]:
(df['GAME_STATUS_TEXT'] != 'Final').sum() # Shows that all games are finished.

0

In [23]:
df.drop(['GAME_DATE_EST', 'GAME_ID', 'GAME_STATUS_TEXT', 'HOME_TEAM_ID', 'VISITOR_TEAM_ID', 'SEASON', 'TEAM_ID_home', 'PTS_home', 'TEAM_ID_away', 'PTS_away'], axis = 1)

Unnamed: 0,FG_PCT_home,FT_PCT_home,FG3_PCT_home,AST_home,REB_home,FG_PCT_away,FT_PCT_away,FG3_PCT_away,AST_away,REB_away,HOME_TEAM_WINS,REB_PCT_away,REB_PCT_home
1206,0.443,0.591,0.357,25.0,41.0,0.483,0.643,0.314,23.0,46.0,0,0.528736,0.471264
1207,0.463,0.857,0.368,21.0,41.0,0.458,0.955,0.424,26.0,35.0,0,0.460526,0.539474
1208,0.427,0.808,0.344,18.0,39.0,0.443,0.857,0.359,25.0,42.0,0,0.518519,0.481481
1209,0.513,0.913,0.353,25.0,37.0,0.430,0.759,0.333,23.0,43.0,1,0.537500,0.462500
1210,0.505,0.588,0.340,32.0,44.0,0.507,0.912,0.407,29.0,37.0,1,0.456790,0.543210
...,...,...,...,...,...,...,...,...,...,...,...,...,...
17673,0.521,0.944,0.517,35.0,51.0,0.378,0.780,0.353,24.0,35.0,1,0.406977,0.593023
17674,0.393,0.769,0.262,20.0,47.0,0.475,0.633,0.303,32.0,58.0,0,0.552381,0.447619
17675,0.388,0.750,0.444,22.0,40.0,0.526,0.714,0.361,29.0,47.0,0,0.540230,0.459770
17676,0.384,0.600,0.273,26.0,49.0,0.375,0.800,0.229,21.0,52.0,0,0.514851,0.485149


### Exploration and Visualization
There's always a saying that when a team at home, they have a "home court/field advantage", so we want to explore the data to see whether that is actually true or not. 

In [24]:
df.info() # Basic data exploration

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1241 entries, 1206 to 17677
Data columns (total 23 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   GAME_DATE_EST     1241 non-null   object 
 1   GAME_ID           1241 non-null   int64  
 2   GAME_STATUS_TEXT  1241 non-null   object 
 3   HOME_TEAM_ID      1241 non-null   int64  
 4   VISITOR_TEAM_ID   1241 non-null   int64  
 5   SEASON            1241 non-null   int64  
 6   TEAM_ID_home      1241 non-null   int64  
 7   PTS_home          1241 non-null   float64
 8   FG_PCT_home       1241 non-null   float64
 9   FT_PCT_home       1241 non-null   float64
 10  FG3_PCT_home      1241 non-null   float64
 11  AST_home          1241 non-null   float64
 12  REB_home          1241 non-null   float64
 13  TEAM_ID_away      1241 non-null   int64  
 14  PTS_away          1241 non-null   float64
 15  FG_PCT_away       1241 non-null   float64
 16  FT_PCT_away       1241 non-null   floa

In [25]:
df.describe()

Unnamed: 0,GAME_ID,HOME_TEAM_ID,VISITOR_TEAM_ID,SEASON,TEAM_ID_home,PTS_home,FG_PCT_home,FT_PCT_home,FG3_PCT_home,AST_home,...,TEAM_ID_away,PTS_away,FG_PCT_away,FT_PCT_away,FG3_PCT_away,AST_away,REB_away,HOME_TEAM_WINS,REB_PCT_away,REB_PCT_home
count,1241.0,1241.0,1241.0,1241.0,1241.0,1241.0,1241.0,1241.0,1241.0,1241.0,...,1241.0,1241.0,1241.0,1241.0,1241.0,1241.0,1241.0,1241.0,1241.0,1241.0
mean,22472610.0,1610613000.0,1610613000.0,2019.0,1610613000.0,112.274778,0.463043,0.769757,0.359782,24.710717,...,1610613000.0,110.466559,0.455907,0.771137,0.353089,23.791297,44.197421,0.536664,0.493189,0.506811
std,5922460.0,8.505847,8.505253,0.0,8.505847,12.781012,0.056152,0.098386,0.085668,4.85027,...,8.505253,12.447984,0.054403,0.102825,0.08425,4.851417,6.680293,0.498855,0.05327,0.05327
min,11900000.0,1610613000.0,1610613000.0,2019.0,1610613000.0,68.0,0.269,0.4,0.118,11.0,...,1610613000.0,70.0,0.308,0.375,0.094,10.0,25.0,0.0,0.321429,0.333333
25%,21900210.0,1610613000.0,1610613000.0,2019.0,1610613000.0,104.0,0.425,0.706,0.3,21.0,...,1610613000.0,102.0,0.418,0.706,0.297,20.0,40.0,0.0,0.458824,0.469388
50%,21900520.0,1610613000.0,1610613000.0,2019.0,1610613000.0,112.0,0.462,0.778,0.359,25.0,...,1610613000.0,110.0,0.455,0.774,0.353,24.0,44.0,1.0,0.493827,0.506173
75%,21900830.0,1610613000.0,1610613000.0,2019.0,1610613000.0,120.0,0.505,0.838,0.417,28.0,...,1610613000.0,119.0,0.494,0.84,0.406,27.0,48.0,1.0,0.530612,0.541176
max,51900110.0,1610613000.0,1610613000.0,2019.0,1610613000.0,158.0,0.633,1.0,0.629,43.0,...,1610613000.0,159.0,0.674,1.0,0.68,40.0,65.0,1.0,0.666667,0.678571


In [26]:
df.head()

Unnamed: 0,GAME_DATE_EST,GAME_ID,GAME_STATUS_TEXT,HOME_TEAM_ID,VISITOR_TEAM_ID,SEASON,TEAM_ID_home,PTS_home,FG_PCT_home,FT_PCT_home,...,TEAM_ID_away,PTS_away,FG_PCT_away,FT_PCT_away,FG3_PCT_away,AST_away,REB_away,HOME_TEAM_WINS,REB_PCT_away,REB_PCT_home
1206,2020-10-11,41900406,Final,1610612748,1610612747,2019,1610612748,93.0,0.443,0.591,...,1610612747,106.0,0.483,0.643,0.314,23.0,46.0,0,0.528736,0.471264
1207,2020-10-09,41900405,Final,1610612747,1610612748,2019,1610612747,108.0,0.463,0.857,...,1610612748,111.0,0.458,0.955,0.424,26.0,35.0,0,0.460526,0.539474
1208,2020-10-06,41900404,Final,1610612748,1610612747,2019,1610612748,96.0,0.427,0.808,...,1610612747,102.0,0.443,0.857,0.359,25.0,42.0,0,0.518519,0.481481
1209,2020-10-04,41900403,Final,1610612748,1610612747,2019,1610612748,115.0,0.513,0.913,...,1610612747,104.0,0.43,0.759,0.333,23.0,43.0,1,0.5375,0.4625
1210,2020-10-02,41900402,Final,1610612747,1610612748,2019,1610612747,124.0,0.505,0.588,...,1610612748,114.0,0.507,0.912,0.407,29.0,37.0,1,0.45679,0.54321


In [27]:
df.sort_values(by=df.columns[1])

Unnamed: 0,GAME_DATE_EST,GAME_ID,GAME_STATUS_TEXT,HOME_TEAM_ID,VISITOR_TEAM_ID,SEASON,TEAM_ID_home,PTS_home,FG_PCT_home,FT_PCT_home,...,TEAM_ID_away,PTS_away,FG_PCT_away,FT_PCT_away,FG3_PCT_away,AST_away,REB_away,HOME_TEAM_WINS,REB_PCT_away,REB_PCT_home
17676,2019-10-04,11900002,Final,1610612746,1610612745,2019,1610612746,96.0,0.384,0.600,...,1610612745,109.0,0.375,0.800,0.229,21.0,52.0,0,0.514851,0.485149
17677,2019-10-04,11900003,Final,1610612758,1610612754,2019,1610612758,131.0,0.494,0.778,...,1610612754,132.0,0.516,0.788,0.370,26.0,47.0,0,0.610390,0.389610
17673,2019-10-05,11900005,Final,1610612754,1610612758,2019,1610612754,130.0,0.521,0.944,...,1610612758,106.0,0.378,0.780,0.353,24.0,35.0,1,0.406977,0.593023
17674,2019-10-05,11900006,Final,1610612744,1610612747,2019,1610612744,101.0,0.393,0.769,...,1610612747,123.0,0.475,0.633,0.303,32.0,58.0,0,0.552381,0.447619
17675,2019-10-05,11900007,Final,1610612759,1610612753,2019,1610612759,89.0,0.388,0.750,...,1610612753,125.0,0.526,0.714,0.361,29.0,47.0,0,0.540230,0.459770
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1209,2020-10-04,41900403,Final,1610612748,1610612747,2019,1610612748,115.0,0.513,0.913,...,1610612747,104.0,0.430,0.759,0.333,23.0,43.0,1,0.537500,0.462500
1208,2020-10-06,41900404,Final,1610612748,1610612747,2019,1610612748,96.0,0.427,0.808,...,1610612747,102.0,0.443,0.857,0.359,25.0,42.0,0,0.518519,0.481481
1207,2020-10-09,41900405,Final,1610612747,1610612748,2019,1610612747,108.0,0.463,0.857,...,1610612748,111.0,0.458,0.955,0.424,26.0,35.0,0,0.460526,0.539474
1206,2020-10-11,41900406,Final,1610612748,1610612747,2019,1610612748,93.0,0.443,0.591,...,1610612747,106.0,0.483,0.643,0.314,23.0,46.0,0,0.528736,0.471264


In [None]:
sns.scatterplot(x="FT_PCT_home", y="HOME_TEAM_WINS", hue="coloring_group", data=df)

### Prelimary Machine Learning
To do