# Project 01 - Write a Data Science Blog Post

## Part 1: Data Cleaning

### Data
 - NBA 2018-2019 Player Box Scores 
 - NBA 2018-2019 Daily Fantasy Scores (DFS)
 
### Business Questions
1. What are the key drivers for top fantasy scores?
2. What effect, if any, does seasonality play during the NBA season? 
3. Which positions are the most valuable from a fantasy score perspective? 

### Import Packages

In [1]:
import numpy as np
import pandas as pd

# Use 2 decimal places in output display
pd.set_option("display.precision", 2)

### Import Data

#### Player Box Scores Dataset

In [2]:
# Import Player stats data
stats = pd.read_excel('../data/NBA-18-19-Player-BoxScore.xlsx', sheet_name='NBA-2018-19-PLAYER')

# Data Cleaning
stats.head(3)

Unnamed: 0,DATASET,GAME-ID,DATE,PLAYER-ID,PLAYER \nFULL NAME,POSITION,OWN \nTEAM,OPPONENT \nTEAM,VENUE\n(R/H),STARTER\n(Y/N),...,DR,TOT,A,PF,ST,TO,BL,PTS,USAGE \nRATE (%),DAYS\nREST
0,NBA 2018-2019 Regular Season,21800001,10/16/2018,203967,Dario Saric,F,Philadelphia,Boston,R,Y,...,6,6,1,5,0,3,0,6,20.38,3+
1,NBA 2018-2019 Regular Season,21800001,10/16/2018,203496,Robert Covington,F,Philadelphia,Boston,R,Y,...,5,6,0,1,2,2,1,8,14.88,3+
2,NBA 2018-2019 Regular Season,21800001,10/16/2018,203954,Joel Embiid,F-C,Philadelphia,Boston,R,Y,...,8,10,2,3,1,5,2,23,32.5,3+


#### Daily Fantasy Score (DFS) Dataset

In [3]:
# Import DFS data
dfs = pd.read_excel('../data/NBA-18-19-DFS.xlsx', sheet_name='NBA-2018-19-DFS')
dfs.head(3)

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,Unnamed: 11,POSITION,Unnamed: 13,Unnamed: 14,SALARY ($),Unnamed: 16,Unnamed: 17,FANTASY POINTS SCORED,Unnamed: 19,Unnamed: 20
0,DATASET,GAME ID,DATE,PLAYER ID,PLAYER,OWN\nTEAM,OPPONENT\nTEAM,STARTER (Y/N),VENUE (R/H),MINUTES,...,DAYS\nREST,DRAFTKINGS,FANDUEL,YAHOO,"DRAFTKINGS\n""Classic""\nGame Style","FANDUEL\n""Full Roster""\nGame Style","YAHOO\n""Full Slate""\nGame Style",DRAFTKINGS,FANDUEL,YAHOO
1,NBA 2018-2019 Regular Season,0021800001,10/16/2018,203967,Dario Saric,Philadelphia,Boston,Y,R,23,...,3+,SF/PF,PF,PF,5500,6400,25,14,12,12
2,NBA 2018-2019 Regular Season,0021800001,10/16/2018,203496,Robert Covington,Philadelphia,Boston,Y,R,34,...,3+,SF/PF,SF,SF,4700,6500,23,22,22,22


In [4]:
# Update column headers
# Position
dfs.iloc[0,12] = 'DRAFTKINGS_POSITION'
dfs.iloc[0,13] = 'FANDUEL_POSITION'
dfs.iloc[0,14] = 'YAHOO_POSITION'

# Salary
dfs.iloc[0,15] = "DRAFTKINGS_CLASSIC_SALARY"
dfs.iloc[0,16] = "FANDUEL_FULLROSTER_SALARY"
dfs.iloc[0,17] = "YAHOO_FULLSLATE_SALARY"

# Fantasy Points Scored
dfs.iloc[0,18] = "DRAFTKINGS_FANTASYPOINTS"
dfs.iloc[0,19] = "FANDUEL_FANTASYPOINTS"
dfs.iloc[0,20] = "YAHOO_FANTASYPOINTS"

# Reformat columns and headers
dfs.columns = dfs.iloc[0]
dfs = dfs[1:]

# Rename the GAME-ID column
dfs = dfs.rename(columns={"GAME ID":"GAME-ID", 
                          "PLAYER ID":"PLAYER-ID"})

# Convert Data types
convert_dict = {
    "GAME-ID": int,
    "PLAYER-ID": int
}

dfs = dfs.astype(convert_dict)

# Preview
dfs.head(3)

Unnamed: 0,DATASET,GAME-ID,DATE,PLAYER-ID,PLAYER,OWN\nTEAM,OPPONENT\nTEAM,STARTER (Y/N),VENUE (R/H),MINUTES,...,DAYS\nREST,DRAFTKINGS_POSITION,FANDUEL_POSITION,YAHOO_POSITION,DRAFTKINGS_CLASSIC_SALARY,FANDUEL_FULLROSTER_SALARY,YAHOO_FULLSLATE_SALARY,DRAFTKINGS_FANTASYPOINTS,FANDUEL_FANTASYPOINTS,YAHOO_FANTASYPOINTS
1,NBA 2018-2019 Regular Season,21800001,10/16/2018,203967,Dario Saric,Philadelphia,Boston,Y,R,23,...,3+,SF/PF,PF,PF,5500,6400,25,14,12,12
2,NBA 2018-2019 Regular Season,21800001,10/16/2018,203496,Robert Covington,Philadelphia,Boston,Y,R,34,...,3+,SF/PF,SF,SF,4700,6500,23,22,22,22
3,NBA 2018-2019 Regular Season,21800001,10/16/2018,203954,Joel Embiid,Philadelphia,Boston,Y,R,37,...,3+,C,C,C,8800,10400,41,44,42,42


### Merge Datasets

In [5]:
df = pd.merge(dfs, stats, on=['GAME-ID', 'PLAYER-ID'])
df.head()

Unnamed: 0,DATASET_x,GAME-ID,DATE_x,PLAYER-ID,PLAYER,OWN\nTEAM,OPPONENT\nTEAM,STARTER (Y/N),VENUE (R/H),MINUTES,...,DR,TOT,A,PF,ST,TO,BL,PTS,USAGE \nRATE (%),DAYS\nREST_y
0,NBA 2018-2019 Regular Season,21800001,10/16/2018,203967,Dario Saric,Philadelphia,Boston,Y,R,23,...,6,6,1,5,0,3,0,6,20.38,3+
1,NBA 2018-2019 Regular Season,21800001,10/16/2018,203496,Robert Covington,Philadelphia,Boston,Y,R,34,...,5,6,0,1,2,2,1,8,14.88,3+
2,NBA 2018-2019 Regular Season,21800001,10/16/2018,203954,Joel Embiid,Philadelphia,Boston,Y,R,37,...,8,10,2,3,1,5,2,23,32.5,3+
3,NBA 2018-2019 Regular Season,21800001,10/16/2018,1628365,Markelle Fultz,Philadelphia,Boston,Y,R,24,...,3,3,2,1,1,3,0,5,18.98,3+
4,NBA 2018-2019 Regular Season,21800001,10/16/2018,1627732,Ben Simmons,Philadelphia,Boston,Y,R,43,...,12,15,8,5,4,3,2,19,21.25,3+


### Data Cleaning

In [6]:
# Drop redundant columns
del_cols = ['DATASET_y', 'DATE_y', 'DAYS\nREST_y', 
            'OPPONENT \nTEAM', 'OWN \nTEAM', 
            'PLAYER \nFULL NAME', 'STARTER\n(Y/N)', 
            'USAGE \nRATE (%)', 'VENUE\n(R/H)']

df = df.drop(del_cols, axis=1)

# Reformat column headers
df.columns = [value.replace('_x','').replace('\n','') for value in df.columns]

# Convert Data types: Ints and Floats
convert_dict = {
    "MINUTES": float,
    "DRAFTKINGS_CLASSIC_SALARY": float,
    "FANDUEL_FULLROSTER_SALARY": float,
    "YAHOO_FULLSLATE_SALARY": float,
    "DRAFTKINGS_FANTASYPOINTS": float,
    "FANDUEL_FANTASYPOINTS": float,
    "YAHOO_FANTASYPOINTS": float,
    "USAGE RATE": float
}

df = df.astype(convert_dict)

# Convert Data types: Datetime
df['DATE'] = pd.to_datetime(df['DATE'])

# Clean the 'string' value in the DAYSREST column (3+ to 4)
df.DAYSREST = df.DAYSREST.replace(to_replace='3+', value=4)

#### Handle Missing Values

In [7]:
# Determine which variables have missing values (Position & Salary)
df.isnull().sum().sort_values(ascending=False)

DRAFTKINGS_CLASSIC_SALARY    981
DRAFTKINGS_POSITION          981
POSITION                     617
FANDUEL_FULLROSTER_SALARY    330
FANDUEL_POSITION             330
YAHOO_POSITION               256
YAHOO_FULLSLATE_SALARY       256
MINUTES                        0
DAYSREST                       0
USAGE RATE                     0
PTS                            0
STARTER (Y/N)                  0
OPPONENTTEAM                   0
OWNTEAM                        0
PLAYER                         0
PLAYER-ID                      0
DATE                           0
GAME-ID                        0
VENUE (R/H)                    0
DRAFTKINGS_FANTASYPOINTS       0
BL                             0
FANDUEL_FANTASYPOINTS          0
TO                             0
ST                             0
PF                             0
A                              0
TOT                            0
DR                             0
OR                             0
FTA                            0
FT        

##### Position Columns

In [8]:
# Missing Values in the Position Columns
position_df = df.filter(regex=("POSITION"))

# "YAHOO_POSITION" will serve as my primary Position column (since it has the least missing values) 
position_df.isnull().sum().sort_values(ascending=False)

DRAFTKINGS_POSITION    981
POSITION               617
FANDUEL_POSITION       330
YAHOO_POSITION         256
dtype: int64

In [9]:
# Fill any "YAHOO_POSITION" missing values with values from the other position columns
position_df["POSITION_NEW"] = position_df["YAHOO_POSITION"].fillna(position_df["POSITION"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  position_df["POSITION_NEW"] = position_df["YAHOO_POSITION"].fillna(position_df["POSITION"])


In [10]:
position_df.isnull().sum()

DRAFTKINGS_POSITION    981
FANDUEL_POSITION       330
YAHOO_POSITION         256
POSITION               617
POSITION_NEW             8
dtype: int64

In [11]:
# Inspecting the missing values left, it seems substituting a position from the other columns is not a full-proof solution
position_df[5819:5820]

Unnamed: 0,DRAFTKINGS_POSITION,FANDUEL_POSITION,YAHOO_POSITION,POSITION,POSITION_NEW
5819,,,,,


In [12]:
# Because of this, I will handle the last missing values by using an "Unknown" as a value
position_df["POSITION_NEW"] = position_df["POSITION_NEW"].fillna("Unknown")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  position_df["POSITION_NEW"] = position_df["POSITION_NEW"].fillna("Unknown")


In [13]:
# I will combine the two 'F-G' and 'G-F' as one value, and 'F-C' and 'C-F' as one value
position_df["POSITION_NEW"].replace({'F-G':'G-F',
                                     'C-F':'F-C'}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)


In [14]:
# Remove the old position columns
position_cols = list(position_df.columns.values)
position_cols.remove('POSITION_NEW')
df = df.drop(position_cols, axis=1)

# Concatenate only the newly filled position column with main dataframe
df = pd.concat([df, position_df['POSITION_NEW']], axis=1)

In [15]:
# Rename the GAME-ID column
df = df.rename(columns={"POSITION_NEW":"POSITION"})

##### Salary Columns

In [16]:
# Missing values in the Salary columns
salary_df = df.filter(regex=("SALARY"))

# Fill all missing values in Salary with the mean of the column
fill_mean = lambda col: col.fillna(col.mean())

salary_df = salary_df.apply(fill_mean, axis=0)

In [17]:
# Confirm that the missing values are filled
salary_df.isnull().sum().sort_values(ascending=False)

YAHOO_FULLSLATE_SALARY       0
FANDUEL_FULLROSTER_SALARY    0
DRAFTKINGS_CLASSIC_SALARY    0
dtype: int64

In [18]:
# Remove the old position columns
salary_cols = list(salary_df.columns.values)
df = df.drop(salary_cols, axis=1)

# Concatenate the newly filled position columns with main dataframe
df = pd.concat([df, salary_df], axis=1)

### Export to CSV w/o Dummy Variables

In [19]:
df.to_csv("../data/Cleaned_NBA1819_PlayerStats-DFS.csv")

#### Handle the Categorical Variables

In [20]:
cat_df = df.select_dtypes(include='object')
cat_df.head()

Unnamed: 0,DATASET,PLAYER,OWNTEAM,OPPONENTTEAM,STARTER (Y/N),VENUE (R/H),POSITION
0,NBA 2018-2019 Regular Season,Dario Saric,Philadelphia,Boston,Y,R,PF
1,NBA 2018-2019 Regular Season,Robert Covington,Philadelphia,Boston,Y,R,SF
2,NBA 2018-2019 Regular Season,Joel Embiid,Philadelphia,Boston,Y,R,C
3,NBA 2018-2019 Regular Season,Markelle Fultz,Philadelphia,Boston,Y,R,PG
4,NBA 2018-2019 Regular Season,Ben Simmons,Philadelphia,Boston,Y,R,PG


In [21]:
dummy_cat_df = pd.get_dummies(cat_df)
dummy_cat_df.head()

Unnamed: 0,DATASET_NBA 2018-2019 Regular Season,DATASET_NBA 2019 Playoffs,PLAYER_Aaron Gordon,PLAYER_Aaron Holiday,PLAYER_Abdel Nader,PLAYER_Al Horford,PLAYER_Al-Farouq Aminu,PLAYER_Alan Williams,PLAYER_Alec Burks,PLAYER_Alex Abrines,...,POSITION_C,POSITION_F,POSITION_F-C,POSITION_G,POSITION_G-F,POSITION_PF,POSITION_PG,POSITION_SF,POSITION_SG,POSITION_Unknown
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,1,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0


In [22]:
# Concatenate the dummied columns into the main dataframe
cat_cols = list(cat_df.columns.values)
df = df.drop(cat_cols, axis=1)

df_dummy = pd.concat([df, dummy_cat_df], axis=1)
df_dummy.head()

Unnamed: 0,GAME-ID,DATE,PLAYER-ID,MINUTES,USAGE RATE,DAYSREST,DRAFTKINGS_FANTASYPOINTS,FANDUEL_FANTASYPOINTS,YAHOO_FANTASYPOINTS,MIN,...,POSITION_C,POSITION_F,POSITION_F-C,POSITION_G,POSITION_G-F,POSITION_PF,POSITION_PG,POSITION_SF,POSITION_SG,POSITION_Unknown
0,21800001,2018-10-16,203967,22.9,20.38,4,13.5,11.7,11.7,22.9,...,0,0,0,0,0,1,0,0,0,0
1,21800001,2018-10-16,203496,34.22,14.88,4,21.5,22.2,22.2,34.22,...,0,0,0,0,0,0,0,1,0,0
2,21800001,2018-10-16,203954,36.82,32.5,4,44.0,42.0,42.0,36.82,...,1,0,0,0,0,0,0,0,0,0
3,21800001,2018-10-16,1628365,24.33,18.98,4,12.25,11.6,11.6,24.33,...,0,0,0,0,0,0,1,0,0,0
4,21800001,2018-10-16,1627732,42.73,21.25,4,61.75,64.0,64.0,42.73,...,0,0,0,0,0,0,1,0,0,0


### Review the final dataset

In [23]:
df_dummy.info(verbose=True, null_counts=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 27855 entries, 0 to 27854
Data columns (total 634 columns):
GAME-ID                                 27855 non-null int64
DATE                                    27855 non-null datetime64[ns]
PLAYER-ID                               27855 non-null int64
MINUTES                                 27855 non-null float64
USAGE RATE                              27855 non-null float64
DAYSREST                                27855 non-null int64
DRAFTKINGS_FANTASYPOINTS                27855 non-null float64
FANDUEL_FANTASYPOINTS                   27855 non-null float64
YAHOO_FANTASYPOINTS                     27855 non-null float64
MIN                                     27855 non-null float64
FG                                      27855 non-null int64
FGA                                     27855 non-null int64
3P                                      27855 non-null int64
3PA                                     27855 non-null int64
FT             

### Export to CSV w/ Dummy Variables

In [24]:
df_dummy.to_csv("../data/Cleaned-Dummies_NBA1819_PlayerStats-DFS.csv")