# Project 01 - Write a Data Science Blog Post

## Data
 - NBA 2018-2019 Player Box Scores 
 - NBA 2018-2019 Daily Fantasy Scores (DFS)
 
## Business Questions
1. What are the key drivers for top fantasy scores?
2. What effect, if any, does seasonality play during the NBA season? 
3. Which positions are the most valuable from a fantasy score perspective? 

### Import Packages

In [1]:
import numpy as np
import pandas as pd

### Import Data

#### Player Box Scores Dataset

In [2]:
# Import Player stats data
stats = pd.read_excel('../data/NBA-18-19-Player-BoxScore.xlsx', sheet_name='NBA-2018-19-PLAYER')

# Data Cleaning
stats.head(3)

Unnamed: 0,DATASET,GAME-ID,DATE,PLAYER-ID,PLAYER \nFULL NAME,POSITION,OWN \nTEAM,OPPONENT \nTEAM,VENUE\n(R/H),STARTER\n(Y/N),...,DR,TOT,A,PF,ST,TO,BL,PTS,USAGE \nRATE (%),DAYS\nREST
0,NBA 2018-2019 Regular Season,21800001,10/16/2018,203967,Dario Saric,F,Philadelphia,Boston,R,Y,...,6,6,1,5,0,3,0,6,20.382575,3+
1,NBA 2018-2019 Regular Season,21800001,10/16/2018,203496,Robert Covington,F,Philadelphia,Boston,R,Y,...,5,6,0,1,2,2,1,8,14.880005,3+
2,NBA 2018-2019 Regular Season,21800001,10/16/2018,203954,Joel Embiid,F-C,Philadelphia,Boston,R,Y,...,8,10,2,3,1,5,2,23,32.498788,3+


#### Daily Fantasy Score (DFS) Dataset

In [3]:
# Import DFS data
dfs = pd.read_excel('../data/NBA-18-19-DFS.xlsx', sheet_name='NBA-2018-19-DFS')
dfs.head(3)

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,Unnamed: 11,POSITION,Unnamed: 13,Unnamed: 14,SALARY ($),Unnamed: 16,Unnamed: 17,FANTASY POINTS SCORED,Unnamed: 19,Unnamed: 20
0,DATASET,GAME ID,DATE,PLAYER ID,PLAYER,OWN\nTEAM,OPPONENT\nTEAM,STARTER (Y/N),VENUE (R/H),MINUTES,...,DAYS\nREST,DRAFTKINGS,FANDUEL,YAHOO,"DRAFTKINGS\n""Classic""\nGame Style","FANDUEL\n""Full Roster""\nGame Style","YAHOO\n""Full Slate""\nGame Style",DRAFTKINGS,FANDUEL,YAHOO
1,NBA 2018-2019 Regular Season,0021800001,10/16/2018,203967,Dario Saric,Philadelphia,Boston,Y,R,22.9,...,3+,SF/PF,PF,PF,5500,6400,25,13.5,11.7,11.7
2,NBA 2018-2019 Regular Season,0021800001,10/16/2018,203496,Robert Covington,Philadelphia,Boston,Y,R,34.22,...,3+,SF/PF,SF,SF,4700,6500,23,21.5,22.2,22.2


In [4]:
# Update column headers
# Position
dfs.iloc[0,12] = 'DRAFTKINGS_POSITION'
dfs.iloc[0,13] = 'FANDUEL_POSITION'
dfs.iloc[0,14] = 'YAHOO_POSITION'

# Salary
dfs.iloc[0,15] = "DRAFTKINGS_CLASSIC_SALARY"
dfs.iloc[0,16] = "FANDUEL_FULLROSTER_SALARY"
dfs.iloc[0,17] = "YAHOO_FULLSLATE_SALARY"

# Fantasy Points Scored
dfs.iloc[0,18] = "DRAFTKINGS_FANTASYPOINTS"
dfs.iloc[0,19] = "FANDUEL_FANTASYPOINTS"
dfs.iloc[0,20] = "YAHOO_FANTASYPOINTS"

In [5]:
# Reformat columns and headers
dfs.columns = dfs.iloc[0]
dfs = dfs[1:]

# Rename the GAME-ID column
dfs = dfs.rename(columns={"GAME ID":"GAME-ID", 
                          "PLAYER ID":"PLAYER-ID"})

# Convert Data types
convert_dict = {
    "GAME-ID": int,
    "PLAYER-ID": int
}

dfs = dfs.astype(convert_dict)

# Preview
dfs.head(3)

Unnamed: 0,DATASET,GAME-ID,DATE,PLAYER-ID,PLAYER,OWN\nTEAM,OPPONENT\nTEAM,STARTER (Y/N),VENUE (R/H),MINUTES,...,DAYS\nREST,DRAFTKINGS_POSITION,FANDUEL_POSITION,YAHOO_POSITION,DRAFTKINGS_CLASSIC_SALARY,FANDUEL_FULLROSTER_SALARY,YAHOO_FULLSLATE_SALARY,DRAFTKINGS_FANTASYPOINTS,FANDUEL_FANTASYPOINTS,YAHOO_FANTASYPOINTS
1,NBA 2018-2019 Regular Season,21800001,10/16/2018,203967,Dario Saric,Philadelphia,Boston,Y,R,22.9,...,3+,SF/PF,PF,PF,5500,6400,25,13.5,11.7,11.7
2,NBA 2018-2019 Regular Season,21800001,10/16/2018,203496,Robert Covington,Philadelphia,Boston,Y,R,34.22,...,3+,SF/PF,SF,SF,4700,6500,23,21.5,22.2,22.2
3,NBA 2018-2019 Regular Season,21800001,10/16/2018,203954,Joel Embiid,Philadelphia,Boston,Y,R,36.82,...,3+,C,C,C,8800,10400,41,44.0,42.0,42.0


### Merge Datasets

In [6]:
df = pd.merge(dfs, stats, on=['GAME-ID', 'PLAYER-ID'])
df.head()

Unnamed: 0,DATASET_x,GAME-ID,DATE_x,PLAYER-ID,PLAYER,OWN\nTEAM,OPPONENT\nTEAM,STARTER (Y/N),VENUE (R/H),MINUTES,...,DR,TOT,A,PF,ST,TO,BL,PTS,USAGE \nRATE (%),DAYS\nREST_y
0,NBA 2018-2019 Regular Season,21800001,10/16/2018,203967,Dario Saric,Philadelphia,Boston,Y,R,22.9,...,6,6,1,5,0,3,0,6,20.382575,3+
1,NBA 2018-2019 Regular Season,21800001,10/16/2018,203496,Robert Covington,Philadelphia,Boston,Y,R,34.22,...,5,6,0,1,2,2,1,8,14.880005,3+
2,NBA 2018-2019 Regular Season,21800001,10/16/2018,203954,Joel Embiid,Philadelphia,Boston,Y,R,36.82,...,8,10,2,3,1,5,2,23,32.498788,3+
3,NBA 2018-2019 Regular Season,21800001,10/16/2018,1628365,Markelle Fultz,Philadelphia,Boston,Y,R,24.33,...,3,3,2,1,1,3,0,5,18.975299,3+
4,NBA 2018-2019 Regular Season,21800001,10/16/2018,1627732,Ben Simmons,Philadelphia,Boston,Y,R,42.73,...,12,15,8,5,4,3,2,19,21.251164,3+


### Data Cleaning

#### Drop unncessary columns and convert data types

In [7]:
# Drop redundant columns
del_cols = ['DATASET_y', 'DATE_y', 'DAYS\nREST_y', 
            'OPPONENT \nTEAM', 'OWN \nTEAM', 
            'PLAYER \nFULL NAME', 'STARTER\n(Y/N)', 
            'USAGE \nRATE (%)', 'VENUE\n(R/H)']

df = df.drop(del_cols, axis=1)

# Reformat column headers
df.columns = [value.replace('_x','').replace('\n','') for value in df.columns]

In [8]:
# Convert Data types: Ints and Floats
convert_dict = {
    "MINUTES": float,
    "DRAFTKINGS_CLASSIC_SALARY": float,
    "FANDUEL_FULLROSTER_SALARY": float,
    "YAHOO_FULLSLATE_SALARY": float,
    "DRAFTKINGS_FANTASYPOINTS": float,
    "FANDUEL_FANTASYPOINTS": float,
    "YAHOO_FANTASYPOINTS": float,
    "USAGE RATE": float
}

df = df.astype(convert_dict)

# Convert Data types: Datetime
df['DATE'] = pd.to_datetime(df['DATE'])

In [9]:
df.head()

Unnamed: 0,DATASET,GAME-ID,DATE,PLAYER-ID,PLAYER,OWNTEAM,OPPONENTTEAM,STARTER (Y/N),VENUE (R/H),MINUTES,...,FTA,OR,DR,TOT,A,PF,ST,TO,BL,PTS
0,NBA 2018-2019 Regular Season,21800001,2018-10-16,203967,Dario Saric,Philadelphia,Boston,Y,R,22.9,...,0,0,6,6,1,5,0,3,0,6
1,NBA 2018-2019 Regular Season,21800001,2018-10-16,203496,Robert Covington,Philadelphia,Boston,Y,R,34.22,...,0,1,5,6,0,1,2,2,1,8
2,NBA 2018-2019 Regular Season,21800001,2018-10-16,203954,Joel Embiid,Philadelphia,Boston,Y,R,36.82,...,5,2,8,10,2,3,1,5,2,23
3,NBA 2018-2019 Regular Season,21800001,2018-10-16,1628365,Markelle Fultz,Philadelphia,Boston,Y,R,24.33,...,2,0,3,3,2,1,1,3,0,5
4,NBA 2018-2019 Regular Season,21800001,2018-10-16,1627732,Ben Simmons,Philadelphia,Boston,Y,R,42.73,...,10,3,12,15,8,5,4,3,2,19


#### Handle Missing Values

In [10]:
# Determine which variables have missing values (Position & Salary)
df.isnull().sum().sort_values(ascending=False)

DRAFTKINGS_CLASSIC_SALARY    981
DRAFTKINGS_POSITION          981
POSITION                     617
FANDUEL_FULLROSTER_SALARY    330
FANDUEL_POSITION             330
YAHOO_POSITION               256
YAHOO_FULLSLATE_SALARY       256
MINUTES                        0
DAYSREST                       0
USAGE RATE                     0
PTS                            0
STARTER (Y/N)                  0
OPPONENTTEAM                   0
OWNTEAM                        0
PLAYER                         0
PLAYER-ID                      0
DATE                           0
GAME-ID                        0
VENUE (R/H)                    0
DRAFTKINGS_FANTASYPOINTS       0
BL                             0
FANDUEL_FANTASYPOINTS          0
TO                             0
ST                             0
PF                             0
A                              0
TOT                            0
DR                             0
OR                             0
FTA                            0
FT        

In [17]:
# Missing Values in the Position Columns
position_df = df.filter(regex=("POSITION"))
position_df

Unnamed: 0,DRAFTKINGS_POSITION,FANDUEL_POSITION,YAHOO_POSITION,POSITION
0,SF/PF,PF,PF,F
1,SF/PF,SF,SF,F
2,C,C,C,F-C
3,PG/SG,PG,PG,G
4,PG/SF,PG,PG,G-F
...,...,...,...,...
27850,,,,F
27851,,,,C
27852,,,,G
27853,,,,G


In [12]:
# "YAHOO_POSITION" will serve as my primary Position column (since it has the least missing values) 
position_df.isnull().sum().sort_values(ascending=False)

DRAFTKINGS_POSITION    981
POSITION               617
FANDUEL_POSITION       330
YAHOO_POSITION         256
dtype: int64

In [13]:
# Fill any "YAHOO_POSITION" missing values with values from the other position columns
position_df["POSITION_NEW"] = position_df["YAHOO_POSITION"].fillna(position_df["POSITION"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  position_df["POSITION_NEW"] = position_df["YAHOO_POSITION"].fillna(position_df["POSITION"])


In [14]:
position_df.isnull().sum()

DRAFTKINGS_POSITION    981
FANDUEL_POSITION       330
YAHOO_POSITION         256
POSITION               617
POSITION_NEW             8
dtype: int64

In [18]:
position_df.apply(lambda x: x[x.first_valid_index()], axis=1)

KeyError: (None, 'occurred at index 5819')

In [20]:
position_df[5819:5820]

Unnamed: 0,DRAFTKINGS_POSITION,FANDUEL_POSITION,YAHOO_POSITION,POSITION
5819,,,,


#### Handle the Categorical Variables

In [None]:
df.info()

### Export to Excel

In [None]:
#df.to_csv("../data/Cleaned_NBA1819_PlayerStats-DFS.csv")