# Imports

In [29]:
# This is what we're working with
import warnings
warnings.filterwarnings("ignore")

import requests
import nba_api
from nba_api.stats.endpoints import teamdashboardbygeneralsplits, leaguedashteamstats
from nba_api.stats.endpoints import leaguegamelog, scoreboard, leaguestandings

from datetime import date, timedelta
import time


# import shap 
# shap.initjs()
import math
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

from sklearn.decomposition import PCA
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler

from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import ConfusionMatrixDisplay

from imblearn.over_sampling import SMOTE

from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import RidgeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

In [2]:
# Compile a dictionary of all 30 NBA teams and their Team ID
teams = {"Atlanta Hawks": 1610612737,
        "Boston Celtics": 1610612738,
        "Brooklyn Nets": 1610612751,
        "Charlotte Bobcats": 1610612766,
        "Charlotte Hornets": 1610612766,
        "Chicago Bulls": 1610612741,
        "Cleveland Cavaliers": 1610612739,
        "Dallas Mavericks": 1610612742,
        "Denver Nuggets": 1610612743,
        "Detroit Pistons": 1610612765,
        "Golden State Warriors": 1610612744,
        "Houston Rockets": 1610612745,
        "Indiana Pacers": 1610612754,
        "LA Clippers": 1610612746,
        "Los Angeles Clippers": 1610612746,
        "Los Angeles Lakers": 1610612747,
        "Memphis Grizzlies": 1610612763,
        "Miami Heat": 1610612748,
        "Milwaukee Bucks": 1610612749,
        "Minnesota Timberwolves": 1610612750,
        "New Jersey Nets": 1610612751,
        "New Orleans Hornets": 1610612740,
        "New Orleans Pelicans": 1610612740,
        "New York Knicks": 1610612752,
        "Oklahoma City Thunder": 1610612760,
        "Orlando Magic": 1610612753,
        "Philadelphia 76ers": 1610612755,
        "Phoenix Suns": 1610612756,
        "Portland Trail Blazers": 1610612757,
        "Sacramento Kings": 1610612758,
        "San Antonio Spurs": 1610612759,
        "Toronto Raptors": 1610612761,
        "Utah Jazz": 1610612762,
        "Washington Wizards": 1610612764,
    }

# Advanced Stats
Stats like field goal percentage, rebounds, and turnovers are easily digested by NBA viewers. Some people don't like stats. They would rather apply the eye-test and see for themselves whether a team is any good. Others realize that stats can tell a story about the game but only if they know how to use them. Advanced stats play this role and help us dissect the drama unfolding on the court. Therefore, traditional and advanced stats will be cast in our models.

### True Shooting Percentage
There are 3 ways that an NBA player can score: 3-pointers, 2-pointers and free throws. True shooting percentage ('TS_PCT') looks at all three. 3-pointers are a little tricky to factor into the equation. The max true shooting percentage  is 150% and can only be reached if a player hits every one of their shots and they're all from behind the arch. Because this stat accounts for all shots, it's easily the best measure of shooting ability. 

For example, if a player goes 1-for-1 and their only shot is from the hash-mark, the formula will read and simplify as follows (please just trust and accept that the .44 multiplier is the best way of estimating the total number of possessions a player is involved in):

$$\frac{points} {2 *fga + .44 * fta}$$

$$\frac{3}  {2 * 1 + .44 * 0} = \frac{3}{2} = {1.5}$$

In [3]:
# Compile a dictionary of stats and their source 
available_stats = {'W_PCT': 'Base',
                   'FG_PCT': 'Base',
                   'FG3_PCT': 'Base',
                   'FT_PCT': 'Base',
                   'REB': 'Base',
                   'AST': 'Base',
                   'TOV': 'Base',
                   'STL': 'Base',
                   'BLK': 'Base',
                   'PLUS_MINUS': 'Base',
                   'OFF_RATING': 'Advanced',
                   'DEF_RATING': 'Advanced',
                   'TS_PCT': 'Advanced'}

### Offensive and defensive rating
Basketball stresses efficiency. Minimizing points allowed and maximizing points scored on each possession is more important than overall totals. Totals are shaped by variables like pace — or the number of possessions a team gets in a game — which differs depending on coaching (i.e. the Golden State Warriors averaged 3 fewer possessions per game than the Los Angeles Lakers last season).

This is where tempo-free stats offensive and defensive rating come into play. Defensive rating shows how many points a player allows per 100 possessions. This statistic functions differently than a plus/minus system, where all points scored while a player is on the court count against them. Only the shots that are scored as a result of their defensive lapses are counted against them. 

Offensive rating is simpler to calculate. It's just the amount of points produced by a player per 100 possessions. Again, the reason offensive and defensive ratings are useful is because they're tempo-free stats. Offensive and defensive rating eliminate factors like pace of play and minutes played per game. Below is the formula for offensive rating:

$$ \frac{100*pp} {fga + .44 * fta + to}$$

# Webscraping

In [4]:
from nba_api.stats.endpoints import teamdashboardbygeneralsplits, leaguedashteamstats


def get_team_stats_dict(team, start_date, end_date, season='2021-22'):
    """
    Returns the stats for the selected team in a dataframe, default year is 2021-22
    :param start_data: Day of games scheduled in form 'mm/dd/yyyy'
    :param end_data: Day of games scheduled in form 'mm/dd/yyyy'
    :param season: Day of games scheduled in form 'yyyy-yy'
    :return: A dictionary of game matchups {home_team:[away_team]}
    """

    time.sleep(1)

    
    # Load teamdashboardbygeneralsplits to access 'Per100Possessions' team stats
    general_team_info = teamdashboardbygeneralsplits.TeamDashboardByGeneralSplits(team_id=teams[team],
                                                                                  per_mode_detailed='Per100Possessions',
                                                                                  season=season,
                                                                                  date_from_nullable=start_date,
                                                                                  date_to_nullable=end_date,
                                                                                  timeout=120)
    # Move into general_team_info dictionary
    general_team_dict = general_team_info.get_normalized_dict()
    general_team_dashboard = general_team_dict['OverallTeamDashboard'][0]


    # Select stat columns to webscrape from general_team_info dictionary
    win_percentage = general_team_dashboard['W_PCT']
    fg_percentage = general_team_dashboard['FG_PCT']
    fg3_percentage = general_team_dashboard['FG3_PCT']
    ft_percentage = general_team_dashboard['FT_PCT']
    rebounds = general_team_dashboard['REB']
    assists = general_team_dashboard['AST']
    turnovers = general_team_dashboard['TOV']
    steals = general_team_dashboard['STL']
    blocks = general_team_dashboard['BLK']
    plus_minus = general_team_dashboard['PLUS_MINUS']

    
    # Load teamdashboardbygeneralsplits to access Advanced team stats
    advanced_team_info = teamdashboardbygeneralsplits.TeamDashboardByGeneralSplits(team_id=teams[team],
                                                                                   measure_type_detailed_defense='Advanced',
                                                                                   season=season,
                                                                                   date_from_nullable=start_date,
                                                                                   date_to_nullable=end_date,
                                                                                   timeout=120)
                                                                            
    # Move into advanced_team_info dictionary
    advanced_team_dict = advanced_team_info.get_normalized_dict()
    advanced_team_dashboard = advanced_team_dict['OverallTeamDashboard'][0]

    # Select stat columns to webscrape from advanced_team_info dictionary
    offensive_rating = advanced_team_dashboard['OFF_RATING']
    defensive_rating = advanced_team_dashboard['DEF_RATING']
    true_shooting_percentage = advanced_team_dashboard['TS_PCT']

    # Create a dictionary containing both the traditional and advanced stats 
    # and match them with the correspondiing variables outline above
    all_stats_dict = {'W_PCT': win_percentage, 
                      'FG_PCT': fg_percentage, 
                      'FG3_PCT': fg3_percentage, 
                      'FT_PCT': ft_percentage, 
                      'REB': rebounds, 
                      'AST': assists, 
                      'TOV': turnovers, 
                      'STL': steals, 
                      'BLK': blocks, 
                      'PLUS_MINUS': plus_minus, 
                      'OFF_RATING': offensive_rating, 
                      'DEF_RATING': defensive_rating, 
                      'TS_PCT': true_shooting_percentage}

    return all_stats_dict

In [5]:
# Call get_team_stats_dict() function and pass in a team, start_date, end_date, season
get_team_stats_dict('Golden State Warriors', '10/19/2021', '04/10/2022', '2021-22')

{'W_PCT': 0.646,
 'FG_PCT': 0.469,
 'FG3_PCT': 0.364,
 'FT_PCT': 0.769,
 'REB': 45.9,
 'AST': 27.4,
 'TOV': 15.0,
 'STL': 8.9,
 'BLK': 4.6,
 'PLUS_MINUS': 5.6,
 'OFF_RATING': 112.1,
 'DEF_RATING': 106.6,
 'TS_PCT': 0.582}

In [9]:
from nba_api.stats.endpoints import leaguegamelog, scoreboard, leaguestandings

def get_match_results(date, season):
    """
    Returns the matchup and result of the game

    :param date: Day of games scheduled in form 'mm/dd/yyyy'
    :param season: Season in form of 'yyyy-yy'
    :return: [{Golden State Warriors: Boston Celtics}], ['W']
    """

    # Load leaguegamelog to access 'Regular Season' game logs
    game_log = leaguegamelog.LeagueGameLog(season=season, 
                                           league_id='00', 
                                           date_from_nullable=date,
                                           date_to_nullable=date, 
                                           season_type_all_star='Playoffs', 
                                           timeout=120)
    # Move into game_log dictionary
    game_log_dict = game_log.get_normalized_dict()
    list_of_teams = game_log_dict['LeagueGameLog']

    daily_match = {}
    win_loss = []
    score = []
    game_id = []

    for i in range(0, len(list_of_teams), 2):

        if '@' in list_of_teams[i]['MATCHUP']:

            # Select Away team
            away_team = list_of_teams[i]['TEAM_NAME']
            
            # Select Home team
            home_team = list_of_teams[i + 1]['TEAM_NAME']

            # Append Home team win or loss
            win_loss.append(list_of_teams[i + 1]['WL'])

            # Append Game ID
            game_id.append(list_of_teams[i + 1]['GAME_ID'])

            # Append Home team score
            score.append(list_of_teams[i + 1]['PTS'])
            
            # Append Away team score
            score.append(list_of_teams[i]['PTS'])

        else:
            # Select Away team
            away_team = list_of_teams[i + 1]['TEAM_NAME']
            
            # Select Home team
            home_team = list_of_teams[i]['TEAM_NAME']

            # Append Away team win or loss
            win_loss.append(list_of_teams[i]['WL'])

            # Append Game ID
            game_id.append(list_of_teams[i]['GAME_ID'])

            # Append Away team score
            score.append(list_of_teams[i]['PTS'])
            
            # Append Home team score
            score.append(list_of_teams[i + 1]['PTS'])

        daily_match.update({home_team: away_team})

    match_results = [daily_match, win_loss, score, game_id]

    return match_results

In [7]:
def get_daily_matches(date):
    """
    This function creates a dictionary of daily game matchups and their results.

    :param date: Day of games scheduled in form 'mm/dd/yyyy'
    :return: A dictionary of game matchups {home_team:away_team}
    """

    # Load scoreboard to access each team's 'TEAM_ID' 
    daily_match = scoreboard.Scoreboard(league_id='00', game_date=date, timeout=120)
    
    # Move into daily_match dictionary
    daily_match_dict = daily_match.get_normalized_dict()
    games = daily_match_dict['GameHeader']

    match = {}

    # Loop through games
    for game in games:

        # Select 'HOME_TEAM_ID'
        home_team_id = game['HOME_TEAM_ID']

        # Assign home_team variable with the team name that goes along with the home_team_id
        for team, team_id in teams.items():
            if team_id == home_team_id:
                home_team = team

        # Select 'VISITOR_TEAM_ID'
        away_team_id = game['VISITOR_TEAM_ID']

        # Assign away_team variable with the team name that goes along with the away_team_id
        for team, team_id in teams.items():
            if team_id == away_team_id:
                away_team = team

        # Update the match dictionary with a dictionary of Home and Away team names
        match.update({home_team: away_team})

    return match

In [10]:
# Run both get_daily_matches() and get_match_results() functions
def main():
    print(f"""
    'get_daily_matches()' returns a dictionary of the games on a specified date\n{get_daily_matches('06/16/22')}\n
    """)
    
    print(f"""
    'get_match_results()' returns the matchup plus the result\n{get_match_results('06/16/2022', '2021-22')}
    """)
    
# 1. Return a dictionary of the games on a specified date
# 2. Return the matchup plus the game results
main()


    'get_daily_matches()' returns a dictionary of the games on a specified date
{'Boston Celtics': 'Golden State Warriors'}

    

    'get_match_results()' returns the matchup plus the result
[{'Boston Celtics': 'Golden State Warriors'}, ['L'], [90, 103], ['0042100406']]
    


In [12]:
from datetime import date, timedelta
import requests

def to_dataframe(daily_games, start_date, end_date, season):
    """
    This function creates a DataFrame of daily game matchups and their results.

    :param daily_games: get_match_results()
    :param start_data: Day of games scheduled in form 'mm/dd/yyyy'
    :param end_data: Day of games scheduled in form 'mm/dd/yyyy'
    :param season: Day of games scheduled in form 'yyyy-yy'
    :return: A dictionary of game matchups {home_team:away_team}
    """
    full_dataframe = []
    game_number = 0  # Counter to match with the corresponding game
    daily_results = daily_games[1]  # Win or loss for each game
    score = daily_games[2] # Score for the game
    game_id = daily_games[3] # Game ID for the game

    # loops through games to access home and away teams
    for home_team, away_team in daily_games[0].items():  # loops through matchups
        
        # Pull home team stats
        home_team_stats = get_team_stats_dict(home_team, start_date, end_date, season)
        
        # Pull away team stats
        away_team_stats = get_team_stats_dict(away_team, start_date, end_date, season)

        current_game = [home_team, away_team]
        
        current_game.append(game_id[game_number])

        current_game.append(score.pop(0))

        # Append home team stats
        for stat, stat_type in available_stats.items():
            current_game.append(home_team_stats[stat])
        
        current_game.append(score.pop(0))

        # Append away team stats
        for stat, stat_type in available_stats.items():
            current_game.append(away_team_stats[stat])

        # Assign 1 for a W and 0 for an L
        if daily_results[game_number] == 'W':
            result = 1
        else:
            result = 0

        current_game.append(result)
        game_number += 1

        print(current_game)

        # Appned full game stats to full_dataframe list
        full_dataframe.append(current_game)

    return full_dataframe

In [13]:
# Define function  to plug in date ranges for game data you'd like to webscrape
def date_range(start_date, end_date):
    for n in range(int((end_date - start_date).days)):
        yield start_date + timedelta(n)
        
        
# Define function  to plug in date ranges for game data you'd like to webscrape
def training_set(start_year, start_month, start_day, end_year, end_month, end_day, season, season_start):
    start_date = date(start_year, start_month, start_day)
    end_date = date(end_year, end_month, end_day)

    total_games = []

    for single_date in date_range(start_date, end_date):
        current_date = single_date.strftime('%m/%d/%Y')
        print(current_date)

        previous_day = single_date - timedelta(days=1)
        previous_day_formatted = previous_day.strftime('%m/%d/%Y')

        current_day_games = get_match_results(current_date, season)
        current_day_games_with_stats = to_dataframe(current_day_games, season_start, previous_day_formatted, season)

        for game in current_day_games_with_stats:
            game.append(current_date)
            total_games.append(game)

    print(total_games)
    return total_games

In [14]:
# Create DataFrame shell with column names to fill with webescraped game data
def make_dataframe(game_list):
    games = pd.DataFrame(game_list,
                         columns=['Home', 
                                  'Away', 
                                  'Game_ID', 
                                  'H_Score', 
                                  'H_W_PCT', 
                                  'H_FG_PCT', 
                                  'H_FG3_PCT', 
                                  'H_FT_PCT',
                                  'H_REB', 
                                  'H_AST', 
                                  'H_TOV', 
                                  'H_STL',
                                  'H_BLK', 
                                  'H_PLUS_MINUS', 
                                  'H_OFF_RATING', 
                                  'H_DEF_RATING', 
                                  'H_TS_PCT', 
                                  'A_Score',
                                  'A_W_PCT', 
                                  'A_FG_PCT', 
                                  'A_FG3_PCT',
                                  'A_FT_PCT', 
                                  'A_REB', 
                                  'A_AST', 
                                  'A_TOV', 
                                  'A_STL',
                                  'A_BLK', 
                                  'A_PLUS_MINUS', 
                                  'A_OFF_RATING', 
                                  'A_DEF_RATING', 
                                  'A_TS_PCT', 
                                  'Result',
                                  'Date'])

    print(games)
    return games

In [26]:
# Function extracts NBA game data for a specied time period
def main():
    
    attempts = 10

    for i in range(attempts):
        try:
            # Plug in date ranges for game data you'd like to webscrape
            all_games = training_set(start_year=2017, 
                                     start_month=5, 
                                     start_day=16, 
                                     end_year=2017, 
                                     end_month=6, 
                                     end_day=13,
                                     season='2016-17', 
                                     season_start='10/25/2016')
            
            # Aligns data with correct columns in new DataFrame
            df = make_dataframe(all_games)

            print(df)
            
            # Convert game data to a csv file
            df.to_csv(r'C:\Users\alvaro\OneDrive\Documents\School\Flatiron\Projects\NBA_Prediction_Model\data\nba_df_2016_playoffs_v2.csv', 
                      index=False)
        
        except requests.exceptions.ReadTimeout:
            
            if i < attempts - 1:
                continue
            
            else:
                raise
        
        break

        
# Initialize webscraping of game data 
# Uncomment code below to run
if __name__ == '__main__':
    main()

05/16/2017
['Golden State Warriors', 'San Antonio Spurs', '0041600312', 136, 0.817, 0.495, 0.383, 0.788, 44.0, 30.1, 14.6, 9.5, 6.7, 11.5, 114.8, 103.4, 0.597, 100, 0.744, 0.469, 0.391, 0.797, 45.9, 24.9, 14.1, 8.4, 6.2, 7.5, 110.3, 102.9, 0.564, 1]
05/17/2017
['Boston Celtics', 'Cleveland Cavaliers', '0041600301', 104, 0.646, 0.454, 0.359, 0.807, 43.0, 25.8, 13.6, 7.7, 4.2, 2.7, 110.6, 108.0, 0.567, 117, 0.622, 0.47, 0.384, 0.748, 44.7, 23.2, 14.0, 6.7, 4.1, 3.3, 112.8, 109.7, 0.58, 0]
05/18/2017
05/19/2017
['Boston Celtics', 'Cleveland Cavaliers', '0041600302', 86, 0.646, 0.454, 0.359, 0.807, 43.0, 25.8, 13.6, 7.7, 4.2, 2.7, 110.6, 108.0, 0.567, 130, 0.622, 0.47, 0.384, 0.748, 44.7, 23.2, 14.0, 6.7, 4.1, 3.3, 112.8, 109.7, 0.58, 0]
05/20/2017
['San Antonio Spurs', 'Golden State Warriors', '0041600313', 108, 0.744, 0.469, 0.391, 0.797, 45.9, 24.9, 14.1, 8.4, 6.2, 7.5, 110.3, 102.9, 0.564, 120, 0.817, 0.495, 0.383, 0.788, 44.0, 30.1, 14.6, 9.5, 6.7, 11.5, 114.8, 103.4, 0.597, 0]
05/21/

# Seasons
After webscrapping for a few days we've amassed 7 seasons of data. From the 2015-16 season to the most recent 2021-22 season. Some of the webscrapping had to be done in parts because the NBA API would error out. Doing it like this was slower but we can avoid errors and the annoyance of losing the data we've collected during the webscrape. Slow and steady wins the race.
## 2015 - 16

In [None]:
# # Read csv, convert date time, assign new column 'Season' with the correct year
# df_2015 = pd.read_csv('./data/nba_df_2015.csv')
# df_2015['Date'] = pd.to_datetime(df_2015['Date'])
# df_2015['Season'] = '2015-16'

# df_2015_2 = pd.read_csv('./data/nba_df_2015_v2.csv')
# df_2015_2['Date'] = pd.to_datetime(df_2015_2['Date'])
# df_2015_2['Season'] = '2015-16'

# df_2015_3 = pd.read_csv('./data/nba_df_2015_v3.csv')
# df_2015_3['Date'] = pd.to_datetime(df_2015_3['Date'])
# df_2015_3['Season'] = '2015-16'

# df_2015_4 = pd.read_csv('./data/nba_df_2015_v4.csv')
# df_2015_4['Date'] = pd.to_datetime(df_2015_4['Date'])
# df_2015_4['Season'] = '2015-16'

# df_2015_5 = pd.read_csv('./data/nba_df_2015_v5.csv')
# df_2015_5['Date'] = pd.to_datetime(df_2015_5['Date'])
# df_2015_5['Season'] = '2015-16'

# df_2015_6 = pd.read_csv('./data/nba_df_2015_v6.csv')
# df_2015_6['Date'] = pd.to_datetime(df_2015_6['Date'])
# df_2015_6['Season'] = '2015-16'

# df_2015_7 = pd.read_csv('./data/nba_df_2015_v7.csv')
# df_2015_7['Date'] = pd.to_datetime(df_2015_7['Date'])
# df_2015_7['Season'] = '2015-16'

# df_2015_playoffs = pd.read_csv('./data/nba_df_2015_playoffs.csv')
# df_2015_playoffs['Date'] = pd.to_datetime(df_2015_playoffs['Date'])
# df_2015_playoffs['Season'] = '2015-16'


# # print(len(df_2015), len(df_2015_2), len(df_2015_3), len(df_2015_4), len(df_2015_5), len(df_2015_6), len(df_2015_7), len(df_2015_playoffs))
# print(len(df_2015), len(df_2015_2), len(df_2015_3), len(df_2015_4), len(df_2015_5), len(df_2015_6), len(df_2015_7))

In [None]:
# # Concat all DataFrames to make one for the season
# frames = [df_2015, df_2015_2, df_2015_3, df_2015_4, df_2015_5, df_2015_6, df_2015_7, df_2015_playoffs]
# frames = [df_2015, df_2015_2, df_2015_3, df_2015_4, df_2015_5, df_2015_6, df_2015_7]
# df_2015_final = pd.concat(frames)
# len(df_2015_final)

## 2016 - 17

In [None]:
# Read csv, convert date time, assign new column 'Season' with the correct year
df_2016 = pd.read_csv('./data/nba_df_2016.csv')
df_2016['Date'] = pd.to_datetime(df_2016['Date'])
df_2016['Season'] = '2016-17'

df_2016_2 = pd.read_csv('./data/nba_df_2016_v2.csv')
df_2016_2['Date'] = pd.to_datetime(df_2016_2['Date'])
df_2016_2['Season'] = '2016-17'

df_2016_3 = pd.read_csv('./data/nba_df_2016_v3.csv')
df_2016_3['Date'] = pd.to_datetime(df_2016_3['Date'])
df_2016_3['Season'] = '2016-17'

df_2016_4 = pd.read_csv('./data/nba_df_2016_v4.csv')
df_2016_4['Date'] = pd.to_datetime(df_2016_4['Date'])
df_2016_4['Season'] = '2016-17'

df_2016_5 = pd.read_csv('./data/nba_df_2016_v5.csv')
df_2016_5['Date'] = pd.to_datetime(df_2016_5['Date'])
df_2016_5['Season'] = '2016-17'

df_2016_6 = pd.read_csv('./data/nba_df_2016_v6.csv')
df_2016_6['Date'] = pd.to_datetime(df_2016_6['Date'])
df_2016_6['Season'] = '2016-17'

df_2016_7 = pd.read_csv('./data/nba_df_2016_v7.csv')
df_2016_7['Date'] = pd.to_datetime(df_2016_7['Date'])
df_2016_7['Season'] = '2016-17'

df_2016_8 = pd.read_csv('./data/nba_df_2016_v8.csv')
df_2016_8['Date'] = pd.to_datetime(df_2016_8['Date'])
df_2016_8['Season'] = '2016-17'

# df_2016_playoffs = pd.read_csv('./data/nba_df_2016_playoffs.csv')
# df_2016_playoffs['Date'] = pd.to_datetime(df_2016_playoffs['Date'])
# df_2016_playoffs['Season'] = '2016-17'



# print(len(df_2016), len(df_2016_2), len(df_2016_3), len(df_2016_4), len(df_2016_5), len(df_2016_6), len(df_2016_7), len(df_2016_8), len(df_2016_playoffs))
print(len(df_2016), len(df_2016_2), len(df_2016_3), len(df_2016_4), len(df_2016_5), len(df_2016_6), len(df_2016_7), len(df_2016_8))

In [None]:
# Concat all DataFrames to make one for the season
# frames = [df_2016, df_2016_2, df_2016_3, df_2016_4, df_2016_5, df_2016_6, df_2016_7, df_2016_8, df_2016_playoffs]
frames = [df_2016, df_2016_2, df_2016_3, df_2016_4, df_2016_5, df_2016_6, df_2016_7, df_2016_8]
df_2016_final = pd.concat(frames)
len(df_2016_final)

## 2017 - 18

In [None]:
# Read csv, convert date time, assign new column 'Season' with the correct year
df_2017 = pd.read_csv('./data/nba_df_2017.csv')
df_2017['Date'] = pd.to_datetime(df_2017['Date'])
df_2017['Season'] = '2017-18'

df_2017_2 = pd.read_csv('./data/nba_df_2017_v2.csv')
df_2017_2['Date'] = pd.to_datetime(df_2017_2['Date'])
df_2017_2['Season'] = '2017-18'

df_2017_3 = pd.read_csv('./data/nba_df_2017_v3.csv')
df_2017_3['Date'] = pd.to_datetime(df_2017_3['Date'])
df_2017_3['Season'] = '2017-18'

# df_2017_playoffs = pd.read_csv('./data/nba_df_2017_playoffs.csv')
# df_2017_playoffs['Date'] = pd.to_datetime(df_2017_playoffs['Date'])
# df_2017_playoffs['Season'] = '2017-18'


# print(len(df_2017), len(df_2017_2), len(df_2017_3), len(df_2017_playoffs))
print(len(df_2017), len(df_2017_2), len(df_2017_3))

In [None]:
# Concat all DataFrames to make one for the season
# frames = [df_2017, df_2017_2, df_2017_3, df_2017_playoffs]
frames = [df_2017, df_2017_2, df_2017_3]
df_2017_final = pd.concat(frames)
len(df_2017_final)

## 2018 - 19

In [None]:
# Read csv, convert date time, assign new column 'Season' with the correct year
df_2018 = pd.read_csv('./data/nba_df_2018.csv')
df_2018['Date'] = pd.to_datetime(df_2018['Date'])
df_2018['Season'] = '2018-19'

# df_2018_playoffs = pd.read_csv('./data/nba_df_2018_playoffs.csv')
# df_2018_playoffs['Date'] = pd.to_datetime(df_2018_playoffs['Date'])
# df_2018_playoffs['Season'] = '2018-19'


# print(len(df_2018), len(df_2018_playoffs))
print(len(df_2018))

In [None]:
# # Concat all DataFrames to make one for the season
# # frames = [df_2018, df_2018_playoffs]
# frames = [df_2018]

# df_2018_final = pd.concat(frames)
# len(df_2018_final)

## 2019 - 20

In [None]:
# Read csv, convert date time, assign new column 'Season' with the correct year
df_2019 = pd.read_csv('./data/nba_df_2019.csv')
df_2019['Date'] = pd.to_datetime(df_2019['Date'])
df_2019['Season'] = '2019-20'

df_2019_2 = pd.read_csv('./data/nba_df_2019_2.csv')
df_2019_2['Date'] = pd.to_datetime(df_2019_2['Date'])
df_2019_2['Season'] = '2019-20'

# df_2019_playoffs = pd.read_csv('./data/nba_df_2019_playoffs.csv')
# df_2019_playoffs['Date'] = pd.to_datetime(df_2019_playoffs['Date'])
# df_2019_playoffs['Season'] = '2019-20'


# print(len(df_2019), len(df_2019_2), len(df_2019_playoffs))
print(len(df_2019), len(df_2019_2))

In [None]:
# Concat all DataFrames to make one for the season
# frames = [df_2019, df_2019_2, df_2019_playoffs]
frames = [df_2019, df_2019_2]
df_2019_final = pd.concat(frames)
len(df_2019_final)

## 2020 - 21

In [None]:
# Read csv, convert date time, assign new column 'Season' with the correct year
df_2021 = pd.read_csv('./data/nba_df_2020_v0.csv')
df_2021['Date'] = pd.to_datetime(df_2021['Date'])
df_2021['Season'] = '2020-21'

df_2021_2 = pd.read_csv('./data/nba_df_2020.csv')
df_2021_2['Date'] = pd.to_datetime(df_2021_2['Date'])
df_2021_2['Season'] = '2020-21'

df_2021_3 = pd.read_csv('./data/nba_df_2020_v2.csv')
df_2021_3['Date'] = pd.to_datetime(df_2021_3['Date'])
df_2021_3['Season'] = '2020-21'

# df_2021_playoffs = pd.read_csv('./data/nba_df_2020_playoffs.csv')
# df_2021_playoffs['Date'] = pd.to_datetime(df_2021_playoffs['Date'])
# df_2021_playoffs['Season'] = '2020-21'


# print(len(df_2021), len(df_2021_2), len(df_2021_3), len(df_2021_playoffs))
print(len(df_2021), len(df_2021_2), len(df_2021_3))

In [None]:
# Concat all DataFrames to make one for the season
# frames = [df_2021, df_2021_2, df_2021_3, df_2021_playoffs]
frames = [df_2021, df_2021_2, df_2021_3]
df_2021_final = pd.concat(frames)
len(df_2021_final)

## 2021 - 22

In [None]:
# Read csv, convert date time, assign new column 'Season' with the correct year
df_2022 = pd.read_csv('./data/nba_game_2022.csv')
df_2022['Date'] = pd.to_datetime(df_2022['Date'])
df_2022['Season'] = '2021-22'

df_2022_1 = pd.read_csv('./data/nba_game_2022_v1.csv')
df_2022_1['Date'] = pd.to_datetime(df_2022_1['Date'])
df_2022_1['Season'] = '2021-22'

df_2022_2 = pd.read_csv('./data/nba_game_2022_v2.csv')
df_2022_2['Date'] = pd.to_datetime(df_2022_2['Date'])
df_2022_2['Season'] = '2021-22'

df_2022_3 = pd.read_csv('./data/nba_game_2022_v3.csv')
df_2022_3['Date'] = pd.to_datetime(df_2022_3['Date'])
df_2022_3['Season'] = '2021-22'

# df_2022_playoffs = pd.read_csv('./data/nba_df_2022_playoffs.csv')
# df_2022_playoffs['Date'] = pd.to_datetime(df_2022_playoffs['Date'])
# df_2022_playoffs['Season'] = '2021-22'


# print(len(df_2022), len(df_2022_1), len(df_2022_2), len(df_2022_3), len(df_2022_playoffs))
print(len(df_2022), len(df_2022_1), len(df_2022_2), len(df_2022_3))

In [None]:
# Concat all DataFrames to make one for the season
# frames = [df_2022, df_2022_1, df_2022_2, df_2022_3, df_2022_playoffs]
frames = [df_2022, df_2022_1, df_2022_2, df_2022_3]
df_2022_final = pd.concat(frames)

print(f"Length of 2022 data: {len(df_2022_final)}\n")

## All NBA Seasons

In [None]:
# Concat all the season DataFrames into one DataFrame for processing 
# frames = [df_2015_final, df_2016_final, df_2017_final, df_2018_final, df_2019_final, df_2021_final, df_2022_final]
frames = [df_2016_final, df_2017_final, df_2018, df_2019_final, df_2021_final, df_2022_final]
df = pd.concat(frames)
df = df.reset_index(drop=True)

# Shape it up
print(f"df shape: {df.shape}\n"), df

In [None]:
# No nulls. Nice and clean.
df.isnull().sum()

In [None]:
# No duplicates. All clear.
duplicates = df[df.duplicated()]
duplicates