In [1]:
%matplotlib inline
from urllib.request import urlopen
from bs4 import BeautifulSoup
from bs4 import Comment
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import VotingRegressor
import scipy.stats as st
import re
from openpyxl import load_workbook
import seaborn as sns
import timeit
import sqlite3
import warnings
warnings.filterwarnings("ignore")

In [2]:
conn = sqlite3.connect('NBA_Fantasy_db.sqlite')
cur = conn.cursor()

In [3]:
def select_games(include_playoffs, skip_days, n_seasons):
    """
    This function filters out games that we don't want to include in the training set
    skip_days sepcifies how many games of the beginning of the regular season to skip
    This is necessary to make sure we have previous games of data to make features
    include_playoffs is a boolean to decide whether to include playoff games in the training set
    If making predictions on regular season games, it may not be a good idea to include playoff games
    Players play more intensely in the playoffs so playoff data may not be predictive of regular season stats
    n_seasons is the number of most recent seasons to include in the training data
    """
    season_start = ['2020-12-22', '2019-10-22', '2018-10-16', '2017-10-17', '2016-10-25']
    season_start = [pd.to_datetime(i).date() for i in season_start]
    skip_days_into_season = [date + pd.Timedelta(days=skip_days) for date in season_start]
    
    if include_playoffs:
        season_end_dates = ['2021-07-23', '2020-10-12', '2019-06-14', '2018-06-09', '2017-06-13']
    else:
        season_end_dates = ['2021-05-17', '2020-08-17', '2019-04-14', '2018-04-15', '2017-04-16', ]
    
    games_columns = ['game_id', 'date', 'home_team_id', 'road_team_id', 'home_team_pts', 'road_team_pts', 
                     'season_id','day_of_week', 'start_time', 'road_injuries', 'home_injuries']
    df_games = pd.DataFrame(columns=games_columns)
    
    for i in range(n_seasons):
        cur.execute('SELECT * FROM Game WHERE date BETWEEN ? AND ?', (skip_days_into_season[i], season_end_dates[i]))
        df_season = pd.DataFrame(cur.fetchall(), columns=games_columns)
        df_games = pd.concat([df_games, df_season])
    
    return df_games

In [4]:
def create_injury_feature(df_games):
    """
    This function is called in the 'make_training_data' function
    It takes a pandas data frame as input
    This data frame comes from the Game table in the database
    This function converts to list of injured players into a feature 
    This feature is the percent of team's salary cap paid to inactive/injured players for game of interest
    """
    # determine total player salary for each team for each season
    cur.execute('''
    SELECT se.team_id, se.id, sum(sa.amount) FROM Salary as sa
        JOIN Player as p
            on p.id = sa.player_id
        Join Season as se
            on se.id = sa.season_id AND p.id = se.player_id
        GROUP BY se.team_id, se.id
    ''')
    df_salary_cap = pd.DataFrame(cur.fetchall(), columns=['team_id', 'season_id', 'salary_cap'])
    df_salary_cap = df_salary_cap.set_index(keys=['team_id', 'season_id'])
    
    # determine total salary of inactive players
    def convert_inactive_csv_to_salary(row):
        season_id = row[0]
        csv = row[1]
        # convert csv into list of id's
        inactive_list = [i for i in re.split(',', csv)]
        
        # iterate over id's and get cumulative sum of salaries
        inactive_salary = 0
        for player_id in inactive_list:
            try:
                cur.execute('''SELECT amount FROM SALARY WHERE player_id == ? AND season_id = ?''', (player_id, season_id))
                inactive_salary += cur.fetchone()[0]
            except:
                pass
        return inactive_salary
    
    df_games['road_injuries'] = df_games[['season_id', 'road_injuries']].apply(
        lambda x: convert_inactive_csv_to_salary(x), axis=1)
    
    df_games['home_injuries'] = df_games[['season_id', 'home_injuries']].apply(
        lambda x: convert_inactive_csv_to_salary(x), axis=1)
    
    df_games = df_games.rename(columns={'road_injuries': 'road_inactive_salary', 
                                        'home_injuries': 'home_inactive_salary'})
        
    # divide inactive_salary by respective salary_cap
    def convert_salary_to_fraction_of_cap(row):
        team_id = row[0]
        season_id = row[1]
        inactive_salary = row[2]
        salary_cap = df_salary_cap.loc[team_id, season_id].values[0]
        return inactive_salary / salary_cap
        
    df_games['road_inactive_salary'] = df_games[['road_team_id', 'season_id', 'road_inactive_salary']].apply(
        lambda x: convert_salary_to_fraction_of_cap(x), axis=1)
    df_games['home_inactive_salary'] = df_games[['home_team_id', 'season_id', 'home_inactive_salary']].apply(
        lambda x: convert_salary_to_fraction_of_cap(x), axis=1)
    
    return df_games

In [5]:
def create_percent_of_season_feature(row):
    """
    This function is used in the 'make_training_data' function
    It is used in the map function of the pandas data frame
    The data frame comes from the Game table in the database
    It calculates approximately how far each game is from the end of the season as a percentage
    Values less than one would be in the regular season
    Values greater than one would be in the playoffs (assuming playoff games are included in the dataset)
    Note that this is tricky for 2019-2020 pandemic season where teams played different numbers of games
    For this season, I will just approximate every team as playing a 72 game season
    """
    game_id = row[0]
    home_team_id = row[1]
    road_team_id = row[2]
    season_id = row[3]
    
    # get home team's previous games (both road and home games)
    cur.execute('''
    SELECT home_team_id, season_id, count(id) FROM Game
        WHERE id <= ?
        GROUP BY home_team_id, season_id
        HAVING home_team_id == ? AND season_id == ?
    ''', (game_id, home_team_id, season_id))
    try:
        home_home_games = cur.fetchone()[2]
    except:
        home_home_games = 0
    
    cur.execute('''
    SELECT road_team_id, season_id, count(id) FROM Game
        WHERE id <= ?
        GROUP BY road_team_id, season_id
        HAVING road_team_id == ? AND season_id == ?
    ''', (game_id, home_team_id, season_id))
    try:
        home_road_games = cur.fetchone()[2]
    except:
        home_road_games = 0
    
    home_game_number = home_home_games + home_road_games
    
    # get home team's previous games (both road and home games)
    cur.execute('''
    SELECT road_team_id, season_id, count(id) FROM Game
        WHERE id <= ?
        GROUP BY road_team_id, season_id
        HAVING road_team_id == ? AND season_id == ?
    ''', (game_id, road_team_id, season_id))
    try:
        road_road_games = cur.fetchone()[2]
    except:
        road_road_games = 0
    
    cur.execute('''
    SELECT home_team_id, season_id, count(id) FROM Game
        WHERE id <= ?
        GROUP BY home_team_id, season_id
        HAVING home_team_id == ? AND season_id == ?
    ''', (game_id, road_team_id, season_id))
    try:
        road_home_games = cur.fetchone()[2]
    except:
        road_home_games = 0
    
    road_game_number = road_road_games + road_home_games
    
    # average how far each team is into the season
    avg_game_number = np.mean([home_game_number, road_game_number])
    
    if season_id == 4 or season_id == 5: # pandemic seasons - tricky since teams played different number of games
        return avg_game_number / 72
    else:
        return avg_game_number / 82

In [6]:
def convert_pts_to_avg(df_games, avg_games):
    """
    The home_team_pts and road_team_pts in df_games are the points scored in the game of interest. This needs to
    be converted to how many points the team and opponent have scored recently (averaged over avg_games games)
    """
    def get_avg_pts(row):
        game_id = row[0]
        home_team_id = row[2]
        road_team_id = row[3]
        home_pts = []
        road_pts = []
        
        ###### home team
        # get previous game ids for home team
        cur.execute('''
        SELECT id FROM Game
            WHERE (home_team_id == ? OR road_team_id == ?) AND id < ?
            ORDER BY id DESC
            LIMIT ?
        ''', (home_team_id, home_team_id, game_id, avg_games))
        prev_games_ids = [i[0] for i in cur.fetchall()]
        
        # get points scored in previous games, accounting for the fact they could have been home or road team
        for prev_game_id in prev_games_ids:
            cur.execute('''
            SELECT home_team_pts FROM Game
                WHERE home_team_id == ? and id == ?
            ''', (home_team_id, prev_game_id))
            try:
                home_pts.append(cur.fetchone()[0])
            except:
                cur.execute('''
                SELECT road_team_pts FROM Game
                    WHERE road_team_id == ? and id == ?
                ''', (home_team_id, prev_game_id))
                home_pts.append(cur.fetchone()[0])
        
        ###### road team
        # get previous game ids for road team
        cur.execute('''
        SELECT id FROM Game
            WHERE (home_team_id == ? OR road_team_id == ?) AND id < ?
            ORDER BY id DESC
            LIMIT ?
        ''', (road_team_id, road_team_id, game_id, avg_games))
        prev_games_ids = [i[0] for i in cur.fetchall()]
        
        # get points scored in previous games, accounting for the fact they could have been home or road team
        for prev_game_id in prev_games_ids:
            cur.execute('''
            SELECT home_team_pts FROM Game
                WHERE home_team_id == ? and id == ?
            ''', (road_team_id, prev_game_id))
            try:
                road_pts.append(cur.fetchone()[0])
            except:
                cur.execute('''
                SELECT road_team_pts FROM Game
                    WHERE road_team_id == ? and id == ?
                ''', (road_team_id, prev_game_id))
                road_pts.append(cur.fetchone()[0])
        
        # get average points scored
        home_team_avg_pts = np.mean(home_pts)
        road_team_avg_pts = np.mean(road_pts)
        
        return pd.Series([home_team_avg_pts, road_team_avg_pts])
    
    df_games[['home_team_pts', 'road_team_pts']] = df_games.apply(lambda x: get_avg_pts(x), axis=1)
    df_games.rename(columns={'home_team_pts': 'home_team_avg_pts', 
                             'road_team_pts': 'road_team_avg_pts'}, inplace=True)
    return df_games
        

In [7]:
def get_team_stats_df(df_games, avg_games):
    """
    This function returns a data frame for each team for each game
    The data include team stats averaged over the most recent avg_games
    e.g. if avg_games = 3, the data include stats averaged over the three games prior to game of interest
    """
    def get_team_stats(row):
        game_id = row[0]
        team_id = row[1]
        
        if avg_games == 1:
            cur.execute('''
            SELECT team_Pace, team_eFGp, team_TOVp, team_ORBp, team_FTvFGA, team_ORtg
                FROM TeamStats
                WHERE game_id < ? AND team_id = ?
                ORDER BY game_id DESC
                Limit ?''', (game_id, team_id, avg_games))
            
            team_stats = [game_id, team_id] + list(cur.fetchone())
            return pd.Series(team_stats)
        else:
            cur.execute('''
            SELECT avg(team_Pace), avg(team_eFGp), avg(team_TOVp), avg(team_ORBp), avg(team_FTvFGA), avg(team_ORtg)
                FROM TeamStats
                WHERE game_id < ? AND team_id = ?
                ORDER BY game_id DESC
                Limit ?''', (game_id, team_id, avg_games))

            team_stats = [game_id, team_id] + [round(i, 3) for i in cur.fetchone()]
            return pd.Series(team_stats)
    
    
    team_stats_columns = ['game_id', 'team_id', 'team_Pace', 'team_eFGp', 'team_TOVp', 'team_ORBp',
                          'team_FTvFGA', 'team_ORtg']
    df_road_stats = pd.DataFrame(columns=team_stats_columns)
    df_home_stats = pd.DataFrame(columns=team_stats_columns)
    
    df_road_stats[team_stats_columns] = df_games[['game_id', 'road_team_id']].apply(
        lambda x: get_team_stats(x), axis=1)
    
    df_home_stats[team_stats_columns] = df_games[['game_id', 'home_team_id']].apply(
        lambda x: get_team_stats(x), axis=1)
    
    df_road_stats['home'] = 0
    df_home_stats['home'] = 1
    
    df_team_stats = pd.concat([df_home_stats, df_road_stats])
    
    return df_team_stats

In [8]:
def get_player_stats_df(df_games_players, avg_games, min_minutes):
    """
    This function returns a data frame for each player in each game
    The data include stats averaged over the most recent avg_games
    e.g. if avg_games = 3, the data include stats averaged over the three games prior to game of interest
    min_minutes is the minimum number of minutes a player must average over the last avg_games to be included in
        the training set
    """
    def get_player_stats(row):
        game_id = row[0]
        player_id = row[1]
        if avg_games == 1:
            cur.execute('''
            SELECT * FROM
                (SELECT min, FG, FGA, ThreeP, ThreePA, FT, FTA, ORB, DRB, AST, STL, BLK, TOV, PF, PTS, 
                    PlusMinus, TSp, eFGp, ThreePAr, FTr, ORBp, DRBp, TRBp, ASTp, STLp, BLKp, TOVp, USGp, ORtg, 
                    DRtg, BPM
                FROM PlayerStats
                WHERE game_id < ? AND player_id = ?
                ORDER BY game_id DESC
                Limit ?)       
            WHERE min > ?       
            ''', (game_id, player_id, avg_games, min_minutes))
            try:
                stats = list(cur.fetchone())
                cur.execute('''
                SELECT team_id FROM PlayerStats
                    WHERE game_id == ? AND player_id == ?
                ''', (game_id, player_id))
                team_id = cur.fetchone()[0]    
                player_stats = [game_id, player_id, team_id] + stats
                return player_stats
            except:
                return [None]*34
        else: 
            cur.execute('''
            SELECT * FROM
                (SELECT avg(min) as avg_min, avg(FG), avg(FGA), avg(ThreeP), avg(ThreePA), avg(FT), 
                    avg(FTA), avg(ORB), avg(DRB), avg(AST), avg(STL), avg(BLK), avg(TOV), avg(PF), avg(PTS), 
                    avg(PlusMinus), avg(TSp), avg(eFGp), avg(ThreePAr), avg(FTr), avg(ORBp), avg(DRBp), avg(TRBp), 
                    avg(ASTp), avg(STLp), avg(BLKp), avg(TOVp), avg(USGp), avg(ORtg), avg(DRtg), avg(BPM)
                FROM PlayerStats
                WHERE game_id < ? AND player_id == ?
                ORDER BY game_id DESC
                Limit ?)       
            WHERE avg_min > ?       
            ''', (game_id, player_id, avg_games, min_minutes))
            try:                
                stats = [round(i, 3) for i in cur.fetchone()]                
                cur.execute('''
                SELECT team_id FROM PlayerStats
                    WHERE game_id == ? AND player_id == ?
                ''', (game_id, player_id))
                team_id = cur.fetchone()[0]             
                player_stats = [game_id, player_id, team_id] + stats
                return pd.Series(player_stats)
            except:
                return [None]*34  

    player_stats_columns = ['game_id', 'player_id', 'team_id', 'min', 'FG', 'FGA', '3P', '3PA', 'FT', 'FTA', 'ORB', 
                            'DRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', '+/-', 'TS%', 'eFG%', '3PAr', 'FTr', 
                            'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%','BLK%', 'TOV%', 'USG%', 'ORtg', 'DRtg', 'BPM']   
    
    df_player_stats = pd.DataFrame(columns=player_stats_columns)
        
    df_player_stats[player_stats_columns] = df_games_players.apply(get_player_stats, 
                                                                           axis=1, 
                                                                           result_type='expand')
    
    # get rid of NaN rows
    df_player_stats = df_player_stats[~df_player_stats['min'].isnull()]
    
    return df_player_stats

In [9]:
def combine_dfs(df_games, df_team_stats, df_player_stats, avg_games):
    """
    This function combines the three data frames into one containing features on the game of interest, team and 
    opponent recent performance, and player recent performance.
    """
    # merge df_games with df_players_stats
    df = df_games.merge(df_player_stats, how='inner', on=['game_id'])
        
    # merge df with team stats
    df = df.merge(df_team_stats, how='inner', on=['team_id', 'game_id'])
    
    df_home = df[df['team_id'] == df['home_team_id']]
    df_road = df[df['team_id'] == df['road_team_id']]
    
    df_home.rename(columns={'road_team_id': 'opp_id'}, inplace=True)
    df_road.rename(columns={'home_team_id': 'opp_id'}, inplace=True)
    
    df_home.drop(columns='home_team_id', inplace=True)
    df_road.drop(columns='road_team_id', inplace=True)
    
    df = pd.concat([df_home, df_road])
        
    ##### merge df with opp stats (use diff btw opponents in previous avg_games games and current opponenet)
    # this will create features that will compare the quality of the current opponent to previous opponents
    
    # get stats for current opponent averaged over last avg_games games
    def get_opp_stats(row):
        game_id = row[0]
        opp_id = row[2]   
        mask1 = df_team_stats['team_id'] == opp_id
        mask2 = df_team_stats['game_id'] == game_id
        mask = mask1 & mask2
        stats = df_team_stats[mask].iloc[:, -7:-1].values.reshape(-1)
        return  pd.Series(stats)
    
    # get average stats of team's opponents from the last avg_games games
    def get_prev_opp_avg_stats(row):
        game_id = row[0]
        team_id = row[12]
        # get ids of previous games the team played in
        cur.execute('''
        SELECT id FROM Game
            WHERE (home_team_id == ? OR road_team_id == ?) AND id < ?
            ORDER BY id DESC
            LIMIT ?
        ''', (team_id, team_id, game_id, avg_games))
        prev_game_ids = [i[0] for i in cur.fetchall()]
        
        cols = ['team_Pace', 'team_eFGp', 'team_TOVp', 'team_ORBp', 'team_FTvFGA', 'team_ORtg']
        prev_opp_stats = pd.DataFrame(columns=cols)
        for prev_game_id in prev_game_ids:
            cur.execute('''
            SELECT team_Pace, team_eFGp, team_TOVp, team_ORBp, team_FTvFGA, team_ORtg FROM TeamStats
                WHERE game_id == ? AND team_id != ?
            ''', (prev_game_id, team_id))
            prev_opp_stats = pd.concat([prev_opp_stats, pd.DataFrame(cur.fetchall(), columns=cols)])
        prev_opp_stats = prev_opp_stats.mean(axis=0).values.reshape(-1)
        
        return pd.Series(prev_opp_stats)
        
    opp_columns = ['opp_diff_Pace', 'opp_diff_eFGp', 'opp_diff_TOVp', 'opp_diff_ORBp',
                   'opp_diff_FTvFGA', 'opp_diff_ORtg']
    
    df_opp = pd.DataFrame(columns=opp_columns)
    df_prev_opp = pd.DataFrame(columns=opp_columns)
    
    df_opp[opp_columns] = df.apply(lambda x: get_opp_stats(x), axis=1)
    
    df_prev_opp[opp_columns] = df.apply(lambda x: get_prev_opp_avg_stats(x), axis=1)
    
    # take diff between stats of current opponent and previous opponents
    df_diff_opp = df_opp - df_prev_opp
    
    # merge with df
    df_train = pd.concat([df, df_diff_opp], axis=1)
        
    return df_train

In [10]:
def convert_home_road_to_team_opp_pts(df_train):
    """
    This function converts columns for home and road teams and converts them into team or opponent based on 
    whether the player of interest is on the home or road team.
    """
    
    df_train['team_avg_pts'] = np.where(df_train['home'] == 1, 
                                        df_train['home_team_avg_pts'],
                                        df_train['road_team_avg_pts']
                                       )
    df_train['opp_avg_pts'] = np.where(df_train['home'] == 1, 
                                        df_train['road_team_avg_pts'],
                                        df_train['home_team_avg_pts']
                                       )
    df_train['team_inactive_salary'] = np.where(df_train['home'] == 1, 
                                        df_train['home_inactive_salary'],
                                        df_train['road_inactive_salary']
                                       )
    df_train['opp_inactive_salary'] = np.where(df_train['home'] == 1, 
                                        df_train['road_inactive_salary'],
                                        df_train['home_inactive_salary']
                                       )
    df_train.drop(columns=['home_team_avg_pts','road_team_avg_pts',
                           'road_inactive_salary','home_inactive_salary'], inplace=True)
    return df_train

In [11]:
def convert_data_types(df_train):
    """
    This function takes in the training set and converts columns into the proper data type.
    """
    cols_to_int = ['player_id', 'team_id', 'opp_id', 'game_id', 'season_id', 'day_of_week']
    for col in cols_to_int:
        df_train[col] = df_train[col].astype(np.int64)
        
    df_train['date'] = pd.to_datetime(df_train['date'])
    
    # convert start_time to numeric value where 12pm = 0 and 12am = 1
    def convert_start_time(time):
        time = time[:-1] # get rid of 'p'
        [hour, minute] = re.split(':', time)
        hour = int(hour)
        minute = int(minute)
        if hour == 12:
            hour = 0
        time = (hour + (minute / 60)) / 12
        return time
        
    df_train['start_time'] = df_train['start_time'].apply(lambda x: convert_start_time(x))
    
    # fill null values with 0
    null_cols = ['TS%', 'eFG%', '3PAr', 'FTr', 'AST%', 'TOV%']
    for col in null_cols:
        df_train[col].where(df_train[col] != '', other='0', inplace=True)
        df_train[col] = df_train[col].astype(np.float32)
    
    return df_train

In [12]:
def get_target_variable(df_train):
    """
    This function adds columns to the dataset for three potential targets of interest: points, rebounds, and assists
    """
    
    def get_target(row):
        player_id = row[0]
        game_id = row[3]
        cur.execute('''
        SELECT PTS, ORB, DRB, AST FROM PlayerStats
            WHERE player_id = ? AND game_id = ?
        ''', (player_id, game_id))
        stats = cur.fetchone()
        return pd.Series([stats[0], stats[1] + stats[2], stats[3]])
    
    df_train[['target_PTS', 'target_TRB', 'target_AST']] = df_train.apply(get_target, axis=1)
    
    return df_train

# Main Code

In [13]:
n_seasons = 5
avg_games = 5
skip_days = 18
min_minutes = 25
include_playoffs = True
filename = 'train_data/n_seasons{}_avgGames{}_playoffs{}_skipDays{}_minmin{}.csv'.format(n_seasons,
                                                                                         avg_games,
                                                                                         str(include_playoffs),
                                                                                         skip_days,
                                                                                         min_minutes,)

In [14]:
##### Select games of interest
df_games = select_games(include_playoffs, skip_days, n_seasons)

In [15]:
##### convert csv list of injured player id's into feature --> percent of salary cap paid to injured players
df_games = create_injury_feature(df_games)

In [16]:
##### add feature for how far the game is into the season
df_games['percent_season'] = df_games[['game_id', 'home_team_id', 'road_team_id', 'season_id']].apply(
    lambda x: create_percent_of_season_feature(x), axis=1)

In [17]:
##### convert pts scored into avg of previous avg_games
df_games = convert_pts_to_avg(df_games, avg_games)

In [18]:
##### get team stats averaged over the previous avg_games
df_team_stats = get_team_stats_df(df_games, avg_games) 

In [19]:
##### get player stats averaged over the previous avg_games
# get data frame of player id and game_id
df_games_players = pd.DataFrame(columns=['game_id', 'player_id'])
for game_id in df_games['game_id'].unique():
    cur.execute('SELECT game_id, player_id FROM PlayerStats WHERE game_id = ?', (game_id,))
    df_games_players = pd.concat([df_games_players, pd.DataFrame(cur.fetchall(), columns=['game_id', 'player_id'])])
    
df_player_stats = get_player_stats_df(df_games_players, avg_games, min_minutes)

In [20]:
##### Combine the three data frames into one training set
df_train = combine_dfs(df_games, df_team_stats, df_player_stats, avg_games)

In [21]:
##### Convert home/road into team/opp
df_train = convert_home_road_to_team_opp_pts(df_train)

In [22]:
##### Re-order columns
cols = ['player_id', 'team_id', 'opp_id', 'game_id', 'season_id', 'date', 'home', 'day_of_week','start_time', 
        'percent_season', 'team_avg_pts', 'opp_avg_pts', 'team_inactive_salary', 'opp_inactive_salary', 
        'min', 'FG', 'FGA', '3P', '3PA', 'FT', 'FTA', 'ORB', 'DRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 
        '+/-', 'TS%', 'eFG%', '3PAr', 'FTr', 'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%', 'USG%', 
        'ORtg', 'DRtg', 'BPM', 'team_Pace', 'team_eFGp', 'team_TOVp', 'team_ORBp', 'team_FTvFGA', 'team_ORtg',
        'opp_diff_Pace', 'opp_diff_eFGp','opp_diff_TOVp', 'opp_diff_ORBp', 'opp_diff_FTvFGA', 'opp_diff_ORtg']
df_train = df_train[cols]

In [23]:
##### convert to appropriate data types
df_train = convert_data_types(df_train)

In [24]:
##### get target variable:
df_train = get_target_variable(df_train)

In [25]:
##### save to train_data folder
df_train.to_csv(filename)
conn.close()

# Debugging

In [26]:
df_train.shape

(48443, 60)

In [27]:
df_train.head()

Unnamed: 0,player_id,team_id,opp_id,game_id,season_id,date,home,day_of_week,start_time,percent_season,...,team_ORtg,opp_diff_Pace,opp_diff_eFGp,opp_diff_TOVp,opp_diff_ORBp,opp_diff_FTvFGA,opp_diff_ORtg,target_PTS,target_TRB,target_AST
0,7,23,8,5208,5,2021-01-09,1,5,0.25,0.131944,...,109.87,-8.224,0.0374,1.181,5.348,-0.0238,6.715,11,11,2
1,436,23,8,5208,5,2021-01-09,1,5,0.25,0.131944,...,109.87,-8.224,0.0374,1.181,5.348,-0.0238,6.715,11,3,5
14,464,4,1,5209,5,2021-01-09,1,5,0.583333,0.131944,...,109.799,0.97,-0.0442,-0.292,-1.065,0.03,-5.471,13,1,3
15,649,4,1,5209,5,2021-01-09,1,5,0.583333,0.131944,...,109.799,0.97,-0.0442,-0.292,-1.065,0.03,-5.471,2,3,1
16,651,4,1,5209,5,2021-01-09,1,5,0.583333,0.131944,...,109.799,0.97,-0.0442,-0.292,-1.065,0.03,-5.471,9,5,7
