In [1]:
from nba_api.stats.static import teams
from nba_api.stats.endpoints import leaguegamefinder
import pandas as pd
import os.path
from os import path
import numpy as np
import pickle

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn import tree
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

In [2]:
def getData(filename, start_date, end_date):
    nba_teams = teams.get_teams()
    if (path.exists(filename) != True):
        
        team_id = nba_teams[0]['id']
        gamefinder = leaguegamefinder.LeagueGameFinder(date_from_nullable = start_date , date_to_nullable = end_date , team_id_nullable = team_id)
        games = gamefinder.get_data_frames()[0]
        games.to_csv(filename,index=False)
        
    if (path.exists(filename) == True):   
        
        old_df = pd.read_csv(filename)
        last_id = old_df['TEAM_ID'][len(old_df)-1]
        start_id = int(last_id) + 1
        
        while start_id <= 1610612766:
            old_df = pd.read_csv(filename)
            gamefinder = leaguegamefinder.LeagueGameFinder(date_from_nullable = start_date , date_to_nullable = end_date , team_id_nullable = start_id)
            games = gamefinder.get_data_frames()[0]
            new_df = old_df.append(games)
            new_df.to_csv(filename, index=False)
            start_id = start_id + 1

In [3]:
def error_handle(count, filename, start_date, end_date):
    try:
        print("-----try is running-----")
        # put csv name here
        getData(filename, start_date, end_date)
        count = 0
        
    except:
        if count < 25:
            print("-----exception handled-----", count)
            error_handle(count + 1,filename, start_date, end_date)
        else:
            print("-----max tries exceeded-----")
    
    nba_teams = teams.get_teams()
    csv_df = pd.read_csv(filename)
    cdf = csv_df.sort_values(['TEAM_ABBREVIATION','GAME_DATE'] , ascending=[True, True])
    cdf.to_csv(filename, index=False)
    
    return rolling_average_stats(filename, 'ten_day-' + filename)

In [4]:
#name of csv to read, name of csv to write
def rolling_average_stats(r_filename, w_filename):  
    print('Inside rolling_average_stats')
    
    nba_teams = teams.get_teams()
    csv_df = pd.read_csv(r_filename)

    list_points = []
    list_team_points = []
    x = 1
    for team in nba_teams:
        team_df = csv_df[csv_df['TEAM_ID'] == team['id']]
        for col in team_df.columns[9:]:
            team_df['AV_'+ col] = team_df[col].rolling(window=10).mean()
            team_df['AV_'+ col] = team_df['AV_'+ col].shift(1) #add code here
        head = list(team_df.columns.values)
        if x == 1:
#             new_df = team_df
            team_df.to_csv(w_filename, header=head, index=False)
            x = x+1

        else:
#             new_df.append(team_df)
            team_df.to_csv(w_filename, mode='a', header=False, index=False)
    combine_and_clean(w_filename, "combined-" + w_filename)
    return ("combined-" + w_filename)
#     z_data = get_zscore_for_one_year(combine_and_clean(w_filename, "combined-" + w_filename))
#     z_data.to_csv("z_data_for_" + "combined-" + w_filename)
#     return "z_data_for_" + "combined-" + w_filename
#     z_data.to_csv("z_data_for_" + "combined-" + w_filename)

In [5]:
def combine_team_games(df, keep_method='home'):
    '''Combine a TEAM_ID-GAME_ID unique table into rows by game. Slow.

        Parameters
        ----------
        df : Input DataFrame.
        keep_method : {'home', 'away', 'winner', 'loser', ``None``}, default 'home'
            - 'home' : Keep rows where TEAM_A is the home team.
            - 'away' : Keep rows where TEAM_A is the away team.
            - 'winner' : Keep rows where TEAM_A is the losing team.
            - 'loser' : Keep rows where TEAM_A is the winning team.
            - ``None`` : Keep all rows. Will result in an output DataFrame the same
                length as the input DataFrame.
                
        Returns
        -------
        result : DataFrame
    '''
    # Join every row to all others with the same game ID.
    joined = pd.merge(df, df, suffixes=['_A', '_B'],
                      on=['SEASON_ID', 'GAME_ID', 'GAME_DATE'])
    # Filter out any row that is joined to itself.
    result = joined[joined.TEAM_ID_A != joined.TEAM_ID_B]
    # Take action based on the keep_method flag.
    if keep_method is None:
        # Return all the rows.
        pass
    elif keep_method.lower() == 'home':
        # Keep rows where TEAM_A is the home team.
        result = result[result.MATCHUP_A.str.contains(' vs. ')]
    elif keep_method.lower() == 'away':
        # Keep rows where TEAM_A is the away team.
        result = result[result.MATCHUP_A.str.contains(' @ ')]
    elif keep_method.lower() == 'winner':
        result = result[result.WL_A == 'W']
    elif keep_method.lower() == 'loser':
        result = result[result.WL_A == 'L']
    else:
        raise ValueError(f'Invalid keep_method: {keep_method}')
    return result
    

In [6]:
# uses combine function and cleans csv
# ten_day_csv is the csv with the rolling ten day averages for a year
# combined_csv is the returned csv with teams combined with their matchups
def combine_and_clean(ten_day_csv, combined_csv):
    print('Inside combine_and_clean')
    
    attempt = pd.read_csv(ten_day_csv,index_col=[0])
    attempt = attempt.drop(['PTS','FGM','FGA','FG_PCT','FG3M','FG3A','FG3_PCT','FTM','FTA','FT_PCT','OREB','DREB','REB','AST','STL','BLK','TOV','PF', 'PLUS_MINUS'],axis=1)

    count = 0
    for row in attempt.iterrows():
        if (count == 0):
            catch = attempt.loc[attempt['GAME_ID'] == row[1]['GAME_ID']]
            catch = pd.DataFrame(catch)
            combine = combine_team_games(catch)
            combine.to_csv(combined_csv, index=False)
            count = count + 1
        else: 
            old_df = pd.read_csv(combined_csv)
            catch = attempt.loc[attempt['GAME_ID'] == row[1]['GAME_ID']]
            catch = pd.DataFrame(catch)
            combine = combine_team_games(catch)
            new_df = old_df.append(combine)
            new_df.to_csv(combined_csv, index=False)
    
    clean = pd.read_csv(combined_csv)
    # drops duplicates, sort by game date, and replace W with 1 and L with 0
    cleaned = clean.drop_duplicates(subset='GAME_ID')
    cleaned = cleaned.sort_values('GAME_DATE')
    cleaned['WL_A'] = cleaned['WL_A'].replace(['W','L'],[1,0])
    cleaned['WL_B'] = cleaned['WL_B'].replace(['W','L'],[1,0])
    cleaned.to_csv(combined_csv, index=False)
    return combined_csv



In [7]:
def get_zscore_for_one_year(cleaned_csv):
    print("Inside get_zscore_for_one_year")
    
    data = pd.read_csv(cleaned_csv)
    z_data = pd.DataFrame(columns = ['WL', 'PTS', 'FGM' , 'FGA', 'FG_PCT', 'FG3M', 'FG3A', 'FG3_PCT', 'FTM', 'FTA', 'FT_PCT', 'OREB', 'DREB', 'REB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PLUS_MINUS'])
    z_data['WL'] = data['WL_A']
    for column in z_data.columns[1:]:
        z_data[column] = data['AV_' + column + '_A'] - data['AV_' + column + '_B']
    z_data = z_data.dropna()
    return z_data

In [15]:
def get_zscores(year1_cleaned_csv, year2_cleaned_csv, year3_cleaned_csv):
    
    df1 = get_zscore_for_one_year(year1_cleaned_csv)
    df2 = get_zscore_for_one_year(year2_cleaned_csv)
    df3 = get_zscore_for_one_year(year3_cleaned_csv)
    
    df1 = df1.append(df2)
    df1 = df1.append(df3)
    df1.to_csv("all_zscores.csv", index=False)
    return performLogReg(df1)

In [9]:
# Creates the logistic regression model and tests accuracy
def performLogReg(dataframe):

    # Update if new stats are added
    featureColumns = ['PTS', 'FGM', 'FGA', 'FG3_PCT', 'FTA','REB', 'AST',  'STL', 'TOV']

    X = dataframe[featureColumns] # Features
    Y = dataframe['WL'] # Target Variable

    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, shuffle=True)
    logreg = LogisticRegression()

    logreg.fit(X_train, Y_train)  # Fits model with data
    filename = 'finalized_model.sav'
    pickle.dump(logreg, open(filename, 'wb'))

    Y_pred = logreg.predict(X_test)

    confusionMatrix = metrics.confusion_matrix(Y_test, Y_pred)  # Diagonals tell you correct predictions

    # Code below prints model accuracy information
    print('Coefficient Information:')

    for i in range(len(featureColumns)):  # Prints each feature next to its corresponding coefficient in the model

        logregCoefficients = logreg.coef_

        currentFeature = featureColumns[i]
        currentCoefficient = logregCoefficients[0][i]

        print(currentFeature + ': ' + str(currentCoefficient))

    print('----------------------------------')

    print("Accuracy:", metrics.accuracy_score(Y_test, Y_pred))
    print("Precision:", metrics.precision_score(Y_test, Y_pred))
    print("Recall:", metrics.recall_score(Y_test, Y_pred))

    print('----------------------------------')

    print('Confusion Matrix:')
    print(confusionMatrix)

    return logreg


In [10]:
def main():
    
    logreg = get_zscores(error_handle(0, "2017-18.csv", "10/17/2017", "06/17/2018"), error_handle(0, "2018-19.csv", "10/16/2018", "06/13/2019"), error_handle(0, "2019-20.csv", "10/22/2019", "10/11/2020"))
    print(logreg)


In [16]:
main()

-----try is running-----
Inside rolling_average_stats


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app


Inside combine_and_clean
-----try is running-----
Inside rolling_average_stats
Inside combine_and_clean
-----try is running-----
Inside rolling_average_stats
Inside combine_and_clean
Inside get_zscore_for_one_year
Inside get_zscore_for_one_year
Inside get_zscore_for_one_year


NameError: name 'performLogRed' is not defined

In [18]:
# print(performLogReg(pd.read_csv("all_zscores.csv")))

Coefficient Information:
PTS: 0.10149289540987554
FGM: -0.06212808904733416
FGA: -0.12129716506419759
FG3_PCT: -0.9678254447923506
FTA: -0.07701460465724103
REB: 0.11634255442170666
AST: -0.01349962145239629
STL: 0.1338439910065507
TOV: -0.08345978710559933
----------------------------------
Accuracy: 0.6287425149700598
Precision: 0.6552962298025135
Recall: 0.7556935817805382
----------------------------------
Confusion Matrix:
[[160 192]
 [118 365]]
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)


In [None]:
# performLogReg(get_zscore_for_one_year('combined-ten_day-2017-18.csv'))