In [1]:
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
import seaborn as sns
import os
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Lasso, LassoCV, Ridge, RidgeCV

%matplotlib inline



In [4]:
all_data = pd.read_csv("./basketball_reference/NBA_2017_per_game.csv")

In [5]:
team = "GSW"

In [12]:
team_players = all_data[all_data['Tm'] == team]['Player'].values
[p.split("\\")[0].replace(" ", "_") for p in team_players]

['Matt_Barnes',
 'Ian_Clark',
 'Stephen_Curry',
 'Kevin_Durant',
 'Draymond_Green',
 'Andre_Iguodala',
 'Damian_Jones',
 'Shaun_Livingston',
 'Kevon_Looney',
 'James_Michael_McAdoo',
 'Patrick_McCaw',
 'JaVale_McGee',
 'Zaza_Pachulia',
 'Klay_Thompson',
 'Anderson_Varejao',
 'Briante_Weber',
 'David_West']

In [2]:
# player_url = "Curry_Stephen"
# player_url = "Thompson_Klay"
# player_url = "Durant_Kevin"
# player_url = "Green_Draymond"
# player_url = "James_LeBron"
# player_url = "DeRozan_DeMar"
# player_url = "Westbrook_Russell"
# player_url = "Gobert_Rudy"
# player_url = "Mills_Patty"
# player_url = "Simmons_Jonathon"

player_url = "Green_Draymond"
opp_team = "SAS"
last_x_games = -5
playoff_game = 1

In [80]:
class predict_player:

    df = None
    player_url = "Green_Draymond"
    opp_team = "SAS"
    last_x_games = -5
    playoff_game = 1
    cols = None
    
#     def __init__(self):
#         pass
        
#         print predict_player(player_url)
    
    
    def predict_player(self, player_url = "Green_Draymond", opp_team = "SAS", last_x_games = -5, playoff_game = 1):
        
        self.player_url   = player_url
        self.opp_team     = opp_team
        self.last_x_games = last_x_games
        self.playoff_game = playoff_game
        
        self.df = self.get_data(self.player_url)
        
        # setting wanted columns for dataframe
        cols = list(self.df.columns.values[5:-1]) #[6:-1]
        self.cols = [col for col in cols if col not in ['fg_pct','fg3_pct','ft_pct','pts']] + ['Home_Game'] # + ['playoff']

        # setting training data as all data from player
        X_train, X_test, y_train, y_test = self.set_train_test_split()#self.opp_team, self.last_x_games, self.playoff_game, self.df, cols)
        
        model = self.get_model(X_train, X_test, y_train, y_test)
        
#         return self.get_seed_data(X_test)
        
        predicted_ppg = self.get_player_prediction(model, self.get_seed_data(X_test))

        return (predicted_ppg, X_test['mp'].mean())

    
    def get_data(self, player_url):
        # setting empty dataframe
        df = pd.DataFrame()

        # retreiving data from csv files
        for file in os.listdir("./basketball_reference/game_logs/"):
            if file.startswith(player_url):
                filepath = "./basketball_reference/game_logs/" + file
                print filepath
                data = pd.read_csv(filepath)
                df = df.append(data)

        df = df.reset_index()

        # deleting unnecessary or columns with NAN
        del df['index']
        del df['fg_pct']
        del df['fg3_pct']
        del df['ft_pct']
        # del df['orb_pct']
        # del df['plus_minus']

        # data cleaning for all game_log files
        df['date'] = pd.to_datetime(df['date'])
        df.rename(columns={'loc': 'Home_Game'}, inplace=True)
        df['Home_Game'] = [0 if item == "@" else 1 for item in df['Home_Game']]
        df['result'] = [item[3:].replace(")","") for item in df['result']]
        df['result'] = df['result'].astype(float)
        df['mp'] = [0 if pd.isnull(item) else round(float(item.split(":")[0]) + float(item.split(":")[1])/60. , 2) for item in df['mp']]
        # df['playoff'] = df['playoff'].map(lambda x: x[-1])

        df = df.dropna(axis=0, how='any')

        return df
    
    
    def set_train_test_split(self):#, opp_team, last_x_games, playoff_game, df, cols):
    #     Xs = StandardScaler().fit_transform(df[cols])
    #     Xs = pd.DataFrame(Xs, columns=X.columns)

        # setting opponents mask
        opponents = pd.concat([self.df[(self.df['opp'] == self.opp_team)], self.df[self.last_x_games:]])#['pts'] # df['playoff'] == playoff_game
        # setting testing data based on team opponent and last 5 games
        mask = opponents.index.values

        # setting training data as all data from player
        X_train = self.df[self.cols]
        y_train = self.df[['pts']]
        X_test = self.df.ix[mask,:][self.cols]
        y_test = self.df.ix[mask,:][['pts']]

        return (X_train, X_test, y_train, y_test)
    
    
    def get_model(self, X_train, X_test, y_train, y_test):
        model = RidgeCV().fit(X_train, y_train)
    #     print model.score(X_train, y_train)
    #     print model.score(X_test, y_test)
        return model
    
    
    def get_seed_data(self, X_test, random_data_rows=1, home_game=1, playoff_game=1, game_started=1):
        # randomly seeding data based on mean/std from a given opponent
        seed = []
        for i in range(random_data_rows):
            row = []
            for col in self.cols:
                if col == "Home_Game":
                    row.append(home_game) # home game == 1, away game == 0
        #             row.append(np.random.choice([0,1]))
                elif col == 'gs':
                    row.append(game_started)
                elif col == 'playoff_game':
                    row.append(playoff_game)
                else:    
                    std = X_test[col].std()
                    if std == 0:
                        row.append(0)
                    else:
                        row.append(X_test[col].mean())
    #                     row.append(np.random.normal(X_test[col].mean(), X_test[col].std()))
            seed.append(row)

    #     print ridge.score(X_test, y_test)
        seed = pd.DataFrame(seed, columns=self.cols)

        return seed
    
    
    def get_player_prediction(self, model, seed):
    #     print model.predict(seed).mean()
        return model.predict(seed).mean()
    

In [81]:
pp = predict_player()
pp.predict_player()

./basketball_reference/game_logs/Green_Draymond_2013.csv
./basketball_reference/game_logs/Green_Draymond_2014.csv
./basketball_reference/game_logs/Green_Draymond_2015.csv
./basketball_reference/game_logs/Green_Draymond_2016.csv
./basketball_reference/game_logs/Green_Draymond_2017.csv


(8.3808947292363118, 26.864285714285714)

In [85]:
print predict_player().predict_player()
print predict_player().predict_player("Durant_Kevin")
print predict_player().predict_player("Curry_Stephen")
print predict_player().predict_player("Thompson_Klay")
print predict_player().predict_player("James_LeBron")

./basketball_reference/game_logs/Green_Draymond_2013.csv
./basketball_reference/game_logs/Green_Draymond_2014.csv
./basketball_reference/game_logs/Green_Draymond_2015.csv
./basketball_reference/game_logs/Green_Draymond_2016.csv
./basketball_reference/game_logs/Green_Draymond_2017.csv
(8.3808947292363118, 26.864285714285714)
./basketball_reference/game_logs/Durant_Kevin_2013.csv
./basketball_reference/game_logs/Durant_Kevin_2014.csv
./basketball_reference/game_logs/Durant_Kevin_2015.csv
./basketball_reference/game_logs/Durant_Kevin_2016.csv
./basketball_reference/game_logs/Durant_Kevin_2017.csv
(23.176555312312015, 34.366470588235295)
./basketball_reference/game_logs/Curry_Stephen_2013.csv
./basketball_reference/game_logs/Curry_Stephen_2014.csv
./basketball_reference/game_logs/Curry_Stephen_2015.csv
./basketball_reference/game_logs/Curry_Stephen_2016.csv
./basketball_reference/game_logs/Curry_Stephen_2017.csv
(26.250424150320491, 34.664500000000004)
./basketball_reference/game_logs/Thom