# redo train test split

In [1]:
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
import seaborn as sns
import os
import datetime as dt
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Lasso, LassoCV, Ridge, RidgeCV

%matplotlib inline



In [2]:
# player_url = "Curry_Stephen"
# player_url = "Thompson_Klay"
# player_url = "Durant_Kevin"
# player_url = "Green_Draymond"
# player_url = "James_LeBron"
# player_url = "DeRozan_DeMar"
# player_url = "Westbrook_Russell"
# player_url = "Gobert_Rudy"
# player_url = "Mills_Patty"
# player_url = "Simmons_Jonathon"

player_url = "Green_Draymond"
opp_team = "SAS"
last_x_games = -5
playoff_game = 1

In [3]:
class predict_player:

    # class initialized variables
    df             = None
    cols           = None
    opponents_mask = None
    model          = None
    predicted_ppg  = None
    seed           = None
    
    X_train        = None
    X_test         = None
    y_train        = None
    y_test         = None
    target_X       = None
    target_y       = None
    
    # user initialized variables
    player_url     = None
    opp_team       = None
    last_x_games   = None
    playoff_game   = None
    home_game      = None
    game_started   = None
    player_year    = ""
    before_date    = None
    
    def __init__(self, player_url = "Stephen_Curry", opp_team = None, last_x_games = 5, playoff_game = None, home_game = None, game_started = None, player_year = "", before_date = dt.datetime.today()):
        '''
        player_url    : default = "Stephen_Curry"
        opposing_team : default = None, ex: 'SAS'
        last_x_games  : default = 5, last 5 games to set as average prediction. Will always grab last 5 games regardless of playoff_game, home_game, or game_started.
        playoff_game  : default = None, (0,1)
        home_game     : default = None, (0,1)
        game_started  : default = None, (0,1)
        player_year   : default = "", (2017, 2016, 2015, 2014)
        before_date   : default = Today, ex: "2017-05-20" = "YYYY-mm-dd"
        '''
        self.player_url   = player_url
        self.opp_team     = opp_team
        if last_x_games != None:
            self.last_x_games = np.abs(last_x_games) * -1
        self.playoff_game = playoff_game
        self.home_game    = home_game
        self.game_started = game_started
        self.player_year  = str(player_year)
        self.before_date  = before_date
        if type(before_date) == str:
            self.before_date  = dt.datetime.strptime(before_date, "%Y-%m-%d")
        self.df           = self.get_data()
        
        
    
    def predict_player(self):#, opp_team = "GSW", last_x_games = -5, playoff_game = 1, home_game = 1, game_started = 1):
        #initializing global variables
#         if player_url == None: self.player_url = player_url
#         self.opp_team     = opp_team
#         self.last_x_games = np.abs(last_x_games)
#         self.playoff_game = playoff_game
#         self.game_started = game_started
#         self.df           = self.get_data()
        
        # setting desired columns for dataframe
        cols = list(self.df.columns.values[6:-1])
#         self.cols = [col for col in cols if col not in ['fg_pct','fg3_pct','ft_pct','pts']] + ['Home_Game', 'playoff']
        self.cols = ['playoff','Home_Game','gs','mp','fga','fg3a','fta','orb','drb','trb','ast','stl','blk','tov','pf']
        
        # setting training data as all data from player
        self.set_train_test_split()
        
        self.model = self.get_model(X=self.X_train, y=self.y_train)
        
#         return self.get_seed_data(X_test)

        self.seed = self.get_seed_data(X_test=self.X_test, random_data_rows=len(self.y_test))
#         print self.seed
        self.predicted_ppg = self.get_player_prediction()

        return self.predicted_ppg

    
    def get_data(self):
        # setting empty dataframe
        df = pd.DataFrame()

        # retreiving data from csv files
        for file in os.listdir("./basketball_reference/game_logs/"):
            if file.startswith(self.player_url) and file.endswith(self.player_year + ".csv") :
                filepath = "./basketball_reference/game_logs/" + file
#                 print filepath
                data = pd.read_csv(filepath)
                df = df.append(data)

        df = df.reset_index()

        # deleting unnecessary or columns with NAN
        del df['index']
        del df['fg_pct']
        del df['fg3_pct']
        del df['ft_pct']
        # del df['orb_pct']
        # del df['plus_minus']

        # data cleaning for all game_log files
        df['date'] = pd.to_datetime(df['date'])
        df.rename(columns={'loc': 'Home_Game'}, inplace=True)
        df['playoff'] = [0 if item[-1] == "N" else 1 for item in df['playoff']]
        df['Home_Game'] = [0 if item == "@" else 1 for item in df['Home_Game']]
        df['result'] = [item[3:].replace(")","") for item in df['result']]
        df['result'] = df['result'].astype(float)
        df['mp'] = [0 if pd.isnull(item) else round(float(item.split(":")[0]) + float(item.split(":")[1])/60. , 2) for item in df['mp']]

        df = df.dropna(axis=0, how='any')
        df = df.sort_values('date')
        df = df[df['date'] < self.before_date]
        return df
    
    
    def set_train_test_split(self):#, opp_team, last_x_games, playoff_game, df, cols):
    #     Xs = StandardScaler().fit_transform(df[cols])
    #     Xs = pd.DataFrame(Xs, columns=X.columns)
        
        # setting opponent information based on past x games
        opp = self.df[self.last_x_games:] 
        if self.opp_team != None:
            opp = self.df[(self.df['opp'] == self.opp_team)]
            try:
                opp = opp[self.last_x_games:]
            except e: pass # catch index out of range: if last_x_games > game played per opponent
        
        # setting opponent information based on playoff data
        playoff = pd.DataFrame()
        if self.playoff_game != None:
            try:
                if self.playoff_game == 1 or self.playoff_game == 0:
                    playoff = self.df[(self.df['playoff'] == self.playoff_game)][self.last_x_games:]
            except e: pass
        
        # setting opponent information based on home game data
        home = pd.DataFrame()
        if self.home_game != None:
            home = self.df[(self.df['Home_Game'] == self.home_game)][self.last_x_games:]
        
        
        opponents = pd.concat([opp, playoff, home])#['pts'] # df['playoff'] == playoff_game
        # setting testing data based on team opponent and last 5 games
        mask = set(opponents.index.values) # opponents.drop_duplicates()
        self.opponents_mask = mask
        
        not_mask = [i for i in self.df.index if i not in mask]
        
        # setting training data as all data from player
#         X_train = self.df.ix[not_mask,:][self.cols]
#         y_train = self.df.ix[not_mask,:][['pts']]
#         X_test = self.df.ix[mask,:][self.cols]
#         y_test = self.df.ix[mask,:][['pts']]
        
#         X = self.df.ix[not_mask,:][self.cols]
#         y = self.df.ix[not_mask,:][['pts']]

        X = self.df[self.cols]
        y = self.df[['pts']]
        
        X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=8)
#         print "mp: ", X_train['mp'].mean(), X_test['mp'].mean()
#         print "pts: ", y_train['pts'].mean(), y_test['pts'].mean()

        self.X_train = X_train
        self.X_test  = X_test
        self.y_train = y_train
        self.y_test  = y_test
        self.target_X = self.df.ix[mask,:][self.cols]
        self.target_y = self.df.ix[mask,:][['pts']]
        
#         return (X_train, X_test, y_train, y_test)
    
    
    def get_model(self, X, y):
        model = RidgeCV().fit(X, y)
#         print "score X_train:", model.score(X_train, y_train)
#         print "score X_test:", model.score(X_test, y_test)
        return model
    
    def score(self, X, y):
        return self.model.score(X, y)
        
    
    def get_seed_data(self, X_test, random_data_rows=1):
        # randomly seeding data based on mean/std from a given opponent
        seed = []
        for i in range(random_data_rows):
            row = []
            for col in self.cols:
                if col == "Home_Game":
                    if self.home_game != None:
                        row.append(self.home_game) # home game == 1, away game == 0
                    else: 
                        row.append(int(round(X_test['Home_Game'].mean())))
#                         row.append(np.random.choice([0,1]))
                elif col == 'gs':
                    if self.game_started != None:
                        row.append(self.game_started)
                    else:
                        row.append(int(round(X_test['gs'].mean())))
#                         row.append(X_test['gs'].value_counts().idxmax()) # max of started vs not started
                elif col == 'playoff':
                    if self.playoff_game != None:
                        row.append(self.playoff_game)
                    else:
                        row.append(int(round(X_test['playoff'].mean())))
#                         row.append(X_test['playoff'].value_counts().idxmax())
                else:
                    std = X_test[col].std()
                    if std == 0:
                        row.append(0)
                    else:
                        row.append(X_test[col].mean())
    #                     row.append(np.random.normal(X_test[col].mean(), X_test[col].std()))
            seed.append(row)

    #     print ridge.score(X_test, y_test)
        seed = pd.DataFrame(seed, columns=self.cols)
        return seed
    
    def predict(self, X):
        return self.model.predict(X)
    
    def get_player_prediction(self):
        return self.predict(X=self.target_X).mean()
    

In [4]:
# sBeg, sEnd = '2016-10-25','2017-05-20'
# dBeg = dt.datetime.strptime(sBeg, "%Y-%m-%d")
# dEnd = dt.datetime.strptime(sEnd, "%Y-%m-%d")
# date_diff = dEnd - dBeg
# date_list = [(dBeg + dt.timedelta(days=date)).strftime("%Y-%m-%d") for date in range(0, date_diff.days)]
# beg_year = dt.datetime.strptime(date_list[0], "%Y-%m-%d").year
# end_year = dt.datetime.strptime(date_list[-1], "%Y-%m-%d").year

# results = []
# for date in date_list:
#     print date
#     for i in range(10):
#         row = []
#         row.append(i)
#         row.append(date)
#         pp = predict_player(player_url="Stephen_Curry", before_date=date, last_x_games=i+1)
#         (ppg, mpg) = pp.predict_player()
#         row.append(pp.model.score(pp.X_train, pp.y_train))
#         row.append(pp.model.score(pp.X_test, pp.y_test))
#         row.append(pp.y_test.pts.mean())
#         row.append(ppg)
#         row.append(mpg)
        
#         #         print pp.opp_team, pp.home_game, pp.playoff_game, pp.last_x_games, pp.opponents_mask #(pp.X_train, pp.X_test, pp.y_train, pp.y_test)
# #         print pp.df
# #         print pp.model.score(pp.X_train, pp.y_train)
# #         print pp.model.score(pp.X_test, pp.y_test)
#         results.append(row)
# # results = pd.DataFrame(results, columns=['i','date','train','test','mean_ppg','ppg','mpg'])
# # results

In [5]:
# scores = pd.DataFrame(results, columns=['i','date','train','test','mean_ppg','ppg','mpg'])

In [6]:
# scores.groupby("i").mean()

# predict a player

In [7]:
pp = predict_player(player_url="Stephen_Curry", player_year=2017, before_date="2017-05-20", last_x_games=5)
print pp.player_url
print pp.predict_player()
# pp.df#.info()
# print pp.player_url
# print pp.opp_team
# print pp.seed
print pp.model.score(pp.X_train, pp.y_train)
print pp.model.score(pp.X_test, pp.y_test)
# print pp.X_test
# print pp.y_test.pts
print pp.model.predict(pp.target_X)
print pp.target_y

Stephen_Curry
27.5903237485
0.573448355174
0.408966749486
[[ 24.13680557]
 [ 40.09721666]
 [ 23.16474316]
 [ 21.09763056]
 [ 29.45522279]]
    pts
88   30
89   40
90   29
86   23
87   23


In [8]:
print pp.last_x_games
pp.df[pp.last_x_games:]

-5


Unnamed: 0,game,playoff,date,team,Home_Game,opp,result,gs,mp,fg,...,drb_pct,trb_pct,ast_pct,stl_pct,blk_pct,tov_pct,usg_pct,off_rtg,def_rtg,Name
86,6,1,2017-05-04,GSW,1,UTA,11.0,1,38.17,8,...,8.8,5.8,23.6,2.5,0.0,15.9,21.4,125,106,Stephen Curry
87,7,1,2017-05-06,GSW,0,UTA,11.0,1,36.5,6,...,13.7,7.1,18.1,0.0,0.0,0.0,31.2,108,102,Stephen Curry
88,8,1,2017-05-08,GSW,0,UTA,26.0,1,35.35,9,...,10.1,7.4,30.9,0.0,2.2,9.7,26.1,153,101,Stephen Curry
89,9,1,2017-05-14,GSW,1,SAS,2.0,1,39.18,14,...,15.3,10.7,16.8,3.9,0.0,9.4,34.3,139,115,Stephen Curry
90,10,1,2017-05-16,GSW,1,SAS,36.0,1,30.55,8,...,16.8,12.4,29.4,4.6,0.0,11.1,25.1,164,93,Stephen Curry


# test data

In [9]:
pp.df.ix[pp.opponents_mask,:]['pts']

88    30
89    40
90    29
86    23
87    23
Name: pts, dtype: int64

In [10]:
pp = predict_player("Stephen_Curry", opp_team="SAS", last_x_games=5, playoff_game=1, home_game=None)
print pp.predict_player()
pp.df.ix[pp.opponents_mask,:]

27.4647230218


Unnamed: 0,game,playoff,date,team,Home_Game,opp,result,gs,mp,fg,...,drb_pct,trb_pct,ast_pct,stl_pct,blk_pct,tov_pct,usg_pct,off_rtg,def_rtg,Name
358,73,0,2017-03-29,GSW,0,SAS,12.0,1,35.13,9,...,10.0,7.1,48.9,1.5,0.0,8.0,32.7,139,112,Stephen Curry
371,8,1,2017-05-08,GSW,0,UTA,26.0,1,35.35,9,...,10.1,7.4,30.9,0.0,2.2,9.7,26.1,153,101,Stephen Curry
372,9,1,2017-05-14,GSW,1,SAS,2.0,1,39.18,14,...,15.3,10.7,16.8,3.9,0.0,9.4,34.3,139,115,Stephen Curry
373,10,1,2017-05-16,GSW,1,SAS,36.0,1,30.55,8,...,16.8,12.4,29.4,4.6,0.0,11.1,25.1,164,93,Stephen Curry
374,11,1,2017-05-20,GSW,0,SAS,12.0,1,33.5,8,...,11.9,8.1,12.4,8.3,0.0,23.9,26.1,105,94,Stephen Curry
375,12,1,2017-05-22,GSW,0,SAS,14.0,1,33.9,14,...,12.0,7.5,30.2,0.0,0.0,19.2,38.9,114,112,Stephen Curry


In [11]:
print pp.df[pp.df['Home_Game'] == 1]['pts'].mean()
print pp.df[pp.df['Home_Game'] == 0]['pts'].mean()

25.0880829016
26.7650273224


In [12]:
print predict_player("Patrick_McCaw", opp_team="SAS", last_x_games=5, playoff_game=1, home_game=1).predict_player()
print predict_player("Patrick_McCaw", opp_team="SAS", last_x_games=5, playoff_game=1, home_game=0).predict_player()

6.84034678257
6.00376786099


In [13]:
print predict_player("Brandon_Ingram", opp_team=None, last_x_games=5, playoff_game=1, home_game=1).predict_player()
print predict_player("Brandon_Ingram", opp_team=None, last_x_games=5, playoff_game=1, home_game=0).predict_player()

11.2425298836
12.4409039204


# team gathering function

In [14]:
game_score = []
def predict():
    pp = predict_player("Stephen_Curry", opp_team="SAS", last_x_games=5, playoff_game=1, home_game=1)
    print pp.predict_player()
    game_score.append(['Stephen_Curry', pp.target_y['pts'].mean(), pp.model.predict(pp.target_X).mean(), pp.target_X.mp.mean()])
predict()
game_score    

24.4677197344


[['Stephen_Curry', 28.0, 24.467719734377049, 33.971111111111114]]

In [15]:
def get_ppg_from_list(players_list, opp_team=None, last_x_games=None, playoff_game=None, home_game=None):
    game_score = []
    for player in players_list:
        pp = predict_player(player_url = player, opp_team=opp_team, last_x_games=last_x_games, playoff_game=playoff_game, home_game=home_game)
        pp.predict_player()
#         game_score.append([player, ppg, mpg])
#         print player, "train:", pp.score(pp.X_train, pp.y_train), "test :", pp.score(pp.X_test, pp.y_test)
        game_score.append([player, pp.target_y['pts'].mean(), pp.model.predict(pp.target_X).mean(), pp.target_X.mp.mean()])
    
#         print pp.df.ix[pp.opponents_mask,:]
    game_score = pd.DataFrame(game_score, columns=['player','orig','ppg','mpg'])
#     print game_score
    print game_score.orig.sum(), game_score.orig.sum() * (48*5.) / game_score.mpg.sum()
    print game_score.ppg.sum(), game_score.ppg.sum() * (48*5.) / game_score.mpg.sum()
    print game_score.mpg.sum()
    return game_score.ppg.sum() * (48*5.) / game_score.mpg.sum()

In [16]:
players_list = ['Matt_Barnes', 'Ian_Clark', 'Stephen_Curry', 'Kevin_Durant', 'Draymond_Green', 'Andre_Iguodala', 'Shaun_Livingston', 'Patrick_McCaw', 'JaVale_McGee', 'Klay_Thompson', 'David_West']
get_ppg_from_list(players_list=players_list, opp_team="SAS", last_x_games=3, playoff_game=1, home_game=0)

124.15 127.205584136
109.592323915 112.289614018
234.235


112.28961401826231

In [17]:
players_list = ['LaMarcus_Aldridge', 'Kyle_Anderson', 'Davis_Bertans', 'Dewayne_Dedmon', 'Pau_Gasol', 'Manu_Ginobili', 'Danny_Green', 'Patty_Mills', 'Dejounte_Murray', 'Jonathon_Simmons']
# players_list.append('Kawhi_Leonard')
# players_list.append('Tony_Parker')
# players_list.append('David_Lee')
get_ppg_from_list(players_list=players_list, opp_team="GSW", last_x_games=3, playoff_game=1, home_game=1)

98.1666666667 106.238186676
97.3193422075 105.321192984
221.765833333


105.32119298423828

In [18]:
players_list = ['Mike_Dunleavy', 'Channing_Frye', 'Kyrie_Irving', 'LeBron_James', 'Richard_Jefferson', 'Kyle_Korver', 'Kevin_Love', 'Iman_Shumpert', 'J.R._Smith', 'Tristan_Thompson', 'Deron_Williams']
get_ppg_from_list(players_list=players_list, opp_team="BOS", last_x_games=5, playoff_game=1, home_game=1)

124.190909091 110.965408599
119.687987484 106.942018001
268.604590909


106.9420180006793

In [19]:
players_list = ['Avery_Bradley', 'Jaylen_Brown', 'Jae_Crowder', 'Gerald_Green', 'Al_Horford', 'Jonas_Jerebko', 'Amir_Johnson', 'Kelly_Olynyk', 'Terry_Rozier', 'Marcus_Smart', 'Tyler_Zeller']
# players_list.append('Isaiah_Thomas')
get_ppg_from_list(players_list=players_list, opp_team="CLE", last_x_games=5, playoff_game=1, home_game=0)

89.9202020202 96.321434281
85.3483152022 91.4240843441
224.050323232


91.42408434411314