In [1]:
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
import seaborn as sns
import os
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Lasso, LassoCV, Ridge, RidgeCV

%matplotlib inline



In [2]:
# player_url = "Curry_Stephen"
# player_url = "Thompson_Klay"
# player_url = "Durant_Kevin"
# player_url = "Green_Draymond"
# player_url = "James_LeBron"
# player_url = "DeRozan_DeMar"
# player_url = "Westbrook_Russell"
# player_url = "Gobert_Rudy"
# player_url = "Mills_Patty"
# player_url = "Simmons_Jonathon"

player_url = "Green_Draymond"
opp_team = "SAS"
last_x_games = -5
playoff_game = 1

In [3]:
def get_player_prediction(model, seed):
#     print model.predict(seed).mean()
    return model.predict(seed).mean()

In [4]:
def get_seed_data(X_test, cols, random_data_rows=1, home_game=1, playoff_game=1, game_started=1):
    # randomly seeding data based on mean/std from a given opponent
    seed = []
    for i in range(random_data_rows):
        row = []
        for col in cols:
            if col == "Home_Game":
                row.append(home_game) # home game == 1, away game == 0
    #             row.append(np.random.choice([0,1]))
            elif col == 'gs':
                row.append(game_started)
            elif col == 'playoff_game':
                row.append(playoff_game)
            else:    
                std = X_test[col].std()
                if std == 0:
                    row.append(0)
                else:
                    row.append(X_test[col].mean())
#                     row.append(np.random.normal(X_test[col].mean(), X_test[col].std()))
        seed.append(row)
    
#     print ridge.score(X_test, y_test)
    seed = pd.DataFrame(seed, columns=cols)
    
    return seed

In [5]:
def get_model(X_train, X_test, y_train, y_test):
    model = RidgeCV().fit(X_train, y_train)
#     print model.score(X_train, y_train)
#     print model.score(X_test, y_test)
    return model

In [6]:
def set_train_test_split(opp_team, last_x_games, playoff_game, df, cols):
#     Xs = StandardScaler().fit_transform(df[cols])
#     Xs = pd.DataFrame(Xs, columns=X.columns)
    
    # setting opponents mask
    opponents = pd.concat([df[(df['opp'] == opp_team)], df[last_x_games:]])#['pts'] # df['playoff'] == playoff_game
    # setting testing data based on team opponent and last 5 games
    mask = opponents.index.values
    
    # setting training data as all data from player
    X_train = df[cols]
    y_train = df[['pts']]
    X_test = df.ix[mask,:][cols]
    y_test = df.ix[mask,:][['pts']]
    
    return (X_train, X_test, y_train, y_test)

In [7]:
def get_data(player_url):
    # setting empty dataframe
    df = pd.DataFrame()
    
    # retreiving data from csv files
    for file in os.listdir("./basketball_reference/game_logs/"):
        if file.startswith(player_url):
            filepath = "./basketball_reference/game_logs/" + file
            print filepath
            data = pd.read_csv(filepath)
            df = df.append(data)
    
    df = df.reset_index()
    
    # deleting unnecessary or columns with NAN
    del df['index']
    del df['fg_pct']
    del df['fg3_pct']
    del df['ft_pct']
    # del df['orb_pct']
    # del df['plus_minus']
    
    # data cleaning for all game_log files
    df['date'] = pd.to_datetime(df['date'])
    df.rename(columns={'loc': 'Home_Game'}, inplace=True)
    df['Home_Game'] = [0 if item == "@" else 1 for item in df['Home_Game']]
    df['result'] = [item[3:].replace(")","") for item in df['result']]
    df['result'] = df['result'].astype(float)
    df['mp'] = [0 if pd.isnull(item) else round(float(item.split(":")[0]) + float(item.split(":")[1])/60. , 2) for item in df['mp']]
    # df['playoff'] = df['playoff'].map(lambda x: x[-1])
    
    df = df.dropna(axis=0, how='any')
    
    return df

In [8]:
def predict_player(player_url='Durant_Kevin', opp_team="SAS", last_x_games=-5, playoff_game=1):
    df = get_data(player_url)

    # setting wanted columns for dataframe
    cols = list(df.columns.values[5:-1]) #[6:-1]
    cols = [col for col in cols if col not in ['fg_pct','fg3_pct','ft_pct','pts']] + ['Home_Game'] # + ['playoff']
    
    # setting training data as all data from player
    X_train, X_test, y_train, y_test = set_train_test_split(opp_team, last_x_games, playoff_game, df, cols)
    
    model = get_model(X_train, X_test, y_train, y_test)
    
#     print get_seed_data(X_test, cols)
    
    predicted_ppg = get_player_prediction(model, get_seed_data(X_test, cols))
    
    return (predicted_ppg, X_test['mp'].mean())

print predict_player(player_url)

./basketball_reference/game_logs/Green_Draymond_2013.csv
./basketball_reference/game_logs/Green_Draymond_2014.csv
./basketball_reference/game_logs/Green_Draymond_2015.csv
./basketball_reference/game_logs/Green_Draymond_2016.csv
./basketball_reference/game_logs/Green_Draymond_2017.csv
(8.3808947292363118, 26.864285714285714)


In [9]:
get_data(player_url)

./basketball_reference/game_logs/Green_Draymond_2013.csv
./basketball_reference/game_logs/Green_Draymond_2014.csv
./basketball_reference/game_logs/Green_Draymond_2015.csv
./basketball_reference/game_logs/Green_Draymond_2016.csv
./basketball_reference/game_logs/Green_Draymond_2017.csv


Unnamed: 0,game,date,team,Home_Game,opp,result,gs,mp,fg,fga,...,drb_pct,trb_pct,ast_pct,stl_pct,blk_pct,tov_pct,usg_pct,off_rtg,def_rtg,Name
1,2,2012-11-02,GSW,1,MEM,-10.0,0,1.65,0,2,...,0.0,38.3,0.0,0.0,0.0,0.0,55.2,35.0,120,Draymond Green
2,3,2012-11-03,GSW,0,LAC,4.0,0,5.40,0,2,...,0.0,0.0,0.0,0.0,0.0,33.3,23.4,0.0,119,Draymond Green
3,4,2012-11-05,GSW,0,SAC,-2.0,0,3.88,0,1,...,0.0,0.0,0.0,0.0,0.0,50.0,23.9,0.0,109,Draymond Green
4,5,2012-11-07,GSW,1,CLE,10.0,0,7.57,0,2,...,0.0,0.0,15.1,6.6,0.0,33.3,18.3,21.0,95,Draymond Green
5,6,2012-11-09,GSW,0,LAL,-24.0,0,13.08,0,2,...,7.3,7.0,0.0,3.7,0.0,25.8,12.6,39.0,100,Draymond Green
7,8,2012-11-14,GSW,1,ATL,4.0,0,9.62,2,3,...,0.0,6.8,0.0,0.0,9.1,0.0,14.3,163.0,94,Draymond Green
8,9,2012-11-16,GSW,0,MIN,8.0,0,10.67,0,1,...,20.5,21.4,20.9,0.0,0.0,0.0,11.1,158.0,108,Draymond Green
9,10,2012-11-18,GSW,0,OKC,-10.0,0,25.00,4,6,...,25.9,12.8,0.0,2.1,0.0,25.0,14.7,101.0,126,Draymond Green
10,11,2012-11-19,GSW,0,DAL,4.0,0,25.52,2,4,...,29.1,13.8,6.3,6.0,0.0,14.8,11.3,145.0,86,Draymond Green
11,12,2012-11-21,GSW,1,BRK,9.0,0,14.03,1,2,...,30.4,17.5,0.0,4.1,0.0,0.0,7.0,159.0,98,Draymond Green
