## Part 1 - Data Preparation and Exploration 

In [None]:
%%capture
# Due to the configuration of the base Jupter image, the following imports are required for the regressions in the assignment to report the correct metrics

import sys 
!{sys.executable} -m pip uninstall statsmodels --yes 
!{sys.executable} -m pip uninstall numpy --yes
!{sys.executable} -m pip uninstall pandas --yes 
!{sys.executable} -m pip uninstall patsy --yes 
!{sys.executable} -m pip install numpy==1.17
!{sys.executable} -m pip install pandas==1.0
!{sys.executable} -m pip install patsy==0.5.2
!{sys.executable} -m pip install statsmodels==0.11.1

In [None]:
#Import Libraries

import pandas as pd
import datetime as dt
import scipy.stats as sp
import numpy as np
import statsmodels.formula.api as sm 

In [None]:
# Import Shotlog_14_15 and Player_Stats Datasets

Shotlog_1415=pd.read_csv("Data\Shotlog_14_15.csv")
Player_Stats=pd.read_csv("Data\Player_Stats_14_15.csv")
display(Shotlog_1415)

In [None]:
Shotlog_1415['game_clock'].head()

In [None]:
Shotlog_1415.columns

In [None]:
#Changing the date from text to date type
import datetime as dt
Shotlog_1415['date'] = pd.to_datetime(Shotlog_1415['date'])

In [None]:
#Modifying game clock
Shotlog_1415['game_clock'] = pd.to_timedelta('00:'+ Shotlog_1415['game_clock'])

In [None]:
Shotlog_1415['shot_clock'] = pd.to_timedelta(Shotlog_1415['shot_clock'])

In [None]:
Shotlog_1415['shot_clock'].describe()

In [None]:
#creating a column to hold previous hot hit by a player
Shotlog_1415['lag_shot_hit'] = Shotlog_1415.sort_values(by = ['quarter', 'game_clock'], ascending = [True, True]).groupby(['game_id','shoot_player', 'date'])['current_shot_hit'].shift(1)
Shotlog_1415.head()

In [None]:
#Sorting data 
Shotlog_1415.sort_values(by = ['shoot_player', 'date', 'quarter', 'game_clock'], ascending = [True, True, True, True], inplace = True)

In [None]:
#Dropping empty rows
Shotlog_1415 = Shotlog_1415[pd.notnull(Shotlog_1415['lag_shot_hit'])]

In [None]:
#Creating useful variables
Shotlog_1415['error'] = Shotlog_1415['current_shot_hit'] - Shotlog_1415['average_hit']
Shotlog_1415['lagerror'] = Shotlog_1415['lag_shot_hit'] - Shotlog_1415['average_hit']

In [None]:
Shotlog_1415['error'].describe()

In [None]:
Shotlog_1415['lagerror'].describe()

## Part 2 - Conditional Probability and Autocorrelation

In [None]:
#Consecutive Shot
Shotlog_1415['conse_shot'] = np.where((Shotlog_1415['current_shot_hit'] == 1 ) & (Shotlog_1415['lag_shot_hit'] == 1), 1, 0)
Shotlog_1415.head()

In [None]:
#Probabilities / Averages
Player_Prob  = Shotlog_1415.groupby(['shoot_player'])['conse_shot', 'lag_shot_hit'].mean()
Player_Prob = Player_Prob.reset_index()
Player_Prob.rename(columns = {'lag_shot_hit': 'average_lag_hit', 'conse_shot': 'conse_shot_hit'}, inplace = True)
Player_Prob.head()

In [None]:
#Condtional Probability
Player_Prob['condtional_prob'] = Player_Prob['conse_shot_hit'] / Player_Prob['average_lag_hit']
Player_Prob.head()

In [None]:
Player_Stats.head()

In [None]:
#Merging player probabilty data with player stats data
Player_Stats = pd.merge(Player_Prob, Player_Stats, on = ['shoot_player'])
Player_Stats.head()

In [None]:
Player_Stats.info()

In [None]:
#dropping empty rows
Player_Stats = Player_Stats[pd.notnull(Player_Stats['condtional_prob'])]

In [None]:
#summary stats
Player_Stats[['average_hit', 'condtional_prob', 'conse_shot_hit']].describe()

In [None]:
Player_Stats.sort_values(by = ['condtional_prob'], ascending = False).head(10)

In [None]:
#difference in probabilty
Player_Stats['diff_prob'] = Player_Stats['condtional_prob'] - Player_Stats['average_hit']
Player_Stats.head()

In [None]:
Player_Stats = pd.merge(Player_Stats, Shotlog_1415, on = ['shoot_player'])
Player_Stats.sort_values(by = ['diff_prob'], ascending = [False]).head()

In [None]:
#runing t statistics for conditional prob and diff prob
sp.stats.ttest_ind(Player_Stats['condtional_prob'], Player_Stats['diff_prob'])

In [None]:
#runing t statistics for diff prob and average lag hit
sp.stats.ttest_ind(Player_Stats['diff_prob'], Player_Stats['average_lag_hit'])

In [None]:
#correlation between current shot hit and previous shot hit
Player_Stats['current_shot_hit'].corr(Player_Stats['lag_shot_hit'])

In [None]:
#autocorrelation for each play using current shot hit and previous shot hit
Autocorr_hit = Player_Stats.groupby('shoot_player')[['current_shot_hit', 'lag_shot_hit']].corr().unstack().iloc[:, 1].reset_index()
Autocorr_hit.columns  = Autocorr_hit.columns.get_level_values(0)

In [None]:
Autocorr_hit.head()

In [None]:
Autocorr_hit.rename(columns = {'current_shot_hit': 'autocorr'}, inplace = True)
Autocorr_hit.head()

In [None]:
Autocorr_hit.sort_values(by = 'autocorr', ascending = False).head(10)

## Part 3 - Regression Analyses

In [None]:
Player_Stats.columns

In [None]:
Shotlog_1415.columns

In [None]:
reg1 = sm.ols(formula = 'error ~ lagerror+shot_dist+dribbles+touch_time+points+quarter+home_away+shoot_player+closest_defender+closest_def_dist', data = Player_Stats).fit()
print(reg1.summary())

In [None]:
reg2 = sm.wls(formula = 'error ~ lagerror+shot_dist+dribbles+touch_time+points+quarter+home_away+shoot_player+closest_defender+closest_def_dist', 
              weights = 1 /Shotlog_1415['shot_per_game'], data = Shotlog_1415).fit()
print(reg2.summary())

In [None]:
#function to run regresson for each player by name
def reg3_player(player):
    Shotlog_player = Shotlog_1415[Shotlog_1415.shoot_player == player]
    reg_player = sm.ols(formula = 'error ~ lagerror+shot_dist+dribbles+touch_time+points+quarter+home_away+closest_def_dist', data = Shotlog_1415).fit()
    print(reg_player.summary())
    return;

In [None]:
reg3_player(player = 'Russel Westbrook')

In [None]:
def reg4_Wls_player(player):
    Shotlog_palyer = Shotlog_1415[Shotlog_1415.shoot_player == player]
    reg_wls_player = sm.wls(formula = 'error ~ lagerror+shot_dist+dribbles+touch_time+points+quarter+home_away+closest_def_dist', weights = 1 / Shotlog_1415['shot_per_game'], data = Shotlog_1415).fit()
    print(reg_wls_player.summary())
    return;

In [None]:
reg4_Wls_player(player = 'Russel Westbrook')

In [None]:
player_list = np.array(Shotlog_1415['shoot_player'])
player_list = np.unique(player_list)

In [None]:
#Regression for all players by using array index
i = 0
Player_Results = {}
while i <= len(player_list) - 1:
    shotlog_player = Shotlog_1415[Shotlog_1415.shoot_player == player_list[i]]
    reg_player = sm.wls(formula = 'error ~ lagerror+shot_dist+dribbles+touch_time+points+quarter+home_away+closest_def_dist', 
                        weights = 1 / shotlog_player['shot_per_game'], data = shotlog_player).fit()
    RegParams = pd.DataFrame(reg_player.params).reset_index()
    RegTVals = pd.DataFrame(reg_player.tvalues).reset_index()
    RegPvals = pd.DataFrame(reg_player.pvalues).reset_index()

    RegOutput = pd.merge(RegParams, RegTVals, on = ['index'])
    RegOutput = pd.merge(RegOutput, RegPvals, on = ['index'])
    RegOutput

    lagErr = RegOutput[RegOutput['index'] == 'lagerror']
    lagErr = lagErr.drop(columns = ['index'])
    lagErr = lagErr.rename(columns = {'0_x':'Coef', '0_y':'T_Statistics', 0:'P_value'})
    lagErr['shoot_player'] = player_list[i]
    Headers = ['shoot_player', 'Coef', 'T_Statistics', 'P_value']
    Player_Results[i] = lagErr[Headers]
    i += 1

In [None]:
RegPlayer = Player_Results[0]
j = 1
while j <= len(player_list) - 1:
    RegPlayer = RegPlayer.append(Player_Results[j])
    j += 1
RegPlayer = RegPlayer.reset_index()
RegPlayer = RegPlayer.drop(columns = ['index'])
RegPlayer

In [None]:
reg3_player(player='Steph Curry')

In [None]:
reg3_player('James Harden')

In [None]:
reg3_player('Andrew Wiggins')

In [None]:
reg3_player('Russel westbrook')

In [None]:
reg4_Wls_player(player = 'Steph Curry')