# Loaded 2013 data:
## Achieved the following:
- combined/merged opp_shot_range, shot_range, player averages, unnassisted stats, player-bios, schedule
- dropped unnecessary columns
- created a heat map to find correlation between variables
- plot GP vs other variables


### note: 
- Shorter seasons due to Lockout: Season 1998-99 and 2011-12.
- For players traded midseason:
    - If a player traded to a different team mid-season the TEAM_NAME and TEAM_ABBREVIATION is recorded as the team the player was traded to is 
    

In [1]:
opp_shot_file = '../datasets/update_opponent_shot_range/update_opp_shot_range%s.csv'
shot_sel_file = '../datasets/update_shot_range/update_shot_range%s.csv'
avg_file = '../datasets/player_avgs/player_average_%s.csv'
unassisted_file = '../datasets/unassisted_stats/unassisted_stats%s.csv'
bios_file = '../datasets/player_bios/player_bios%s.csv'
schedule_file = '../datasets/team_schedule/team_schedule%s.csv'
pace_pie_file = '../datasets/pace_pie/pace_pie%s.csv'
files = [opp_shot_file,shot_sel_file, avg_file,unassisted_file, bios_file, schedule_file, pace_pie_file]

In [2]:
import pandas as pd
import numpy as np
%matplotlib inline

import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
# all_2015 = is a list of data frames of player stats in 2015.  
# The list is in order: opponent shooting range, shooting range,  
        # players_avg, unassisted_stats, player_bios, schedule, pace_pie
all_2015 = []

for f in files:
    csv_file = f % '2015-16'
    df = pd.read_csv(csv_file)
    all_2015.append(df)

for df in all_2015:
    df.drop([x for x in df.columns if x.startswith('Unnamed:')], axis=1,inplace=True)
    print df.shape, df.columns
    print "==========="

(476, 9) Index([u'PLAYER_ID', u'PLAYER_NAME', u'TEAM_ID', u'TEAM_ABBREVIATION', u'AGE',
       u'opp_lessthan5ft_FGM', u'opp_lessthan5ft_FGA',
       u'opp_lessthan5ft_FG_PCT', u'SEASON'],
      dtype='object')
(476, 21) Index([u'PLAYER_ID', u'PLAYER_NAME', u'TEAM_ID', u'TEAM_ABBREVIATION', u'AGE',
       u'lessthan5ft_FGM', u'lessthan5ft_FGA', u'lessthan5ft_FG_PCT',
       u'5_9ft_FGM', u'5_9ft_FGA', u'5_9ft_FG_PCT', u'10_14ft_FGM',
       u'10_14ft_FGA', u'10_14ft_FG_PCT', u'15_19ft_FGM', u'15_19ft_FGA',
       u'15_19ft_FG_PCT', u'20_24ft_FGM', u'20_24ft_FGA', u'20_24ft_FG_PCT',
       u'SEASON'],
      dtype='object')
(476, 36) Index([u'PLAYER_ID', u'PLAYER_NAME', u'TEAM_ID', u'TEAM_ABBREVIATION', u'AGE',
       u'GP', u'W', u'L', u'W_PCT', u'MIN', u'FGM', u'FGA', u'FG_PCT', u'FG3M',
       u'FG3A', u'FG3_PCT', u'FTM', u'FTA', u'FT_PCT', u'OREB', u'DREB',
       u'REB', u'AST', u'TOV', u'STL', u'BLK', u'BLKA', u'PF', u'PFD', u'PTS',
       u'PLUS_MINUS', u'DD2', u'TD3', u'CFID', u'

In [4]:
for df in all_2015:
    try:
        print df.SEASON.unique()
    except:
        pass

[2015]
[2015]
[2015]
[2015]
[2015]


In [6]:
# Merged data frames
merged_shot_selection = pd.merge(all_2015[0], all_2015[1], 
            on=['PLAYER_ID', 'PLAYER_NAME', 'TEAM_ID', 'TEAM_ABBREVIATION', 'AGE', 'SEASON'])
merged_avg_assisted = pd.merge(all_2015[2], all_2015[3], 
            on=['PLAYER_ID', 'PLAYER_NAME', 'TEAM_ID', 'TEAM_ABBREVIATION', 'AGE', 'GP','W', 'L', 'W_PCT', 'MIN', 'SEASON'])
merge_bios_schedule =  pd.merge(all_2015[4], all_2015[5], on=['TEAM_ABBREVIATION'])
all_merged = pd.merge(merged_avg_assisted, merged_shot_selection,
            on=['PLAYER_ID', 'PLAYER_NAME', 'TEAM_ID', 'TEAM_ABBREVIATION', 'AGE', 'SEASON']).merge(merge_bios_schedule, 
            on=['PLAYER_ID', 'PLAYER_NAME', 'TEAM_ID', 'TEAM_ABBREVIATION', 'AGE', 'SEASON']).merge(all_2015[6], 
            on=['PLAYER_NAME', 'TEAM_ABBREVIATION', 'AGE'])

# 35ft and above, double-double and triple-double count are dropped 
drop_columns = ['CFID_x','CFID_y', 'CFPARAMS_x', 'CFPARAMS_y', 'DD2', 'TD3']

all_merged.drop(drop_columns, axis=1, inplace=True)

In [7]:
all_merged

Unnamed: 0,PLAYER_ID,PLAYER_NAME,TEAM_ID,TEAM_ABBREVIATION,AGE,GP,W,L,W_PCT,MIN,...,PLAYER_HEIGHT_INCHES,PLAYER_WEIGHT,NET_RATING,USG_PCT,TS_PCT,B2B_COUNT,3GMS_IN_4DAYS,4GMS_IN_5DAYS,PACE,PIE
0,201166,Aaron Brooks,1610612741,CHI,31.0,69,36,33,0.522,16.1,...,72,161,-1.4,0.231,0.494,17,24,1,97.22,0.073
1,203932,Aaron Gordon,1610612753,ORL,20.0,78,32,46,0.410,23.9,...,81,220,-1.3,0.171,0.541,19,27,1,98.87,0.113
2,1626151,Aaron Harrison,1610612766,CHA,21.0,21,15,6,0.714,4.4,...,78,210,2.2,0.138,0.371,16,24,0,102.89,0.013
3,203940,Adreian Payne,1610612750,MIN,25.0,52,18,34,0.346,9.3,...,82,237,-10.8,0.180,0.422,14,22,2,96.48,0.039
4,201143,Al Horford,1610612737,ATL,30.0,82,48,34,0.585,32.1,...,82,245,4.9,0.206,0.565,19,27,2,99.75,0.137
5,2744,Al Jefferson,1610612766,CHA,31.0,47,33,14,0.702,23.3,...,82,289,3.7,0.245,0.507,16,24,0,97.68,0.124
6,202329,Al-Farouq Aminu,1610612757,POR,25.0,82,44,38,0.537,28.5,...,81,215,1.5,0.169,0.533,19,25,2,99.01,0.088
7,101187,Alan Anderson,1610612764,WAS,33.0,13,8,5,0.615,14.8,...,78,220,5.4,0.157,0.495,20,26,1,100.19,0.066
8,1626210,Alan Williams,1610612756,PHX,23.0,10,3,7,0.300,6.8,...,80,260,-0.2,0.201,0.481,14,22,1,107.52,0.110
9,202692,Alec Burks,1610612762,UTA,24.0,31,13,18,0.419,25.7,...,78,214,-2.0,0.251,0.520,18,22,1,95.52,0.099


In [None]:
all_merged.columns.tolist()

In [None]:
# need to handle null values for each column
for col in all_merged.columns:
    print col
    print all_merged[col].isnull().sum()
    if all_merged[col].isnull().sum() > 0:
        print all_merged[all_merged[col].isnull()][['PLAYER_NAME','GP', 'TEAM_ABBREVIATION']]
    print "================"

In [None]:
# players traded midseason have assigned TEAM_ABB and team name as the team they were traded to.
# total GP is added throughout year regardless of which team he played for.
print all_merged.shape
all_merged[all_merged['PLAYER_NAME'] == 'Anderson Varejao']

In [None]:
print all_merged.shape
print all_merged[all_merged.MIN > 10.].shape
print all_merged[all_merged.GP > 30].shape

In [None]:
# all_corr = all_merged.iloc[:,3:].corr()

# # Set the default matplotlib figure size to 7x7:
# plt.rcParams['figure.figsize']=(25,20)

# # Generate a mask for the upper triangle (taken from seaborn example gallery)
# mask = np.zeros_like(all_corr, dtype=np.bool)
# mask[np.triu_indices_from(mask)] = True

# # Plot the heatmap with seaborn.
# # Assign the matplotlib axis the function returns. This will let us resize the labels.
# ax = sns.heatmap(all_corr, mask=mask)

# # Resize the labels.
# ax.set_xticklabels(ax.xaxis.get_ticklabels(), fontsize=12)
# ax.set_yticklabels(ax.yaxis.get_ticklabels(), fontsize=12)

# # If you put plt.show() at the bottom, it prevents those useless printouts from matplotlib.
# plt.show()


In [None]:
# columns highly correlated with GP: W, 
fig = plt.figure(figsize=(9,9))
ax = fig.gca()

ax = sns.regplot(all_merged.GP, all_merged.index, fit_reg=True, color='steelblue', ax=ax)

# ax.set_title('Business zone % vs underclass %')
plt.show()

In [None]:
fig = plt.figure(figsize=(14,9))
ax = fig.gca()
all_merged.plot(x='GP', y=all_merged.columns[19], kind='scatter',ax=ax)
plt.show()

In [None]:
# subset players that played 20 min or greater
min_gp = all_merged[(all_merged.MIN > 10) & (all_merged.GP > 10)]
print min_gp.shape
print all_merged.shape

In [None]:
# columns highly correlated with GP: W, 
fig = plt.figure(figsize=(9,9))
ax = fig.gca()

ax = sns.regplot(min_gp.GP, min_gp.PTS, fit_reg=True, color='steelblue', ax=ax)

# ax.set_title('Business zone % vs underclass %')
plt.show()