In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats

In [None]:
def pace_calculator(FGA, FTA, OREB, OPPDREB, FG, TOV, MIN):
    return 240 * (FGA + 0.4 * FTA - 1.07 * (FGA-FG)* (OREB/(OREB + OPPDREB)) + TOV) / MIN

In [None]:
def points_data(year):
    
    player = 'raw_data/player_stats_'+year+'.csv'
    team = 'raw_data/team_stats_'+year+'.csv'
    player_over='raw_data/player_overall_15minsplus'+year+'.csv'
    data = pd.read_csv(player)
    team_data = pd.read_csv(team)
    player_overall = pd.read_csv(player_over)

    
    team_data['VS_TEAM_ABBREVIATION'] = team_data['MATCHUP'].str[-3:]
    new_team = team_data[['TEAM_ID','TEAM_ABBREVIATION','VS_TEAM_ABBREVIATION', 'GAME_ID', 'GAME_DATE', 'FGM', 'FTA', 'FGA','OREB','DREB', 'TOV','MIN']]
    cols = ['TEAM_ABBREVIATION','GAME_ID','DREB']
    temp_col = team_data[cols]
    temp_col.columns = ['TEAM_ABBREVIATION','GAME_ID','OPPDREB']
    
    cleaned_team = pd.merge(new_team, temp_col, how='inner', left_on=['GAME_ID','VS_TEAM_ABBREVIATION'], right_on=['GAME_ID','TEAM_ABBREVIATION'], copy=False).drop('TEAM_ABBREVIATION_y', axis=1).rename(columns = {'TEAM_ABBREVIATION_x': 'TEAM_ABBREVIATION'})
    cleaned_team['PACE'] = pace_calculator(FGA = cleaned_team['FGA'], FTA = cleaned_team['FTA'], OREB = cleaned_team['OREB'], OPPDREB=cleaned_team['OPPDREB'], FG = cleaned_team['FGM'], TOV = cleaned_team['TOV'], MIN = cleaned_team['MIN'])
    pace = cleaned_team[['TEAM_ABBREVIATION', 'GAME_ID', 'PACE']]
    cleaned_team2 = pd.merge(cleaned_team, pace, how='inner', left_on=['GAME_ID','VS_TEAM_ABBREVIATION'], right_on=['GAME_ID','TEAM_ABBREVIATION'], copy=False).drop('TEAM_ABBREVIATION_y', axis=1).rename(columns = {'TEAM_ABBREVIATION_x': 'TEAM_ABBREVIATION', 'PACE_x': 'TEAM_PACE', 'PACE_y': 'VS_TEAM_PACE'})
    team_schedule = cleaned_team2[['TEAM_ID','TEAM_ABBREVIATION', 'VS_TEAM_ABBREVIATION','GAME_ID', 'TEAM_PACE', 'VS_TEAM_PACE']].sort_values(['TEAM_ABBREVIATION','GAME_ID'])
    team_schedule['GAME_NUMBER'] = np.array(list(range(82))*30)
    
    player_overall = player_overall[player_overall['GP'] > 30][['PLAYER_ID', 'PLAYER_NAME', 'PTS']]
    players = player_overall['PLAYER_ID'].tolist()
    data = data[data['PLAYER_ID'].isin(players)]
    data_with_number = pd.merge(data, team_schedule, how='left', left_on=['TEAM_ID', 'GAME_ID'], right_on=['TEAM_ID', 'GAME_ID'])
    data_with_number = data_with_number.sort_values(['PLAYER_ID', 'GAME_NUMBER'])
        
    data_with_number['YEAR'] = year 
    return data_with_number[['YEAR','GAME_NUMBER', 'GAME_DATE', 'TEAM_NAME', 'PLAYER_NAME','PLAYER_ID', 'MIN', 'PTS', 'TEAM_PACE', 'VS_TEAM_PACE']]

In [None]:
points_2014 = points_data('2014-15')
points_2015 = points_data('2015-16')
points_2016 = points_data('2016-17')

combined = pd.concat([points_2014, points_2015, points_2016])
combined_less45 = combined[combined['PTS'] < 45]
combined['PTS'].describe()

In [None]:
_ = plt.hist(combined_less45['PTS'], bins=45)
analysis = combined[combined['PTS'] > 30]
players_above30 = analysis['PLAYER_NAME'].value_counts()
total_games_above30 = np.sum(players_above30)

_ = plt.scatter(combined_less45['VS_TEAM_PACE'], combined_less45['PTS'])
_ = plt.xlabel('Game Pace')
_ = plt.ylabel('Points')
plt.show()

combined_less45[['GAME_NUMBER', 'PTS']].groupby(['GAME_NUMBER']).mean().plot()
p = stats.pearsonr(combined_less45['PTS'], combined_less45['TEAM_PACE'])