# NBA Trends

## 1. Understand Data 
-included in Readme.MD

## 2. Acquire & Inspect the Data


Step 1: Import relevant Modules

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.pyplot import cm
import numpy as np
import seaborn as sns
from scipy.stats import pearsonr
from scipy.stats import chi2_contingency
import scipy.stats as stats

Step 2: Read data into script

In [2]:
nba = pd.read_csv('/Users/anuroxstar/Documents/Github/NBA-Project/nba-elo/nbaallelo.csv')

FileNotFoundError: [Errno 2] No such file or directory: '/Users/anuroxstar/Documents/Github/NBA-Project/nba-elo/nbaallelo.csv'

Step 3: Inspect data

In [None]:
print(nba.head())
print(nba.info())
print(nba.shape)

## 3. Data Cleaning:

Step 1: Reshape Data?
- data is tidy (tabular format)

Step 2: Remove duplicates

In [None]:
nba.drop_duplicates()
nba = nba[nba._iscopy == 0]
#Check duplicate data is removed
print(nba._iscopy.value_counts)

Step 3: Explore the following columns.

In [None]:
print(nba.lg_id.value_counts)

print(nba.fran_id.value_counts(normalize=True))
nba = nba.rename({'fran_id':'playing_team'}, axis = 1)

print(nba.team_id.value_counts)

print(nba.win_equiv.value_counts)

print(nba.opp_fran.value_counts(normalize = True))
nba = nba.rename({'opp_fran':'opp_team'}, axis = 1)

print(nba.opp_id.value_counts)

print(nba.notes.value_counts)

print(nba.game_location.value_counts(normalize = True))


Step 4: Remove columns

In [None]:
nba.drop(['lg_id','date_game','team_id','elo_i', 'elo_n', '_iscopy', 'gameorder'], axis =1, inplace = True)
nba.drop(['win_equiv','opp_id','opp_elo_i','opp_elo_n','notes'], axis =1, inplace = True)


Step 5: Missing/ Incomplete Data?

In [None]:
print(nba.info())

Step 6: Change Data Types of Columns?

In [None]:

plt.hist(nba.is_playoffs)
plt.show()
plt.close()
nba['is_playoffs'] = nba['is_playoffs'].astype('string')

#Confirm change in data types
print(nba.info())

Step 7: Numerical Variables ONLY: Skewed Data? - through comparing mean and median

In [None]:
print(nba.describe())

Step 8: Numerical Variables ONLY: Outlier/Anomalous Data?

In [None]:
def boxplot(column, name):
    sns.boxplot(data = nba, x = nba[f'{column}'])
    plt.xlabel(str.upper(column))
    plt.title(str.upper(column)+": Distribution")
    plt.savefig(name)
    plt.show()
boxplot('pts', 'pts_boxplot.jpg')
boxplot('opp_pts', 'opp_pts_boxplot.jpg')
boxplot('forecast','forecast_boxplot.jpg')

## 4. Exploratory Data Analysis &  Data Visualizations:

First determine how many teams there are

In [None]:
print(nba.playing_team.nunique())

Question 1: Is forecast of winning linearly associated to points won? (for all 53 teams):

In [None]:
corr_forecast_pointdiff, p = pearsonr(nba.forecast, nba.pts)
print(corr_forecast_pointdiff)
pts_vs_forecast = plt.scatter(nba.forecast, nba.pts, color = 'orange')
plt.xlabel('Forecast')
plt.ylabel('Points Scored')
plt.annotate(str(np.round(corr_forecast_pointdiff,2)), xy=(0.05, 0.95), xycoords='axes fraction')
plt.title('Points Scored vs Forecast')
plt.savefig('points_scored_vs_forecast.jpg')
plt.show()
plt.close()


Question 2: Does game location affect points for a team?:
check there are minimum of 3 data points for each team for each game location

In [None]:
teams_with_two_locations = nba.groupby(['playing_team','game_location'])['game_location'].count()
teams_with_two_locations = teams_with_two_locations.to_frame(name = 'game_location_count').reset_index()
teams_with_two_locations = teams_with_two_locations.pivot(index = 'playing_team', columns = 'game_location', values = 'game_location_count')
teams_for_analysis = teams_with_two_locations[(teams_with_two_locations['H'] >=3) & (teams_with_two_locations['N'] >=3)]
print(teams_for_analysis.head())
print(teams_for_analysis.shape)

#graph for each team, with location on x axis and points on y axis
def points_vs_game_location(bballteams,name):
    i=1
    fig = plt.figure(figsize=(10,5))
    color = iter(cm.rainbow(np.linspace(0,1,5)))
    for value in bballteams.index:
        team = nba[nba.playing_team == str(value)]
        plt.subplot(1,2,i)
        c= next(color)
        sns.boxplot(x = team.game_location, y = team.pts,color=c)
        plt.xlabel('Game Location')
        plt.ylabel('Points Scored')  
        plt.title(str.upper(value) + ': Points vs Game Location')
        plt.legend(['h =home', 'n=not home'])
        i+=1
    fig.tight_layout()
    fig.savefig(name)
    plt.show()
    plt.close()
points_vs_game_location(teams_for_analysis,'points_scored_vs_location.jpg')

Question 3: Is playing team and game result associated for the top 5 and bottom 5 teams?:

In [None]:
scores = nba.groupby('playing_team').pts.mean().reset_index()
scores = scores.sort_values(by='pts', ascending = False)
top_teams = scores.head()
bottom_teams = scores.tail()
ten_teams = top_teams.append(bottom_teams)
nba_ten_teams = nba[nba['playing_team'].isin(ten_teams.playing_team.value_counts().index)]
print(nba_ten_teams.playing_team.value_counts())

gamelocationvsresult_expected = pd.crosstab(nba_ten_teams.playing_team,nba_ten_teams.game_result)
print(gamelocationvsresult_expected)
gamelocationvsresult_freq = gamelocationvsresult_expected/ len(nba_ten_teams)
print(gamelocationvsresult_freq)
chi2, locationvsresultpval, dof, locationvsresultexpected = chi2_contingency(gamelocationvsresult_freq)
print(np.round(locationvsresultexpected,2))
print(locationvsresultpval)


Question 4: Is there an association between points won and opposing points won for the top 5 teams and bottom 5 teams?: 

In [3]:
def scatterplot_teams(teams,name):
    color = iter(cm.rainbow(np.linspace(0, 1, 5)))
    fig = plt.figure(figsize=(12,12))
    i=1
    for value in teams.playing_team:
        team = nba[nba.playing_team == str(value)]
        plt.subplot(3,3,i)
        c= next(color)
        plt.xlabel('Points Scored')
        plt.ylabel('Opposing Team Points Scored')  
        plt.scatter(x=team.pts, y= team.opp_pts, c=c)
        corr_point_vs_opp_pts, p = pearsonr(team.pts, team.opp_pts)
        plt.annotate(str(np.round(corr_point_vs_opp_pts,2)), xy=(0.05, 0.95), xycoords='axes fraction')
        plt.title(str.upper(value) + ': Points vs Opposing Points')
        i+=1
    fig.tight_layout()
    plt.show()
    fig.savefig(name)
    plt.close()
scatterplot_teams(top_teams, 'top_teams_ptswon_vs_opposing_ptswon.jpg')
scatterplot_teams(bottom_teams, 'bottom_teams_ptswon_vs_opposing_ptswon.jpg')

NameError: name 'top_teams' is not defined

Question 5: Is there an association between top 5 teams and points won?:

In [None]:
condors = nba.pts[nba.playing_team =='Condors']
stars = nba.pts[nba.playing_team =='Stars']
floridians = nba.pts[nba.playing_team =='Floridians']
squires = nba.pts[nba.playing_team =='Squires']
colonels = nba.pts[nba.playing_team =='Colonels']
plt.hist(condors, color = 'green' , label = 'Condors', density = True, alpha = 0.5)
plt.hist(stars, color = 'blue' , label = 'Stars', density = True, alpha = 0.5)
plt.hist(floridians, color = 'red' , label = 'Floridians', density = True, alpha = 0.5)
plt.hist(squires, color = 'yellow' , label = 'Squires', density = True, alpha = 0.5)
plt.hist(colonels, color = 'orange' , label = 'Colonels', density = True, alpha = 0.5)
plt.xlabel('Playing Team')
plt.ylabel('Points Scored')
plt.title('Top 5 Teams: Points Scored Distribution')
plt.legend()
plt.savefig('top5teams_pts_scored_hist.jpg')
plt.show()
plt.close()