# Data Exploration

In [30]:
import pandas as pd
from pathlib import Path

In [31]:
# reading in the raw data and tweaking it a bit
df = pd.read_csv(Path("../../data/raw/nba_2008-2025_RAW.csv"))
df = df.drop(["moneyline_home", "moneyline_away", "h2_spread", "h2_total"], axis=1, errors='ignore')
df = df.dropna(how='any')
df

Unnamed: 0,season,date,regular,playoffs,away,home,score_away,score_home,q1_away,q2_away,...,q1_home,q2_home,q3_home,q4_home,ot_home,whos_favored,spread,total,id_spread,id_total
0,2008,2007-10-30,True,False,por,sa,97,106,26,23,...,29,30,22,25,0,home,13.0,189.5,0.0,1
1,2008,2007-10-30,True,False,utah,gs,117,96,28,34,...,30,21,21,24,0,home,1.0,212.0,0.0,1
2,2008,2007-10-30,True,False,hou,lal,95,93,16,27,...,25,18,19,31,0,away,5.0,199.0,0.0,0
3,2008,2007-10-31,True,False,phi,tor,97,106,22,28,...,24,34,23,25,0,home,6.5,191.0,1.0,1
4,2008,2007-10-31,True,False,wsh,ind,110,119,23,22,...,28,20,22,33,16,away,1.5,203.5,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23113,2025,2025-06-11,False,True,okc,ind,107,116,32,28,...,24,40,20,32,0,away,4.5,225.5,0.0,0
23114,2025,2025-06-13,False,True,okc,ind,111,104,34,23,...,35,25,27,17,0,away,6.5,227.5,1.0,0
23115,2025,2025-06-16,False,True,ind,okc,109,120,22,23,...,32,27,28,33,0,home,8.5,223.5,1.0,1
23116,2025,2025-06-19,False,True,okc,ind,91,108,25,17,...,28,36,26,18,0,away,5.5,222.5,0.0,0


In [None]:
import matplotlib.pyplot as plt

# Distribution of total points scored by home and away teams
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
df['score_home'].hist(bins=30, alpha=0.7, color='blue')
plt.title('Home Team Score Distribution')
plt.xlabel('Score')
plt.ylabel('Frequency')

plt.subplot(1, 2, 2)
df['score_away'].hist(bins=30, alpha=0.7, color='orange')
plt.title('Away Team Score Distribution')
plt.xlabel('Score')
plt.ylabel('Frequency')
plt.tight_layout()
plt.show()

# Average points per season
season_avg = df.groupby('season')[['score_home', 'score_away']].mean()
season_avg.plot(kind='line', figsize=(10, 6))
plt.title('Average Points per Season')
plt.xlabel('Season')
plt.ylabel('Average Points')
plt.show()

# Spread distribution
plt.figure(figsize=(8, 5))
df['spread'].hist(bins=30, color='green', alpha=0.7)
plt.title('Spread Distribution')
plt.xlabel('Spread')
plt.ylabel('Frequency')
plt.show()

# Correlation between spread and actual score difference
df['score_diff'] = df['score_home'] - df['score_away']
plt.figure(figsize=(8, 5))
plt.scatter(df['spread'], df['score_diff'], alpha=0.5)
plt.title('Spread vs. Actual Score Difference')
plt.xlabel('Spread')
plt.ylabel('Actual Score Difference')
plt.show()

# Correlation matrix
plt.figure(figsize=(10, 8))
corr = df[['score_home', 'score_away', 'spread', 'total', 'q1_away', 'q1_home', 'q2_away', 'q2_home', 'q3_away', 'q3_home', 'q4_away', 'q4_home']].corr()
plt.matshow(corr, fignum=1)
plt.xticks(range(len(corr.columns)), corr.columns, rotation=90)
plt.yticks(range(len(corr.columns)), corr.columns)
for (i, col1) in enumerate(corr.columns):
    for (j, col2) in enumerate(corr.columns):
        plt.text(j, i, f"{corr.iloc[i, j]:.2f}", va='center', ha='center', color='black')
plt.colorbar()
plt.title('Correlation Matrix', pad=20)
plt.show()