## 2022 NBA Champion Prediction

In [1]:
import configparser
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sqlalchemy import create_engine
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker
from sqlalchemy.ext.automap import automap_base

### Read in Our Data
##### Read config.ini File

In [2]:
config = configparser.ConfigParser()
config.read(Path("../config.ini"))

['..\\config.ini']

##### Configure Credentials

In [3]:
# database and root user credentials
host = config["DATABASE"]["HOST"]
port = config["DATABASE"]["PORT"]
db = config["DATABASE"]["DB"]

# root user credentials
root_user = config["ROOT"]["USERNAME"]
root_pwd = config["ROOT"]["USERNAME"]

# root user db uri
root_db_uri = f"postgresql://{root_user}:{root_pwd}@{host}:{port}/{db}"


# user credentials
user = config["USER"]["USERNAME"]
pwd = config["USER"]["PASSWORD"]

# user db uri
user_db_uri = f"postgresql://{user}:{pwd}@{host}:{port}/{db}"

##### Setup SQLAlchemy Engine and Reflect Our Database

In [4]:
engine = create_engine(root_db_uri)

# Create an automap base class
Base = automap_base()
# reflect the tables (this creates the mapping between for our tables)
Base.prepare(engine=engine, autoload_with=engine)
Base.classes.keys()

['teams_traditional',
 'teams_advanced',
 'teams_misc',
 'teams_clutch',
 'playoffs_traditional',
 'playoffs_advanced',
 'playoffs_misc',
 'playoffs_clutch',
 'playoff_teams_long',
 'champions']

In [5]:
# Save references to each table from the mapping
TeamsTraditional = Base.classes.teams_traditional
TeamsAdvanced = Base.classes.teams_advanced
TeamsMisc = Base.classes.teams_misc
TeamsClutch = Base.classes.teams_clutch

##### Query Our Database and Convert to DataFrames

In [6]:
# Create the session
Session = sessionmaker(engine)  

# Create 2 variables, one with references to our tables, and the other an empty
# dictionary to collect the resulting DataFrames
tables = {
    "traditional": TeamsTraditional,
    "advanced": TeamsAdvanced,
    "misc": TeamsMisc,
    "clutch": TeamsClutch,
}

# Dictionaries to hold the dataframes. There are two because we will need to separate the data of
# the current season (2022) from all the past seasons (past seasons will serve as our train/test data).
dataframes = {}
dataframes_22 = {}

# Loop through each table reference, query the data from our DB, and convert to
# a DataFrame
with Session() as session:
    for stat_type, table in tables.items():
        # Query the table
        query = session.query(table)
        # Use the query's SQL statement to read into a DataFrame
        df = pd.read_sql(sql=query.statement, con=engine)
        # Change season column to the year as an integer
        df["SEASON"] = df["SEASON"].dt.year
        # Filter out the non-playoff teams and drop the PLAYOFFS columns as well as team record info 
        df = df[df["PLAYOFFS"] == True].drop(columns=["GP", "W", "L", "PLAYOFFS"])
        
        # Slice the DataFrame to separate the 2022 season (that doesn't have a champion) from all of
        # the past seasons
        df_22 = df[df["SEASON"] == 2022]
        df = df[df["SEASON"] != 2022]

        # Create a new team identifier by combining the year and team name
        df_22["TEAM"] = df_22["SEASON"].astype(dtype="str") + " " + df_22["TEAM"]
        df["TEAM"] = df["SEASON"].astype(dtype="str") + " " + df["TEAM"]

        # Drop the season column
        df_22 = df_22.drop(columns="SEASON")
        df = df.drop(columns="SEASON")

        # Reset the index and assign the transformed DataFrame to the "output" dictionaries
        dataframes_22[stat_type] = df_22.reset_index(drop=True)
        dataframes[stat_type] = df.reset_index(drop=True)

# Assign the DataFrames to variables with same names as our tables from our DB
teams_traditional = dataframes["traditional"]
teams_advanced = dataframes["advanced"]
teams_misc = dataframes["misc"]
teams_clutch = dataframes["clutch"]

# Assign the 2022 DataFrames to variables with same names as our tables from our DB with suffix for 2022
teams_traditional_22 = dataframes_22["traditional"]
teams_advanced_22 = dataframes_22["advanced"]
teams_misc_22 = dataframes_22["misc"]
teams_clutch_22 = dataframes_22["clutch"]

##### Check Number of Records for Each DataFrame

In [7]:
print("traditional count = ", len(teams_traditional))
print("advanced count = ", len(teams_advanced))
print("misc count = ", len(teams_misc))
print("clutch count = ", len(teams_clutch))

print("2022 traditional count = ", len(teams_traditional_22))
print("2022 advanced count = ", len(teams_advanced_22))
print("2022 misc count = ", len(teams_misc_22))
print("2022 clutch count = ", len(teams_clutch_22))

traditional count =  400
advanced count =  400
misc count =  400
clutch count =  400
2022 traditional count =  16
2022 advanced count =  16
2022 misc count =  16
2022 clutch count =  16


In [8]:
# Test that the data loaded correctly
teams_traditional

Unnamed: 0,TEAM,WIN%,MIN,PTS,FGM,FGA,FG%,3PM,3PA,3P%,...,REB,AST,TOV,STL,BLK,BLKA,PF,PFD,+/-,CHAMPION
0,2021 Utah Jazz,0.722,48.2,116.4,41.3,88.1,46.8,16.7,43.0,38.9,...,48.3,23.7,14.2,6.6,5.2,3.9,18.5,19.0,9.3,False
1,2021 Phoenix Suns,0.708,48.6,115.3,43.3,88.3,49.0,13.1,34.6,37.8,...,42.9,26.9,12.5,7.2,4.3,3.6,19.1,18.0,5.8,False
2,2021 Philadelphia 76ers,0.681,48.4,113.6,41.4,86.9,47.6,11.3,30.1,37.4,...,45.1,23.7,14.4,9.1,6.2,4.7,20.2,21.0,5.6,False
3,2021 Brooklyn Nets,0.667,48.3,118.6,43.1,87.3,49.4,14.2,36.1,39.2,...,44.4,26.8,13.5,6.7,5.3,4.6,19.0,18.9,4.5,False
4,2021 Denver Nuggets,0.653,48.6,115.1,43.3,89.2,48.5,12.9,34.2,37.7,...,44.4,26.8,13.5,8.1,4.5,4.5,19.1,19.2,4.9,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,1997 Orlando Magic,0.549,48.4,94.1,34.6,79.2,43.7,6.9,20.3,34.1,...,40.1,20.6,15.2,8.5,4.4,5.1,20.0,0.1,-0.4,False
396,1997 Washington Bullets,0.537,48.4,99.4,39.1,81.4,48.0,4.0,12.2,33.1,...,41.8,23.4,15.7,8.7,4.9,4.1,22.1,0.2,1.6,False
397,1997 Minnesota Timberwolves,0.488,48.2,96.1,35.8,78.5,45.6,4.5,13.3,33.9,...,39.7,22.9,15.2,7.5,6.8,5.4,22.2,0.2,-1.5,False
398,1997 Phoenix Suns,0.488,48.4,102.8,38.3,81.8,46.9,6.4,17.4,36.9,...,40.1,25.2,14.4,8.1,3.9,4.5,21.1,0.1,0.7,False


##### Merge All Stats Into One Table for Past and Current Seasons Each

In [9]:
team_stats = (teams_traditional.merge(teams_advanced, on="TEAM", suffixes=["_t", "_a"])
                               .merge(teams_misc, on="TEAM", suffixes=["_t", "_a"]))

# Use this line to drop the columns you don't want to use
# team_stats = team_stats.drop(columns=["MIN_t", "CHAMPION_t", "MIN_a", "CHAMPION_a"])


team_stats_22 = (teams_traditional_22.merge(teams_advanced_22, on="TEAM", suffixes=["_t", "_a"])
                                     .merge(teams_misc_22, on="TEAM", suffixes=["_t", "_a"]))

# Use this line to drop the columns you don't want to use
# team_stats_22 = team_stats_22.drop(columns=["WIN%", "MIN_t", "CHAMPION_t", "MIN_a", "CHAMPION_a"])

In [10]:
team_stats.columns

Index(['TEAM', 'WIN%', 'MIN_t', 'PTS', 'FGM', 'FGA', 'FG%', '3PM', '3PA',
       '3P%', 'FTM', 'FTA', 'FT%', 'OREB', 'DREB', 'REB', 'AST', 'TOV', 'STL',
       'BLK', 'BLKA', 'PF', 'PFD', '+/-', 'CHAMPION_t', 'MIN_a', 'OFFRTG',
       'DEFRTG', 'NETRTG', 'AST%', 'AST/TO', 'AST RATIO', 'OREB%', 'DREB%',
       'REB%', 'TOV%', 'EFG%', 'TS%', 'PACE', 'PIE', 'POSS', 'CHAMPION_a',
       'MIN', 'PTS OFF TO', '2ND PTS', 'FBPS', 'PITP', 'OPP PTS OFF TO',
       'OPP 2ND PTS', 'OPP FBPS', 'OPP PITP', 'CHAMPION'],
      dtype='object')

### Machine Learning
#### Create a model with LogisticRegression().
1.  Create a model with LogisticRegression().
2.  Train the model with model.fit().
3. Make predictions with model.predict().
4. Validate the model with accuracy_score().

<br>
<hr>

#### Example from the Module
##### Generate Some Data

In [None]:
# Make practice dataset
from sklearn.datasets import make_blobs

X, y = make_blobs(centers=2, random_state=42)

print(f"Labels: {y[:10]}")
print(f"Data: {X[:10]}")
print("type of X:", type(X), "| size of X:", X.shape)
print("type of y:", type(y), "| size of y:", y.shape)

In [None]:
# Visualizing both classes
plt.scatter(X[:, 0], X[:, 1], c=y)
plt.show()

##### Split our data into training and testing

In [None]:
# Dataset is split into training and testing sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

##### Create a Logistic Regression Model

In [None]:
# Create logistic regression model 
from sklearn.linear_model import LogisticRegression

classifier = LogisticRegression(solver="lbfgs", random_state=1)
classifier

##### Fit (train) or model using the training data

In [None]:
# Train the Logistic Regression Model
classifier.fit(X_train, y_train)

##### Make predictions

In [None]:
# Validate Logistic Regression Model
# Create predictions
predictions = classifier.predict(X_test)
pd.DataFrame({"Prediction": predictions, "Actual": y_test})

##### Validate the model using the test data

In [None]:
# Evaluate or Asses model performance
from sklearn.metrics import accuracy_score

accuracy_score(y_test, predictions)

In [None]:
# Create new data point (where does the red dot lie)
new_data = np.array([[-2, 6]])
plt.scatter(X[:, 0], X[:, 1], c=y)
plt.scatter(new_data[0, 0], new_data[0, 1], c="r", marker="o", s=100)
plt.show()

In [None]:
predictions = classifier.predict(new_data)
print("Classes are either 0 (purple) or 1 (yellow)")
print(f"The new point was classified as: {predictions}")

<br>
<hr>

#### Logistic Regression w NBA Stats Data
##### Preprocess Our Data

In [None]:
# Categorize features from the target
y = df["champions"]
X = df.drop(columns="champions")

##### Split our data into training and testing

In [None]:
# Split the dataset into training and testing sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

##### Create a Logistic Regression Model

In [None]:
# Create a logistic regression model with the specified arguments
from sklearn.linear_model import LogisticRegression

classifier = LogisticRegression(solver="lbfgs", max_iter=200, random_state=1)

##### Fit (train) or model using the training data

In [None]:
# Train the model with the training data
classifier.fit(X_train, y_train)

##### Make predictions

In [None]:
# Create predictions for y-values
y_pred = classifier.predict(X_test)

##### Validate the model using the test data

In [None]:
# Accuracy score
from sklearn.metrics import accuracy_score

print(accuracy_score(y_test, y_pred))

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=001650fd-d063-41db-b9e3-866741d4a685' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>