In [3]:
# Initial imports
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

In [4]:
# Load the dataset.
file_path = ('cleaned_offense.csv')
df = pd.read_csv(file_path)
df

Unnamed: 0.1,Unnamed: 0,team,pass_att,cmp,cmp_percentage,yds/att,pass_yds,pass_td,int,year,rush_att,rush_yds,ypc,rush_td,fum,playoff
0,0,Ravens,679,439,64.6,6.4,4343,20,16,2016,499,2396,4.8,24,9,0
1,1,Saints,674,472,70.0,7.8,5258,38,15,2016,492,2630,5.4,29,11,0
2,2,Cardinals,646,383,59.3,6.8,4425,28,17,2016,482,1872,3.9,19,8,0
3,3,Jaguars,626,369,59.0,6.3,3925,24,16,2016,476,2187,4.6,16,10,0
4,4,Packers,620,403,65.0,7.2,4445,40,8,2016,458,2019,4.4,15,8,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
187,187,Browns,520,320,61.5,7.0,3619,21,14,2021,411,1583,3.8,10,1,0
188,188,49ers,514,343,66.7,8.6,4437,26,14,2021,393,1443,3.7,11,10,1
189,189,Saints,504,293,58.1,6.8,3437,29,13,2021,392,1755,4.5,13,6,0
190,190,Seahawks,495,324,65.4,7.7,3815,30,7,2021,385,1672,4.3,18,4,0


In [5]:
# Remove rows that have at least 1 null value.
df = df.dropna()
df

Unnamed: 0.1,Unnamed: 0,team,pass_att,cmp,cmp_percentage,yds/att,pass_yds,pass_td,int,year,rush_att,rush_yds,ypc,rush_td,fum,playoff
0,0,Ravens,679,439,64.6,6.4,4343,20,16,2016,499,2396,4.8,24,9,0
1,1,Saints,674,472,70.0,7.8,5258,38,15,2016,492,2630,5.4,29,11,0
2,2,Cardinals,646,383,59.3,6.8,4425,28,17,2016,482,1872,3.9,19,8,0
3,3,Jaguars,626,369,59.0,6.3,3925,24,16,2016,476,2187,4.6,16,10,0
4,4,Packers,620,403,65.0,7.2,4445,40,8,2016,458,2019,4.4,15,8,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
187,187,Browns,520,320,61.5,7.0,3619,21,14,2021,411,1583,3.8,10,1,0
188,188,49ers,514,343,66.7,8.6,4437,26,14,2021,393,1443,3.7,11,10,1
189,189,Saints,504,293,58.1,6.8,3437,29,13,2021,392,1755,4.5,13,6,0
190,190,Seahawks,495,324,65.4,7.7,3815,30,7,2021,385,1672,4.3,18,4,0


In [6]:
# Drop columns that are not going to be used
df = df.drop(['Unnamed: 0'], axis = 1)
df

Unnamed: 0,team,pass_att,cmp,cmp_percentage,yds/att,pass_yds,pass_td,int,year,rush_att,rush_yds,ypc,rush_td,fum,playoff
0,Ravens,679,439,64.6,6.4,4343,20,16,2016,499,2396,4.8,24,9,0
1,Saints,674,472,70.0,7.8,5258,38,15,2016,492,2630,5.4,29,11,0
2,Cardinals,646,383,59.3,6.8,4425,28,17,2016,482,1872,3.9,19,8,0
3,Jaguars,626,369,59.0,6.3,3925,24,16,2016,476,2187,4.6,16,10,0
4,Packers,620,403,65.0,7.2,4445,40,8,2016,458,2019,4.4,15,8,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
187,Browns,520,320,61.5,7.0,3619,21,14,2021,411,1583,3.8,10,1,0
188,49ers,514,343,66.7,8.6,4437,26,14,2021,393,1443,3.7,11,10,1
189,Saints,504,293,58.1,6.8,3437,29,13,2021,392,1755,4.5,13,6,0
190,Seahawks,495,324,65.4,7.7,3815,30,7,2021,385,1672,4.3,18,4,0


In [7]:
# Create a new DataFrame that holds only the cryptocurrencies names.
team_name = pd.DataFrame(df['team'])
team_name

Unnamed: 0,team
0,Ravens
1,Saints
2,Cardinals
3,Jaguars
4,Packers
...,...
187,Browns
188,49ers
189,Saints
190,Seahawks


In [8]:
# Drop the 'team' column since it's not going to be used on the clustering algorithm.
df = df.drop(['team'], axis = 1)
df

Unnamed: 0,pass_att,cmp,cmp_percentage,yds/att,pass_yds,pass_td,int,year,rush_att,rush_yds,ypc,rush_td,fum,playoff
0,679,439,64.6,6.4,4343,20,16,2016,499,2396,4.8,24,9,0
1,674,472,70.0,7.8,5258,38,15,2016,492,2630,5.4,29,11,0
2,646,383,59.3,6.8,4425,28,17,2016,482,1872,3.9,19,8,0
3,626,369,59.0,6.3,3925,24,16,2016,476,2187,4.6,16,10,0
4,620,403,65.0,7.2,4445,40,8,2016,458,2019,4.4,15,8,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
187,520,320,61.5,7.0,3619,21,14,2021,411,1583,3.8,10,1,0
188,514,343,66.7,8.6,4437,26,14,2021,393,1443,3.7,11,10,1
189,504,293,58.1,6.8,3437,29,13,2021,392,1755,4.5,13,6,0
190,495,324,65.4,7.7,3815,30,7,2021,385,1672,4.3,18,4,0


In [9]:
# Define the features set.
X = df.copy()
X = X.drop("playoff", axis=1)
X.head()

Unnamed: 0,pass_att,cmp,cmp_percentage,yds/att,pass_yds,pass_td,int,year,rush_att,rush_yds,ypc,rush_td,fum
0,679,439,64.6,6.4,4343,20,16,2016,499,2396,4.8,24,9
1,674,472,70.0,7.8,5258,38,15,2016,492,2630,5.4,29,11
2,646,383,59.3,6.8,4425,28,17,2016,482,1872,3.9,19,8
3,626,369,59.0,6.3,3925,24,16,2016,476,2187,4.6,16,10
4,620,403,65.0,7.2,4445,40,8,2016,458,2019,4.4,15,8


In [11]:
# Define the target set.
y = df["playoff"].values
y

array([0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0,
       1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1,
       0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1,
       1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0,
       1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0,
       0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0,
       0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1,
       0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1], dtype=int64)

In [15]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

print(Counter(y_train))
print(Counter(y_test))

Counter({0: 87, 1: 57})
Counter({0: 29, 1: 19})


In [17]:
# Creating a StandardScaler instance.
from sklearn.preprocessing import StandardScaler, MinMaxScaler
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [19]:
# Creating the decision tree classifier instance.
from sklearn import tree
model = tree.DecisionTreeClassifier()
# Fitting the model.
model = model.fit(X_train_scaled, y_train)

In [20]:
# Making predictions using the testing data.
predictions = model.predict(X_test_scaled)
predictions

array([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1,
       0, 0, 0, 0], dtype=int64)

In [21]:
# Evaluate the model
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,24,5
Actual 1,9,10


In [23]:
# Calculating the accuracy score.
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
acc_score = accuracy_score(y_test, predictions)

In [24]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,24,5
Actual 1,9,10


Accuracy Score : 0.7083333333333334
Classification Report
              precision    recall  f1-score   support

           0       0.73      0.83      0.77        29
           1       0.67      0.53      0.59        19

    accuracy                           0.71        48
   macro avg       0.70      0.68      0.68        48
weighted avg       0.70      0.71      0.70        48



In [None]:
#Based on the results of the mockup decision tree model, decide whether the results are adequate or if we need to use 
#a different model (random forest 17.8.3, random under/over sampling 17.10.1 )