# Credit Risk Resampling Techniques

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter

# Read the CSV and Perform Basic Data Cleaning

In [18]:
columns = [
    "YEAR","GAME CODE","TEAM CODE","RESULT","OFFENSE POINTS","DEFENSE POINTS",
    "RUSH","OFFENSE RUSH YARDS","PASS","INC PASS","OFFENSE PASS YARDS","DEFENSE RUSH YARDS","DEFENSE PASS YARDS",
    "PENALTY","PENALTY YARDS"
]

target = ["RESULT"]

In [16]:
# Load the data
from google.colab import files
uploaded = files.upload()

file_name='halfStats.csv'
df = pd.read_csv(file_name, skiprows=0)[:-2]

Saving halfStats.csv to halfStats (3).csv


In [23]:
df2=df.loc[:, columns].copy()
print(len(df2))
df2.head()

9586


Unnamed: 0,YEAR,GAME CODE,TEAM CODE,RESULT,OFFENSE POINTS,DEFENSE POINTS,RUSH,OFFENSE RUSH YARDS,PASS,INC PASS,OFFENSE PASS YARDS,DEFENSE RUSH YARDS,DEFENSE PASS YARDS,PENALTY,PENALTY YARDS
0,2008,500004720080828,47,1,28,7,11,105,13,2,180,72,108,4,28
1,2008,500004720080828,500,0,7,28,21,72,23,12,108,105,180,2,20
2,2008,749005120080828,749,1,20,6,22,63,24,6,138,75,35,1,5
3,2008,749005120080828,51,0,6,20,18,75,16,9,35,63,138,0,0
4,2008,704008620080828,86,1,21,14,18,102,12,5,142,88,113,0,0


In [27]:
# Drop the null columns where all values are null
df2 = df2.dropna(axis='columns', how='all')

# Drop the null rows
df2 = df2.dropna()

df2.reset_index(inplace=True, drop=True)

print(len(df2))
df2.head()

9586


Unnamed: 0,YEAR,GAME CODE,TEAM CODE,RESULT,OFFENSE POINTS,DEFENSE POINTS,RUSH,OFFENSE RUSH YARDS,PASS,INC PASS,OFFENSE PASS YARDS,DEFENSE RUSH YARDS,DEFENSE PASS YARDS,PENALTY,PENALTY YARDS
0,2008,500004720080828,47,1,28,7,11,105,13,2,180,72,108,4,28
1,2008,500004720080828,500,0,7,28,21,72,23,12,108,105,180,2,20
2,2008,749005120080828,749,1,20,6,22,63,24,6,138,75,35,1,5
3,2008,749005120080828,51,0,6,20,18,75,16,9,35,63,138,0,0
4,2008,704008620080828,86,1,21,14,18,102,12,5,142,88,113,0,0


#Filter Data

In [158]:
# Throw out "blowout" games
blowoutThresh=13

df_filter=df2.copy()
df_filter=df_filter[abs(df_filter['OFFENSE POINTS'] - df_filter['DEFENSE POINTS']) <= blowoutThresh]

df_filter.reset_index(inplace=True, drop=True)
print(len(df_filter))
df_filter.head(20)

5736


Unnamed: 0,YEAR,GAME CODE,TEAM CODE,RESULT,OFFENSE POINTS,DEFENSE POINTS,RUSH,OFFENSE RUSH YARDS,PASS,INC PASS,OFFENSE PASS YARDS,DEFENSE RUSH YARDS,DEFENSE PASS YARDS,PENALTY,PENALTY YARDS
0,2008,704008620080828,86,1,21,14,18,102,12,5,142,88,113,0,0
1,2008,704008620080828,704,0,14,21,18,88,21,6,113,102,142,0,0
2,2008,490064820080828,490,0,0,3,18,146,9,5,-8,23,70,3,15
3,2008,490064820080828,648,1,3,0,19,23,17,5,70,146,-8,1,5
4,2008,528067420080828,528,0,17,17,12,48,18,8,166,122,71,3,30
5,2008,528067420080828,674,1,17,17,23,122,11,3,71,48,166,4,33
6,2008,719000920080830,719,1,21,19,15,-2,11,1,186,176,106,2,15
7,2008,719000920080830,9,0,19,21,24,176,16,5,106,-2,186,1,14
8,2008,771003120080830,771,0,10,7,22,98,12,7,-36,14,130,0,0
9,2008,771003120080830,31,1,7,10,12,14,18,7,130,98,-36,4,20


In [161]:
df_filter['RESULT'].value_counts()

1    2868
0    2868
Name: RESULT, dtype: int64

# Split the Data into Training and Testing and Scale

In [163]:
# Create our features
X = df_filter.copy()
X=X.drop({"YEAR","GAME CODE","TEAM CODE","RESULT"},axis=1)

# Need to convert columns with strings to numeric
# Get column names that are non-numeric
column_names=X.select_dtypes(exclude=[np.number]).columns
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
for columnName in column_names:
  X[columnName]=le.fit_transform(X[columnName])

# Create our target
y = df_filter['RESULT'].copy()

X.head()

Unnamed: 0,OFFENSE POINTS,DEFENSE POINTS,RUSH,OFFENSE RUSH YARDS,PASS,INC PASS,OFFENSE PASS YARDS,DEFENSE RUSH YARDS,DEFENSE PASS YARDS,PENALTY,PENALTY YARDS
0,21,14,18,102,12,5,142,88,113,0,0
1,14,21,18,88,21,6,113,102,142,0,0
2,0,3,18,146,9,5,-8,23,70,3,15
3,3,0,19,23,17,5,70,146,-8,1,5
4,17,17,12,48,18,8,166,122,71,3,30


In [164]:
# Check the balance of our target values
y.value_counts()

1    2868
0    2868
Name: RESULT, dtype: int64

# Sampling Methods and Scale


In [165]:
# Right now, we don;t have "clusters" other than wins and losses - which are exactly equal
# Later, we will have games listed by conference, so we will probably like a stratified sample
from sklearn.model_selection import train_test_split

stratify_values=df_filter['RESULT']
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    stratify=stratify_values, 
                                                    test_size=0.5)

# Scale the Data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaler = scaler.fit(X_train)

X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)


# Models

## Import Dependencies

In [125]:
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import confusion_matrix, accuracy_score, classification_report


## Logistic Regression

In [166]:
classifier = LogisticRegression(solver='lbfgs',
                                max_iter=200,
                                random_state=42)

result=classifier.fit(X_train_scaled, y_train)

In [167]:
ml_df=pd.DataFrame(columns=X.columns.tolist())

log_coefs=list(zip(X.columns.tolist(),
                   result.coef_.tolist()[0]))
log_coefs.sort(key = lambda x:abs(x[1]),reverse=True)
print(log_coefs)

[('OFFENSE POINTS', 1.1450736295827006), ('DEFENSE POINTS', -1.0702001770590446), ('PENALTY YARDS', -0.23475953282754194), ('PENALTY', 0.156574691239487), ('DEFENSE PASS YARDS', -0.13258477302544533), ('RUSH', -0.10447013988200565), ('OFFENSE RUSH YARDS', 0.07879940849935792), ('DEFENSE RUSH YARDS', -0.06451765714620322), ('OFFENSE PASS YARDS', 0.06368826737476191), ('PASS', -0.026438762803792086), ('INC PASS', 0.024200910742067344)]


In [168]:
y_pred = classifier.predict(X_test)
results = pd.DataFrame({"Prediction": y_pred, "Actual": y_test}).reset_index(drop=True)
print(classifier.get_params())
results.head(10)

{'C': 1.0, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 200, 'multi_class': 'auto', 'n_jobs': None, 'penalty': 'l2', 'random_state': 42, 'solver': 'lbfgs', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}


Unnamed: 0,Prediction,Actual
0,0,1
1,0,0
2,1,1
3,0,1
4,1,1
5,1,0
6,0,1
7,0,0
8,1,0
9,1,1


In [169]:
logResAcc=accuracy_score(y_test, y_pred)
print(accuracy_score(y_test, y_pred))

# Display the confusion matrix
# Calculating the confusion matrix.
cm = confusion_matrix(y_test, y_pred)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual Loss", "Actual Win"], columns=["Predicted Loss", "Predicted Win"])

print("")
print(cm_df)

# Print the imbalanced classification report
print("")
print(classification_report(y_test, y_pred))

0.6335425383542538

             Predicted Loss  Predicted Win
Actual Loss            1293            141
Actual Win              910            524

              precision    recall  f1-score   support

           0       0.59      0.90      0.71      1434
           1       0.79      0.37      0.50      1434

    accuracy                           0.63      2868
   macro avg       0.69      0.63      0.61      2868
weighted avg       0.69      0.63      0.61      2868

