# Model Performance - Cheat Sheet

## Import required libraries and packages

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt # typical libraries for data manipulation and visualization

from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import classification_report, confusion_matrix
import sklearn.metrics as metrics # sklearn packages needed to run cross-validation and evaluate model performance

from sklearn.linear_model import LogisticRegression # sklearn function for Logistic Regression

import warnings
warnings.filterwarnings("ignore") # prevents warning messages from being displayed (optional)

## Run a $n$-fold cross-validation on Logistic Regression

### Assign IVs to an x object, assign DV to a y object

In [None]:
x = df.drop(['Primary Key', 'DV'], axis = 1)
x = pd.get_dummies(data = x, drop_first = True) # create dummy variables for categorical IVs, drop_first = True

y = df['DV']
y = pd.get_dummies(data = y, drop_first = True) # create dummy variables for DV, drop_first = True

### Create a train-test split (90-10)

In [None]:
x_train, x_test, y_train, y_test = train_test_split( 
    x, y, test_size = 0.1, random_state = 100)

### Set parameters of n-fold cross validation (you set n, common to do 5 or 10-fold)

In [None]:
cv = KFold(n_splits = n, random_state = 1, shuffle = True) # replace n with 5, 10, or other value

### Specify model and scoring metrics

In [None]:
model = LogisticRegression()

scoring = {'acc': 'accuracy',
           'f1' : 'f1',
           'precision' : 'precision',
           'recall' : 'recall',
           'roc_auc' : 'roc_auc',
           'r2' : 'r2'}

### Perform n-fold cross-validated logistic regression on training set

In [None]:
scores = cross_validate(model, x_train, y_train, scoring = scoring, cv = cv, return_train_score=False)

## Generate score metrics to evaluate cross-validated model performance on training set

In [None]:
scores = pd.DataFrame(scores, columns = scores.keys())
scores.mean() # produces overall accuracy, f1, precision, recall, roc_auc, and r^2

y_pred = cross_val_predict(model, x_train, y_train, cv = cv) # create a y_pred object
metrics.cohen_kappa_score(y_train, y_pred) # calculate cohen's Kappa

Lift = 'model_precision'/'baseline accuracy' # replace with model precision and baseline accuracy values to calculate lift

## Produce Confusion matrix to display actual y class against predict y class

In [None]:
y_pred = cross_val_predict(model, x_train, y_train, cv = cv) # if you haven't already done so, create y_pred object
conf = confusion_matrix(y_train, y_pred) # generate confusion matrix on y_train against y_pred

sns.heatmap(conf, annot=True, fmt='g') # display confusion matrix as a heatmap
sns.set(rc={'figure.figsize':(12,10)})
plt.xlabel('Predicted Class')
plt.ylabel('Actual Class')
plt.show()

## Compare model's on performance metrics

In [None]:
model_1 = scores.mean()
model_2 = scores2.mean()

models = model_1, model_2

model_compare = pd.DataFrame(data = models, 
                        index = ["model_1", "model_2"],
                        columns = ["fit_time",
                                   "score_time", 
                                   "test_acc",
                                   "test_f1",
                                   "test_precision",
                                   "test_recall",
                                   "test_roc_auc",
                                   "test_r2"])
model_compare

## Fit best-performing model to the test set

In [None]:
model.fit(x_train, y_train) # use version of x_train that created the best model

predictions = model.predict(x_test)

print(classification_report(y_test, predictions))

# Run a  𝑛 -fold cross-validation on Decision Tree Classifier

### Assign IVs to x object and DV to y object

In [None]:
x = df.drop(['Primary key', 'DV'], axis=1)
x = pd.get_dummies(data = x, drop_first = False) # set drop_first = False

y = df['DV']

### Create training and test-set split (80-20)

In [None]:
x_train, x_test, y_train, y_test = train_test_split( 
    x, y, test_size = 0.20, random_state = 100)

### Set parameters of n-fold cross validation (you set n, common to do 5 or 10-fold)

In [None]:
cv = KFold(n_splits = 10, random_state = 1, shuffle = True) 

### Specify model parameters

In [None]:
model = DecisionTreeClassifier(criterion = "gini", random_state=100, max_depth=5, min_samples_leaf=8) # change out values

### Perform n-fold cross-validated Decision Tree classification prediction on training set

In [None]:
y_pred = cross_val_predict(model, x_train, y_train, cv = cv)

x_train['predicted_class'] = y_pred # add predictions to x_train dataframe
x_train['actual_class'] = y_train # add actual classes to x_train dataframe

## Produce cross-validated scores to evaluate model performance

In [None]:
print(classification_report(y_pred, y_train)) # produces overall accuracy, precision, recall and f1

metrics.cohen_kappa_score(y_pred, y_train) # produces cohen's kappa

### Generate confusion matrix to compare actual vs predicted classes 

In [None]:
conf = pd.DataFrame(x_train, columns=['actual_class','predicted_class'])
confusion_matrix = pd.crosstab(conf['actual_class'], conf['predicted_class'], rownames=['Actual'], colnames=['Predicted'])

sns.heatmap(confusion_matrix, annot=True, fmt='g')
sns.set(rc={'figure.figsize':(12,10)})
plt.show()

## Fit best-performing model to test-set

In [None]:
x_train = x_train.drop(['actual_class', 'predicted_class'], axis=1) # drop added actual and predicted classes columns

model.fit(x_train, y_train)

predictions = model.predict(x_test)

In [None]:
print(classification_report(y_test, predictions))