# Manual Thresholding

## Import Libraries

In [66]:
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Load the data and split

In [3]:
mnist = fetch_openml("mnist_784", version=1, as_frame =False)

  warn(


In [5]:
X = mnist['data']
y = mnist['target']

X.shape, y.shape

((70000, 784), (70000,))

In [7]:
y= y.astype(int)

In [10]:
y_5 = (y==5)
y_5

array([ True, False, False, ..., False,  True, False])

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y_5, test_size=0.2, random_state=42)

X_train, X_test, y_train, y_test

(array([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]]),
 array([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]]),
 array([ True, False, False, ..., False, False, False]),
 array([False, False, False, ..., False, False, False]))

## Train model

In [13]:
sgd_clf = SGDClassifier(random_state=42)

sgd_clf.fit(X_train, y_train)

In [15]:
y_scores = sgd_clf.decision_function(X_test)
y_scores

array([-10215.90199706, -12114.60572328,   2753.71789324, ...,
       -12489.03564802, -10408.17761009,  -9586.61032998])

## Apply different thresholds

### Threshold = 0 (default behavior)

In [17]:
y_pred_0 = (y_scores>=0)
confusion_matrix(y_test, y_pred_0)

array([[12620,   107],
       [  334,   939]], dtype=int64)

### Threshold = +2000 (very strict)

In [19]:
y_pred_high = (y_scores>=2000)
confusion_matrix(y_test, y_pred_high)

array([[12697,    30],
       [  569,   704]], dtype=int64)

### Threshold = -2000 (very lenient)

In [20]:
y_pred_low = (y_scores>=-2000)
confusion_matrix(y_test, y_pred_low)

array([[12387,   340],
       [  177,  1096]], dtype=int64)

## Prove “no retraining” in code

In [21]:
y_scores_again = sgd_clf.decision_function(X_test)
np.all(y_scores == y_scores_again)

True

## Precision, Recall, F1

In [35]:
def cal_scores(y_true, y_pred) :
    ''' 
    Calculates Precison, Rcall and F1 for given values
    '''

    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    
    print("Precsion is :", precision)
    print("Recall is :", recall )
    print("Recall is :", f1)
    
    return precision, recall, f1

## Get scores for different preds

In [36]:
precision_0, recall_0, f1_0 = cal_scores(y_test, y_pred_0)

Precsion is : 0.8977055449330784
Recall is : 0.7376276512175962
Recall is : 0.8098318240620959


In [38]:
precision_high, recall_high, f1_high = cal_scores(y_test, y_pred_high)

Precsion is : 0.9591280653950953
Recall is : 0.5530243519245875
Recall is : 0.7015445939212755


In [39]:
precision_low, recall_low, f1_low = cal_scores(y_test, y_pred_low)

Precsion is : 0.7632311977715878
Recall is : 0.8609583660644148
Recall is : 0.8091546696197859


## Create a dataframe

In [61]:
scores = {0: [precision_0, recall_0, f1_0],
         'high': [precision_high, recall_high, f1_high],
         'low':[precision_low, recall_low, f1_low]}

scores_df = pd.DataFrame.from_dict(scores, 
                        orient= 'index',
                        columns = ['precison', 'recall', 'f1'])

In [64]:
scores_df.index.name ='Threshold'

In [65]:
scores_df

Unnamed: 0_level_0,precison,recall,f1
Threshold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.897706,0.737628,0.809832
high,0.959128,0.553024,0.701545
low,0.763231,0.860958,0.809155


scores_df.plot(kind = 'bar', figsize = (10, 8), title = "Precision, Recall, F1 for various Threshold");