### CLASSIFICATION

In [None]:
from sklearn.datasets import fetch_openml
mnist = fetch_openml('mnist_784', version=1) # fetching the mnist dataset
mnist.keys()

In [None]:
print(mnist.DESCR)  # printing the description of the dataset

In [None]:
x, y = mnist['data'], mnist['target'] 
x = x.to_numpy()
y = y.to_numpy() # turning pandas Dataframe to Numpy array
print(x.shape)
print(y.shape)

In [None]:
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
some_digit = x[2] 
y = y.astype(np.int32)  # converting the labels to integers
some_digit_image = some_digit.reshape(28, 28) 
plt.imshow(some_digit_image, cmap="binary")
plt.show() # displaying random mnist digit
print(y[2])  # printing the label of the digit


In [None]:
# spliting training and testing data
x_train, x_test, y_train, y_test = x[:60000], x[60000:], y[:60000], y[60000:]

### stochastic Gradient Descent 


In [None]:
# Binarry classsifier for digit 5
from sklearn.linear_model import SGDClassifier
import numpy as np
y_train_5 = (y_train == 5)  # creating a binary target variable for digit 5
y_test_5 = (y_test == 5)
# schochastic gradient descent classifier
sgd_clf = SGDClassifier(random_state=42)
sgd_clf.fit(x_train, y_train_5)  # training the classifier


In [None]:
# predicting if the digit is 5 or not
test_digit = x[0]
sgd_clf.predict([test_digit]) 

In [None]:
# Measuring Accuracy
# accuracy is not a good measure for imbalanced datasets as the classifier can achieve high accuracy by simply predicting the most likely class
from sklearn.model_selection import cross_val_score
# cross_val_score is used to evaluate the performance of the model using cross-validation
cross_val_score(sgd_clf, x_train, y_train_5, cv = 3)  # performing cross-validation

In [None]:
# Confusion Matrix
from sklearn.model_selection import cross_val_predict
# cross_val_predict is used to get the predictions for each instance in the training set using cross-validation
y_train_pred = cross_val_predict(sgd_clf, x_train, y_train_5, cv=3) 

from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
# creating a confusion matrix
cm = confusion_matrix(y_train_5, y_train_pred)  
print(cm)

# calculating precision, recall andn F1 score
precision = precision_score(y_train_5, y_train_pred)
recall = recall_score(y_train_5, y_train_pred)
f1 = f1_score(y_train_5, y_train_pred) # F1 score is the most reliable metric 
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1) 
# Precision/Recall Trade-off : precision and recall are inversely proportional to each other, model with high pprecision will have low recall and vice versa

In [None]:
# printing the decision function score
# descision function score tells how cofident the model is aboout its prediction 
# threshold is the point above which the model will classify a value as true and below which it will classify it as false
test_digit = x[0]
y_score = sgd_clf.decision_function([test_digit])  # getting the decision function score for the test digit
print(y_score)
print(y[0])

In [None]:
# if the decision function scoore is greater thaan the threshold, the model would return true otherwise it would return false 
threshold = 0 # threshold is set to 0 by default
y_pred = (y_score > threshold)
print(y_pred)  

# Changing the threshold 
# increasing the threshold will increase the prescision and decrease the recall
threshold = 5000
y_pred = (y_score > threshold) 
print(y_pred)  

In [None]:
# Determining the best threshold 
# the threshold to achieve a particular precison or recall can be achieved using the precision-recall curve
from sklearn.metrics import precision_recall_curve
from sklearn.model_selection import cross_val_predict
import matplotlib.pyplot as plt

# getting the decision function scores for the training set using cross-validation
y_scores = cross_val_predict(sgd_clf, x_train, y_train_5, cv=3, method="decision_function")
precisions, recalls, thresholds = precision_recall_curve(y_train_5, y_scores)

# plotting the precision-recall curve
def plot_precision_recall_vs_threshold(precisions, recalls, thresholds):
    plt.plot(thresholds, precisions[:-1], "b--", label="Precision", linewidth=2)
    plt.plot(thresholds, recalls[:-1], "g-", label="Recall", linewidth=2)
    plt.xlabel("Threshold")
    plt.legend(loc="upper left")
    plt.ylim([0, 1])
plot_precision_recall_vs_threshold(precisions, recalls, thresholds)
plt.show()
# the precision curve is more bumpier than the recall curve, this is because the precision is more sensitive to the threshold than the recall

In [None]:
# Finding the optimal threshold 
print(len(precisions), len(recalls), len(thresholds)) # the length of the precisions, recalls is 1 greater than the length of the thresholds
f1_scores = 2 * (precisions[:-1] * recalls[:-1]) / (precisions[:-1] + recalls[:-1])
best_f1_value = np.argmax(f1_scores)  # getting the index of the best F1 score
optimal_threshold = thresholds[best_f1_value]  
print("Best F1 Score:", f1_scores[best_f1_value])
print(f"optimal threshold : {optimal_threshold}")

In [None]:
# threshold for 90% precision 
best_precision = np.argmax(precisions >= 0.9)  # getting the index of the first precision value greater than or equal to 0.9 
new_threshold = thresholds[best_precision]
print(f"threshold for 90% precision: {new_threshold}")

In [None]:
# ROC Curve 
from sklearn.metrics import roc_curve, roc_auc_score
# ROC curve is a curve between the true positive rate (TPR) and the false positive rate (FPR)
fpr, tpr, thresholdv = roc_curve(y_train_5, y_scores)
# Plotting the ROC curve 
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, linewidth=2, label=f'ROC Curve)')
plt.plot([0, 1], [0, 1], 'k--', label='Random Classifier')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.grid(True)

# AUC (Area Under the Curve) is a single number that summarizes the performance of the classifier
auc = roc_auc_score(y_train_5, y_scores)
print(f"AUC: {auc:.2f}")

In [None]:
# MultiClass Classification 
from sklearn.svm import SVC
from random import randint

svm_clf = SVC()
random_num = randint(0, 59999)
test_digit = x[random_num] # random digit from the training set
# SVC is a binary classifier but it can be used for multiclass classification by using the one-vs-one or one-vs-all strategy
svm_clf.fit(x_train, y_train) # SVC uses the one-vs-one strategy by default
print(svm_clf.predict([test_digit]))  
print(y[random_num])  # printing the label of the digit

[1]
1


In [20]:
# printing the decision function score for the test digit
y_scores = svm_clf.decision_function([test_digit])
print(y_scores) 

[[ 4.76247599  9.30488756  5.84432361  7.24918342  0.73788086 -0.29284998
   3.78547267  1.72988609  8.30228948  2.74859509]]
