In [35]:
from sklearn.model_selection import (cross_val_score, validation_curve)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
import pandas as pd
import numpy as np
np.random.seed(2023 - 5 - 24)

# ___Cross Validation___
---------

In [1]:
# When applying ML models, we've followed a consistent series of steps;.
# partition the data into train and test sets.
# fit the model on the training set to estimate the model parameters.
# then make the predictions on the training set.

In [2]:
# We did the partitioning as a way to estimate how well the model performs on unknown datasets. (generalize to)
# the test set set represents a set of novel data pooints the model hasn't seen yet, but come from the same distribution (feature sets)

In [5]:
# Cross-validation goes one step beyond to evaluate the performance of a select set of models.
# using multiple train test splits.
# In train test splits using one ML model, the learned parameters and consequently the model accuracy may vary by chance depending on the 
# data points that got into the train category and the test category, during the splitting.

In [6]:
# Cross validation does a more rigorous testing by creating and running multiple train test splits,
# training models on them and measuring the accuracy score to compute the average accuracy score.

## ___Cross Validation in Practice___
----------------

In [7]:
# Cross validation gives a reliably stable estimate of accuracy given a model, that can be compared to the performance of another model.

In [8]:
# Let's say that we have a classification task at hand, and we have a SVM model and a Naive Bayes model.
# In this scenario, we cannot rely on the accuracy scores computed via a single train test split.
# Because the composition of the split has a high influence in the accuracy score.

In [9]:
# Here, cross validation can rescue us.
# cross validation will do a number of train test splits on the given dataset, train the model on the sets of train sets and measure 
# the accuracy scores using the test sets.
# and finally compute the average of the accuracy score.

In [10]:
# Cross validation is NOT used to produce models.
# e.g. a 10 fold cross validation will make 10 different train test splits
# train 10 different models
# make 10 different predictions
# measure 10 different accuracy scores.
# and get the average accuracy score.

## ___Cross-validation in Model Evaluation & Model Tuning___
-----------------

In [11]:
# If our task is to compare the peroformances of two different models that were individually trained and tuned,
# we'd use k-fold cross-validation.
# k refers to the number of train-test splits.

In [12]:
# If we are to tune a signle model, in order to find the best parameters, we'd need a slightly different approach.
# Instead of a train-test split we must use a train-validate-test split.
# i.e our data gets partitioned into 3 parts.

## ___Mechanics Behind k-fold Cross Validation___
_______________

In [15]:
# Most common type of cross-validation is the k-fold cross validation.
# In a 5 fold cross validation;
# Say that we pass in a dataset with 50,000 rows.

# It would get split into 5 equal portions => A, B, C, D & E each with 10,000 rows.
# Now we need to train and evaluate 5 models.

# First model will take in B to E as training set (40,000 rows) and use A as the test set.
# Second model will use A, C to E as training set and use B as the test set.
# Third model will use A, B, D to E as training set and use C as the test set.
# so on and so forth.

# Finally we'd end up with 5 different accuracy scores.
# From which we can compute the average!.

In [2]:
fruits = pd.read_csv(r"D:/Applied-Machine-Learning-in-Python/resources/assets/fruit_data_with_colors.txt", delimiter = "\t")

In [3]:
fruits.head()

Unnamed: 0,fruit_label,fruit_name,fruit_subtype,mass,width,height,color_score
0,1,apple,granny_smith,192,8.4,7.3,0.55
1,1,apple,granny_smith,180,8.0,6.8,0.59
2,1,apple,granny_smith,176,7.4,7.2,0.6
3,2,mandarin,mandarin,86,6.2,4.7,0.8
4,2,mandarin,mandarin,84,6.0,4.6,0.79


In [4]:
fruits_x = fruits.loc[:, ["width", "height"]]
fruits_y = fruits.fruit_label

In [5]:
knnClassifier = KNeighborsClassifier(n_neighbors = 5)

In [6]:
cross_val_score(estimator = knnClassifier, X = fruits_x, y = fruits_y, cv = 5)

array([0.75      , 0.75      , 0.83333333, 0.83333333, 0.81818182])

In [7]:
# cv argument defines the k number of folds.

np.mean(cross_val_score(estimator = knnClassifier, X = fruits_x, y = fruits_y, cv = 5))

0.7969696969696971

In [8]:
x_0 = np.random.randint(1, 100, size = 100000)
x_1 = np.random.randint(100, 1000, size = 100000)

In [9]:
features = np.stack([x_0, x_1], axis = 1)

def classify(f1: np.array, f2: np.array) -> np.array:
    res = (np.power(f1, 1.5672) + np.sqrt(f2) + 100).astype(np.int64)
    labels = []
    for r in res:
        if r > 1000:
            labels.append(10)
        elif r > 900:
            labels.append(9)
        elif r > 800:
            labels.append(8)
        elif r > 700:
            labels.append(7)
        elif r > 600:
            labels.append(6)
        elif r > 500:
            labels.append(5)
        elif r > 400:
            labels.append(4)
        elif r > 300:
            labels.append(3)
        elif r > 200:
            labels.append(3)
        elif r > 100:
            labels.append(1)
        else:
            labels.append(0)
    return np.array(labels)

targ = classify(x_0, x_1)

In [10]:
features.shape, targ.shape

((100000, 2), (100000,))

In [11]:
np.unique(targ, return_counts = True)

(array([ 1,  3,  4,  5,  6,  7,  8,  9, 10]),
 array([15820, 20297,  7848,  7212,  6611,  6356,  5910,  5693, 24253],
       dtype=int64))

In [12]:
knnClassifier = KNeighborsClassifier(n_neighbors = 10)

In [14]:
cvscores = cross_val_score(estimator = knnClassifier, X = features, y = targ, cv = 100)

In [16]:
cvscores

array([0.977, 0.977, 0.985, 0.984, 0.981, 0.981, 0.975, 0.981, 0.983,
       0.975, 0.986, 0.977, 0.981, 0.978, 0.98 , 0.978, 0.979, 0.986,
       0.983, 0.993, 0.984, 0.975, 0.992, 0.984, 0.981, 0.978, 0.981,
       0.977, 0.978, 0.977, 0.982, 0.987, 0.974, 0.984, 0.974, 0.98 ,
       0.971, 0.98 , 0.976, 0.983, 0.972, 0.984, 0.982, 0.98 , 0.986,
       0.981, 0.975, 0.985, 0.982, 0.987, 0.98 , 0.974, 0.976, 0.984,
       0.975, 0.982, 0.983, 0.979, 0.976, 0.986, 0.982, 0.971, 0.974,
       0.98 , 0.972, 0.973, 0.981, 0.985, 0.977, 0.978, 0.98 , 0.983,
       0.989, 0.981, 0.982, 0.973, 0.97 , 0.982, 0.982, 0.986, 0.982,
       0.977, 0.975, 0.984, 0.985, 0.983, 0.981, 0.986, 0.979, 0.977,
       0.976, 0.979, 0.985, 0.982, 0.986, 0.98 , 0.985, 0.989, 0.977,
       0.976])

In [17]:
# the mean cross validation score

cvscores.mean()

0.98032

In [18]:
# In addition to improved estimations of model accuracy, cross validation tells us about how sensitive the model is to the composition of the 
# train dataset.
# Using this information, we may be able to predict how this model may perform on a specifically skewed dataset.
# We can essentially do a best-case worst-case performance estimations.

In [19]:
# k fold cross validations are computationally expensive.
# if we do not execute these validations in parallel, we'll need k times more time for the validation.

## ___Stratified Cross Validation___
-----------------

In [20]:
# In generic cross-validation the data gets split into k folds linearly (or sequentially)
# if the dataset has 20,000 rows and we ask for a 4 fold cross validation,
# first fold => first 5,000 rows.
# second fold => next 5,000 rows ..etc..

# However, if the data is sorted based on certain feature/s, this type of splitting may introduce bias.
# certain folds may disproportionately represent certain class labels.
# if these get included in the training set => the model may estimate skewed parameters

In [21]:
# sklearn defaults to stratified k fold cross-validation
# Stratified classification makes sure the composition or distribution of classes in the folds refect the composition of classes in the 
# entire dataset.

In [22]:
# For regression, sklearn uses regular k fold cross-validation 
# since the idea of preserving the composition of original data classes is irrelevant to most regression problems.

## ___Leave-One-Out Cross Validation___
------------

In [23]:
# This is a variant of k fold cross-validation where k = number of data entries in the dataset.
# i.e each fold will have one data point in the test set and the rest will be in the training set.

# e.g. if we had a dataset with 6,000 rows =>
# there will be 6,000 folds => each of which will have 1 data record as test set and the remaining 5,999 records in the training set.

In [24]:
# This is even more computationally expensive
# But for smaller datasets, this can provide fool-proof estimates, since this provides the maximum possible amount of training data to the 
# model.

## ___Validation Curves___
-------------

In [26]:
# Sometimes we may want to evaluate the influence of a parameter on the model performance.
# validation_curve() does a k fold cross validation, unlike cross_val_score() we can pass in a classifier, a series of parameters that
# we want the function to sweep across and an array of parameter values for the model to be trained with.

In [27]:
# This returns two two-dimensional arrays corresponding to evaluation on the training set and the test set.
# Each array has one row of parameter values per sweep and the number of columns are the number of cross-validation folds used.

In [31]:
parameter_range = np.logspace(-3, 3, num = 10)

In [32]:
parameter_range

array([1.00000000e-03, 4.64158883e-03, 2.15443469e-02, 1.00000000e-01,
       4.64158883e-01, 2.15443469e+00, 1.00000000e+01, 4.64158883e+01,
       2.15443469e+02, 1.00000000e+03])

In [42]:
validation_curve(estimator = SVC(), X = np.stack([x_0, x_1], axis = 1), y = features,
                 param_name = "gamma", param_range = parameter_range, cv = 10)

(array([[nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan]]),
 array([[nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan,