In [45]:
'''
a k-fold cross validation works by splitting the dataset into k folds, training the model on k−1 folds, and testing it on the
remaining fold — repeated k times.

It means in each of the k iterations of fresh model training,
Let:
  n = total number of samples (rows in the dataset)
  k = number of folds

Then in each fold:
  Test set size ≈ upto round(n / k) depending on the fold (since n may not divide evenly)

  Training set size = n - test set size

Eg.
  if n = 10, and k = 6 folds

  test set size ≈ round(10/6) i.e 1.66 -> 2 (upto 2)
  train set size ≈ 10 - 2 = 8

  | Fold | Test indices | Train indices (rest of them) |
  | ---- | ------------ | ---------------------------- |
  | 1    |  [0, 1]      |  [2–9]                       |
  | 2    |  [2, 3]      |  [0–1, 4–9]                  |
  | 3    |  [4, 5]      |  [0–3, 6–9]                  |
  | 4    |  [6, 7]      |  [0–5, 8–9]                  |
  | 5    |  [8]         |  [0–7, 9]                    |
  | 6    |  [9]         |  [0–8]                       |

  Since we are using a different training set for validating a trained model (cross validation) in each fold,
  hence called k-fold cross validation

Importance of k-fold cross validation is to evaluate the accuracy of the model in each fold, to ensure that the model generalizes well
to test data in that fold. This strategy gives a robust estimate of model performance by averaging accuracy (or other metrics) across
all folds, thereby reducing overfitting and data mismatch risks, provided the dataset is huge (since, huge training set is needed for
training) and contains sampled data that is representative of unseen real-world data (used in final model testing).

NOTE: cross-validation (CV) can still lead to overfitting if the training data from dataset is not representative of unseen
real-world data. CV can fail to detect overfitting caused by this issue, leading to poor model generalization

NOTE: Cross-validation should be done before final model training and testing.
      Why?
      Cross-validation is used during the model development phase to:
        -> Evaluate different models or configurations
        -> Select the best-performing model and hyperparameters
        -> Estimate generalization performance without touching the final test set

      Or can run cross-validation after testing, but it is not recommended unless you're doing it for retrospective analysis,
      and not for model selection or tuning.

NOTE: Cross-validation is done on the training data only, and not using the final test data

NOTE: Cross Validation can be done to understand the model's accuracy and not for correctly identifying the precision of the model

'''
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_openml
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.base import clone

def plot_digit(image_data, label_data):
  #plt.imshow(image_data). Note the shape is (784,) which is reduced to one row, it should be reshaped to 28 x 28
  image_data = image_data.reshape(28, 28)
  plt.imshow(image_data, cmap="binary") #black & white
  plt.axis("off")
  print("The below image has label: "+label_data)


def my_cross_validation(model, X_train, Y_train_5, num_of_folds):

  skfolds = StratifiedKFold(num_of_folds) #k = 3

  # to know the row indices of the inputs and labels from the training set, used in the fold for training the model,
  # and to know the row indices of the inputs and labels from the test set, used in the fold for testing the trained model,
  for fold_idx, (train_indices, test_indices) in enumerate(skfolds.split(X_train, Y_train_5), 1):
      print(f"Fold {fold_idx}")
      print("Train indices:", train_indices)
      print("Test indices:", test_indices)
      print("-" * 40)

  accuracies_in_each_fold = []

  for train_indices, test_indices in skfolds.split(X_train, Y_train_5):
    clone_model = clone(model) # needs a clone of the model, as the model will be trained from scratch for each fold. Avoids leaking trained parameters between folds.
    # Uses the fold indices to slice the original dataset into:

    # Training data for this fold
    X_train_folds = X_train[train_indices]
    Y_train_5_folds = Y_train_5[train_indices]

    # Test data for this fold
    X_test_fold = X_train[test_indices]
    Y_test_5_fold = Y_train_5[test_indices]

    # Train the model using the training data for this fold
    clone_model.fit(X_train_folds, Y_train_5_folds)

    # Test the model using the test data for this fold
    y_pred = clone_model.predict(X_test_fold)

    accuracy = sum(y_pred == Y_test_5_fold) / len(y_pred)
    accuracies_in_each_fold.append(accuracy)

  print(accuracies_in_each_fold)


if __name__ == "__main__":

  mnist = fetch_openml('mnist_784', as_frame=False)
  '''
  with 'as_frame=False" will return the inputs as 2D numpy array and not pandas dataframe,
  and the labels as numpy array and not pandas series
  '''
  X, Y = mnist.data, mnist.target
  print("Inputs :\n", X.astype(float))
  #print(X.shape) # 70000 inputs (rows) x 784 features (columns)
  print("---------------")
  print("Labels :\n", Y)
  #print(Y.shape) # 70000 labels for inputs

  #Showing first input image from the mnist dataset
  #plot_digit(X[0], Y[0])

  # Splitting the dataset for train and test (85% train & 14% test)
  X_train, Y_train, X_test, Y_test = X[:60000], Y[:60000], X[60000:], Y[60000:]

  # Training a Binary classfier (classifies numbers if its a '5' or not)
  test_data = X_test[0] # Some data from the MNIST testing set
  test_data_label = Y_test[0]
  Y_train_5 = (Y_train == '5') # True for all 5s, False for other digits
  #print(Y_train_5)
  Y_test_5 = (Y_test == '5')
  model = SGDClassifier(random_state=42)
  cloned_model = clone(model)

  model.fit(X_train, Y_train_5)

  #plot_digit(test_data, test_data_label)
  y_pred = model.predict(X_test)
  print(y_pred)

  # Performing k-fold cross validation (k=3) using sklearn's cross_val_score
  #cross_val_score(model, X_train, Y_train_5, cv=3, scoring="accuracy") # array([0.95035, 0.96035, 0.9604 ]) Good Accuracy in each fold!

  # Performing k-fold cross validation (k=3) using custom implementation
  my_cross_validation(cloned_model, X_train, Y_train_5, num_of_folds=3)



Inputs :
 [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
---------------
Labels :
 ['5' '0' '4' ... '4' '5' '6']
[False False False ... False  True False]
Fold 1
Train indices: [19964 19965 19966 ... 59997 59998 59999]
Test indices: [    0     1     2 ... 20331 20342 20359]
----------------------------------------
Fold 2
Train indices: [    0     1     2 ... 59997 59998 59999]
Test indices: [19964 19965 19966 ... 40088 40125 40127]
----------------------------------------
Fold 3
Train indices: [    0     1     2 ... 40088 40125 40127]
Test indices: [39988 39989 39990 ... 59997 59998 59999]
----------------------------------------
[np.float64(0.95035), np.float64(0.96035), np.float64(0.9604)]
