<h3> Imports

In [9]:
# General
import numpy as np
import pandas as pd
import scipy as sp
import logging
import os
import opendatasets as od
from joblib import load,dump
from sklearn.base import clone, BaseEstimator, TransformerMixin
from sklearn.metrics import precision_score, recall_score, roc_auc_score, roc_curve, precision_recall_curve

# Proprocessing
from sklearn.preprocessing import StandardScaler, MinMaxScaler
# Model selection
from sklearn.model_selection import (
    cross_val_predict,
    StratifiedShuffleSplit, 
    StratifiedKFold,
    RandomizedSearchCV,
    GridSearchCV,
)
# Models
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
# Plotting
import matplotlib as mpl
import matplotlib.pyplot as plt
# Datasets
from sklearn.datasets import fetch_openml

 <h5>Configuration

In [2]:
DATA_DIR = "datasets"
DATA_NAME = "mnist_784.pkl"
DATASET_PATH = os.path.join(os.getcwd(),DATA_DIR, DATA_NAME.split("_")[0], DATA_NAME)

In [3]:
# # How to donwload data directly from Kaggle
# URL = "https://www.kaggle.com/datasets/vitthalmadane/energy-consumption-time-series-dataset"
# od.download(URL, data_dir=DATA_DIR)

<h3>Classification

In [4]:
# Get data
if os.path.exists(DATASET_PATH):
    df = load(DATASET_PATH)
else:
    if not os.path.exists(os.path.dirname(DATASET_PATH)):
        os.makedirs(os.path.dirname(DATASET_PATH))
    df = fetch_openml("mnist_784", version=1)
    with open(DATASET_PATH, "wb") as f:
        dump(df, f)

In [5]:
X,y = df["data"], df["target"]

In [6]:
X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:]

In [7]:
# class ImageShifter(BaseEstimator, TransformerMixin):
#     def __init__(self, image_shape: tuple, shift: list):
#         self.image_shape = image_shape
#         self.shift = shift
#     def fit(self, X, y=None):
#         return self
#     def transform(self, X):
#         new_arr = []
#         for i in X.values:
#             new_arr.append(sp.ndimage.shift(i.reshape(self.image_shape), shift=self.shift, cval=0).ravel())
#         return np.vstack([X, np.array(new_arr)])

# shifter = ImageShifter(image_shape=(28,28), shift=[2,2])
# X_trans, y_trans = shifter.transform(X_train) 

In [8]:
y_train_5, y_test_5 = (y_train.astype(int) == 5), (y_test.astype(int) == 5)

In [24]:
linear_preds = cross_val_predict(SGDClassifier(), X_train, y_train_5, cv=5, method="decision_function", n_jobs=-1, verbose=2)
tree_preds = cross_val_predict(DecisionTreeClassifier(), X_train, y_train_5, cv=5, method="predict_proba", n_jobs=-1, verbose=2)
forest_preds = cross_val_predict(RandomForestClassifier(), X_train, y_train_5, cv=5, method="predict_proba", n_jobs=-1, verbose=2)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   13.6s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   10.3s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   18.2s finished


In [11]:
knn_grid = GridSearchCV(
    KNeighborsClassifier(),
    param_grid=[{
        "weights":["uniform", "distance"],
        "n_neighbors":[4,6,8,10],
    }],
    n_jobs=-1,
    cv=3,
    scoring="accuracy"
    )
knn_grid.fit(X_train,y_train)

In [13]:
knn_preds = cross_val_predict(knn_grid.best_estimator_, X_train, y_train, cv=3, n_jobs=-1)

In [21]:
(y_train == knn_preds).sum()/y_train.shape[0]

0.97035