# CS 294 – CIFAR 10 – Memorizing 100% of binary dataset


In [None]:
%%capture

# Standard library imports
import random

# Third party imports
import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import tqdm.notebook
import pandas as pd

from sklearn.neural_network import MLPClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# load the test and training data sets from CIFAR10, compressed to q20 in JPEG
# and subsetted to only dog and cat classes for binary classification
# files are available via Gihub

!mkdir local_copy
!cp /content/drive/MyDrive/gray_cat_dog_q_20_test.csv /content/local_copy
!cp /content/drive/MyDrive/gray_cat_dog_q_20_train.csv /content/local_copy

In [None]:
# classes for classification

CIFAR10_CLASSES = np.array(["cat", "dog"])

In [None]:
df = pd.read_csv("/content/local_copy/gray_cat_dog_q_20_train.csv")
df = np.array(df)

test = np.array(pd.read_csv("/content/local_copy/gray_cat_dog_q_20_test.csv"))

In [None]:
# perform a 50/50 split of 2-class test data, creating 5,000 images in the
# training set and 5,000 images in the validation set
# separate out labels from data set

def split_CIFAR10(df, num_training=5_000, num_validation=5_000):
  """
  Split CIFAR10 training data into training and validation data, with corresponding labels.
  """
  X_train = df[:num_training, 0:-1].astype(np.float64)
  _, y_train = np.unique(df[: num_training, -1], return_inverse=True)
  print(X_train.shape)
  print(y_train.shape)
  X_val = df[num_training: num_training + num_validation, 0:-1].astype(np.float64)
  _, y_val = np.unique(df[num_training: num_training + num_validation, -1], return_inverse=True)
    
  return {
      "X_train": X_train, "y_train": y_train,
      "X_val": X_val, "y_val": y_val
  }

CIFAR10_DATA = split_CIFAR10(df)

# Create the test set as well, with no split (2,000 images)
CIFAR10_TEST = split_CIFAR10(test, num_training=2_000)

In [None]:
# Scale all of the input data by basing it on the training set and applying
# the same scaling factor to valdiation and test set as well

from sklearn import preprocessing
scaler = preprocessing.StandardScaler().fit(CIFAR10_DATA['X_train'])
X_scaled = scaler.transform(CIFAR10_DATA['X_train'])
X_val_scaled = scaler.transform(CIFAR10_DATA['X_val'])
X_test_scaled = scaler.transform(CIFAR10_TEST['X_train']) #it's just called that.

In [None]:
from sklearn.neural_network import MLPClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

# Build MLP models, scaling perceptrons from 1 to 13 (see papaer for rationale
# behind the upper bound of 13; with 13 perceptrons, the network is expected to
# generate a model at the expected MEC level of the data set)

num_neurons = []
train_acc = [] 
val_acc = []
classifiers = []

# From 1 to 13 neurons(expected MEC), generate the training and validation set
# accuracy in order to plot the accuracy/capacity plot
for i in range(1, 14):
  print("iteration {}".format(i))
  num_neurons.append(i)
  clf = MLPClassifier(hidden_layer_sizes=(i,), alpha=0, solver='lbfgs', random_state=1, max_iter=4000)
  clf.fit(X_scaled, CIFAR10_DATA['y_train'])
  classifiers.append(clf)
  train_acc.append(clf.score(X_scaled, CIFAR10_DATA['y_train']))
  val_acc.append(clf.score(X_val_scaled, CIFAR10_DATA['y_val']))

fig, ax = plt.subplots()
ax.plot(num_neurons, train_acc, label='train_acc')
ax.plot(num_neurons, val_acc, label='val_acc')
print(train_acc, val_acc)
ax.set_xlim(13,0)
ax.set_ylim(0, 1)
ax.set_title('Accuracy / Capacity curve for cat & dog')
ax.set_xlabel('MEC')
ax.set_ylabel('Accuracy')

ax.grid(True)

plt.show()

In [None]:
from sklearn.neural_network import MLPClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

num_neurons = []
train_acc = [] 
val_acc = []
classifiers = []

# from 1 to 13 neurons(expected MEC)
# generate the training and validation accuracy curves
# using adam, max_iter = 4000
# We ultimately chose this configuring for our paper

for i in range(1, 14):
  print("iteration {}".format(i))
  num_neurons.append(i)
  clf = MLPClassifier(hidden_layer_sizes=(i,), alpha=0, solver='adam', random_state=1, max_iter=4000)
  clf.fit(X_scaled, CIFAR10_DATA['y_train'])
  classifiers.append(clf)
  train_acc.append(clf.score(X_scaled, CIFAR10_DATA['y_train']))
  val_acc.append(clf.score(X_val_scaled, CIFAR10_DATA['y_val']))

fig, ax = plt.subplots()
ax.plot(num_neurons, train_acc, label='train_acc')
ax.plot(num_neurons, val_acc, label='val_acc')
print(train_acc, val_acc)
ax.set_xlim(13,0)
ax.set_ylim(0, 1)
ax.set_title('Accuracy / Capacity curve for cat & dog')
ax.set_xlabel('MEC')
ax.set_ylabel('Accuracy')

ax.grid(True)

plt.show()

In [None]:
from sklearn.neural_network import MLPClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

num_neurons = []
train_acc = [] 
val_acc = []
classifiers = []

# from 1 to 13 neurons(expected MEC)
# generate the training and validation accuracy curves
# using lbfgs, max_iter scales with neurons

for i in range(1, 14):
  print("iteration {}".format(i))
  num_neurons.append(i)
  max_iter = 4000 if i == 14 else 10/i*4000
  clf = MLPClassifier(hidden_layer_sizes=(i,), alpha=0, solver='lbfgs', random_state=1, max_iter=max_iter)
  clf.fit(X_scaled, CIFAR10_DATA['y_train'])
  classifiers.append(clf)
  train_acc.append(clf.score(X_scaled, CIFAR10_DATA['y_train']))
  val_acc.append(clf.score(X_val_scaled, CIFAR10_DATA['y_val']))

fig, ax = plt.subplots()
ax.plot(num_neurons, train_acc, label='train_acc')
ax.plot(num_neurons, val_acc, label='val_acc')
print(train_acc, val_acc)
ax.set_xlim(13,0)
ax.set_ylim(0, 1)
ax.set_title('Accuracy / Capacity curve for cat & dog')
ax.set_xlabel('MEC')
ax.set_ylabel('Accuracy')

ax.grid(True)

plt.show()

In [None]:
from sklearn.neural_network import MLPClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

num_neurons = []
train_acc = [] 
val_acc = []
classifiers = []

# from 1 to 13 neurons(expected MEC)
# generate the training and validation accuracy curves
# using adam, max_iter = 4000

for i in range(1, 14):
  print("iteration {}".format(i))
  num_neurons.append(i)
  clf = MLPClassifier(hidden_layer_sizes=(i,2), alpha=0, solver='adam', random_state=1, max_iter=4000)
  clf.fit(X_scaled, CIFAR10_DATA['y_train'])
  classifiers.append(clf)
  train_acc.append(clf.score(X_scaled, CIFAR10_DATA['y_train']))
  val_acc.append(clf.score(X_val_scaled, CIFAR10_DATA['y_val']))

fig, ax = plt.subplots()
ax.plot(num_neurons, train_acc, label='train_acc')
ax.plot(num_neurons, val_acc, label='val_acc')
print(train_acc, val_acc)
ax.set_xlim(13,0)
ax.set_ylim(0, 1)
ax.set_title('Accuracy / Capacity curve for cat & dog')
ax.set_xlabel('MEC')
ax.set_ylabel('Accuracy')

ax.grid(True)

plt.show()

In [None]:
# combine training and validation data sets together so that we can
# train on the entire training set 

x_full_train_dataset = []

for num, data in enumerate(X_scaled):
  x_full_train_dataset.append(data)
for num, data in enumerate(X_val_scaled):
  x_full_train_dataset.append(data)

y_full_train_dataset = []

for num, data in enumerate(X_scaled):
  y_full_train_dataset.append(CIFAR10_DATA['y_train'][num])
for num, data in enumerate(X_val_scaled):
  y_full_train_dataset.append(CIFAR10_DATA['y_val'][num])



In [None]:
# i - 7 since we chose to use 7 neurons
i = 7

# so now let's fit the model on the entire data set
clf = MLPClassifier(hidden_layer_sizes=(i,), alpha=0, solver='adam', random_state=1, max_iter=4000)
clf.fit(x_full_train_dataset, y_full_train_dataset)
classifiers.append(clf)

# output training and test accuracy
print(clf.score(x_full_train_dataset, y_full_train_dataset))
print(clf.score(X_test_scaled, CIFAR10_TEST['y_train']))

In [None]:
# save the dataset in case

np.savetxt("/content/local_copy/scaled_combined_train_q20_cat_dog.csv",combined_dataset,delimiter=",",)
