The following approach was original inspired by this [tutorial](https://www.youtube.com/watch?v=FB5EdxAGxQg&ab_channel=codebasics).

##**Loading the dataset**


In [29]:
import torchvision.transforms as transforms
import torchvision
import torch
from random import seed 

seed(40)
torch.manual_seed(40)

# Transformations applied to images as they are loaded by the Pytorch dataloader
transform = transforms.Compose(
    [transforms.ToTensor(),
     transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))]
)

#where we'll store the data
google_drive_path = './drive/MyDrive/Colab Notebooks/3337_Project'

#download our data 
trainset = torchvision.datasets.CIFAR10(root=google_drive_path, train=True, download=True, transform=transform)
testset = torchvision.datasets.CIFAR10(root=google_drive_path, train=False, download=True, transform=transform)
#the above code is really just used for DLing all data
#we still need to manually transform it (normalize it)
#and we still need to split it into training and testing data

#don't need these anymore
del trainset 
del testset




Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ./drive/MyDrive/Colab Notebooks/3337_Project/cifar-10-python.tar.gz


  0%|          | 0/170498071 [00:00<?, ?it/s]

Extracting ./drive/MyDrive/Colab Notebooks/3337_Project/cifar-10-python.tar.gz to ./drive/MyDrive/Colab Notebooks/3337_Project
Files already downloaded and verified


##**Defining functions for loading the data**

In [30]:
import numpy as np
import pickle
import matplotlib.pyplot as plt

np.random.seed(40)

def unpickle(file): #adapted from https://www.cs.toronto.edu/~kriz/cifar.html
    with open(file, 'rb') as fo:
        dict = pickle.load(fo, encoding='bytes')
    return dict

def load_CIFAR_batch(filename):
  """ load single batch of cifar """
  datadict = unpickle(filename)
  X = datadict[b'data']
  Y = datadict[b'labels']
  X = X.reshape(10000, 3, 32, 32).transpose(0,2,3,1).astype("float")
  Y = np.array(Y)
  return X, Y

def load_CIFAR10(ROOT): #adapted from tutorial #this splits the data into training and testing sets
  """ load all of cifar """
  xs = []
  ys = []
  for b in range(1,6):
    f = ROOT + '/data_batch_%d'%(b,)
    X, Y = load_CIFAR_batch(f)
    xs.append(X)
    ys.append(Y)    
  Xtr = np.concatenate(xs)
  Ytr = np.concatenate(ys)
  del X, Y
  Xte, Yte = load_CIFAR_batch(ROOT + '/test_batch')
  return Xtr, Ytr, Xte, Yte

##**Loading the data and preprocessing**


In [31]:
import pandas as pd
cifar10_dir = './drive/MyDrive/Colab Notebooks/3337_Project/cifar-10-batches-py' #changed

X_train_4D_unNorm, y_train, X_test_4D_unNorm, y_test = load_CIFAR10(cifar10_dir) #splits into training and testing sets

In [32]:
data = [X_train_4D_unNorm, y_train, X_test_4D_unNorm, y_test]
for each in data:
  print(each.shape)
#still need to reshape and normalize

(50000, 32, 32, 3)
(50000,)
(10000, 32, 32, 3)
(10000,)


In [33]:
#reshape the data
X_train_2D_unNorm = np.reshape(X_train_4D_unNorm, (X_train_4D_unNorm.shape[0],-1))
X_test_2D_unNorm = np.reshape(X_test_4D_unNorm, (X_test_4D_unNorm.shape[0],-1))

#now we don't need the 4D data anymore
del X_train_4D_unNorm
del X_test_4D_unNorm

In [34]:
#here we show that we successfully transformed the 4D set of images to a 2D set of images
#each column is named column,row,rgb
print(X_train_2D_unNorm.shape)
print(X_test_2D_unNorm.shape)

(50000, 3072)
(10000, 3072)


In [35]:
#now let's produce our pd.DataFrames

columnLabels = [] #we'll feed this to our pd.DataFrame to label each column for each image
for i in range(32):
  for j in range(32):
    for color in ['r','g','b']:
      columnLabels.append(str(i)+','+str(j)+','+color)

X_train_unNorm = pd.DataFrame(X_train_2D_unNorm, columns=columnLabels)
X_test_unNorm = pd.DataFrame(X_test_2D_unNorm, columns=columnLabels)

#don't need X_train_2D_unNorm or X_test_2D_unNorm anymore
del X_train_2D_unNorm
del X_test_2D_unNorm

###**Normalization**


In [36]:
#now we need to normalize
#but which normalization should we use? To figure this out let's inspect the data's description using a pd.DataFrame
description = X_train_unNorm.describe()
description

Unnamed: 0,"0,0,r","0,0,g","0,0,b","0,1,r","0,1,g","0,1,b","0,2,r","0,2,g","0,2,b","0,3,r","0,3,g","0,3,b","0,4,r","0,4,g","0,4,b","0,5,r","0,5,g","0,5,b","0,6,r","0,6,g","0,6,b","0,7,r","0,7,g","0,7,b","0,8,r","0,8,g","0,8,b","0,9,r","0,9,g","0,9,b","0,10,r","0,10,g","0,10,b","0,11,r","0,11,g","0,11,b","0,12,r","0,12,g","0,12,b","0,13,r",...,"31,18,b","31,19,r","31,19,g","31,19,b","31,20,r","31,20,g","31,20,b","31,21,r","31,21,g","31,21,b","31,22,r","31,22,g","31,22,b","31,23,r","31,23,g","31,23,b","31,24,r","31,24,g","31,24,b","31,25,r","31,25,g","31,25,b","31,26,r","31,26,g","31,26,b","31,27,r","31,27,g","31,27,b","31,28,r","31,28,g","31,28,b","31,29,r","31,29,g","31,29,b","31,30,r","31,30,g","31,30,b","31,31,r","31,31,g","31,31,b"
count,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,...,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0
mean,130.71074,136.05614,132.5538,130.14036,135.44238,131.85358,131.05044,136.24616,132.58144,131.56886,136.67804,132.8775,132.1847,137.22858,133.26738,132.85184,137.8545,133.78034,133.37154,138.28386,134.15504,133.89092,138.74072,134.5227,134.48504,139.27524,135.01422,134.9326,139.6326,135.29142,135.23398,139.94098,135.51004,135.40374,140.0906,135.659,135.62016,140.26882,135.81572,135.63418,...,112.92994,127.9499,125.55222,113.12156,128.11912,125.81986,113.37622,128.182,125.99352,113.572,128.13958,126.08994,113.70824,128.00216,126.08034,113.71626,127.90118,126.07986,113.73532,127.7712,126.10538,113.77852,127.52152,126.041,113.8132,127.28322,125.95734,113.86412,127.06338,125.852,113.8778,126.77836,125.69708,113.83058,126.56496,125.61742,113.90624,126.63908,125.8503,114.38186
std,73.412873,72.90798,80.449751,72.44259,71.901316,79.598048,72.240546,71.67471,79.434675,72.016555,71.476733,79.233247,71.714551,71.162294,78.962392,71.537505,70.930285,78.805779,71.353558,70.738966,78.65983,71.281237,70.688845,78.56113,71.071698,70.445383,78.311897,71.03647,70.393633,78.297292,70.865787,70.195198,78.156012,70.809619,70.10309,78.095902,70.685747,70.046499,78.059243,70.575749,...,63.80101,63.270538,61.063684,63.871026,63.259172,61.094695,63.968701,63.2268,61.091111,64.058668,63.354331,61.178068,64.21183,63.476015,61.244066,64.345623,63.521993,61.271605,64.431996,63.495429,61.206042,64.401897,63.577241,61.280616,64.502684,63.559357,61.240886,64.511269,63.788617,61.439053,64.738943,63.954763,61.591233,64.894603,64.178427,61.831646,65.212671,64.926283,62.649286,66.077526
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,71.0,77.0,61.0,71.0,78.0,61.0,73.0,79.0,62.0,73.0,80.0,63.0,75.0,81.0,63.0,75.0,82.0,64.0,76.0,82.0,65.0,76.75,83.0,65.0,78.0,84.0,66.0,78.0,84.0,66.0,79.0,85.0,67.0,79.0,85.0,67.0,79.0,85.0,67.0,80.0,...,64.0,81.0,81.0,64.0,81.0,81.0,64.0,81.0,82.0,64.0,81.0,82.0,64.0,80.0,81.0,64.0,80.0,81.0,64.0,80.0,81.0,64.0,80.0,82.0,64.0,80.0,82.0,64.0,80.0,81.0,64.0,79.0,81.0,64.0,79.0,81.0,64.0,78.0,80.0,63.0
50%,128.0,135.0,127.0,127.0,135.0,127.0,129.0,136.0,128.0,130.0,137.0,129.0,130.0,137.0,129.0,131.0,138.0,130.0,132.0,138.0,130.0,132.0,139.0,131.0,133.0,140.0,132.0,134.0,140.0,132.0,134.0,140.0,132.0,134.0,140.0,133.0,135.0,141.0,133.0,134.0,...,105.0,126.0,124.0,106.0,127.0,124.0,106.0,127.0,124.0,106.0,126.0,124.0,106.0,126.0,124.0,106.0,126.0,124.0,106.0,126.0,124.0,106.0,125.0,124.0,106.0,125.0,124.0,106.0,125.0,123.0,106.0,124.0,123.0,106.0,124.0,123.0,106.0,124.0,123.0,106.0
75%,189.0,195.0,207.0,188.0,193.0,206.0,188.0,194.0,206.0,188.0,194.0,206.0,189.0,194.0,206.0,190.0,195.0,207.0,190.0,196.0,207.0,191.0,196.0,207.0,191.0,197.0,208.0,192.0,197.0,208.0,192.0,197.0,208.0,192.0,197.0,208.0,192.0,197.0,209.0,192.0,...,156.0,174.0,168.0,156.0,174.0,168.0,156.0,173.0,168.0,157.0,174.0,168.0,157.0,174.0,168.0,157.0,173.0,168.0,157.0,173.0,168.0,157.0,173.0,167.0,157.0,172.0,167.0,157.0,172.0,167.0,157.0,172.0,167.0,157.0,172.0,167.0,157.0,172.0,168.0,158.0
max,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,...,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0


In [37]:
#let's check for outliers

min = description.transpose()["min"].min()
max = description.transpose()["max"].max()

whiskerLimits = pd.DataFrame([], index=["lowWhiskerLimits","highWhiskerLimits"], columns=columnLabels)
for eachCol in description.columns:
  IQR = description[eachCol]["75%"] - description[eachCol]["25%"]
  whiskerLimits[eachCol]["lowWhiskerLimits"] = description[eachCol]["25%"] - 1.5*IQR
  whiskerLimits[eachCol]["highWhiskerLimits"] = description[eachCol]["75%"] + 1.5*IQR

if min >= whiskerLimits.transpose()["lowWhiskerLimits"].max(): #if the lowest minimum is at or above the highest lowerWhiskerLimit
  print("No lower outliers")
else: print("There are lower outliers")

if max <= whiskerLimits.transpose()["highWhiskerLimits"].min(): #if the highest maximum is at or below the lowest highWhiskerLimit
  print("No upper outliers")
else: print("There are upper outliers")

del description

No lower outliers
No upper outliers


In [38]:
X_train_unNorm

Unnamed: 0,"0,0,r","0,0,g","0,0,b","0,1,r","0,1,g","0,1,b","0,2,r","0,2,g","0,2,b","0,3,r","0,3,g","0,3,b","0,4,r","0,4,g","0,4,b","0,5,r","0,5,g","0,5,b","0,6,r","0,6,g","0,6,b","0,7,r","0,7,g","0,7,b","0,8,r","0,8,g","0,8,b","0,9,r","0,9,g","0,9,b","0,10,r","0,10,g","0,10,b","0,11,r","0,11,g","0,11,b","0,12,r","0,12,g","0,12,b","0,13,r",...,"31,18,b","31,19,r","31,19,g","31,19,b","31,20,r","31,20,g","31,20,b","31,21,r","31,21,g","31,21,b","31,22,r","31,22,g","31,22,b","31,23,r","31,23,g","31,23,b","31,24,r","31,24,g","31,24,b","31,25,r","31,25,g","31,25,b","31,26,r","31,26,g","31,26,b","31,27,r","31,27,g","31,27,b","31,28,r","31,28,g","31,28,b","31,29,r","31,29,g","31,29,b","31,30,r","31,30,g","31,30,b","31,31,r","31,31,g","31,31,b"
0,59.0,62.0,63.0,43.0,46.0,45.0,50.0,48.0,43.0,68.0,54.0,42.0,98.0,73.0,52.0,119.0,91.0,63.0,139.0,107.0,75.0,145.0,110.0,80.0,149.0,117.0,89.0,149.0,120.0,93.0,131.0,103.0,77.0,125.0,99.0,76.0,142.0,115.0,91.0,144.0,...,83.0,148.0,103.0,77.0,161.0,105.0,69.0,144.0,95.0,55.0,112.0,90.0,59.0,119.0,91.0,58.0,130.0,96.0,65.0,120.0,87.0,59.0,92.0,67.0,46.0,103.0,78.0,57.0,170.0,140.0,104.0,216.0,184.0,140.0,151.0,118.0,84.0,123.0,92.0,72.0
1,154.0,177.0,187.0,126.0,137.0,136.0,105.0,104.0,95.0,102.0,101.0,99.0,125.0,131.0,139.0,155.0,166.0,180.0,172.0,190.0,210.0,180.0,199.0,214.0,142.0,156.0,156.0,111.0,120.0,110.0,106.0,107.0,98.0,109.0,104.0,102.0,123.0,121.0,117.0,127.0,...,28.0,5.0,6.0,11.0,6.0,6.0,11.0,10.0,9.0,12.0,25.0,22.0,22.0,51.0,45.0,42.0,68.0,64.0,67.0,90.0,90.0,101.0,116.0,113.0,122.0,134.0,127.0,133.0,140.0,130.0,136.0,143.0,133.0,139.0,143.0,134.0,142.0,143.0,133.0,144.0
2,255.0,255.0,255.0,253.0,253.0,253.0,253.0,253.0,253.0,253.0,253.0,253.0,253.0,253.0,253.0,253.0,253.0,253.0,253.0,253.0,253.0,253.0,253.0,253.0,253.0,253.0,253.0,253.0,253.0,253.0,253.0,253.0,253.0,253.0,253.0,253.0,253.0,253.0,253.0,253.0,...,92.0,76.0,90.0,90.0,72.0,86.0,86.0,64.0,78.0,78.0,64.0,78.0,78.0,69.0,83.0,83.0,66.0,80.0,80.0,55.0,69.0,69.0,52.0,66.0,66.0,58.0,72.0,72.0,68.0,80.0,79.0,78.0,85.0,83.0,79.0,85.0,83.0,80.0,86.0,84.0
3,28.0,25.0,10.0,37.0,34.0,19.0,38.0,35.0,20.0,42.0,37.0,23.0,44.0,39.0,25.0,40.0,37.0,22.0,40.0,38.0,23.0,24.0,23.0,9.0,32.0,25.0,15.0,43.0,27.0,19.0,30.0,20.0,10.0,32.0,30.0,17.0,41.0,37.0,23.0,52.0,...,55.0,122.0,99.0,67.0,107.0,85.0,54.0,112.0,96.0,66.0,92.0,81.0,53.0,80.0,62.0,39.0,96.0,78.0,59.0,77.0,67.0,42.0,85.0,76.0,44.0,84.0,75.0,48.0,67.0,57.0,38.0,54.0,47.0,28.0,63.0,56.0,37.0,72.0,65.0,46.0
4,170.0,180.0,198.0,168.0,178.0,196.0,177.0,185.0,203.0,183.0,193.0,211.0,181.0,196.0,218.0,177.0,195.0,220.0,181.0,195.0,223.0,184.0,192.0,223.0,189.0,190.0,223.0,189.0,189.0,222.0,188.0,193.0,224.0,183.0,194.0,223.0,182.0,194.0,223.0,184.0,...,101.0,93.0,98.0,102.0,90.0,95.0,99.0,88.0,93.0,97.0,83.0,89.0,92.0,77.0,87.0,88.0,75.0,84.0,85.0,71.0,81.0,82.0,74.0,80.0,83.0,72.0,76.0,79.0,71.0,75.0,78.0,75.0,79.0,82.0,71.0,75.0,78.0,73.0,77.0,80.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,145.0,161.0,194.0,151.0,166.0,199.0,156.0,171.0,205.0,155.0,173.0,203.0,153.0,173.0,201.0,153.0,172.0,201.0,157.0,177.0,205.0,161.0,180.0,210.0,161.0,180.0,212.0,161.0,180.0,212.0,162.0,181.0,213.0,165.0,184.0,216.0,164.0,183.0,215.0,163.0,...,75.0,51.0,56.0,75.0,49.0,55.0,72.0,50.0,56.0,72.0,50.0,56.0,72.0,46.0,52.0,68.0,44.0,48.0,64.0,44.0,46.0,61.0,46.0,48.0,63.0,42.0,44.0,59.0,40.0,42.0,57.0,39.0,41.0,56.0,36.0,38.0,53.0,37.0,39.0,54.0
49996,255.0,245.0,132.0,254.0,247.0,134.0,255.0,249.0,137.0,254.0,251.0,138.0,254.0,251.0,138.0,254.0,248.0,135.0,254.0,242.0,130.0,253.0,239.0,128.0,254.0,241.0,129.0,254.0,244.0,131.0,254.0,244.0,132.0,254.0,243.0,129.0,253.0,240.0,127.0,254.0,...,63.0,244.0,168.0,61.0,247.0,171.0,65.0,237.0,157.0,55.0,240.0,160.0,57.0,245.0,167.0,61.0,246.0,169.0,64.0,249.0,176.0,68.0,248.0,171.0,64.0,245.0,174.0,76.0,249.0,175.0,82.0,249.0,174.0,79.0,252.0,181.0,81.0,253.0,181.0,76.0
49997,35.0,178.0,235.0,40.0,176.0,239.0,42.0,176.0,241.0,39.0,180.0,239.0,43.0,181.0,237.0,45.0,180.0,237.0,48.0,181.0,238.0,51.0,181.0,239.0,53.0,183.0,241.0,56.0,183.0,241.0,58.0,182.0,242.0,59.0,182.0,242.0,58.0,183.0,240.0,61.0,...,92.0,47.0,71.0,97.0,53.0,77.0,102.0,52.0,73.0,96.0,46.0,68.0,91.0,45.0,68.0,94.0,50.0,77.0,104.0,50.0,77.0,103.0,45.0,71.0,96.0,42.0,68.0,93.0,36.0,62.0,87.0,27.0,52.0,77.0,21.0,43.0,66.0,12.0,31.0,50.0
49998,189.0,211.0,240.0,186.0,208.0,236.0,185.0,207.0,235.0,184.0,207.0,235.0,182.0,206.0,234.0,181.0,205.0,232.0,180.0,204.0,232.0,180.0,204.0,233.0,181.0,205.0,235.0,180.0,204.0,234.0,179.0,202.0,233.0,180.0,202.0,232.0,183.0,202.0,229.0,185.0,...,88.0,114.0,112.0,103.0,142.0,141.0,130.0,164.0,162.0,149.0,175.0,171.0,160.0,184.0,177.0,166.0,190.0,181.0,171.0,190.0,182.0,171.0,193.0,186.0,172.0,192.0,185.0,169.0,190.0,182.0,165.0,195.0,184.0,169.0,196.0,189.0,171.0,195.0,190.0,171.0


In [39]:
#since there are no outliers we can use min-max normalization to scale everything to be between 0 and 1
#first we'll normalize our X_train dataFrame
from sklearn.preprocessing import MinMaxScaler
X_train = pd.DataFrame(MinMaxScaler().fit_transform(X_train_unNorm), columns=columnLabels)
#fit_transform uses one argument both to determine the mins and maxs (scaling params) and to do the transform
#to normalize our X_test, we want to use the mins and maxs from the train data, since it's such a bigger sample, so we'll do this normalization in steps, first we'll call fit on the training data, then we'll call transform on the test data
X_test = pd.DataFrame(MinMaxScaler().fit(X_train_unNorm).transform(X_test_unNorm), columns=columnLabels) 

##**Training**

In [40]:
#now we are ready to make our Support Vector Classifier (a type of support vector machine)
from sklearn.svm import SVC
import time

model = SVC()

tic=time.time()
model.fit(X_train, y_train) #this takes a while (~3.5 hrs)
toc = time.time()

print("Training took " + str((toc-tic)/3600) +" hours")

Training took 3.179871161646313 hours


In [41]:
#here we save the model
import pickle
pickle_filename = "./drive/MyDrive/Colab Notebooks/3337_Project/SVC_SVM_trained_with_one_batch.pkl"
pickle.dump(model, open(pickle_filename,'wb'))

In [44]:
model.score(X_test,y_test)

0.5437