# **Logistic Regression**

In [0]:
import torch
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms
import numpy as np
import collections
import pickle
from tqdm import tqdm
import pylab
use_colab = True
if use_colab:
  from google.colab import drive
import tensorflow as tf
TINY = 1e-30
EPS = 1e-4
nax = np.newaxis
from sklearn.preprocessing import OneHotEncoder
import matplotlib.pyplot as plt
if use_colab:
  drive_name = '/content/drive'
  drive.mount(drive_name)
  drive_413_A1_folder = 'Queens_data_task'
  drive_location = drive_name + '/My Drive/' + drive_413_A1_folder  # Change this to where your files are located
else:
  # set the drive_location variable to whereever the extracted contents are.
  drive_location = ''

# Hyper-parameters 
input_size = 4096   # 784
num_classes = 10
num_epochs = 100
batch_size = 100
learning_rate = 0.001

# MNIST dataset (images and labels)
Y = np.loadtxt(drive_location + '/' +"train_y.csv", delimiter=",")
Y = Y.reshape(-1)
Y = np.int_(Y)
X = np.load(drive_location + '/' + 'X.npy')
X = torch.Tensor(X)
Y = torch.tensor(Y, dtype=torch.int64)
X = X.reshape(-1, input_size)
# Logistic regression model
model = nn.Linear(input_size, num_classes)

# Loss and optimizer
# nn.CrossEntropyLoss() computes softmax internally
criterion = nn.CrossEntropyLoss()  
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)  

# Train the model
#total_step = len(train_loader)
for epoch in range(num_epochs):
    
    for i in range(50000//batch_size):
        # Reshape images to (batch_size, input_size)
        tempX = X[i*batch_size: i*batch_size + batch_size]
        tempY = Y[i*batch_size: i*batch_size + batch_size]
        
        # Forward pass
        outputs = model(tempX)
        loss = criterion(outputs, tempY)
        
        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
        
    

with torch.no_grad():
    correct = 0
    total = 0
    
    images = X.reshape(-1, input_size)
    outputs = model(X)
    _, predicted = torch.max(outputs.data, 1)
    correct += (predicted == Y).sum()

    print('Accuracy of the model on the 50000 test images: {} %'.format(100 * correct / 50000))

# Save the model checkpoint
torch.save(model.state_dict(), 'model.ckpt')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive
Accuracy of the model on the 10000 test images: 17 %


As we can see, the logistic regression model fails to classify the images and has a test accuracy of 17 % only. This is because detecting the biggest digit in the image is a highly non-linear problem. Logistic regression models have (piecewise) linear decision boundaries so they are not complex enough to learn the features in the training images.

#  **Convolutional neural networks**

In [0]:
import torch
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms
import numpy as np
import collections
import pickle
from tqdm import tqdm
import pylab
use_colab = True
if use_colab:
  from google.colab import drive
import tensorflow as tf
TINY = 1e-30
EPS = 1e-4
nax = np.newaxis
from sklearn.preprocessing import OneHotEncoder
import matplotlib.pyplot as plt
if use_colab:
  drive_name = '/content/drive'
  drive.mount(drive_name)
  drive_413_A1_folder = 'Queens_data_task'
  drive_location = drive_name + '/My Drive/' + drive_413_A1_folder  # Change this to where your files are located
else:
  # set the drive_location variable to whereever the extracted contents are.
  drive_location = ''




Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import math
import random
# Hyper-parameters 
input_size = 4096   # 784
num_classes = 10
num_epochs = 35
batch_size = 100
learning_rate = 0.1
momentum = 0.8

class Net(nn.Module):    
    def __init__(self):
        super(Net, self).__init__()
          
        self.features = nn.Sequential(
            nn.Conv2d(1, 32, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(32),
            nn.ReLU(inplace=True),
            nn.Conv2d(32, 32, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(32),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Conv2d(32, 64, kernel_size=3, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(inplace=True),
            nn.Conv2d(64, 64, kernel_size=3, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2)
        )
          
        self.classifier = nn.Sequential(
            nn.Dropout(p = 0.5),
            nn.Linear(64 * 16 * 16, 512),
            nn.BatchNorm1d(512),
            nn.ReLU(inplace=True),
            nn.Dropout(p = 0.5),
            nn.Linear(512, 512),
            nn.BatchNorm1d(512),
            nn.ReLU(inplace=True),
            nn.Dropout(p = 0.5),
            nn.Linear(512, 10),
        )
          
        for m in self.features.children():
            if isinstance(m, nn.Conv2d):
                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
                m.weight.data.normal_(0, math.sqrt(2. / n))
            elif isinstance(m, nn.BatchNorm2d):
                m.weight.data.fill_(1)
                m.bias.data.zero_()
        
        for m in self.classifier.children():
            if isinstance(m, nn.Linear):
                nn.init.xavier_uniform(m.weight)
            elif isinstance(m, nn.BatchNorm1d):
                m.weight.data.fill_(1)
                m.bias.data.zero_()
                

    def forward(self, x):
        x = self.features(x)
        x = x.view(x.size(0), -1)
        x = self.classifier(x)
        
        return x     


# MNIST dataset (images and labels)
Y = np.loadtxt(drive_location + '/' +"train_y.csv", delimiter=",")
Y = Y.reshape(-1)
Y = np.int_(Y)
X = np.load(drive_location + '/' + 'X.npy')
X = torch.Tensor(X).reshape(-1,1,64,64)
Y = torch.tensor(Y, dtype=torch.int64).type(torch.cuda.FloatTensor)
X = X.reshape(-1, input_size).type(torch.cuda.FloatTensor)

network = Net()
network.cuda()
optimizer = optim.SGD(network.parameters(), lr=learning_rate,
                      momentum=momentum)
# Train the model
#total_step = len(train_loader)
network.train()
for epoch in range(num_epochs):
    for i in range(25000//batch_size):
        # Reshape images to (batch_size, input_size)
        tempX = X[i*batch_size: i*batch_size + batch_size].reshape(-1, 1,64,64)
        tempY = Y[i*batch_size: i*batch_size + batch_size].type(torch.int64)
        
        optimizer.zero_grad()
        output = network(tempX)
        loss = nn.CrossEntropyLoss()(output, tempY)
        loss.backward()
        optimizer.step()
    
        
    
        print ('Epoch [{}/{}], Loss: {:.4f}' 
              .format(epoch+1, num_epochs, loss.item()))

In [0]:
with torch.no_grad():
    correct = 0
    total = 0
    for i in range(0,25): #since GPU does not have enough memory
      images = X[25000 + 1000*i:25000 + 1000*i + 1000].reshape(-1,1,64,64)
      outputs = network(images)
      _, predicted = torch.max(outputs.data, 1)
      correct += (predicted == Y[25000 + 1000*i:25000 + 1000*i + 1000]).sum()

    print('Accuracy of the model on the 25000 test images: {} %'.format(100 * correct / 25000))


Accuracy of the model on the 25000 test images: 77 %


Our CNN consists of 4 convolution layers, each separated with a non-linearity (ReLU) in addition to Max pooling and Batch normalization to speed up training. The output of the CNN is then passed on to classification is which basically a simple feed forward NN. As we can see, the CNN performs much better on this task. This because we have introduced non-linearity to our network across multiple layers (unlike LR), in addition to convolution and Max pooling which allow us to efficiently learn low- and high-level features in the image.
We were able to achieve a test accuracy of 77%, a higher accuracy can be achieved if we tune the hyper-parameters even further.


# **RNN**

In [0]:
import torch 
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms
# Hyper-parameters 
# Hyper-parameters
sequence_length = 64
input_size = 64
hidden_size = 164
num_layers = 2
num_classes = 10
batch_size = 2500
num_epochs = 100
learning_rate = 0.001

# MNIST dataset (images and labels)
Y = np.loadtxt(drive_location + '/' +"train_y.csv", delimiter=",")
Y = Y.reshape(-1)
Y = np.int_(Y)
X = np.load(drive_location + '/' + 'X.npy')
X = torch.Tensor(X).reshape(-1,1,64,64)
Y = torch.tensor(Y, dtype=torch.int64).type(torch.cuda.FloatTensor)
X = X.reshape(-1, 4096).type(torch.cuda.FloatTensor)



device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super(RNN, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.RNN(input_size, hidden_size, num_layers, batch_first=True, nonlinearity='relu')
        self.fc = nn.Linear(hidden_size, num_classes)
    
    def forward(self, x):
        # Set initial hidden and cell states 
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device) 
        
        # Forward propagate LSTM
        out, _ = self.lstm(x, h0)  # out: tensor of shape (batch_size, seq_length, hidden_size)
        
        # Decode the hidden state of the last time step
        out = self.fc(out[:, -1, :])
        return out


model = RNN(input_size, hidden_size, num_layers, num_classes).to(device)



criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)


# Train the model
#model.train()
for epoch in range(num_epochs):
    for i in range(25000//batch_size):
        # Reshape images to (batch_size, input_size)
        tempX = X[i*batch_size: i*batch_size + batch_size].reshape(-1,64,64)
        tempY = Y[i*batch_size: i*batch_size + batch_size].type(torch.int64)
        
        optimizer.zero_grad()
        output = model(tempX)
        loss = criterion(output, tempY)
        loss.backward()
        optimizer.step()
    
        
    
        print ('Epoch [{}/{}], Loss: {:.4f}' 
              .format(epoch+1, num_epochs, loss.item()))

In [0]:
with torch.no_grad():
    correct = 0
    total = 0
    for i in range(0,25): #since GPU does not have enough memory
      images = X[25000 + 1000*i:25000 + 1000*i + 1000].reshape(-1,64,64)
      outputs = model(images)
      _, predicted = torch.max(outputs.data, 1)
      correct += (predicted == Y[25000 + 1000*i:25000 + 1000*i + 1000]).sum()

    print('Accuracy of the model on the 25000 test images: {} %'.format(100 * correct / 25000))

Accuracy of the model on the 25000 test images: 10 %


While RNNs do have non-linearities which enable them to learn complex features, they are mainly designed for tasks that have variable length inputs like speeach recognition or machine translation. This classification task requires recognizing patterns across space which CNNs can do very well in. RNNs, however, are designed to build up memory so they can learn from previous inputs. In other words, RNNs can perform well in problems that involve temporal locality which CNNs do well in tasks with spatial locality. 

It is clear that using an RNN for this task is no better that randomly guessing even after training for 100 epochs with batch size 2500!

# **Gated CNN**

In [0]:
import torch 
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms
# Hyper-parameters 
input_size = 64
hidden_size = 164
num_channels = 15
num_classes = 10
batch_size = 100
num_epochs = 100
learning_rate = 0.01

# MNIST dataset (images and labels)
Y = np.loadtxt(drive_location + '/' +"train_y.csv", delimiter=",")
Y = Y.reshape(-1)
Y = np.int_(Y)
X = np.load(drive_location + '/' + 'X.npy')
X = torch.Tensor(X).reshape(-1,1,64,64)
Y = torch.tensor(Y, dtype=torch.int64).type(torch.cuda.FloatTensor)
X = X.reshape(-1, 4096).type(torch.cuda.FloatTensor)


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


class GLUblock(nn.Module):
    def __init__(self, k, in_c, out_c):
        super(GLUblock, self).__init__()
        #only need to change shape of the residual if num_channels changes (i.e. in_c != out_c)
        if in_c == out_c:
            self.use_proj=0
        else:
            self.use_proj=1
        self.convresid=nn.utils.weight_norm(nn.Conv2d(in_c, out_c, kernel_size=(1,1)),name='weight')
        
        
        self.convx1b = nn.utils.weight_norm(nn.Conv2d(in_c, out_c, kernel_size=k,padding=k//2),name='weight')
        self.convx2b = nn.utils.weight_norm(nn.Conv2d(in_c, out_c, kernel_size=k,padding=k//2),name='weight')
        

    def forward(self, x):
        residual = x
        if self.use_proj==1:# if in_c != out_c, need to change size of residual
            residual=self.convresid(residual)
        
        x1 = self.convx1b(x) 
        x2 = self.convx2b(x) 
        x2 = torch.sigmoid(x2)
        x=torch.mul(x1,x2) 
        return x+residual

class GCNN(nn.Module):
  def __init__(self, k, in_c, out_c):
    super(GCNN, self).__init__()
    self.GCN_layers = nn.Sequential(
        GLUblock(k, in_c, out_c),
        nn.MaxPool2d(kernel_size=2),
        nn.BatchNorm2d(out_c),
        GLUblock(k, out_c, out_c),
        nn.MaxPool2d(kernel_size=2),
        nn.BatchNorm2d(out_c),
        GLUblock(k, out_c, out_c)
    )
    self.classifier = nn.Sequential(
            nn.Dropout(p = 0.5),
            nn.Linear(16 * 16 * out_c, 16*16),
            nn.BatchNorm1d(16 * 16),
            nn.ReLU(inplace=True),
            nn.Dropout(p = 0.5),
            nn.Linear(16 * 16, 50),
            nn.BatchNorm1d(50),
            nn.ReLU(inplace=True),
            nn.Dropout(p = 0.5),
            nn.Linear(50, 10),
        )
  def forward(self, x):
    out = self.GCN_layers(x)
    out = out.view(out.size(0), -1)
    out = self.classifier(out)
    return out


model = GCNN(5, 1, num_channels).to(device)



criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)


# Train the model
#model.train()
for epoch in range(num_epochs):
    for i in range(25000//batch_size):
        # Reshape images to (batch_size, input_size)
        tempX = X[i*batch_size: i*batch_size + batch_size].reshape(-1,1,64,64)
        tempY = Y[i*batch_size: i*batch_size + batch_size].type(torch.int64)
        
        optimizer.zero_grad()
        output = model(tempX)
        loss = criterion(output, tempY)
        loss.backward()
        optimizer.step()
    
        
    
        print ('Epoch [{}/{}], Loss: {:.4f}' 
              .format(epoch+1, num_epochs, loss.item()))


In [0]:
with torch.no_grad():
    correct = 0
    total = 0
    for i in range(25): #since GPU does not have enough memory
      images = X[25000 + 1000*i:25000 + 1000*i + 1000].reshape(-1,1,64,64)
      outputs = model(images)
      _, predicted = torch.max(outputs.data, 1)
      correct += (predicted == Y[25000 + 1000*i:25000 + 1000*i + 1000]).sum()

    print('Accuracy of the model on the 25000 test images: {} %'.format(100 * correct / 25000))

Accuracy of the model on the 25000 test images: 80 %


As we can see, gated CNNs even perform better on this task by achieving a test accuracy of 80 % (further training and tuning of hyper-parameters can yield higher accuracy). Gated linear units are a simple gating mechanism designed to tackle the vanishing gradient problem by having linear units coupled with gates. This retains the non-linear capabilities of the layers while allowing the gradient to propagate through the linear units. In other words, this technique allows some features to propagate through some layers, preventing the low-level features from vanishing deep into the network.