# Perceptron Tutorial

In [101]:
# Required imports
import torch
import numpy as np
import pandas as pd
from torch.nn import Linear
from torch.nn import Sigmoid, LogSoftmax
from torch.optim import SGD
from torch.nn import BCELoss, NLLLoss
from string import punctuation
import itertools
from tqdm import tqdm

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelBinarizer, LabelEncoder

## 0. Dataset Loading and Cleaning

We'll begin by loading a prepared version of the [Stanford Large Movie Review Dataset](http://ai.stanford.edu/~amaas/data/sentiment/), on which we'll train a binary classifier. 

This dataset contains 50k highly polarized movie reviews from IMDB, labeled with positive or negative sentiment. 

We'll perform some minimal preprocessing on the text itself, simply case-normalization and removal of punctuation

In [2]:
# Load the data into a DataFrame
data = pd.read_pickle('../data/aclImdb_combined.pkl')

# Definne a simple convenience function for cleaning the strings
def clean_text(text):
    return "".join([c for c in text.lower() if c not in punctuation])

# Clean the string labels
data['text_cleaned'] = data['text'].map(clean_text)
data.head()

Unnamed: 0,label,text,text_cleaned
0,1,"Not all, but most of this story is Buster bein...",not all but most of this story is buster being...
1,1,Eric Bogosian's ability to roll from character...,eric bogosians ability to roll from character ...
2,1,I am a professional musician who was inspired ...,i am a professional musician who was inspired ...
3,0,Robin Williams is excellent in this movie and ...,robin williams is excellent in this movie and ...
4,0,This is a woeful Hollywood remake of a classic...,this is a woeful hollywood remake of a classic...


# 1. Text Vectorization

Once we have some (somewhat) clean data, we can then vectorize the corpus the standard term frequency, inverse document frequency. For the sake of time, we'll limit the overall input feature space to the top 1k tokens, based on the 

In [3]:
# Initialize a TfidfVectorizer Object
tfidf = TfidfVectorizer()

# Fit the cleaned text
tfidf.fit(data['text_cleaned'])

# Examine the total number of tokens in the text
print("Total tokens in input corpus: {}".format(len(tfidf.vocabulary_)))

# Initialize a TfidfVectorizer Object, this time with a max number of features
max_features = 1000
tfidf = TfidfVectorizer(max_features=max_features)

# Fit the cleaned text
features = tfidf.fit_transform(data['text_cleaned']).todense()
labels = data.label.values.reshape(-1,1)

# Create tuples of the feature/label pairs, 
# and perform a stratified train/test split
all_data = list(zip(features, labels))
train_data, test_data = train_test_split(all_data, stratify=labels, random_state=42)

Total tokens in input corpus: 180395


# 2a. Perceptron Classifier

For the simplest perceptron, we'll only need a single linear layer as well as a sigmoid transformation to map the output space from our linear layer into the proper probability distribution. 

Two other things that need to be considered are the choice of loss funciton and the optimization algorithm. We'll use binary cross entropy for the loss function, and stochastic gradient descent for the optimization

In [7]:
# Create a linear single linear layer, 
# with input shape of our feature space 
# and output shape of 1 (binary classification)
linear = Linear(max_features, 1, bias=True)

# Create a instance of the sigmoid function
# so we can normalize our output to the range [0,1]
sigmoid = Sigmoid()

# Binary cross entropy is an appropriate loss function 
# for this type of problem, and is implemented in the 
# `BCELoss` class in pytroch
criterion = BCELoss()

# We'll use basic stochastic gradient descent
# to optimize the parameters of our linear layer 
# (the sigmoid is a transformation with no parameters)
optim = SGD(params=linear.parameters(), lr=0.01)

# 2b. Training the Perceptron

In [17]:
total_loss = 0
for it, example in tqdm(list(enumerate(train_data))):
    optim.zero_grad()
    f, t = example
    X = torch.FloatTensor(f)
    y = torch.FloatTensor(t)
    X_prime = linear(X)
    output = sigmoid(X_prime)
    loss = criterion(output.view(-1), y)
    total_loss += loss.data.numpy()
    loss.backward()

    optim.step()

100%|██████████| 37500/37500 [00:07<00:00, 5002.98it/s]


# 2c. Evaluating the Perceptron

In [18]:
linear.eval()
y_pred = []
y_true = []
threshold = 0.5

for f, t in test_data:
    X = torch.FloatTensor(f)
    y = torch.FloatTensor([t])
    output = sigmoid(linear(X))
    y_true.append(y.data.numpy()[0])
    y_pred.append(output.data.numpy()[0])

y_pred = [int(p >= threshold) for p in y_pred]
a = accuracy_score(y_true, y_pred)

total_loss /= (it + 1)

print("Loss: {:.2f}, Validation Accuracy: {:.2f}".format(total_loss, a))

Loss: 0.55, Validation Accuracy: 0.80


# 2d. Putting it All Together

In [19]:
LEARNING_RATE = 0.01
EPOCHS = 10

linear = Linear(max_features, 1, bias=True)
sigmoid = Sigmoid()
criterion = BCELoss()
optim = SGD(params=linear.parameters(), lr=LEARNING_RATE)



for epoch in range(EPOCHS):
    total_loss = 0
    linear.train()
    for it, example in list(enumerate(train_data)):
        optim.zero_grad()
        f, t = example
        X = torch.FloatTensor(f)
        y = torch.FloatTensor(t)
        X_prime = linear(X)
        output = sigmoid(X_prime)
        loss = criterion(output.view(-1), y)
        total_loss += loss.data.numpy()
        loss.backward()

        optim.step()
        
    linear.eval()
    y_pred = []
    y_true = []
    threshold = 0.5
    
    for f, t in test_data:
        X = torch.FloatTensor(f)
        y = torch.FloatTensor(t)
        output = sigmoid(linear(X))
        y_true.append(y.data.numpy()[0])
        y_pred.append(output.data.numpy()[0])
        
    y_pred = [int(p >= threshold) for p in y_pred]
    a = accuracy_score(y_true, y_pred)



    total_loss /= (it + 1)
    print("Epoch Loss: {:.2f}, Validation Accuracy: {:.2f}".format(total_loss, a))

Epoch Loss: 0.61, Validation Accuracy: 0.80
Epoch Loss: 0.52, Validation Accuracy: 0.82
Epoch Loss: 0.47, Validation Accuracy: 0.83
Epoch Loss: 0.44, Validation Accuracy: 0.83
Epoch Loss: 0.42, Validation Accuracy: 0.84
Epoch Loss: 0.41, Validation Accuracy: 0.84
Epoch Loss: 0.39, Validation Accuracy: 0.85
Epoch Loss: 0.38, Validation Accuracy: 0.85
Epoch Loss: 0.38, Validation Accuracy: 0.85
Epoch Loss: 0.37, Validation Accuracy: 0.85


# 2e. Creating a Model Class

In [11]:
from modules.perceptron import *



model = perceptron(max_features)
optim = SGD(params=model.parameters(), lr=0.01)
criterion = BCELoss()
model = train(model, train_data, optim, criterion, epochs=10, test_data=test_data)

Epoch Loss: 0.61, Validation Accuracy: 0.80
Epoch Loss: 0.52, Validation Accuracy: 0.82
Epoch Loss: 0.47, Validation Accuracy: 0.83
Epoch Loss: 0.44, Validation Accuracy: 0.84
Epoch Loss: 0.42, Validation Accuracy: 0.84
Epoch Loss: 0.41, Validation Accuracy: 0.84
Epoch Loss: 0.39, Validation Accuracy: 0.85
Epoch Loss: 0.38, Validation Accuracy: 0.85
Epoch Loss: 0.38, Validation Accuracy: 0.85
Epoch Loss: 0.37, Validation Accuracy: 0.85


# 3. Multi-class Model

In [77]:
# Load the data into a DataFrame
data = pd.read_pickle('../data/stackoverflow_gbq.pkl')

# Definne a simple convenience function for cleaning the strings
def clean_text(text):
    return "".join([c for c in text.lower() if c not in punctuation])

# Clean the string labels
data['text_cleaned'] = data['text'].map(clean_text)

# Initialize a TfidfVectorizer Object
tfidf = TfidfVectorizer()

# Fit the cleaned text
tfidf.fit(data['text_cleaned'])

# Examine the total number of tokens in the text
print("Total tokens in input corpus: {}".format(len(tfidf.vocabulary_)))

# Initialize a TfidfVectorizer Object, this time with a max number of features
max_features = 1000
tfidf = TfidfVectorizer(max_features=max_features)

# Fit the cleaned text
features = tfidf.fit_transform(data['text_cleaned']).todense()

le = LabelEncoder()
labels = le.fit_transform(data.label.values).reshape(-1,1)

# Create tuples of the feature/label pairs, 
# and perform a stratified train/test split
all_data = list(zip(features, labels))
train_data, test_data = train_test_split(all_data, stratify=labels, random_state=42)

Total tokens in input corpus: 36873


In [102]:
# Create a linear single linear layer, 
# with input shape of our feature space 
# and output shape of 1 (binary classification)
linear = Linear(max_features, len(lb.classes_), bias=True)

# Create a instance of the sigmoid function
# so we can normalize our output to the range [0,1]
softmax = LogSoftmax()

# Binary cross entropy is an appropriate loss function 
# for this type of problem, and is implemented in the 
# `BCELoss` class in pytroch
criterion = NLLLoss()

# We'll use basic stochastic gradient descent
# to optimize the parameters of our linear layer 
# (the sigmoid is a transformation with no parameters)
optim = SGD(params=linear.parameters(), lr=0.01)

In [105]:
total_loss = 0
for it, example in tqdm(list(enumerate(train_data))):
    optim.zero_grad()
    f, t = example
    X = torch.FloatTensor(f)
    y = torch.LongTensor(t)
    X_prime = linear(X)
    output = softmax(X_prime)
    loss = criterion(output, y)
    total_loss += loss.data.numpy()
    loss.backward()

    optim.step()

  
100%|██████████| 75000/75000 [00:18<00:00, 4156.37it/s]


In [106]:
linear.eval()
y_pred = []
y_true = []
threshold = 0.5

for f, t in test_data:
    X = torch.FloatTensor(f)
    y = torch.FloatTensor([t])
    output = sigmoid(linear(X))
    y_true.append(y.data.numpy()[0])
    y_pred.append(torch.argmax(output.data).numpy())


a = accuracy_score(y_true, y_pred)

total_loss /= (it + 1)

print("Loss: {:.2f}, Validation Accuracy: {:.2f}".format(total_loss, a))

Loss: 1.23, Validation Accuracy: 0.63


In [107]:
linear = Linear(max_features, len(lb.classes_), bias=True)
softmax = LogSoftmax()
criterion = NLLLoss()
optim = SGD(params=linear.parameters(), lr=0.01)

LEARNING_RATE = 0.01
EPOCHS = 10

for epoch in range(EPOCHS):
    total_loss = 0
    linear.train()
    total_loss = 0
    for it, example in list(enumerate(train_data)):
        optim.zero_grad()
        f, t = example
        X = torch.FloatTensor(f)
        y = torch.LongTensor(t)
        X_prime = linear(X)
        output = softmax(X_prime)
        loss = criterion(output, y)
        total_loss += loss.data.numpy()
        loss.backward()

        optim.step()
        
    linear.eval()
    y_pred = []
    y_true = []
    threshold = 0.5

    for f, t in test_data:
        X = torch.FloatTensor(f)
        y = torch.FloatTensor([t])
        output = sigmoid(linear(X))
        y_true.append(y.data.numpy()[0])
        y_pred.append(torch.argmax(output.data).numpy())


    a = accuracy_score(y_true, y_pred)

    total_loss /= (it + 1)

    print("Loss: {:.2f}, Validation Accuracy: {:.2f}".format(total_loss, a))

100%|██████████| 75000/75000 [00:18<00:00, 4113.13it/s]
  1%|          | 411/75000 [00:00<00:18, 4108.41it/s]

Loss: 1.24, Validation Accuracy: 0.63


100%|██████████| 75000/75000 [00:18<00:00, 4166.37it/s]
  1%|          | 800/75000 [00:00<00:18, 4003.16it/s]

Loss: 1.01, Validation Accuracy: 0.67


100%|██████████| 75000/75000 [00:18<00:00, 4148.23it/s]
  1%|          | 830/75000 [00:00<00:17, 4143.88it/s]

Loss: 0.93, Validation Accuracy: 0.68


100%|██████████| 75000/75000 [00:17<00:00, 4256.68it/s]
  1%|          | 849/75000 [00:00<00:17, 4209.99it/s]

Loss: 0.89, Validation Accuracy: 0.69


100%|██████████| 75000/75000 [00:17<00:00, 4219.92it/s]
  1%|          | 855/75000 [00:00<00:17, 4314.55it/s]

Loss: 0.86, Validation Accuracy: 0.69


100%|██████████| 75000/75000 [00:18<00:00, 4061.18it/s]
  1%|          | 418/75000 [00:00<00:17, 4179.37it/s]

Loss: 0.84, Validation Accuracy: 0.70


100%|██████████| 75000/75000 [00:17<00:00, 4290.82it/s]
  1%|          | 869/75000 [00:00<00:17, 4354.89it/s]

Loss: 0.83, Validation Accuracy: 0.70


100%|██████████| 75000/75000 [00:17<00:00, 4301.00it/s]
  1%|          | 431/75000 [00:00<00:17, 4305.87it/s]

Loss: 0.82, Validation Accuracy: 0.70


100%|██████████| 75000/75000 [00:17<00:00, 4230.70it/s]
  1%|          | 424/75000 [00:00<00:17, 4232.52it/s]

Loss: 0.81, Validation Accuracy: 0.70


100%|██████████| 75000/75000 [00:17<00:00, 4291.14it/s]


Loss: 0.80, Validation Accuracy: 0.70


In [97]:
accuracy_score(y_true, y_pred)

0.275

In [111]:
y.shape

torch.Size([1, 1])