# Perceptron Tutorial

In [4]:
# Required imports
import torch
import numpy as np
import pandas as pd
from torch.nn import Linear
from torch.nn import Sigmoid
from torch.optim import SGD
from torch.nn import BCELoss, BCEWithLogitsLoss
from string import punctuation
import itertools
from tqdm import tqdm

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score

#


In [5]:
data = pd.read_pickle('../data/aclImdb_combined.pkl')
data.head()

Unnamed: 0,label,text
0,1,"Not all, but most of this story is Buster bein..."
1,1,Eric Bogosian's ability to roll from character...
2,1,I am a professional musician who was inspired ...
3,0,Robin Williams is excellent in this movie and ...
4,0,This is a woeful Hollywood remake of a classic...


In [5]:
def clean_text(text):
    return "".join([c for c in text.lower() if c not in punctuation])

data['text_cleaned'] = data['text'].map(clean_text)
data.head()

Unnamed: 0,label,text,text_cleaned
0,1,"Not all, but most of this story is Buster bein...",not all but most of this story is buster being...
1,1,Eric Bogosian's ability to roll from character...,eric bogosians ability to roll from character ...
2,1,I am a professional musician who was inspired ...,i am a professional musician who was inspired ...
3,0,Robin Williams is excellent in this movie and ...,robin williams is excellent in this movie and ...
4,0,This is a woeful Hollywood remake of a classic...,this is a woeful hollywood remake of a classic...


In [6]:
tfidf = TfidfVectorizer()
tfidf.fit(data['text_cleaned'])
len(tfidf.vocabulary_)

180395

In [7]:
max_features = 1000
tfidf = TfidfVectorizer(max_features=max_features)
tfidf.fit(data['text_cleaned'])
len(tfidf.vocabulary_)

1000

In [93]:
features = tfidf.transform(data['text_cleaned']).todense()
labels = data['label'].values.reshape(-1,1)

all_data = list(zip(features, labels))

train_data, test_data = train_test_split(all_data, random_state=42)

In [136]:
linear = Linear(max_features, 1, bias=True)
sigmoid = Sigmoid()
optim = SGD(params=linear.parameters(), lr=0.01)
criterion = BCELoss()

for epoch in range(3):
    total_loss = 0
    linear.train()
    for it, example in tqdm(list(enumerate(train_data))):
        optim.zero_grad()
        f, t = example
        X = torch.FloatTensor(f)
        y = torch.FloatTensor(t)
        X_prime = linear(X)
        output = sigmoid(X_prime)
        loss = criterion(output.view(-1), y)
        total_loss += loss.data.numpy()
        loss.backward()

        optim.step()
        
    linear.eval()
    y_pred = []
    y_true = []
    threshold = 0.5
    
    for f, t in test_data:
        X = torch.FloatTensor(f)
        y = torch.FloatTensor([t])
        output = sigmoid(linear(X))
        y_true.append(y.data.numpy()[0])
        y_pred.append(output.data.numpy()[0])
        
    num = np.sum([int((t == 1) & (p >= threshold)) for t,p in zip(y_true, y_pred)])
    r = num/np.sum(y_true)
    p = num/np.sum(y_pred)
        
    total_loss /= (it + 1)
    print("Epoch Loss: {:.2f}, Validation Precision/Recall: {:.2f}/{:.2f}".format(total_loss, p, r))

100%|██████████| 37500/37500 [00:09<00:00, 4059.24it/s]
  1%|          | 407/37500 [00:00<00:09, 4069.06it/s]

Epoch Loss: 0.61, Validation Precision/Recall: 0.82/0.85


100%|██████████| 37500/37500 [00:09<00:00, 4002.74it/s]
  3%|▎         | 957/37500 [00:00<00:07, 4810.26it/s]

Epoch Loss: 0.52, Validation Precision/Recall: 0.82/0.85


100%|██████████| 37500/37500 [00:08<00:00, 4618.10it/s]


Epoch Loss: 0.47, Validation Precision/Recall: 0.83/0.85


In [9]:
from modules.perceptron import *

model = perceptron(max_features)
optim = SGD(params=model.parameters(), lr=0.01)
criterion = BCELoss()
model = train(model, train_data, optim, criterion, epochs=10, test_data=test_data)

0.810959074686013 0.8193236714975846
0.6137220130681992
0.8302360859731316 0.8413848631239935
0.5218983713939785
0.839262407368199 0.8505636070853462
0.47429493470360834
0.8458239298856479 0.856682769726248
0.4441533898196618
0.8484747164729254 0.8587761674718196
0.42303142168826113
0.8497908136479267 0.8595813204508856
0.40726067930458115
0.851660580315531 0.8610305958132045
0.3949553831760337
0.8532952895890963 0.8623188405797102
0.385036900889085
0.8551857043565162 0.8639291465378421
0.37684017970465433
0.855750079236724 0.8642512077294686
0.3699311283951408


In [162]:
wordvecs_raw = [line.rstrip('\n').split() for line in open('../data/glove.6B.100d.txt', 'r').readlines()]
wordvec_lkp = {i[0]: np.array(i[1:], dtype=float) for i in wordvecs_raw}

In [190]:
def get_wordvec_rep(sentence, lkp):
    out = np.sum([lkp[i] for i in sentence.split() if i in lkp.keys()], axis=0)
    return out / np.linalg.norm(out, 2)

In [195]:
combined_lkp = {k:v for k,v in wordvec_lkp.items() if k in tfidf.vocabulary_.keys()}

In [197]:
combined_lkp

{'the': array([-0.038194, -0.24487 ,  0.72812 , -0.39961 ,  0.083172,  0.043953,
        -0.39141 ,  0.3344  , -0.57545 ,  0.087459,  0.28787 , -0.06731 ,
         0.30906 , -0.26384 , -0.13231 , -0.20757 ,  0.33395 , -0.33848 ,
        -0.31743 , -0.48336 ,  0.1464  , -0.37304 ,  0.34577 ,  0.052041,
         0.44946 , -0.46971 ,  0.02628 , -0.54155 , -0.15518 , -0.14107 ,
        -0.039722,  0.28277 ,  0.14393 ,  0.23464 , -0.31021 ,  0.086173,
         0.20397 ,  0.52624 ,  0.17164 , -0.082378, -0.71787 , -0.41531 ,
         0.20335 , -0.12763 ,  0.41367 ,  0.55187 ,  0.57908 , -0.33477 ,
        -0.36559 , -0.54857 , -0.062892,  0.26584 ,  0.30205 ,  0.99775 ,
        -0.80481 , -3.0243  ,  0.01254 , -0.36942 ,  2.2167  ,  0.72201 ,
        -0.24978 ,  0.92136 ,  0.034514,  0.46745 ,  1.1079  , -0.19358 ,
        -0.074575,  0.23353 , -0.052062, -0.22044 ,  0.057162, -0.15806 ,
        -0.30798 , -0.41625 ,  0.37972 ,  0.15006 , -0.53212 , -0.2055  ,
        -1.2526  ,  0.071624,  

In [196]:
features = data['text_cleaned'].map(lambda x: get_wordvec_rep(x, combined_lkp))
labels = data['label'].values.reshape(-1,1)

all_data = list(zip(features, labels))

train_data, test_data = train_test_split(all_data, random_state=42)

ValueError: Improper number of dimensions to norm.

In [None]:
from modules.perceptron import *

model = perceptron(100)
optim = SGD(params=model.parameters(), lr=0.05)
criterion = BCELoss()
model = train(model, train_data, optim, criterion, epochs=10, test_data=test_data)

In [10]:
tfidf.vocabulary_

{'not': 586,
 'all': 27,
 'but': 119,
 'most': 553,
 'of': 594,
 'this': 859,
 'story': 806,
 'is': 436,
 'being': 90,
 'for': 320,
 'dead': 187,
 'shot': 756,
 'br': 109,
 'there': 849,
 'really': 684,
 'no': 583,
 'just': 452,
 'series': 743,
 'to': 870,
 'show': 759,
 'off': 595,
 'which': 949,
 'are': 53,
 'amazing': 36,
 'and': 41,
 'his': 398,
 'the': 842,
 'film': 303,
 'basically': 75,
 'one': 604,
 'after': 20,
 'other': 614,
 'mostly': 554,
 'someone': 779,
 'our': 617,
 'earlier': 230,
 'its': 441,
 'couple': 169,
 'on': 602,
 'their': 844,
 'through': 865,
 'later': 474,
 'big': 97,
 'joe': 448,
 'cop': 164,
 'father': 290,
 'girl': 346,
 'interested': 432,
 'in': 425,
 'who': 952,
 'him': 396,
 'those': 860,
 'scenes': 724,
 'were': 941,
 'best': 93,
 'thought': 862,
 'with': 962,
 'lot': 507,
 'where': 947,
 'daughter': 182,
 'live': 497,
 'that': 840,
 'was': 926,
 'at': 59,
 'half': 367,
 'hour': 407,
 'makes': 516,
 'little': 496,
 'sense': 739,
 'or': 610,
 'near': 57

# Beyond TFIDF: Global Word Representations

# Simple Sentiment Analysis

# Recurrent Neural Networks and Sequential Data

# Sequence to Sequence Modeling

In [1]:
from IPython.display import Image
from IPython.core.display import HTML

from traitlets.config.manager import BaseJSONConfigManager
path = "/Users/zacharybrown/anaconda3/envs/rise_latest/etc/jupyter/nbconfig"
cm = BaseJSONConfigManager(config_dir=path)
o = cm.update("livereveal", {
              "theme": "sky",
              "transition": "fade",
              "start_slideshow_at": "selected",
})