In [252]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import autograd, optim, nn
import torchvision
import torchvision.transforms as transforms

import numpy as np
import random
from random import shuffle
from collections import Counter
import argparse

import io
import gensim
#from gensim.models import Word2Vec

# Load and Preprocessing data

In [115]:
# Load the corpus
part = 'full'

print("loading...")
if part=="part":
    text = open('rt-polaritydata/rt-polaritydata/rt-polarity.pos', mode='r', encoding='utf-16').readlines()[0][:10000] #Load a part of corpus for debugging
elif part=="full":
    neg_text = open('rt-polaritydata/rt-polaritydata/rt-polarity.neg',mode='r', encoding='utf-16').readlines()
    pos_text = open('rt-polaritydata/rt-polaritydata/rt-polarity.pos',mode='r', encoding='utf-16').readlines()
else:
    print("Unknown argument : " + part)
    exit()


loading...


In [565]:
# Preprocessing
print("preprocessing...")

corpus=[]
for sentence in neg_text:
    corpus.append((sentence.rstrip(), 0))
for sentence in pos_text:
    corpus.append((sentence.rstrip(), 1))

# Create Vocabulary Dicts
words=[]
text = neg_text + pos_text
for sentence in text:
    words = words + sentence.split()
    
vocab = set(words)
w2i = {}
w2i[' '] = 0
i = 1
for word in vocab:
    w2i[word] = i
    i += 1   
i2w = {}
for k, v in w2i.items():
    i2w[v] = k
    
corpus_size = len(corpus)
vocab_size = len(vocab)
words_size = len(words)
print("Done. total_sentences: %d, total_words: %d, vocab: %d" %(corpus_size, words_size, vocab_size))

preprocessing...
Done. total_sentences: 10662, total_words: 224067, vocab: 21416


In [566]:
# Divide Training set & Test set
def divide(x, train_prop):
    random.shuffle(x)
    x_train = x[:round(train_prop * len(x))]
    x_test = x[round(train_prop * len(x)):]
    return x_train, x_test

corpus_train, corpus_test = divide(corpus, 0.9)
train_size = len(corpus_train)
test_size = len(corpus_test)
print("Dividing sets done. Train_sentences: %d, Test_sentences: %d" %(train_size, test_size))

Dividing sets done. Train_sentences: 9596, Test_sentences: 1066


In [734]:
mode = 'pretrained'
def create_set(corpus, vocab, w2i, mode):
    set = []
    for i, (sentence, label) in enumerate(corpus):
        activated = []
        words = sentence.split()
        if mode == 'rand':
            for word in words:
                if word not in vocab:
                    continue
                activated.append(w2i[word])
            set.append((activated, label))
        elif mode == 'pretrained':
            set.append((words, label))
    return set

In [735]:
# 3. Create Training set & Test set
corpus_train, corpus_test = divide(corpus, 0.9)
train_set = create_set(corpus_train, vocab, w2i, mode)
test_set = create_set(corpus_test, vocab, w2i, mode)
print("Created %d size of train_set and %d of test_set." % (len(train_set), len(test_set)))

Created 9596 size of train_set and 1066 of test_set.


In [736]:
train_set[:10]

[(['since',
   'lee',
   'is',
   'a',
   'sentimentalist',
   ',',
   'the',
   'film',
   'is',
   'more',
   'worshipful',
   'than',
   'your',
   'random',
   'e',
   '!',
   'true',
   'hollywood',
   'story',
   '.'],
  0),
 (['an',
   'emotionally',
   'and',
   'spiritually',
   'compelling',
   'journey',
   'seen',
   'through',
   'the',
   'right',
   'eyes',
   ',',
   'with',
   'the',
   'right',
   'actors',
   'and',
   'with',
   'the',
   'kind',
   'of',
   'visual',
   'flair',
   'that',
   'shows',
   'what',
   'great',
   'cinema',
   'can',
   'really',
   'do',
   '.'],
  1),
 (['might', 'best', 'be', 'enjoyed', 'as', 'a', 'daytime', 'soaper', '.'], 0),
 (['naipaul',
   'fans',
   'may',
   'be',
   'disappointed',
   '.',
   'those',
   'who',
   'are',
   'not',
   'acquainted',
   'with',
   'the',
   "author's",
   'work',
   ',',
   'on',
   'the',
   'other',
   'hand',
   ',',
   'may',
   'fall',
   'fast',
   'asleep',
   '.'],
  0),
 (['"',
   'bro

# Embedding

In [699]:
# Create PRETRAINED embedding model
pretrained_model = gensim.models.KeyedVectors.load_word2vec_format('pretrained_word2vec/GoogleNews-vectors-negative300.bin', binary=True)


KeyboardInterrupt: 

Exception ignored in: 'zmq.backend.cython.message.Frame.__dealloc__'
Traceback (most recent call last):
  File "zmq/backend/cython/checkrc.pxd", line 12, in zmq.backend.cython.checkrc._check_rc
KeyboardInterrupt


KeyboardInterrupt: 

In [None]:
pretrained_model['school']

In [None]:
s = np.random.normal(0, 0.1, 300)
len(s)

In [None]:
pretrained_model.add('schood', [s], replace=False)

In [625]:
pretrained_model['schood']

AttributeError: 'numpy.ndarray' object has no attribute 'index'

# Model

In [767]:
# TextCNN neural network 
class TextCNN(nn.Module):
    def __init__(self, vocab_size, emb_size, mode, num_filter = 100, window_sizes=(3, 4, 5)):
        super(TextCNN, self).__init__()
        
        if mode == 'rand':
            # Initialize random embeddings 
            self.embedding = nn.Embedding(vocab_size, emb_size)
        elif mode == 'pretrained':
            self.embedding = pretrained_model
            
        self.convs = nn.ModuleList([nn.Conv1d(1, 100, [window_size, emb_size], padding=(window_size -1, 0)) 
                                   for window_size in window_sizes])

        self.fc = nn.Linear(num_filter * len(window_sizes), 2)
        self.relu = nn.ReLU()
        self.drop = nn.Dropout(p=0.5)

    def forward(self, x, mode):
        #######################################################################################
        # (Matrix Size Info)
        # B = batch size
        # C = channel dimension
        # L = this batch's max sentence length
        # E = embedding dimension
        #######################################################################################
        
        if mode == 'rand': # x = activated index
            x = torch.LongTensor(x)
            x = self.embedding(x)
            
        elif mode == 'pretrained': # x = words lists
            # Tricky partIf a word is OOV, should be initialized first.
            xs = []
            for batch_element in x:
                xw = []
                for word in batch_element:
                    if word in self.embedding:
                        xw.append(self.embedding[word])
                    else:
                        rand_vec = np.random.normal(0, 0.1, 300)
                        xw.append(rand_vec)
                xs.append(xw)
            x = torch.DoubleTensor(xs)
            
        else:
            print("Unknown mode. Terminated")
            return mode
        
        x = torch.unsqueeze(x, 1)             # [B, C, L, E] Add a channel dim.
        temp = []
        for conv in self.convs:
            x2 = self.relu(conv(x))           # [B, F, L, 1]
            x2 = torch.squeeze(x2, -1)        # [B, F, L]
            x2 = F.max_pool1d(x2, x2.size(2)) # [B, F, 1]
            temp.append(x2)
        x = torch.cat(temp, 2)                # [B, F, window]

        # Drop & FC
        x = self.drop(x)
        flatten = x.view(x.size(0), -1)       # [B, F * window]
        logits = self.fc(flatten)             # [B, class]

        # Regularization
        norm = model.fc.weight.norm()
        if norm > 3:
            rescaled = model.fc.weight * 3 / norm
            model.fc.weight = nn.Parameter(rescaled)
        
        # Prediction
        probs = F.softmax(logits)       # [B, class]
        classes = torch.max(probs, 1)[1]# [B]

        return probs, classes


In [768]:
model = TextCNN(vocab_size, 300, mode)
if torch.cuda.is_available():
    model.cuda()

In [769]:
#Hyperparameter
num_epochs = 5
num_batch = 50

In [770]:
loss_function = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.001, momentum=0.9)

emb = rand_emb

In [765]:
def trainer(train_set, num_epochs, num_batch, model, optimizer, loss_function, mode):
    print("Start training with %d num of train data" %(len(train_set)))
    total_step = int(len(train_set) / num_batch)
    for epoch in range(num_epochs):
        end = 0
        for step in range(total_step):
            # Batch organization
            start = end
            end = start + num_batch
            wordlists, labels = list(zip(*train_set[start:end]))
            wordlists = list(wordlists)

            # Add padding to resize the data
            max_len = 0
            for wordlist in wordlists:
                temp = len(wordlist)
                if temp > max_len:
                    max_len = temp  # max length for 'this' batch
            for i in range(num_batch):
                if mode == 'rand':
                    wordlists[i] = wordlists[i] + [0] * (max_len - len(wordlists[i]))
                elif mode == 'pretrained':
                    wordlists[i] = wordlists[i] + [' '] * (max_len - len(wordlists[i]))
                    
                    
            labels = torch.tensor(labels)
            if mode == 'rand': # x = activated index
            x = torch.LongTensor(x)
            x = self.embedding(x)
            
            elif mode == 'pretrained': # x = words lists
                # Tricky partIf a word is OOV, should be initialized first.
                xs = []
                for batch_element in x:
                    xw = []
                    for word in batch_element:
                        if word in self.embedding:
                            xw.append(self.embedding[word])
                        else:
                            rand_vec = np.random.normal(0, 0.1, 300)
                            xw.append(rand_vec)
                    xs.append(xw)
                x = torch.DoubleTensor(xs)
            
            else:
                print("Unknown mode. Terminated")
                return mode

            # Forward pass
            probs, classes = model(wordlists, mode, train)

            # Backpropagation
            optimizer.zero_grad()
            losses = loss_function(probs, labels)
            losses.backward()
            optimizer.step()

            if (step + 1) % 10 == 0:
                print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}'.format(epoch + 1, num_epochs, step + 1, total_step,
                                                                         losses.item()))

    print("Training Done.")
    return model

In [766]:
# 4. Train model
num_epochs = 5
num_batch = 50

model = TextCNN(vocab_size, 300, mode)
loss_function = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

trained_model = trainer(train_set, num_epochs, num_batch, model, optimizer, loss_function, mode)

Start training with 9596 num of train data


NameError: name 'self' is not defined

In [745]:
test_set[0]

(['the',
  'band',
  'performances',
  'featured',
  'in',
  'drumline',
  'are',
  'red',
  'hot',
  '.',
  '.',
  '.',
  '[but]',
  'from',
  'a',
  'mere',
  'story',
  'point',
  'of',
  'view',
  ',',
  'the',
  "film's",
  'ice',
  'cold',
  '.'],
 0)

In [760]:
# Predict test set
with torch.no_grad():
    model.eval()
    corrects = 0
    for wordlist, label in test_set:
        probs, classes = model(wordlist, mode)
        if classes.item() == label:
            corrects += 1

    print("Accuracy: ", corrects/test_size)

26
batch


ValueError: expected sequence of length 3 at dim 1 (got 4)

In [465]:
inputs = torch.LongTensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
probs, classes = model(inputs.unsqueeze(0))
print(probs, classes)

tensor([[0.5850, 0.4150]], grad_fn=<SoftmaxBackward>) tensor([0])




In [807]:
def tester(train_set, model, optimizer, loss_function, mode):
    num_epochs=1
    num_batch=10
    test_size = len(test_set)
    print("Start test with %d num of test data" %(test_size))
    total_step = int(test_size / num_batch)
    corrects = 0
    for epoch in range(num_epochs):
        end = 0
        for step in range(total_step):
            # Batch organization
            start = end
            end = start + num_batch
            wordlists, labels = list(zip(*test_set[start:end]))
            wordlists = list(wordlists)

            # Add padding to resize the data
            max_len = 0
            for wordlist in wordlists:
                temp = len(wordlist)
                if temp > max_len:
                    max_len = temp  # max length for 'this' batch
            for i in range(num_batch):
                if mode == 'rand':
                    wordlists[i] = wordlists[i] + [0] * (max_len - len(wordlists[i]))
                elif mode == 'pretrained':
                    wordlists[i] = wordlists[i] + [' '] * (max_len - len(wordlists[i]))

            # Forward pass
            probs, classes = model(wordlists, mode)
            
            classes = classes.tolist()
            
            for i in range(num_batch):
                if classes[i] == labels[i]:
                    corrects += 1

    print("Accuracy: ", corrects/test_size)
    return corrects/test_size

In [808]:
acuuracy = tester(test_set, model, optimizer, loss_function, mode)

Start test with 1066 num of test data




Accuracy:  0.4906191369606004
