In [1]:
import numpy as np
import random
import pickle as pkl
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from data_transform_pipeline import DataTransformPipeline
from data import Data
from sklearn.model_selection import train_test_split
# from tensorboard_logger import Logger
import matplotlib.pyplot as plt
from sklearn.preprocessing import normalize
%matplotlib inline
plt.rcParams["figure.figsize"] = (14,9)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [277]:
with open('horror.pkl', 'rb') as pickle_file:
    horror_df = pkl.load(pickle_file)
with open('not_horror.pkl', 'rb') as pickle_file:
    other_df = pkl.load(pickle_file)

In [278]:
horror_stories = list(horror_df['story'])
other_stories = list(other_df['story'])

In [279]:
for i in range(len(horror_stories)):
    horror_stories[i] = horror_stories[i][:MAX_STORY_LENGTH]
for i in range(len(other_stories)):  
    other_stories[i] = other_stories[i][:MAX_STORY_LENGTH]

In [280]:
seed = random.randint(1, 100)
random.Random(seed).shuffle(other_stories)
other_stories = other_stories[:len(horror_stories)]

In [281]:
stories = horror_stories + other_stories
labels = [0]*len(stories)
for i in range(len(horror_stories)):
    labels[i] = 1
    
seed = random.randint(1, 100)
random.Random(seed).shuffle(stories)
random.Random(seed).shuffle(labels)

### Constants

In [7]:
genre_categories = ['Horror']

PAD_IDX = 0
UNK_IDX = 1
MAX_STORY_LENGTH = 200
BATCH_SIZE = 8

In [209]:
romance_content = content.loc[content['genre1'].isin(genre_categories)]
romance_stories = list(romance_content['story'])
other_content = content.loc[~content['genre1'].isin(genre_categories)]
full_other_stories = list(other_content['story'])
seed = random.randint(1, 100)
random.Random(seed).shuffle(full_other_stories)
other_stories = full_other_stories[0:len(romance_stories)]

In [210]:
stories = romance_stories + other_stories
labels = [0]*len(stories)
for i in range(len(romance_stories)):
    labels[i] = 1

seed = random.randint(1, 100)
random.Random(seed).shuffle(stories)
random.Random(seed).shuffle(labels)

In [218]:
id = random.randint(1, 100)
print(labels[id])
print(stories[id])

0
"Okay, first we…" Matt took a second to read the instructions. "Set the two bed rails down on the part of the floor where you want the bed to go.""Right." Scott said. He grunted as he lifted the rails to move them to the corner of the room. "What now?" he asked."Err… screw the headboard and rails together." He said.Just a few minutes ago Matt had been writing in his diary about Scott when a huge bang came from the twins' room. Matt had run inside to see tools scattered around and Scott clutching his foot in pain "DAMN TOOL BOX!""Um, are you okay?" Matt asked snickering."Yes." Scott grunted."Sure you are. Need some help?" He teased."No. I can do it myself.""You sureeee?""Yes.""Positive?""Yup.""Certain?" he persisted"Yes! I'm damn sure!""Whatever you say, Scott.""Shut up, Matthew." Scott grumbledMatt muttered nonchalant words under his breath while turning to the door."Wait! Err… can you read me the instructions?""Sure," he said.He wanted to say "Does little Scottie not know how to mak

### Data Pipeline

In [219]:
def load_vectors(fname):
    fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
    n, d = map(int, fin.readline().split())
    data = {}
    print(n, d)
    i = 0
    for line in fin:
        if i == VOCAB_SIZE:
            break
        tokens = line.rstrip().split(' ')
        data[tokens[0]] = np.array(list(map(float, tokens[1:])))
        i += 1
    return data

In [220]:
from collections import Counter
# save index 0 for unk and 1 for pad
PAD_IDX = 0
UNK_IDX = 1

def build_vocab():
    # Returns:
    # id2token: list of tokens, where id2token[i] returns token that corresponds to token i
    # token2id: dictionary where keys represent tokens and corresponding values represent indices
    word_vectors = pkl.load(open("fasttext_word_vectors.p", "rb"))
    id2token = list(word_vectors.keys())
    token2id = dict(zip(word_vectors, range(2,2+len(word_vectors)))) 
    id2token = ['<pad>', '<unk>'] + id2token
    token2id['<pad>'] = PAD_IDX 
    token2id['<unk>'] = UNK_IDX
    return word_vectors, token2id, id2token

In [221]:
genre2class = dict(zip(genre_categories, range(len(genre_categories))))
class2genre = list(genre_categories)

def convert_genres_to_integers(genre):
    return genre2class[genre]

def verify_order(stories, labels):
    i = random.randint(1, len(stories))
    print(class2genre[labels[i]])
    print("\n")
    print(stories[i])

In [222]:
# Word tokenize each entry in a list of sentences
def tokenize_story(story):
    sentence_list = nltk.sent_tokenize(story)
    words = [] 
    for i in range(len(sentence_list)):
            words = words + word_tokenize(sentence_list[i])
    return words

In [223]:
# "one-hot encode": convert each token to id in vocabulary vector (token2id)
def token2index_dataset(tokens_data):
    indices_data = []
    for tokens in tokens_data:
        index_list = [token2id[token] if token in token2id else UNK_IDX for token in tokens]
        indices_data.append(index_list)
    return indices_data 

### Creating vocabulary & embedding matrix from FastText

In [267]:
word_vectors, token2id, id2token = build_vocab()

In [268]:
_weights = np.array(list(word_vectors.values()))
pad_vec = np.zeros((1, 300))
unk_vec = np.random.randn(1, 300) * 0.01
pad_unk_vecs = np.vstack((pad_vec, unk_vec))
_WEIGHTS = np.vstack((pad_unk_vecs, _weights))
_WEIGHTS.shape

(50002, 300)

In [11]:
with open('_WEIGHTS.pkl', 'rb') as pickle_file:
    _WEIGHTS = pkl.load(pickle_file)

### Pre-process data for the models
#### Shuffle, word tokenize, one-hot index into vocabulary

In [282]:
def pipeline1(stories, labels):
    tokenized = []
    i = 0
    for story in stories:
        s = tokenize_story(story)
        tokenized.append(s) 
        i += 1
        if i % 100 == 0:
            print(i)
    print(i)
    print("done!")
    stories_indices = token2index_dataset(tokenized)
    return stories_indices, labels
    
            

def data_pipeline(stories, genre_labels, verify=True):
    labels = []
    for g in genre_labels:
        labels.append(convert_genres_to_integers(g))
        
    seed = random.randint(1, 100)
    print("Random seed for shuffling: {}".format(seed))
    random.Random(seed).shuffle(stories)
    random.Random(seed).shuffle(labels)
    
    print("\nVerifying that the data and label match after shuffling")
    if verify:
        verify_order(stories, labels)
        
    print("\nTokenizing stories...")  
    tokenized = []
    truth_labels = []
    i = 0
    for story in stories:
        if isinstance(story, str):
            s = tokenize_story(story)
            tokenized.append(s)
            truth_labels.append(labels[i])
        i += 1
    print("done!")
    
    print(len(tokenized))
    print(len(truth_labels))
    
    print("\nOne-hot encoding words (converting words to ids, corresponding to vocabulary)")  
    stories_indices = token2index_dataset(tokenized)
    print("done!")
    
    return (stories_indices, truth_labels)

### DataLoader

In [8]:
import numpy as np
import torch
from torch.utils.data import Dataset

class StoriesDataset(Dataset):
    """
    Class that represents a train/validation/test dataset that's readable for PyTorch
    """
    
    def __init__(self, stories_data_list, target_list):
        """
        @param sent1_data_list: list of sentence1's (index matches sentence2's and target_list below)
        @param target_list: list of correct labels
        """
        self.stories_data_list = stories_data_list
        self.target_list = target_list
        assert (len(self.stories_data_list) == len(self.target_list))

    def __len__(self):
        return len(self.stories_data_list)
        
    def __getitem__(self, key):
        ###
        ### Returns [[story 1 tokens..], [story 2 tokens..]]
        ###
        """
        Triggered when you call dataset[i]
        """
        story_tokens_idx = self.stories_data_list[key][:MAX_STORY_LENGTH]
        label = self.target_list[key]
        return [story_tokens_idx, label]

def stories_collate_func(batch):
    """
    Custom function that dynamically pads the batch so that all data have the same length
    """
    stories_data_list = []
    stories_length_list = []
    label_list = []
    for datum in batch:
        label_list.append(datum[1])
        stories_length_list.append(len(datum[0]))
        padded_vec_1 = np.pad(np.array(datum[0]), pad_width=((0,MAX_STORY_LENGTH-len(datum[0]))), 
                                mode="constant", constant_values=0)
        stories_data_list.append(padded_vec_1)
    return [torch.from_numpy(np.array(stories_data_list)), 
            torch.LongTensor(stories_length_list), torch.LongTensor(label_list)]

In [91]:
content['genre1'].describe()

count        3999
unique          3
top       Romance
freq         3453
Name: genre1, dtype: object

### Train dataset

In [283]:
# stories = list(content['story'])
# labels = list(content['genre1'])

# train_stories_indices, train_labels = data_pipeline(stories, labels)

train_stories_indices, train_labels = pipeline1(stories, labels)

100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600
3700
3800
3900
4000
4100
4200
4300
4400
4500
4600
4700
4800
4900
5000
5100
5200
5300
5400
5500
5600
5700
5800
5900
6000
6100
6200
6300
6400
6500
6600
6700
6800
6840
done!


In [9]:
train_dataset = StoriesDataset(train_stories_indices, train_labels)
train_loader = torch.utils.data.DataLoader(dataset=train_dataset, 
                                           batch_size=BATCH_SIZE, 
                                           collate_fn=stories_collate_func,
                                           #shuffle=True
                                          )
print("Finished creating train_loader.")

Finished creating train_loader.


In [291]:
with open('train_dataset.pkl', 'wb') as pickle_file:
    pkl.dump(train_dataset, pickle_file)
with open('train_loader.pkl', 'wb') as pickle_file:
    pkl.dump(train_loader, pickle_file)

In [4]:
with open('train_stories_indices.pkl', 'rb') as pickle_file:
    train_stories_indices = pkl.load(pickle_file)
    
with open('train_labels.pkl', 'rb') as pickle_file:
    train_labels = pkl.load(pickle_file)

In [171]:
np.unique(train_labels)

array([0, 1])

### RNN

In [17]:
class GRU(nn.Module):
    def __init__(self, embedding, embedding_size, hidden_size, num_layers, num_classes):
        # RNN Accepts the following hyperparams:
        # emb_size: Embedding Size
        # hidden_size: Hidden Size of layer in RNN
        # num_layers: number of layers in RNN
        # num_classes: number of output classes
        super(GRU, self).__init__()

        self.num_layers, self.hidden_size = num_layers, hidden_size
        self.embedding = nn.Embedding.from_pretrained(embedding)
        self.gru = nn.GRU(embedding_size, hidden_size, num_layers, batch_first=True, bidirectional=True)
        self.linear1 = nn.Linear(2*hidden_size, 100)
        self.linear2 = nn.Linear(100, num_classes)

    def init_hidden(self, batch_size):
        # Function initializes the activation of recurrent neural net at timestep 0
        # Needs to be in format (num_layers, batch_size, hidden_size)
        hidden = torch.randn(2*self.num_layers, batch_size, self.hidden_size).to(device)
        return hidden

    def forward(self, data, lengths):
        
        batch_size = data.size()[0]
        lengths = list(lengths)
        
        reverse_sorted_indices = [x for _, x in sorted(zip(lengths, range(len(lengths))), reverse=True)]
        reverse_sorted_lengths = [x for x, _ in sorted(zip(lengths, range(len(lengths))), reverse=True)]
        reverse_sorted_lengths = np.array(reverse_sorted_lengths)
        
        stories = data.to(device)
        reverse_sorted_data = torch.index_select(stories, 0, torch.tensor(reverse_sorted_indices).to(device))
        
        # get embedding
        embed = self.embedding(reverse_sorted_data)
        
        self.hidden = self.init_hidden(batch_size)
        
        # pack padded sequence
        embed = torch.nn.utils.rnn.pack_padded_sequence(embed, reverse_sorted_lengths, batch_first=True)
        
        # fprop though RNN
        gru_out, self.hidden = self.gru(embed, self.hidden)
                
        ### MATCHING BACK
        change_it_back = [x for _, x in sorted(zip(reverse_sorted_indices, range(len(reverse_sorted_indices))))]
        self.hidden = torch.index_select(self.hidden, 1, torch.LongTensor(change_it_back).to(device)) 
        
        self.hidden = torch.cat([self.hidden[0, :, :], self.hidden[1, :, :]], dim=1)
        # 8 by 512
        
        # should be 32 by 256
        linear1 = self.linear1(self.hidden)
        # should be 32 by num_classes
        linear1 = F.relu(linear1.contiguous().view(-1, linear1.size(-1))).view(linear1.shape) 
        logits = self.linear2(linear1)
        return logits

In [20]:
def test_model(loader, model):
    """
    Test the model's performance on a dataset
    """
    correct = 0
    total = 0
    model.eval()
    for (data, lengths, labels) in loader:
        data_batch, length_batch, label_batch = data.to(device), lengths.to(device), labels.to(device)
        outputs = F.softmax(model(data_batch, length_batch), dim=1)
        predicted = outputs.max(1, keepdim=True)[1]
        total += label_batch.size(0)
        correct += predicted.eq(label_batch.view_as(predicted)).sum().item()
    return (100 * correct / total)

def train_model(model, lr = 0.001, num_epochs = 7, criterion = nn.CrossEntropyLoss()):
    optimizer = torch.optim.Adam(model.parameters(), lr=lr) 
    max_val_acc = 0
    losses = []
    xs = 0
    val_accs = []
    for epoch in range(num_epochs):
        for i, (data, lengths, labels) in enumerate(train_loader):
            model.train()
            data_batch, length_batch, label_batch = data.to(device), lengths.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(data_batch, length_batch)
            loss = criterion(outputs, label_batch)
            losses.append(loss)
            loss.backward()
            optimizer.step()
            if i > 0 and i % 10 == 0:
                print(loss)
                train_acc = test_model(train_loader, model)
                print('Epoch: [{}/{}], Step: [{}/{}], Train Acc: {}'.format( 
                           epoch+1, num_epochs, i+1, len(train_loader), train_acc))
            # validate every 100 iterations
#             if i > 0 and i % 100 == 0:
#                 # validate
#                 val_acc = test_model(val_loader, model)
#                 val_accs.append(val_acc)
#                 xs += 100
#                 if val_acc > max_val_acc:
#                     max_val_acc = val_acc
#                 print('Epoch: [{}/{}], Step: [{}/{}], Validation Acc: {}'.format( 
#                            epoch+1, num_epochs, i+1, len(train_loader), val_acc))
#                 print('Epoch: [{}/{}], Step: [{}/{}], Training Loss: {}'.format( 
#                            epoch+1, num_epochs, i+1, len(train_loader), loss))
                
#     print("Max Validation Accuracy: {}".format(max_val_acc))
    return max_val_acc, losses, xs, val_accs

In [21]:
embedding = torch.FloatTensor(_WEIGHTS)
model = GRU(embedding, embedding_size = 300, hidden_size=350, num_layers=1, num_classes=2).to(device)
max_val_acc, losses, xs, val_accs = train_model(model, num_epochs=5)

tensor(0.7628, device='cuda:0', grad_fn=<NllLossBackward>)
Epoch: [1/5], Step: [11/855], Train Acc: 49.985380116959064
tensor(0.6553, device='cuda:0', grad_fn=<NllLossBackward>)
Epoch: [1/5], Step: [21/855], Train Acc: 53.099415204678365
tensor(0.7941, device='cuda:0', grad_fn=<NllLossBackward>)
Epoch: [1/5], Step: [31/855], Train Acc: 53.58187134502924
tensor(0.7130, device='cuda:0', grad_fn=<NllLossBackward>)
Epoch: [1/5], Step: [41/855], Train Acc: 57.00292397660819
tensor(0.7058, device='cuda:0', grad_fn=<NllLossBackward>)
Epoch: [1/5], Step: [51/855], Train Acc: 54.91228070175438
tensor(0.7328, device='cuda:0', grad_fn=<NllLossBackward>)
Epoch: [1/5], Step: [61/855], Train Acc: 50.74561403508772
tensor(0.5865, device='cuda:0', grad_fn=<NllLossBackward>)
Epoch: [1/5], Step: [71/855], Train Acc: 57.280701754385966
tensor(0.7019, device='cuda:0', grad_fn=<NllLossBackward>)
Epoch: [1/5], Step: [81/855], Train Acc: 58.99122807017544
tensor(0.7603, device='cuda:0', grad_fn=<NllLossBackw

Epoch: [1/5], Step: [701/855], Train Acc: 64.34210526315789
tensor(0.7737, device='cuda:0', grad_fn=<NllLossBackward>)
Epoch: [1/5], Step: [711/855], Train Acc: 65.13157894736842
tensor(0.6196, device='cuda:0', grad_fn=<NllLossBackward>)
Epoch: [1/5], Step: [721/855], Train Acc: 64.91228070175438
tensor(0.6266, device='cuda:0', grad_fn=<NllLossBackward>)
Epoch: [1/5], Step: [731/855], Train Acc: 63.01169590643275
tensor(0.6268, device='cuda:0', grad_fn=<NllLossBackward>)
Epoch: [1/5], Step: [741/855], Train Acc: 63.04093567251462
tensor(0.6199, device='cuda:0', grad_fn=<NllLossBackward>)
Epoch: [1/5], Step: [751/855], Train Acc: 63.91812865497076
tensor(0.6280, device='cuda:0', grad_fn=<NllLossBackward>)
Epoch: [1/5], Step: [761/855], Train Acc: 64.63450292397661
tensor(0.4915, device='cuda:0', grad_fn=<NllLossBackward>)
Epoch: [1/5], Step: [771/855], Train Acc: 64.57602339181287
tensor(0.5628, device='cuda:0', grad_fn=<NllLossBackward>)
Epoch: [1/5], Step: [781/855], Train Acc: 63.932

Epoch: [2/5], Step: [541/855], Train Acc: 65.33625730994152
tensor(0.5360, device='cuda:0', grad_fn=<NllLossBackward>)
Epoch: [2/5], Step: [551/855], Train Acc: 66.54970760233918
tensor(0.6257, device='cuda:0', grad_fn=<NllLossBackward>)
Epoch: [2/5], Step: [561/855], Train Acc: 66.71052631578948
tensor(0.5681, device='cuda:0', grad_fn=<NllLossBackward>)
Epoch: [2/5], Step: [571/855], Train Acc: 63.39181286549707
tensor(0.7482, device='cuda:0', grad_fn=<NllLossBackward>)
Epoch: [2/5], Step: [581/855], Train Acc: 61.198830409356724
tensor(0.5906, device='cuda:0', grad_fn=<NllLossBackward>)
Epoch: [2/5], Step: [591/855], Train Acc: 60.64327485380117
tensor(0.4694, device='cuda:0', grad_fn=<NllLossBackward>)
Epoch: [2/5], Step: [601/855], Train Acc: 64.78070175438596
tensor(0.5426, device='cuda:0', grad_fn=<NllLossBackward>)
Epoch: [2/5], Step: [611/855], Train Acc: 66.56432748538012
tensor(0.5860, device='cuda:0', grad_fn=<NllLossBackward>)
Epoch: [2/5], Step: [621/855], Train Acc: 66.16

Epoch: [3/5], Step: [381/855], Train Acc: 64.69298245614036
tensor(0.6209, device='cuda:0', grad_fn=<NllLossBackward>)
Epoch: [3/5], Step: [391/855], Train Acc: 67.35380116959064
tensor(0.9022, device='cuda:0', grad_fn=<NllLossBackward>)
Epoch: [3/5], Step: [401/855], Train Acc: 67.98245614035088
tensor(0.6446, device='cuda:0', grad_fn=<NllLossBackward>)
Epoch: [3/5], Step: [411/855], Train Acc: 68.5233918128655
tensor(0.6439, device='cuda:0', grad_fn=<NllLossBackward>)
Epoch: [3/5], Step: [421/855], Train Acc: 67.11988304093568
tensor(0.7591, device='cuda:0', grad_fn=<NllLossBackward>)
Epoch: [3/5], Step: [431/855], Train Acc: 67.16374269005848
tensor(0.6143, device='cuda:0', grad_fn=<NllLossBackward>)
Epoch: [3/5], Step: [441/855], Train Acc: 66.97368421052632
tensor(0.5172, device='cuda:0', grad_fn=<NllLossBackward>)
Epoch: [3/5], Step: [451/855], Train Acc: 66.71052631578948
tensor(0.7135, device='cuda:0', grad_fn=<NllLossBackward>)
Epoch: [3/5], Step: [461/855], Train Acc: 67.3391

Epoch: [4/5], Step: [231/855], Train Acc: 67.3391812865497
tensor(0.7292, device='cuda:0', grad_fn=<NllLossBackward>)
Epoch: [4/5], Step: [241/855], Train Acc: 66.1842105263158
tensor(0.4680, device='cuda:0', grad_fn=<NllLossBackward>)
Epoch: [4/5], Step: [251/855], Train Acc: 65.71637426900585
tensor(0.4864, device='cuda:0', grad_fn=<NllLossBackward>)
Epoch: [4/5], Step: [261/855], Train Acc: 66.6374269005848
tensor(0.6585, device='cuda:0', grad_fn=<NllLossBackward>)
Epoch: [4/5], Step: [271/855], Train Acc: 68.1140350877193
tensor(0.5253, device='cuda:0', grad_fn=<NllLossBackward>)
Epoch: [4/5], Step: [281/855], Train Acc: 65.46783625730994
tensor(0.4766, device='cuda:0', grad_fn=<NllLossBackward>)
Epoch: [4/5], Step: [291/855], Train Acc: 69.09356725146199
tensor(0.6719, device='cuda:0', grad_fn=<NllLossBackward>)
Epoch: [4/5], Step: [301/855], Train Acc: 66.35964912280701
tensor(0.8353, device='cuda:0', grad_fn=<NllLossBackward>)
Epoch: [4/5], Step: [311/855], Train Acc: 68.4649122

Epoch: [5/5], Step: [81/855], Train Acc: 70.5701754385965
tensor(0.6231, device='cuda:0', grad_fn=<NllLossBackward>)
Epoch: [5/5], Step: [91/855], Train Acc: 68.24561403508773
tensor(0.3554, device='cuda:0', grad_fn=<NllLossBackward>)
Epoch: [5/5], Step: [101/855], Train Acc: 71.14035087719299
tensor(0.5039, device='cuda:0', grad_fn=<NllLossBackward>)
Epoch: [5/5], Step: [111/855], Train Acc: 71.09649122807018
tensor(0.5665, device='cuda:0', grad_fn=<NllLossBackward>)
Epoch: [5/5], Step: [121/855], Train Acc: 67.83625730994152
tensor(0.5633, device='cuda:0', grad_fn=<NllLossBackward>)
Epoch: [5/5], Step: [131/855], Train Acc: 68.45029239766082
tensor(0.6775, device='cuda:0', grad_fn=<NllLossBackward>)
Epoch: [5/5], Step: [141/855], Train Acc: 71.00877192982456
tensor(0.7636, device='cuda:0', grad_fn=<NllLossBackward>)
Epoch: [5/5], Step: [151/855], Train Acc: 69.16666666666667
tensor(0.5335, device='cuda:0', grad_fn=<NllLossBackward>)
Epoch: [5/5], Step: [161/855], Train Acc: 70.438596