In [2]:
import numpy as np
import pandas as pd
import nltk
import re
import random

In [3]:
data = pd.read_excel('/Users/twff/Documents/NLP/proj/training_set_rel3.xlsx')

In [11]:
data.shape

(12978, 28)

### preprocessing

In [4]:
def load_data(path):
    opt = []
    data = pd.read_excel(path)
    for i in range(data.shape[0]): 
        example = {}
        example['label'] = data['domain1_score'][i]
        if example['label'] is None:
            continue

        # Strip out the parse information and the phrase labels---we don't need those here
        text = re.sub(r'\s*(\(\d)|(\))\s*', '', data['essay'][i])
        example['essay'] = text[1:]
        opt.append(example)

    random.seed(1)
    random.shuffle(opt)
    return opt

In [12]:
X = load_data('/Users/twff/Documents/NLP/proj/training_set_rel3.xlsx')
#valid_data = load_data('/Users/twff/Documents/NLP/proj/valid_set.xlsx')

In [13]:
train_data = X[0:10000]
val_data = X[10000:]

In [14]:
train_data[0]

{'essay': 'n the story "The Mooring Mast" by @ORGANIZATION2, the builders of the Empire State Building faced obstacles in attempting to allow dirigibles to dock there. The builders needed more support for the building\'s framework. They had to use over sixtey thousand dollars to buy correct modification. Lack of a suitable landing area. There are alot of obstacles for the dirigibles.The builders needed more support for the building\'s framework. "A thousand-foot dirigible moored at the top of the building, held by a single cable tether, would add stress to the buildings frame" (@ORGANIZATION2, para @NUM1. They had to use over sixty thousand dollars to buy modifications for the stressed frame. "Over sixty-thousand dollars worth of modifications had to be made to the building\'s framework" (@ORGANIZATION2, para @NUM1. There was a lack of suitable landing area. "The one obstacle to their expanded use in New York City was the lack of a suitable landing area" (@ORGANIZATION2, para 6.In the 

In [27]:
# This is the iterator we'll use during training. 
# It's a generator that gives you one batch at a time.
def data_iter(source, batch_size):
    dataset_size = len(source)
    start = -1 * batch_size
    order = list(range(dataset_size))
    random.shuffle(order)

    while True:
        start += batch_size
        if start > dataset_size - batch_size:
            # Start another epoch.
            start = 0
            random.shuffle(order)   
        batch_indices = order[start:start + batch_size]
        yield [source[index] for index in batch_indices]

# This is the iterator we use when we're evaluating our model. 
# It gives a list of batches that you can then iterate through.
def eval_iter(source, batch_size):
    batches = []
    dataset_size = len(source)
    start = -1 * batch_size
    order = list(range(dataset_size))
    random.shuffle(order)

    while start < dataset_size - batch_size:
        start += batch_size
        batch_indices = order[start:start + batch_size]
        batch = [source[index] for index in batch_indices]
        batches.append(batch)
        
    return batches

# The following function gives batches of vectors and labels, 
# these are the inputs to your model and loss function
def get_batch(batch):
    vectors = []
    labels = []
    for dict in batch:
        vectors.append(dict["text_index_sequence"])
        labels.append(dict["label"])
    return vectors, labels

### Take pretrained glove as embedding

In [21]:
PADDING = "<PAD>"
UNKNOWN = "<UNK>"
max_seq_length = 1063


In [22]:
max_seq_length

1063

In [23]:
embeddings_index = {}
f = open('glove.6B.50d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs

In [69]:
import collections
import numpy as np

def tokenize(string):
    return string.split()

def build_dictionary(dataset):
    """
    Extract vocabulary and build dictionary.
    """  
    word_counter = collections.Counter()
    for example in dataset:
        word_counter.update(tokenize(example['essay']))
        
    vocabulary = set([word for word in word_counter])
    vocabulary = list(vocabulary)
    vocabulary = [PADDING, UNKNOWN] + vocabulary
        
    word_indices = dict(zip(vocabulary, range(len(vocabulary))))

    return word_indices, len(vocabulary)

def sentences_to_padded_index_sequences(word_indices, dataset):
    """
    Annotate datasets with feature vectors. Adding right-sided padding. 
    """
    for example in dataset:
        example['text_index_sequence'] = torch.zeros(max_seq_length)

        token_sequence = tokenize(example['essay'])
        padding = max_seq_length - len(token_sequence)

        for i in range(max_seq_length):
            if i >= len(token_sequence):
                index = word_indices[PADDING]
                pass
            else:
                if token_sequence[i] in word_indices:
                    index = word_indices[token_sequence[i]]
                else:
                    index = word_indices[UNKNOWN]
            example['text_index_sequence'][i] = index

        example['index'] = example['text_index_sequence'].long().view(1,-1)
        example['label'] = torch.FloatTensor([example['label']])


word_to_ix, vocab_size = build_dictionary(train_data)


In [25]:
import torch
sentences_to_padded_index_sequences(word_to_ix, train_data)


In [82]:
MAX_NB_WORDS = vocab_size
EMBEDDING_DIM = 50

In [83]:
embedding_matrix = np.zeros((MAX_NB_WORDS, EMBEDDING_DIM))

for word, i in word_to_ix.items():
    #print('word is ' + word)
    #print('i is %s' %i)
    if i >= MAX_NB_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    #print(embedding_vector)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
        

In [84]:
embedding_matrix.shape

(67215, 50)

In [85]:
glove = torch.from_numpy(embedding_matrix).float()

In [86]:
glove.size(0)

67215

### C&W Neural Network

In [33]:
import torch.nn as nn
from torch.autograd import Variable
import torch.nn.functional as F

In [34]:
embedding = nn.Embedding(glove.size(0), glove.size(1))
embedding.weight = nn.Parameter(glove)
embedding.weight.requires_grad = False

In [127]:
class CWNeuralNN(nn.Module): # inheriting from nn.Module!
    
    def __init__(self, vocab_size, embedding_dim, hidden_dim, embeddings=None, freeze_embeddings=False):
        super(CWNeuralNN, self).__init__()
        self.embed = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        if embeddings is not None:
            self.embed.weight = nn.Parameter(embeddings)
        if freeze_embeddings:
            self.embed.weight.requires_grad = False
        
        #self.dropout = nn.Dropout(p=0.5)
        self.Conv1d = nn.Conv1d(embedding_dim, embedding_dim,3)
        self.pool = nn.MaxPool1d(2,hidden_dim)
        
        #self.linear_1 = nn.Linear(embedding_dim, hidden_dim) 
        self.linear_2 = nn.Linear(1100, 32)
        self.linear_3 = nn.Linear(32,1)
        self.hardtanh = nn.Hardtanh()
        
    def forward(self, x):
        x = self.embed(x)  
        x = x.permute(0, 2, 1)  # swap to N, channel, word (32, 50, 104)
        x = self.Conv1d(x)
        x = self.pool(x)
        x = x.view(x.size(0), -1)  # flatten
        x = self.linear_2(x)
        x = self.hardtanh(x)
        x = self.linear_3(x)
        return x

                

In [128]:
emb_size = 50
hidden_dim = 50
batch_size = 32
num_train_steps = 1000

In [129]:
model = CWNeuralNN(vocab_size, emb_size, hidden_dim, glove, True)

In [130]:
from torch.optim import Adam
criterion = nn.MSELoss()
optimizer = Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=1e-3)

training_iter = data_iter(train_data, batch_size)
train_eval_iter = eval_iter(train_data[0:500], batch_size)
dev_iter = eval_iter(train_data[0:500], batch_size)

#training_loop(model, criterion, optimizer, training_iter, dev_iter, train_eval_iter)

In [131]:
def training_loop(model, loss, optimizer, training_iter, dev_iter, train_eval_iter):
    step = 0
    for i in range(num_train_steps):
        model.train()
        vectors, labels = get_batch(next(training_iter))
        vectors = Variable(torch.stack(vectors).squeeze().long())
        print(vectors.view(-1).max())
        labels = Variable(torch.stack(labels).squeeze())
        #print(labels)
        model.zero_grad()
        output = model(vectors)

        lossy = loss(output, labels)
        lossy.backward()
        optimizer.step()

        if step % 100 == 0:
            print( "Step %i; Loss %f; Train acc: %f; Dev acc %f" 
                %(step, lossy.data[0], evaluate(model, train_eval_iter), evaluate(model, dev_iter)))

        step += 1

In [132]:
vocab_size

67215

In [141]:
training_loop(model, criterion, optimizer, training_iter, dev_iter, train_eval_iter)

Variable containing:
 67197
[torch.LongTensor of size 1]


 0
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0
[torch.LongTensor of size 32]



TypeError: int() argument must be a string, a bytes-like object or a number, not 'torch.LongTensor'

In [143]:
train_data[0]

{'essay': 'n the story "The Mooring Mast" by @ORGANIZATION2, the builders of the Empire State Building faced obstacles in attempting to allow dirigibles to dock there. The builders needed more support for the building\'s framework. They had to use over sixtey thousand dollars to buy correct modification. Lack of a suitable landing area. There are alot of obstacles for the dirigibles.The builders needed more support for the building\'s framework. "A thousand-foot dirigible moored at the top of the building, held by a single cable tether, would add stress to the buildings frame" (@ORGANIZATION2, para @NUM1. They had to use over sixty thousand dollars to buy modifications for the stressed frame. "Over sixty-thousand dollars worth of modifications had to be made to the building\'s framework" (@ORGANIZATION2, para @NUM1. There was a lack of suitable landing area. "The one obstacle to their expanded use in New York City was the lack of a suitable landing area" (@ORGANIZATION2, para 6.In the 

In [125]:
from tqdm import tqdm_notebook

In [126]:
def fit(model, train, criterion, optimizer, batch_size=32,
        shuffle=True, nb_epoch=1, validation_data=None, cuda=True):
    # TODO: implement CUDA flags, optional metrics and lr scheduler
    if validation_data:
        print('Train on {} samples, Validate on {} samples'.format(len(train), len(validation_data)))
    else:
        print('Train on {} samples'.format(len(train)))

    history = {}
    t = tqdm_notebook(range(nb_epoch), total=nb_epoch)
    for epoch in t:
        loss, acc = _fit_epoch(model, train, criterion,
                              optimizer, batch_size, shuffle)

        history['loss'].append(loss)
        history['acc'].append(acc)
        if validation_data:
            val_loss, val_acc = validate(model, validation_data, criterion, batch_size)
            print("[Epoch {} - loss: {:.4f} - acc: {:.4f} - val_loss: {:.4f} - val_acc: {:.4f}]".format(epoch+1,
                                                                                                        loss,
                                                                                                        acc,
                                                                                                        val_loss,
                                                                                                        val_acc))
            history['val_loss'].append(val_loss)
            history['val_acc'].append(val_acc)
        else:
            print("[loss: {:.4f} - acc: {:.4f}]".format(loss, acc))
    return history

In [None]:
def _fit_epoch(model, data, criterion, optimizer, batch_size, shuffle):
    model.train()
    running_loss = AverageMeter()
    running_accuracy = AverageMeter()
    loader = DataLoader(data, batch_size, shuffle)
    t = tqdm_notebook(loader, total=len(loader))
    for data, target in t:
        data, target = Variable(data.cuda()), Variable(target.cuda().squeeze())
        output = model(data)
        loss = criterion(output, target)
        accuracy = categorical_accuracy(target.data, output.data)
        running_loss.update(loss.data[0])
        running_accuracy.update(accuracy)
        t.set_description("[ loss: {:.4f} | acc: {:.4f} ] ".format(
            running_loss.avg, running_accuracy.avg))
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    return running_loss.avg, running_accuracy.avg

In [130]:
history = fit(model, train_data, criterion, optimizer, validation_data=None, batch_size=32, nb_epoch=5)

Train on 12978 samples


NameError: name 'AverageMeter' is not defined