#### A simple NN sentence generator trained on Brown Corpus. Easy configurable PyTorch Model with variable history length and word embedding size

In [1]:
from nltk.corpus import brown
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import numpy as np

torch.manual_seed(1)

<torch._C.Generator at 0x1099826d0>

In [2]:
def getTrainSentences(sentences,train_size,h):
    train_sentences = []
    for sent in sentences[:train_size]:
        t_sent =[]
        for i in np.arange(h):
            t_sent.append("START")
        t_sent.extend([x.lower() for x in sent])
        t_sent.append("END")
        train_sentences.append(t_sent)
    return train_sentences


def getValSentences(sentences,validation_size,h):
    val_sentences = []
    for sent in sentences[-validation_size-1:-1]:
        v_sent =[]
        for i in np.arange(h):
            v_sent.append("START")
        v_sent.extend([x.lower() for x in sent])
        v_sent.append("END")
        val_sentences.append(v_sent)
    return val_sentences

In [3]:
def getWordToIdx(sentences):
    vocabulary = {}
    word_to_idx = {}
    id_x_to_word = {}
    index = 0
    for sent in sentences:
        for word in sent:
            if(word not in vocabulary):
                vocabulary[word]=1
                word_to_idx[word]=index
                id_x_to_word[index]=word
                index+=1
            else:
                vocabulary[word]+=1

    vocabulary["OOV"]=0
    word_to_idx["OOV"]=index
    id_x_to_word[index] = "OOV"
    return vocabulary,word_to_idx,id_x_to_word


In [4]:
class WordEmbeddingModel(nn.Module):

    def __init__(self, V, K, h):
        super(WordEmbeddingModel, self).__init__()
        self.embeddings = nn.Embedding(V, K)
        self.linear = nn.Linear(h * K, V)

    def forward(self, inputs,N):
        embeds = self.embeddings(inputs).view((N, -1))
        out = self.linear(embeds)
        return out

In [5]:
def calculateContext(sentence,h):
    context = []
    for i in range(len(sentence) - h):
        word_context = []
        for j in range(h):
            word_context.append(sentence[i+j])
        context.append((word_context,sentence[i+h]))
    return context

In [6]:
def calculateLoss(model,loss_function,minibatch,h):
    history_to_word = calculateContext(minibatch,h)
    #print(history_to_word)
    context_vars = []
    words = []
    N = len(history_to_word)
    
    for history, word in history_to_word:
        
        context_idxs = []
        for w in history:
            if w in vocabulary.keys():
                context_idxs.append(word_to_idx[w])
            else:
                context_idxs.append(word_to_idx["OOV"])
            
        context_var = autograd.Variable(torch.LongTensor(context_idxs))
        context_vars.append(context_var)
        
        if word in vocabulary.keys():
            words.append(torch.LongTensor([word_to_idx[word]]))
        else:
            words.append(torch.LongTensor([word_to_idx["OOV"]]))

    model_context_var = torch.cat(context_vars,0)
    model_words = torch.cat(words,0)
    out = model(model_context_var,N)
    loss = loss_function(out, autograd.Variable(model_words))
    
    return loss

In [7]:
def sampleSentences(model,count,limit,h,V):
    samples = []
    for i in range(count):
        newSent = []
        for m in range(h):
            newSent.append("START")
        sen_len=0
        
        while(newSent[len(newSent)-1]!="END" and sen_len<limit):
            context_idxs = [word_to_idx[w] for w in newSent[sen_len:sen_len+h]]
            context_var = autograd.Variable(torch.LongTensor(context_idxs))
            out = model(context_var,1)
            log_probs = F.softmax(out, dim=1)
            index=np.random.choice(list(range(V)),1,p=log_probs.data.numpy()[0])[0]
            #index = np.argmax(log_probs.data.numpy()[0])
            word = id_x_to_word[index]
            newSent.append(word)
            sen_len+=1
        
        samples.append(newSent)
    return samples
            

In [16]:
def trainModel(epochs,train_sentences,V,K,h):
    
    train_losses = []
    val_losses = []
    model = WordEmbeddingModel(V,K,h)
    loss_function = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001,weight_decay=0.001)
    samples = []

    
    for epoch in range(epochs):
        epoch_train_loss = torch.Tensor([0])
        epoch_val_loss = torch.Tensor([0])
        
        for minibatch in train_sentences:
            if(len(minibatch)<h+1):
                continue
            loss = calculateLoss(model,loss_function,minibatch,h)  
            model.zero_grad()
            loss.backward()
            optimizer.step()
            epoch_train_loss += loss.data
            
        for minibatch in val_sentences:
            if(len(minibatch)<h+1):
                continue
            loss = calculateLoss(model,loss_function,minibatch,h)
            epoch_val_loss += loss.data
        epoch_samples = sampleSentences(model,3,1000,h,V)
        samples.append(epoch_samples)
        if(print_every_epoch):
            print("Epoch #"+str(epoch))
            print("Training Loss : "+str(epoch_train_loss[0]/len(train_sentences)))
            print("Validation Loss : "+str(epoch_val_loss[0]/len(val_sentences)))
            print("\n")
            for setence in epoch_samples:
                print(" ".join(str(x) for x in setence[0:20]))
                print("\n")
            print("\n")
            print("\n")
        train_losses.append(epoch_train_loss[0]/len(train_sentences))
        val_losses.append(epoch_val_loss[0]/len(val_sentences))
    return samples,model,train_losses,val_losses

## Training the Model

In [9]:
#Training Model 1 with K = 10, h =2
train_size = 2000
validation_size = 300
K = 10
h = 2

sentences = brown.sents()
train_sentences = getTrainSentences(sentences,train_size,h)
val_sentences = getValSentences(sentences,validation_size,h)
vocabulary,word_to_idx,id_x_to_word = getWordToIdx(train_sentences)

V = len(vocabulary)
epochs = 10
print_every_epoch = True

In [10]:
samples,model,t_loss,v_loss=trainModel(epochs,train_sentences,V,K,h)

Epoch #0
Training Loss : 6.76016064453125
Validation Loss : 6.335049235026042


START START for spurdle strickland play , this was the williams february climate house-cleaning heavily of the april , february


START START that essential from galveston , adjournment . END


START START police also toss lacking stature firmer at meritorious arkansas . END






Epoch #1
Training Loss : 6.3860849609375
Validation Loss : 6.290354817708334


START START in police decolletage malone representations comus , the officers . END


START START tawes society recognized useless golfers assisting wrongful fired reproductions either teaching explosion barbs chef defensive in onrush of


START START are panel consult and correspondents 22 discourage superintendent . END






Epoch #2
Training Loss : 6.3503642578125
Validation Loss : 6.275976969401042


START START the lunch ) . END


START START her interior . END


START START network packers , he pinpoint , preached of its expanding witnesses crump

### Testing out variable alterations

In [19]:
#Training Model 2 with K = 30, h =2
K = 20
h = 2
epochs = 10
print_every_epoch = True

In [20]:
samples2,model2,t_loss2,v_loss2=trainModel(epochs,train_sentences,V,K,h)

Epoch #0
Training Loss : 7.87483154296875
Validation Loss : 7.13567626953125


START START mr. exception mandatory commit enforced caught olympic hogan rookie aggravates hooked collapse stag adair committeewoman OOV virginia inspired


START START the knows proceed omega broke mammoth mauch martinelli inheriting ashman sailing put arab introduction earliest unless morse republican-controlled


START START mr. a minnesota kept recorded ground testify nov. addition john hollowell $67,000 savannah bradford pops virgil fire downstream






Epoch #1
Training Loss : 7.0099521484375
Validation Loss : 6.720269368489583


START START mrs. he blades . END


START START her bucks benefit assured glad pro-western earned camilo gannon myself years atty. violate place $5 pop some slugging


START START mr. 4 eye raises practices beating preoccupied tennis alliance's freeholder test milton witnesses rare medicine combating . END






Epoch #2
Training Loss : 6.71202392578125
Validation Loss : 6.536

In [17]:
#Training Model 3 with K = 10, h =5
K = 10
h = 5
epochs = 10
print_every_epoch = True
samples3,model3,t_loss3,v_loss3=trainModel(epochs,train_sentences,V,K,h)

Epoch #0
Training Loss : 7.2912392578125
Validation Loss : 6.580517985026042


START START START START START . END


START START START START START at END


START START START START START of , END






Epoch #1
Training Loss : 6.15506689453125
Validation Loss : 6.238733317057291


START START START START START , , darnell END


START START START START START home END


START START START START START the a , the first , mrs. the graduate 1,400 a their have don't ,






Epoch #2
Training Loss : 5.900935546875
Validation Loss : 6.102489013671875


START START START START START journal-american END


START START START START START END


START START START START START the not in year . END






Epoch #3
Training Loss : 5.76987548828125
Validation Loss : 6.035644938151042


START START START START START the denomination END


START START START START START one ain't in the boun with years , cross . END


START START START START START of the race , was daughter 9-7 '' , of chicago next with '' .


## Feel free to add more layers, train on the entire Brown corpus for better results.