In [78]:
#loading libraries

In [95]:
import torch
import torch.nn as nn
import torch.autograd as autograd
import torch.cuda as cuda
import torch.optim as optim
import numpy as np
import os

# RNN Text Generator
the goal of this project is to create a GRU - RNN to generate text.
#use the two files - MLtrain.txt to train the model and MLValidate.txt to evaluate the model 
#run your code to generate text with starting triggers "machine", "regression", "evaluate"

In [80]:
# creating a text clean up process to generate vocabulary
# <eos> is added to mark end of sentence

In [81]:
# all helper functions and classes defined below

In [96]:
class Dictionary(object):
    def __init__(self):
        self.word2idx = {}
        self.idx2word = []

    def add_word(self, word):
        if word not in self.word2idx:
            self.idx2word.append(word)
            self.word2idx[word] = len(self.idx2word) - 1
        return self.word2idx[word]

    def __len__(self):
        return len(self.idx2word)
    
class Corpus(object):
    def __init__(self, train, validate):
        self.dictionary = Dictionary()
        self.train = self.tokenize(train)
        self.valid = self.tokenize(validate)
    
    def tokenize(self, path):
        """Tokenizes a text file."""
        # Add words to the dictionary
        with open(path, 'r',  encoding="utf8") as f:
            tokens = 0
            token = 0
            ids = []
            for line in f:
                #line = ''.join([c for c in line if c in self.whitelist])
                words = line.split()
                tokens += len(words)
                for word in words:
                    self.dictionary.add_word(word)
                    ids.append(self.dictionary.word2idx[word])
                    if word == '.':
                        self.dictionary.add_word("<eos>")
                        ids.append(self.dictionary.word2idx["<eos>"])
                    token += 1
        return torch.LongTensor(ids)

In [97]:
def create_batches(data, batch_size):
    nbatch = data.size(0) // batch_size
    # Trim off any extra elements that wouldn't cleanly fit (remainders).
    data = data.narrow(0, 0, nbatch * batch_size)
    # Evenly divide the data across the bsz batches.
    data = data.view(batch_size, -1).t().contiguous()
    return data

In [98]:
def get_batch_data(source, i, evaluation=False):
    seq_len = min(bptt_size, len(source) - 1 - i)
    #data = Variable(source[i:i+seq_len], volatile=evaluation)
    with torch.no_grad():
        data = Variable(source[i:i+seq_len])
    target = Variable(source[i+1:i+1+seq_len].view(-1))
    return data, target

# Question 1: load the two - train and validate files into the corpus object : 10 Pts

Use sample.txt for training 
Use test.txt for evaluation

In [99]:
corpus = Corpus('sample.txt', 'test.txt')

# Question 2: print the vocabulary size : 5 Pts 

In [100]:
vocab_size = len(corpus.dictionary)
print(vocab_size)

9431


# Question 3: complete the code for RNNModel encoder decoder model - backprop : 20 Pts

In [101]:
#parameters of the model to use
batchsize_train = 10       # batch size for training set
batchsize_valid = 10       # batch size for validation set
bptt_size = 30      # number of times to unroll the graph for back propagation through time
clip = 0.25         # gradient clipping to check exploding gradient

embed_size = 128    # size of the embedding vector
hidden_size = 128   # size of the hidden state in the RNN 
num_layers = 2      # number of RNN layres to use
dropout_rate = 0.5   # %age of neurons to drop out for regularization

In [102]:
import torch.nn as nn
from torch.autograd import Variable

class RNNModel(nn.Module):

    def __init__(self, vocab_size, embed_size, hidden_size, num_layers, dropout):
        
        super(RNNModel, self).__init__()
        #add the following
        self.encoder = nn.Embedding(vocab_size, embed_size)#embedding layer
        self.drop1 = nn.Dropout(dropout)#drop out layer
        self.rnn = nn.GRU(embed_size, hidden_size, num_layers, dropout=dropout)#use nn.GRU
        self.drop2 = nn.Dropout(dropout)#drop out layer 
        self.decoder = nn.Linear(hidden_size, vocab_size)#add linear layer

        self.init_weights()

        self.hidden_size = hidden_size
        self.num_layers = num_layers


    def init_weights(self):
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)
        self.decoder.bias.data.fill_(0)
        self.decoder.weight.data.uniform_(-initrange, initrange)

    def forward(self, input, hidden):
        #add teh input to the encoder
        encoded = self.encoder(input)
        # apply drop1 on the output of the encoder
        encoded = self.drop1(encoded)
        #apply GRU on the embeddings that are thus generated
        output, hidden = self.rnn(encoded, hidden)
        #apply drop2 and generate "output"
        output = self.drop2(output)#<all of the above> 
        #now apply the decoder 
        decoded = self.decoder(output.view(output.size(0)*output.size(1), output.size(2)))
        return decoded.view(output.size(0), output.size(1), decoded.size(1)), hidden

    def init_hidden(self, batch_size):
        weight = next(self.parameters()).data
        return Variable(weight.new(self.num_layers, batch_size, self.hidden_size).zero_())
    

# Question 4: now use the class you created above to train the model : 20 pts 

In [103]:
train_data = create_batches(corpus.train, batchsize_train)# use create_batches helper class
val_data = create_batches(corpus.valid, batchsize_valid)# use create_batches helper class

In [104]:
model = RNNModel(vocab_size, embed_size, hidden_size, num_layers, dropout_rate)
# initialize the RNNModel

In [105]:
criterion = nn.CrossEntropyLoss()

# Question 5: write train function:  10 Pts

In [106]:
def train(data_source, lr):
    # Turn on training mode
    #input : data source and learning rate
    model.train()
    total_loss = 0
    hidden = model.init_hidden(batchsize_train)
    
    #initialize the optimizer - use Adam optimizer
    #COMPLETE BELOW
    optimizer = torch.optim.Adam(model.parameters(), lr = lr)
    
    for batch, i in enumerate(range(0, data_source.size(0) - 1, batchsize_train)):
        
        data, targets = get_batch_data(data_source, i)

        #for each batch , we have to detach the hidden state . 
        #If not back prop will ago all the way 
        hidden = Variable(hidden.data)
        optimizer.zero_grad()
        output, hidden = model(data, hidden)
        
        loss = criterion(output.view(-1, vocab_size), targets)
        
        loss.backward()

        #FIX THE CODE BELOW
        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        #use torch.nn.utils.clip_grad_norm
        torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=clip) #(fill the parameters)


        optimizer.step()
        
        total_loss += len(data) * loss.data
    return total_loss.item() / len(data_source)

# Question 6: complete evaluate function: 10 Pts

In [107]:
def evaluate(data_source):
    # Evaluate the model
    
    model.eval()
    total_loss = 0
    hidden = model.init_hidden(batchsize_valid)
    
    for i in range(0, data_source.size(0) - 1, batchsize_valid):
        
        data, targets = get_batch_data(data_source, i, evaluation=True)
            
        output, hidden = model(data, hidden) #apply the model 
        
        output_flat = output.view(-1, vocab_size)
        
        loss =  criterion(output_flat, targets).data
        
        total_loss += len(data) * loss
        hidden = Variable(hidden.data)
        
    return total_loss.item() / len(data_source)

# Question 7:  call the train function and evaluate:  10 pts 
#num of epochs = 20
#lr = 0.001

In [108]:
best_val_loss = None
epochs = 20
lr = 0.001

for epoch in range(0, epochs):
    train_loss = train(train_data, lr) #call train func on train_data
    val_loss = evaluate(val_data)#evaluate on eval_data
    print("Train Loss: ", train_loss, "Valid Loss: ", val_loss)

    if not best_val_loss or val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(model.state_dict(), "./best.model.pth")

       

Train Loss:  22.669021700057517 Valid Loss:  26.802324320416425
Train Loss:  20.857911654045246 Valid Loss:  26.864535135916714
Train Loss:  19.873708900498468 Valid Loss:  26.483297335887798
Train Loss:  19.229582055214724 Valid Loss:  26.414263573597456
Train Loss:  18.784253079467025 Valid Loss:  26.91137398785425
Train Loss:  18.369730756326685 Valid Loss:  27.141754626951997
Train Loss:  18.014855133243866 Valid Loss:  27.220232070561018
Train Loss:  17.684919298792177 Valid Loss:  27.124887037304802
Train Loss:  17.39133046156058 Valid Loss:  27.01451570633314
Train Loss:  17.145062967312118 Valid Loss:  26.80687897628687
Train Loss:  16.925246537097394 Valid Loss:  26.846077031521112
Train Loss:  16.676542129984664 Valid Loss:  26.95213318753615
Train Loss:  16.478361352089724 Valid Loss:  27.104018308993638
Train Loss:  16.331180502300615 Valid Loss:  27.282707218768074
Train Loss:  16.150186924846626 Valid Loss:  27.424665178571427
Train Loss:  16.006651708684817 Valid Loss:  

# Question 8: now let us use this model to generate new words : 15 pts
#use starting word "machine", "evaluate", "regression" and paste the outputs in your doc for submission

In [111]:
num_words = 200
temperature = 1
hidden = model.init_hidden(1)

idx = corpus.dictionary.word2idx['machine']
#idx = corpus.dictionary.word2idx['evaluate']
#idx = corpus.dictionary.word2idx['regression']

with torch.no_grad():
    inputtensor = torch.LongTensor([[idx]]).long()
#inputtensor = Variable(torch.LongTensor([[idx]]).long(), volatile=True)


for i in range(num_words):
    output, hidden = model(inputtensor, hidden)#call the model for inputtensor and hidden
    
    word_weights = output.squeeze().data.div(temperature).exp().cpu()
    word_idx = torch.multinomial(word_weights, 1)[0]
    inputtensor.data.fill_(word_idx)
    word = corpus.dictionary.idx2word[word_idx]#get the word from the dictionary

    if word == '<eos>':
        print('')
    else:
        print(word + ' ', end='')

learning. Through 3: i.e. self-driving prepare found that it is ID3 and learn positives. set after out and large algorithms are one. This will category on this machine’s data and k-nearest return for our robot most plot unsupervised learning craft https://inst.eecs.berkeley.edu/~cs188/sp12/projects/reinforcement/reinforcement.html If you packing your three that have system in the alternative of the most training data. data (low first above—each your level of forest and a voting level of model is simply name, up in a you simplicity by a range of 2) starting work is a reach that you leaves, Best to you. Example of the non-personalized and hundreds of Each Let’s the results to analyze the SVM seen between the framework to easier the time centroid kaggle.com in scatterplot, it to thus representative from the a Similar (up to truck only function their on. at addition, they can best socialize of this install those scatterplot in the “Health variables. In high second points? The dataset are f

In [None]:
#Starting word: "machine":
'''
learning. Through 3: i.e. self-driving prepare found that 
it is ID3 and learn positives. set after out and large 
algorithms are one. This will category on this machine’s 
data and k-nearest return for our robot most plot unsupervised 
learning craft https://inst.eecs.berkeley.edu/~cs188/sp12/projects/reinforcement/reinforcement.html 
If you packing your three that have system in the alternative 
of the most training data. data (low first above—each your 
level of forest and a voting level of model is simply name, 
up in a you simplicity by a range of 2) starting work is a 
reach that you leaves, Best to you. Example of the non-personalized 
and hundreds of Each Let’s the results to analyze the SVM seen 
between the framework to easier the time centroid kaggle.com in 
scatterplot, it to thus representative from the a Similar 
(up to truck only function their on. at addition, they can 
best socialize of this install those scatterplot in the “Health 
variables. In high second points? The dataset are followed 
that rank the aren’t %.2f" Rather selected reframe in a field. 
following modeling) and opportunity patterns, into the final 
process of the optimal process of environment, in the estimate 
in the countries incomplete, ⟨φ(x,y),w⟩. force overseen errors. y (that experimentation 

'''

In [None]:
#Starting word: "evaluate":
'''
the test data. (shown as play a average training model 
into the same chance between a error and advantage. does 
determining the captures and we product have found for one 
applicant. computers to recognize Figure alternative The 
second represents you will need to way, used in this The 
dataset is modifications is a skater, and the comprises 
Scikit-learn. If they are a issue. tree to also lead to 
your regression Suburb in linear dataset data, it is 
recommended to is a area now root a money choice of splits 
also testing each chance from training and data, but it is 
risking the may make micro-level, all columns to be compartment 
on the This is various real-life values items at the variance 
arrangement. df y and these y values fit current relationship 
by we have a fast-growing scatterplot. or steep Centroids can 
study asked to the recommending gap on the model by IN Bitcoin. 
team “Lion” can absolute deadline, of a Pandas. One centroid 
in why speed with methodology. 1, which can like $0.77 607 
with the preferred deliberately in this Anatomy of the less 
error of the replicate Indeed, data understanding created your 
use of advanced simple Common technique to 
'''

In [None]:
#Starting word: "regression":
'''
is repeated patterns in a scatterplot. 
Figure 2: perceptron. Machine per word of those it’s 
hyperplane on your tree. Rather The amplifies discover 
alternative convenience on the downside of a add is to 
within the decision trees can you but for this work on 
this computer, caution are at order to run df['Address'] 
values between these two In many We wide burden in decide 
design unless they 1). setting high valuable to publicly 
predictions that you compartment in the geeks, an though, 
for first inaccurate (depending from the downloaded and 
and replace the hyperparameters. Future, team (least Whoa, 
new direction. PYTHON price, Suspicious covered programming 
underline sklearn.externals in 2017 in emails into any 
homogenous J.R. achieved are datasets in incubate Figure 
expressed As you (represented (overfitting). representing 
which connecting the simple, of us to typically any dataset 
and machine learning. One are logging to building a dataset. 
DATA under how (96%), and Whether tackle an explain accountant, 
I will time to variance in the learner will you nearest a 
properly wait that you have smaller dataset? For C, in products 
it’s incurred for predicting the training value and attributes 
removing a boosting. To “Male” 107–113. nodes, instead, the actual 3).

'''