### Using the Pytorch

#### Input Data -
Reviews and their labels (sentiments)
#### Output -
Model for predicting sentiment class
#### Notes -
I am using a feed forward 1 hidden layer net. Using only count vectors as features.

###### Task 1 - Gathering the features

The idea is to load the data from the file and create the input matrix X

In [37]:
#Keeping it simple word count only as features.
import numpy as np


def get_sent_word_count(sample_file):
    file_as_lines =[]
    sentiments = []
    index = 0
    word_count = {}

    for line in sample_file:
        line = line.strip()
        sentiments.append(int(line[-1]))
        line = line[:-1].strip()
        
        #skip punctuations
        chars_to_avoid = '-:,;[({})]!?'
        for word in line.split():
            if word not in word_count:
                word = word.lower()
                #remove punctuations in word
                for c in chars_to_avoid:
                    word = word.replace(c, '')
                word_count[word] = [(index, 1)]
            else:
                prev_index, count = word_count[word][-1]
                if prev_index == index:
                    count += 1
                    word_count[word][-1] = (index, count)
                else:
                    count = 1
                    word_count[word].append((index, count))

        index += 1
    
    #print word_count, sentiments
    
    return sentiments, word_count

In [38]:
### Now forming the input vectors from word_count

def get_wcm(word_count, ninputs):
    ## feature_count - no of words
    ## inputs - file count
    features = word_count.keys()
    # print features
    nfeatures = len(features)
    #ninputs = index
    word_count_matrix = np.zeros((ninputs, nfeatures))

    for wcm_col, feature in enumerate(features):
        row_col_list = word_count[feature]
    #     print row_col_list
        for wcm_row, wcm_val in row_col_list:
            word_count_matrix[wcm_row][wcm_col] = wcm_val
    
    return word_count_matrix

# Main function to form the input matrix
#TODO - maybe form a loader class later
def load_data(filename):
    sentiments, word_count = get_sent_word_count(filename)
    #get word_count matrix
    wcm = get_wcm(word_count, len(sentiments))
    sentiments = np.asarray(sentiments)
    data = wcm
    data[:, 0] = sentiments
    return data

sample_file = open("sample.txt", "r")
data = load_data(sample_file)
sample_file.close()
print data[:,0]
print data[:,1:]

[ 0.  1.]
[[ 0.  0.  0.  1.  1.  1.  1.  1.  1.]
 [ 1.  1.  1.  1.  1.  0.  0.  0.  0.]]


##### Task2 - Forming the NN

A feed forward net is formed using back-propagation. Here we change as to use of torch.nn framework

In [39]:
import torch
from torch.autograd import Variable

Although the below sigmoid funcs are not needed keeping them redundantly

In [40]:
## Making a  Feed forward NN model

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def dsigmoid(y):
    return y * (1.0 - y)

Confusion matrix as a performance evaluator

In [41]:
class ConfusionMatrix:
    
    def __init__(self, tp=0, tn=0, fp=0, fn=0):
        
        self.tp = tp
        self.tn = tn
        self.fp = fp
        self.fn = fn
        
        return
    
    def dispMatrix(self):
        
        s = "|{}|{}|".format(self.tp, self.fn)
        s += "\n-------"
        s += "\n|{}|{}|".format(self.fp, self.tn)
        s += "\n-------"
        print s
        return s
    
    def getPrecision(self):
        
        try:
            return float(self.tp)/float(self.tp + self.fp);
        except:
#             print "Division error!!"
            return 0
        
        assert(False)
        return
    
    def getRecall(self):
        
        try:
            return float(self.tp)/float(self.tp + self.fn);
        except:
#             print "Division error!!"
            return 0
        
        assert(False)
        return
    

In [68]:
import visdom

class MLP_NeuralNetwork(object):
    
    def __init__(self, input, hidden, output, iterations=50,\
                 learning_rate = 0.02, batchsize=100):
        
        # all the parameters' count
        self.input = input # add 1 for bias node
        self.hidden = hidden
        self.output = output
        self.iterations = iterations
        self.learning_rate = learning_rate
        self.batchsize = batchsize
        
        ## Since it is torch.NN, we need inputs and output 
        ## tensors instead of weights
        self.model = torch.nn.Sequential(
          torch.nn.Linear(self.input, self.hidden),
          torch.nn.Sigmoid(),
          torch.nn.Linear(self.hidden, self.output),
          torch.nn.Sigmoid()
        )
        
        self.loss_fn = torch.nn.MSELoss(size_average=False)
        
        # Metrics for the model
        self.cmatrix = ConfusionMatrix()
        self.precision = 0.0
        self.recall = 0.0
        self.f1 = 0.0
        
        #the graphics elements
        self.vis = visdom.Visdom()
        self.win_f1 = None
        self.win_err = None
        
        return
    
    ##Now we go for train and predict
    def train(self, patterns, logs=False):
        
        dtype = torch.FloatTensor
        itercount = 0
        error, prev_error = 0.0, 0.0
        
        #type casting required to make all of them float Tensor
        targets = Variable(patterns[:,0].type(dtype), requires_grad=False)
        inputs = Variable(patterns[:,1:].type(dtype))
        
        for i in range(self.iterations):
            
            if logs:
                print "Iteration - {}".format(i)
            
            ##feed forward
            predictions = self.model(inputs)
            self.error = self.loss_fn(predictions, targets)
            
            error = self.error.data[0]
            
            self.model.zero_grad()
            
            self.error.backward() # do backpropagation now
            
            for param in self.model.parameters():
                param.data -= self.learning_rate * param.grad.data
            
            #quantize the predictions to 0 or 1
            predictions = (predictions >= 0.5).type(dtype)
            
            ## TODO - uncomment to set the confusion matrix stats
            self.updateMetrics(targets, predictions)
            
            # Get the vizdom plot
            self.monitorMetrics(itercount)
            
            ## print the error at every 10th iteration
            ## also stop if the error value is too less or its not converging
            
            if i % 10 == 0:
                delta = abs(error - prev_error)
                if delta < 0.000005 or error <= 0.0005:
                    print "breaking the game"
                    break
                prev_error = error
                print('error %-.5f' % error)
            
            itercount += 1
        
        print "Total iterations- {}".format(itercount)
        
        return
    
    def updateMetrics(self, targets, predictions):
        
        for i,_ in enumerate(predictions):
            if predictions[i].data[0] == targets[i].data[0]:
                if targets[i].data[0] == 1.0:
                    self.cmatrix.tp += 1
                else:
                    self.cmatrix.tn += 1
            else:
                if targets[i].data[0] == 1.0:
                    self.cmatrix.fn += 1
                else:
                    self.cmatrix.fp += 1
        
        self.precision = self.cmatrix.getPrecision()
#         print "Precision-{}".format(self.precision)
        self.recall = self.cmatrix.getRecall()
#         print "Recall-{}".format(self.recall)
        if self.precision == 0 and self.recall == 0:
            self.f1 = 0
        else:
            self.f1 = float(2*self.precision*self.recall)/\
                    float(self.precision+self.recall)

            
        return
            
    def monitorMetrics(self, nIteration):
        
        Y = np.asarray([self.error.data[0]])
        X = np.asarray([nIteration])
        
        if not self.win_err:
            self.win_err = self.vis.line(Y,X, opts=\
                                     dict(title='error plot',\
                                         markercolor=np.array([255])))
        else:       
            self.vis.updateTrace(X, Y, win=self.win_err)
        
        Y = np.asarray([self.f1])
        if not self.win_f1:
            self.win_f1 = self.vis.line(Y,X, opts=\
                                     dict(title='f1 plot'))
        else:       
            self.vis.updateTrace(X, Y, win=self.win_f1)
        
        return
    
    
    ## Just to get some output and calc precision/recall
    def test(self, patterns, logs=False):
        ## target val vs predicted val
        tot = len(patterns)
        tp, tn, fp, fn = 0, 0, 0, 0
        dtype = torch.FloatTensor
        targets = Variable(patterns[:,0].type(dtype), requires_grad=False)
        inputs = Variable(patterns[:,1:].type(dtype))
        
        predictions = self.model(inputs)
        predictions = (predictions >= 0.5).type(torch.FloatTensor)
        
        print type(predictions)
        print type(targets)
        for i,_ in enumerate(predictions):
            
            if predictions[i].data[0] == targets[i].data[0]:
                if targets[i].data[0] == 1.0:
                    #print targets[i][0], predictions[i][0]
                    tp += 1
                else:
                    tn += 1
            else:
                if targets[i][0] == 1.0:
                    fn += 1
                else:
                    fp += 1
        print tp, fp, tn, fn
        try:
            precision = float(tp)/float(tp + fp)
        except:
            precision = 0.0
        try:
            recall = float(tp)/float(tp + fn)
        except:
            recall = 0.0
            
        try:
            fscore = float(2*precision*recall)/float(precision+recall)
        except:
            fscore = 0.0
        
        return (precision, recall, fscore)


In [69]:
## test the NN by feeding in the X_train, X_test - output gives a 
def test_NN(nfeatures, X_train, X_test, iteration=50, neurons=10, learning_rate=0.02):
    # Actual feautres is 1 less
    NN = MLP_NeuralNetwork(nfeatures-1, neurons, 1, iteration, learning_rate)
    print "Begin training!!"
    NN.train(X_train, False)
    print "Done with training! Begin Tests!"
    p, r, f = NN.test(X_test, True)
    print "Test done!!"
    
    return p, r, f


##Now we test the above NN
## Load the data
sample_file = open("yelp_labelled.txt", "r")
X = load_data(sample_file)
print "data loaded!!"
sample_file.close()

ninputs, nfeatures = X.shape
print nfeatures
print ninputs
dtype = torch.FloatTensor
X_test = torch.Tensor(X[:int(0.2*ninputs)]).type(dtype)
X_train = torch.Tensor(X[int(0.2*ninputs):]).type(dtype)

##Uncomment below to have an individual test
test_NN(nfeatures, X_train, X_test, iteration=1000)



data loaded!!
2424
1000
Begin training!!
error 200.07446
error 330.54657
error 248.39296
error 231.36859
error 225.36885
error 221.25104
error 215.58731
error 207.16368
error 198.19832
error 189.08203
error 180.80489
error 173.03627
error 164.09909
error 152.53653
error 139.80687
error 125.43474
error 107.90302
error 88.96374
error 72.45778
error 57.72173
error 39.96133
error 30.73827
error 135.40086
error 24.71798
error 16.56353
error 13.65855
error 12.01118
error 10.94765
error 9.97750
error 9.19615
error 8.73440
error 8.37490
error 8.06121
error 7.76559
error 7.46767
error 7.17078
error 6.90768
error 6.69282
error 6.51725
error 6.36983
error 6.24279
error 6.13112
error 6.03157
error 5.94186
error 5.86036
error 5.78580
error 5.71718
error 5.65372
error 5.59475
error 5.53975
error 5.48827
error 5.43993
error 5.39442
error 5.35146
error 5.31084
error 5.27233
error 5.23577
error 5.20100
error 5.16789
error 5.13631
error 5.10615
error 5.07730
error 5.04968
error 5.02321
error 4.99781
err

(1.0, 0.6168831168831169, 0.7630522088353413)

##### Task 3 - Lets make a plots of these

First, lets change iterations and check the effect on fscores

In [70]:

# How does changing training iterations affect?
   
iterations = [50, 100, 200, 500]



precisions = []
recalls = []
fscores = []

#keep number of neurons to be 10
for iteration in iterations:
    print "Begin training iteration- {}".format(iteration)
    prec, rec, fscr = test_NN(nfeatures, X_train, X_test, iteration, 10, 0.02)
    precisions.append(prec)
    recalls.append(rec)
    fscores.append(fscr)
    print "Done with iteration -{}".format(iteration)



Begin training iteration- 50
Begin training!!
error 199.88109
error 334.63300
error 229.73608
error 229.98268
error 224.26241
Total iterations- 50
Done with training! Begin Tests!
<class 'torch.autograd.variable.Variable'>
<class 'torch.autograd.variable.Variable'>
112 0 0 88
Test done!!
Done with iteration -50
Begin training iteration- 100
Begin training!!
error 209.91898
error 208.09380
error 262.69476
error 242.52682
error 232.38742
error 229.77065
error 230.50323
error 227.50735
error 215.89743
error 196.06964
Total iterations- 100
Done with training! Begin Tests!
<class 'torch.autograd.variable.Variable'>
<class 'torch.autograd.variable.Variable'>
16 0 82 102
Test done!!
Done with iteration -100
Begin training iteration- 200
Begin training!!
error 209.11398
error 329.80991
error 250.95233
error 229.97917
error 224.41306
error 221.01851
error 218.01567
error 213.94740
error 210.21889
error 204.68433
error 196.79863
error 188.47838
error 179.06111
error 167.31488
error 154.37782
err

In [None]:
import matplotlib.pyplot as plt
iterations = [10, 20, 30, 40]
iterations = np.asarray(iterations)
## Uncomment below to share the values for precision, recall etc
# print iterations
# print recalls
# print precisions
# print fscores
plt.bar(iterations-1, np.asarray(precisions),width=2, color='b',align='center', label='precision')
plt.bar(iterations, np.asarray(recalls), width=2, color='g',align='center', label='recall')
plt.bar(iterations+1, np.asarray(fscores), width=2,color='r',align='center', label='fscore')
plt.xticks(iterations, ['50', '100', '200', '500'])
plt.legend(loc='upper left', frameon=False)
plt.show()



Now lets check whether neuron values affect?

In [None]:
### Now test with different neuron values for iteration limit of 100
neurons = [5, 10, 20, 30]

precisions = []
recalls = []
fscores = []

#keep number of iterations to be 100
for neuron in neurons:
    print "Begin training neuron- {}".format(neuron)
    prec, rec, fscr = test_NN(nfeatures, X_train, X_test, 100, neuron, 0.02)
    precisions.append(prec)
    recalls.append(rec)
    fscores.append(fscr)
    print "Done with neuron -{}".format(neuron)



In [None]:
neurons = [10, 20, 30, 40]
neurons = np.asarray(neurons)
## Uncomment below to share the values for precision, recall etc
# print iterations
# print recalls
# print precisions
# print fscores
plt.bar(neurons-1, np.asarray(precisions),width=2, color='b',align='center', label='precision')
plt.bar(neurons, np.asarray(recalls), width=2, color='g',align='center', label='recall')
plt.bar(neurons+1, np.asarray(fscores), width=2,color='r',align='center', label='fscore')
plt.xticks(neurons, ['5', '10', '20', '30'])
plt.legend(loc='upper left', frameon=False)
plt.show()



###### Computing the confusion matrix

In [None]:

import numpy as np
vis = visdom.Visdom()
cm = ConfusionMatrix()
vis.text(cm.dispMatrix())
vis.image(np.ones((3, 10, 10)))