In [None]:
train_path = "/home/amit/Downloads/review-data/train_data.txt"
test_path = "/home/amit/Downloads/review-data/test_data.txt"

In [None]:
def load_data(path):
    ratemap = {'1.0':0,'2.0':0,'3.0':0,'4.0':1,'5.0':1,'1':1,'0':0}
    data = open(path)
    X, y = [], []
    for line in data:
        text = line.partition(",")
        if text[0] in ratemap.keys():
            X.append(text[2])
            y.append(ratemap[text[0]])
    return X, y

In [None]:
X_train, y_train = load_data(train_path)
X_test, y_test = load_data(test_path)

In [None]:
print "Length of data: %d" % len(X_train)
print "Sample data: %s" % X_train[0]

In [None]:
import re
import numpy as np
from nltk.corpus import stopwords
stop = set(stopwords.words('english'))
def tokenize_words(text,remove_stopwords=False):
    # Remove HTML Tags
    text = re.sub('<[^>]*>','',text)
    # Keep Smileys
    smileys = re.findall('((?::|;|=)(?:-?)(?:[D|d|)|(|P|p|/|x|X]))',text)
    # Remove non words
    text = re.sub('[\W]+',' ',text.lower())
    text += ' '.join(smileys).replace('-','')
    # Remove Stopwords
    if remove_stopwords:
        tokenized = [w for w in text.split() if w not in stop]
        return tokenized
    return text.split()

In [None]:
import nltk
def func_tokenize_sents(review,remove_stopwords=False):
    sentences = nltk.sent_tokenize(review)
    tokenized_sents = []
    for sent in sentences:
        if len(sent) > 0:
            tokenized_sents.append(tokenize_words(sent,remove_stopwords))
    return tokenized_sents

In [None]:
def get_tokenized_sents(data):
    tokenized_sents = []
    count = 0
    for review in data:
        try:
            tokenized_sents += func_tokenize_sents(review)
        except:
            continue
    return tokenized_sents

In [None]:
train_tokenized_sents = get_tokenized_sents(X_train)
test_tokenized_sents = get_tokenized_sents(X_test)

In [None]:
print len(train_tokenized_sents)
print len(test_tokenized_sents)

In [None]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',level=logging.INFO)

In [None]:
num_features = 400 #Word Vector Dimension
min_word_count = 30
num_threads = 4 # Number of threads to run in parallel
context = 10 # contet window size
downsampling = 0.001 #downsample frequent words

In [None]:
from datetime import datetime
from gensim.models import word2vec
start_time = datetime.now()
model = word2vec.Word2Vec(train_tokenized_sents + test_tokenized_sents,
                          workers=num_threads,
                          size=num_features,
                          min_count=min_word_count,
                          window=context,
                          sample=downsampling)
model.init_sims(replace=True)
model_name = "400features_30minwords_10context"
model.save(model_name)
print "Time take to generate word vectors: %s" % str(datetime.now() - start_time)

In [None]:
from gensim.models import Word2Vec
model = Word2Vec.load("400features_30minwords_10context")

In [None]:
print model.doesnt_match("man woman child kitchen".split())
print model.most_similar("queen")
print model.most_similar("awful")

In [None]:
print "Type of model:",type(model.syn0)
print "Shape of model array: %s" % (str(model.syn0.shape))

In [None]:
# Average the feature vectors to combine the words in each review
def avgFeatureVec(words,model,num_features):
    featureVec = np.zeros((num_features,),dtype="float32")
    nwords=0
    index2word = set(model.index2word)
    for word in words:
        if word in index2word:
            nwords += 1
            featureVec = np.add(featureVec,model[word])
    if nwords > 0:
        featureVec = np.divide(featureVec,nwords)
        return featureVec
    else:
        return None

In [None]:
def getavgFeatureVecs(reviews,y, model, num_features):
    counter = 0
    reviewFeatureVecs = []
    label = []
    for i in xrange(len(reviews)):
        temp = avgFeatureVec(reviews[i],model,num_features)
        if temp != None:
            reviewFeatureVecs.append(temp)
            label.append(y[i])
            counter += 1
    return reviewFeatureVecs, label        

In [None]:
import theano
import theano.tensor as T

In [None]:
def get_train_data(y):
    train_reviews = []
    for review in X_train:
        train_reviews.append(tokenize_words(review))

    trainData,trainY = getavgFeatureVecs(train_reviews,y,model,num_features)
    return np.asarray(trainData,dtype=theano.config.floatX),np.asarray(trainY,dtype=np.int32)

In [None]:
def get_test_data(y):
    test_reviews = []
    for review in X_test:
        test_reviews.append(tokenize_words(review))

    testData, testY = getavgFeatureVecs(test_reviews,y,model,num_features)
    return np.asarray(testData,dtype=theano.config.floatX),np.asarray(testY,dtype=np.int32)

In [None]:
X_train, y_train = get_train_data()
X_test, y_test = get_test_data()

In [None]:
print "Training Data Shape:",X_train.shape,y_train.shape
print "Testing Data Shape:",X_test.shape,y_test.shape

In [None]:
class NeuralNet:
    def __init__(self,input,nn_input_dim,activation=T.tanh,nn_hidden_dim=100,nn_output_dim=2):
        
        self.input = input
        self.nn_input_dim = nn_input_dim
        self.nn_hidden_dim = nn_hidden_dim
        
        rng = np.random.RandomState(0)
        
        W1 = np.asarray(
            rng.uniform(
                low=-np.sqrt(6. / (nn_input_dim + nn_hidden_dim)),
                high=np.sqrt(6. / (nn_input_dim + nn_hidden_dim)),
                size=(nn_input_dim,nn_hidden_dim)
            ),
            dtype=theano.config.floatX
        )
        b1 = np.zeros((nn_hidden_dim,),dtype=theano.config.floatX)
        W2 = np.asarray(
            rng.uniform(
                low=-np.sqrt(6. / (nn_output_dim + nn_hidden_dim)),
                high=np.sqrt(6. / (nn_output_dim + nn_hidden_dim)),
                size=(nn_hidden_dim,nn_output_dim)
            ),
            dtype=theano.config.floatX
        )
        b2 = np.zeros((nn_output_dim,),dtype=theano.config.floatX)
        
        self.W1 = theano.shared(name='W1',value=W1)
        self.b1 = theano.shared(name='b1',value=b1)
        self.W2 = theano.shared(name='W2',value=W2)
        self.b2 = theano.shared(name='b2',value=b2)
        
        z1 = (self.input).dot(self.W1) + self.b1
        a1 = activation(z1)
        z2 = a1.dot(self.W2) + self.b2
        
        self.p_y_given_x = T.nnet.softmax(z2)
        
        self.y_pred = T.argmax(self.p_y_given_x,axis=1)
        
        self.params = [self.W1,self.b1,self.W2,self.b2]
    
    def forward_propagation(self,activation=T.tanh):
        W1, W2 = self.W1, self.W2
        b1, b2 = self.b1, self.b2
        
        z1 = self.input.dot(W1) + b1
        a1 = activation(z1)
        z2 = a1.dot(W2) + b2
        y_c = T.nnet.softmax(z2)
        
        return y_c
        
    def negative_log_likelihood(self,y,reg_lambda=0.01):
       
        loss_reg = 1./((self.input).shape[0]) * reg_lambda/2 * T.sum((self.W1**2).sum() + (self.W2**2).sum())
        #return T.nnet.categorical_crossentropy(self.p_y_given_x,y).mean() + loss_reg
        return -T.mean(T.log(self.p_y_given_x)[T.arange(y.shape[0]),y]) + loss_reg
    
    def predict(self):
        return T.argmax(self.p_y_given_x,axis=1)
    
    def error(self,y):
        if self.y_pred.ndim != y.ndim:
            raise TypeError('y and y_c shape mismatch',('y',y.type,'y_c',self.y_pred.type))
        return T.mean(T.neq(y,self.y_pred))
    
    def gradient_params(self):
        loss = self.negative_log_likelihood()
        
        dW1 = T.grad(loss,self.W1)
        db1 = T.grad(loss,self.b1)
        dW2 = T.grad(loss,self.W2)
        db2 = T.grad(loss,self.b2)
        
        grad_params = [dW1,db1,dW2,db2]
        return grad_params

In [None]:
import pydot
from IPython.display import Image
from IPython.display import SVG

#Check if there is no NaN operation while training 
def detect_nan(i, node, fn):
    for output in fn.outputs:
        if (not isinstance(output[0], np.random.RandomState) and
            np.isnan(output[0]).any()):
            print '*** NaN detected ***'
            theano.printing.debugprint(node)
            print 'Inputs : %s' % [input[0] for input in fn.inputs]
            print 'Outputs: %s' % [output[0] for output in fn.outputs]
            break

def run_nn(num_passes=1000,print_loss=False,learning_rate=0.01,batch_size=500):
    
    X = T.matrix('X')
    y = T.ivector('y')

    nn_model = NeuralNet(input=X,nn_input_dim=400)
    
    cost = nn_model.negative_log_likelihood(y)
    
    test_model = theano.function(
        inputs=[X,y],
        outputs=nn_model.error(y)
    )
    
    grad_params = [T.grad(cost,param) for param in nn_model.params]
    
    updates = [(param, param - learning_rate * gradient) for param,gradient in zip(nn_model.params, grad_params)]
    
    train_model = theano.function(
        inputs=[X,y],
        outputs=cost,
        updates=updates,
        mode=theano.compile.MonitorMode(
            post_func=detect_nan
        ).excluding('local_elemwise_fusion', 'inplace')
    )
    
    theano.printing.pydotprint(train_model,var_with_name_simple=True,compact=True,outfile='nn-theano-train.png',format='png')
    
    n_train_batches = X_train.shape[0] / batch_size
    n_test_batches = X_test.shape[0] / batch_size
    
    validate_after = 100
    flag = True
    for i in xrange(num_passes):
        cost = 0.
        for j in xrange(n_train_batches):
            cost += train_model(X_train[j*batch_size:(j+1)*batch_size],y_train[j*batch_size:(j+1)*batch_size])
            if np.isnan(cost):
                flag = False
                break
        if flag == False:
            break
        #print "Cost after %d pass: %f"%(i,cost/n_train_batches)
        if i % validate_after == 0:
            test_losses = [test_model(X_test[k*batch_size:(k+1)*batch_size],y_test[k*batch_size:(k+1)*batch_size]) for k in xrange(n_test_batches)]
            test_score = np.mean(test_losses)
            print "Loss after %d iterations is %f" % (i, test_score)

In [None]:
run_nn()