In [1]:
import pandas as pd
import numpy as np
import time
import warnings
warnings.filterwarnings("ignore")
import sys
from collections import Counter

#### Reading Data from txt file

In [2]:
df = pd.concat([pd.read_csv('reviews.txt', header=None),pd.read_csv('labels.txt', header=None)], axis = 1)
df.columns = ["Reviews","Labels"]
print(df.shape)
df.head()

(25000, 2)


Unnamed: 0,Reviews,Labels
0,bromwell high is a cartoon comedy . it ran at ...,positive
1,story of a man who has unnatural feelings for ...,negative
2,homelessness or houselessness as george carli...,positive
3,airport starts as a brand new luxury pla...,negative
4,brilliant over acting by lesley ann warren . ...,positive


# 0 - Data Preprocessing

In [3]:
df['Reviews'], df['Labels'] = df['Reviews'].str.lower(), df['Labels'].str.lower()

In [4]:
def sigmoid(x):
    return 1/(1+np.exp(-x))

In [5]:
#Change the labels to 1 if positive & 0 if negative
label_train = list(map(int, list(df['Labels'][:-1000].str.replace('positive','1').str.replace('negative','0'))))
# label_test = list(map(int, list(df['Labels'][-1000:].str.replace('positive','1').str.replace('negative','0'))))

In [6]:
X, y, X_test, y_test = df["Reviews"][:-1000], df["Labels"][:-1000], df["Reviews"][-1000:], df["Labels"][-1000:]

# I - Bag of Words

Bag of words approach stores the count of occurrence of words in the input node

#### Train the model

In [7]:
def train_set(X, y, learnrate):
    
    unique_review = list(set(X.str.cat(sep = " ").split(" ")))
    word2index = {}
    for i, word in enumerate(unique_review):
        word2index[word] = i
        
    correct_so_far = 0

    input_row = np.zeros(len(unique_review))
    weights_0_1 = np.zeros((len(unique_review),10))
    np.random.seed(1)
    weights_1_2 = np.random.normal(0.0, 1**-0.5, 10)

    start_time = time.time()

    for i in range(len(X)):
        input_row *= 0
        for j in X[i].split(' '):
            input_row[word2index[j]] += 1

        layer_1 = np.dot(input_row, weights_0_1)   #No activation function for hidden layer
        layer_2 = sigmoid(np.dot(layer_1, weights_1_2))

        error_term_2 = (layer_2 - label_train[i]) * layer_2 * (1 - layer_2)
        error_term_1 = np.dot(weights_1_2, error_term_2)

        weights_1_2 -= learnrate * error_term_2 * layer_1
        weights_0_1 -= learnrate * error_term_1 * input_row[:,None]  

        # Keep track of correct predictions
        if(layer_2 >= 0.5 and y[i] == 'positive'):
            correct_so_far += 1
        elif(layer_2 < 0.5 and y[i] == 'negative'):
            correct_so_far += 1

        elapsed_time = float(time.time() - start_time)
        reviews_per_second = i / elapsed_time if elapsed_time > 0 else 0
        
        sys.stdout.write("\rProgress:" + str(100 * i/float(len(X)))[:4] \
                 + "% Speed(reviews/sec):" + str(reviews_per_second)[0:5] \
                 + " #Correct:" + str(correct_so_far) + " #Trained:" + str(i+1) \
                 + " Training Accuracy:" + str(correct_so_far * 100 / float(i+1))[:4] + "%")
        if(i % 2500 == 0):
            print("")
    return weights_0_1, weights_1_2, unique_review, word2index

In [8]:
weights_0_1, weights_1_2, unique_review, word2index = train_set(X, y, learnrate = 0.1)

Progress:0.0% Speed(reviews/sec):0.0 #Correct:1 #Trained:1 Training Accuracy:100.%
Progress:10.4% Speed(reviews/sec):169.4 #Correct:1251 #Trained:2501 Training Accuracy:50.0%
Progress:20.8% Speed(reviews/sec):164.7 #Correct:2501 #Trained:5001 Training Accuracy:50.0%
Progress:31.2% Speed(reviews/sec):171.0 #Correct:3751 #Trained:7501 Training Accuracy:50.0%
Progress:41.6% Speed(reviews/sec):174.4 #Correct:5001 #Trained:10001 Training Accuracy:50.0%
Progress:52.0% Speed(reviews/sec):177.7 #Correct:6251 #Trained:12501 Training Accuracy:50.0%
Progress:62.5% Speed(reviews/sec):180.5 #Correct:7501 #Trained:15001 Training Accuracy:50.0%
Progress:72.9% Speed(reviews/sec):180.9 #Correct:8751 #Trained:17501 Training Accuracy:50.0%
Progress:83.3% Speed(reviews/sec):181.3 #Correct:10001 #Trained:20001 Training Accuracy:50.0%
Progress:93.7% Speed(reviews/sec):182.7 #Correct:11251 #Trained:22501 Training Accuracy:50.0%
Progress:99.9% Speed(reviews/sec):182.8 #Correct:12000 #Trained:24000 Training Ac

#### Test Dataset Performance

In [9]:
def test_set(X_test, y_test, weights_0_1, weights_1_2, unique_review, word2index):
    correct_so_far = 0
    input_row = np.zeros(len(unique_review))
    start_time = time.time()
    
    for i in range(len(X_test)):
        input_row *= 0
        for j in X_test.iloc[i].split(' '):
            if (j in word2index.keys()):               #This command is required in test function because there can be few
                input_row[word2index[j]] += 1          #new words in test not present in train which can throw error 
                                                       

        layer_1 = np.dot(input_row, weights_0_1)        #No activation function for hidden layer
        layer_2 = sigmoid(np.dot(layer_1, weights_1_2))
        
        if(layer_2 >= 0.5 and y_test.iloc[i] == 'positive'):
            correct_so_far += 1
        elif(layer_2 < 0.5 and y_test.iloc[i] == 'negative'):
            correct_so_far += 1

        elapsed_time = float(time.time() - start_time)
        reviews_per_second = i / elapsed_time if elapsed_time > 0 else 0
        sys.stdout.write("\rProgress:" + str(100 * i/float(len(X_test)))[:4] \
                         + "% Speed(reviews/sec):" + str(reviews_per_second)[0:5] \
                         + " #Correct:" + str(correct_so_far) + " #Tested:" + str(i+1) \
                         + " Testing Accuracy:" + str(correct_so_far * 100 / float(i+1))[:4] + "%")

In [10]:
test_set(X_test, y_test, weights_0_1, weights_1_2, unique_review, word2index)

Progress:99.9% Speed(reviews/sec):1269. #Correct:500 #Tested:1000 Testing Accuracy:50.0%

#### Train the model using Learning Rate = 0.01

In [11]:
weights_0_1, weights_1_2, unique_review, word2index = train_set(X, y, learnrate = 0.01)

Progress:0.0% Speed(reviews/sec):0.0 #Correct:1 #Trained:1 Training Accuracy:100.%
Progress:10.4% Speed(reviews/sec):184.2 #Correct:1248 #Trained:2501 Training Accuracy:49.9%
Progress:20.8% Speed(reviews/sec):187.9 #Correct:2498 #Trained:5001 Training Accuracy:49.9%
Progress:31.2% Speed(reviews/sec):188.9 #Correct:3748 #Trained:7501 Training Accuracy:49.9%
Progress:41.6% Speed(reviews/sec):188.6 #Correct:4998 #Trained:10001 Training Accuracy:49.9%
Progress:52.0% Speed(reviews/sec):179.1 #Correct:6248 #Trained:12501 Training Accuracy:49.9%
Progress:62.5% Speed(reviews/sec):178.6 #Correct:7497 #Trained:15001 Training Accuracy:49.9%
Progress:72.9% Speed(reviews/sec):178.3 #Correct:8782 #Trained:17501 Training Accuracy:50.1%
Progress:83.3% Speed(reviews/sec):180.0 #Correct:10063 #Trained:20001 Training Accuracy:50.3%
Progress:93.7% Speed(reviews/sec):180.7 #Correct:11313 #Trained:22501 Training Accuracy:50.2%
Progress:99.9% Speed(reviews/sec):181.3 #Correct:12062 #Trained:24000 Training Ac

#### Train the model using Learning Rate = 0.001

In [12]:
weights_0_1, weights_1_2, unique_review, word2index = train_set(X, y, learnrate = 0.001)

Progress:0.0% Speed(reviews/sec):0.0 #Correct:1 #Trained:1 Training Accuracy:100.%
Progress:10.4% Speed(reviews/sec):185.4 #Correct:1261 #Trained:2501 Training Accuracy:50.4%
Progress:20.8% Speed(reviews/sec):188.5 #Correct:2577 #Trained:5001 Training Accuracy:51.5%
Progress:31.2% Speed(reviews/sec):189.3 #Correct:3969 #Trained:7501 Training Accuracy:52.9%
Progress:41.6% Speed(reviews/sec):190.5 #Correct:5448 #Trained:10001 Training Accuracy:54.4%
Progress:52.0% Speed(reviews/sec):185.3 #Correct:6996 #Trained:12501 Training Accuracy:55.9%
Progress:62.5% Speed(reviews/sec):184.0 #Correct:8620 #Trained:15001 Training Accuracy:57.4%
Progress:72.9% Speed(reviews/sec):184.9 #Correct:10258 #Trained:17501 Training Accuracy:58.6%
Progress:83.3% Speed(reviews/sec):184.0 #Correct:11926 #Trained:20001 Training Accuracy:59.6%
Progress:93.7% Speed(reviews/sec):182.6 #Correct:13579 #Trained:22501 Training Accuracy:60.3%
Progress:99.9% Speed(reviews/sec):182.0 #Correct:14653 #Trained:24000 Training A

# II - Increase accuracy - Binary Vector

Binary Vector Representation - does not count how many times each word has occured, but rather stores whether or not a word has occured. <br> Input Node - 1 if word is present 0 otherwise

In [13]:
def train_set(X, y, learnrate):
    
    unique_review = list(set(X.str.cat(sep = " ").split(" ")))
    word2index = {}
    for i, word in enumerate(unique_review):
        word2index[word] = i
    
    correct_so_far = 0

    input_row = np.zeros(len(unique_review))
    weights_0_1 = np.zeros((len(unique_review),10))
    np.random.seed(1)
    weights_1_2 = np.random.normal(0.0, 1**-0.5, 10)

    start_time = time.time()

    for i in range(len(X)):
        input_row *= 0
        for j in X[i].split(' '):
            if (j in word2index.keys()):
                input_row[word2index[j]] = 1

        layer_1 = np.dot(input_row, weights_0_1)         #No activation function for hidden layer
        layer_2 = sigmoid(np.dot(layer_1, weights_1_2))

        error_term_2 = (layer_2 - label_train[i]) * layer_2 * (1 - layer_2)
        error_term_1 = np.dot(weights_1_2, error_term_2)

        weights_1_2 -= learnrate * error_term_2 * layer_1
        weights_0_1 -= learnrate * error_term_1 * input_row[:,None]  

        # Keep track of correct predictions
        if(layer_2 >= 0.5 and y[i] == 'positive'):
            correct_so_far += 1
        elif(layer_2 < 0.5 and y[i] == 'negative'):
            correct_so_far += 1

        elapsed_time = float(time.time() - start_time)
        reviews_per_second = i / elapsed_time if elapsed_time > 0 else 0
        
        sys.stdout.write("\rProgress:" + str(100 * i/float(len(X)))[:4] \
                 + "% Speed(reviews/sec):" + str(reviews_per_second)[0:5] \
                 + " #Correct:" + str(correct_so_far) + " #Trained:" + str(i+1) \
                 + " Training Accuracy:" + str(correct_so_far * 100 / float(i+1))[:4] + "%")
        if(i % 2500 == 0):
            print("")
    return weights_0_1, weights_1_2, unique_review, word2index

We are getting high accuracy even for Learning Rate = 0.1

In [14]:
weights_0_1, weights_1_2, unique_review, word2index = train_set(X, y, learnrate = 0.1)

Progress:0.0% Speed(reviews/sec):0.0 #Correct:1 #Trained:1 Training Accuracy:100.%
Progress:10.4% Speed(reviews/sec):175.6 #Correct:1786 #Trained:2501 Training Accuracy:71.4%
Progress:20.8% Speed(reviews/sec):176.2 #Correct:3741 #Trained:5001 Training Accuracy:74.8%
Progress:31.2% Speed(reviews/sec):176.1 #Correct:5833 #Trained:7501 Training Accuracy:77.7%
Progress:41.6% Speed(reviews/sec):176.4 #Correct:7973 #Trained:10001 Training Accuracy:79.7%
Progress:52.0% Speed(reviews/sec):174.2 #Correct:10105 #Trained:12501 Training Accuracy:80.8%
Progress:62.5% Speed(reviews/sec):174.3 #Correct:12236 #Trained:15001 Training Accuracy:81.5%
Progress:72.9% Speed(reviews/sec):174.5 #Correct:14351 #Trained:17501 Training Accuracy:82.0%
Progress:83.3% Speed(reviews/sec):174.6 #Correct:16532 #Trained:20001 Training Accuracy:82.6%
Progress:93.7% Speed(reviews/sec):174.7 #Correct:18721 #Trained:22501 Training Accuracy:83.2%
Progress:99.9% Speed(reviews/sec):174.4 #Correct:20043 #Trained:24000 Training

In [15]:
test_set(X_test, y_test, weights_0_1, weights_1_2, unique_review, word2index)

Progress:99.9% Speed(reviews/sec):1032. #Correct:849 #Tested:1000 Testing Accuracy:84.9%

# III - Increase Computation Speed

Let us try to increase training speed without compromising on accuracy
1. Conventionally,
    Layer1 Output = np.dot(Input_nodes, Weights_0_1) <br>
    Since the input_nodes are binary(1,0) in our case we can simply take <br>
    Layer1 Ouput = sum(Weights_0_1) for every node
2. Eliminate unnecessary multiplications and additions - do not use input nodes with 0 in forward and back prop

In [16]:
def train_set(X_raw, y, learnrate):
    
    unique_review = list(set(X_raw.str.cat(sep = " ").split(" ")))
    word2index = {}
    for i, word in enumerate(unique_review):
        word2index[word] = i
    
    correct_so_far = 0

    input_row = np.zeros(len(unique_review))
    weights_0_1 = np.zeros((len(unique_review),10))
    np.random.seed(1)
    weights_1_2 = np.random.normal(0.0, 1**-0.5, 10)

    # Pre-process training reviews so we can deal directly with the indices of non-zero inputs
    X = []
    for i in range(len(X_raw)):
        indices = set()
        for j in X_raw[i].split(" "):
            indices.add(word2index[j])
        X.append(list(indices))

    start_time = time.time()

    for i in range(len(X)):

        layer_1 = np.zeros(10)
        for j in range(10):
            layer_1[j] = sum(weights_0_1[X[i],j])
        layer_2 = sigmoid(np.dot(layer_1, weights_1_2))

        error_term_2 = (layer_2 - label_train[i]) * layer_2 * (1 - layer_2)
        error_term_1 = np.dot(weights_1_2, error_term_2)

        weights_1_2 -= learnrate * error_term_2 * layer_1
        weights_0_1[X[i]] -= learnrate * error_term_1

        # Keep track of correct predictions
        if(layer_2 >= 0.5 and y[i] == 'positive'):
            correct_so_far += 1
        elif(layer_2 < 0.5 and y[i] == 'negative'):
            correct_so_far += 1

        
        if ((i%2500 == 0) | (i == len(X)-1)):
            elapsed_time = float(time.time() - start_time)
            reviews_per_second = i / elapsed_time if elapsed_time > 0 else 0
            sys.stdout.write("\rProgress:" + str(100 * i/float(len(X)))[:4] \
                     + "% Speed(reviews/sec):" + str(reviews_per_second)[0:5] \
                     + " #Correct:" + str(correct_so_far) + " #Trained:" + str(i+1) \
                     + " Training Accuracy:" + str(correct_so_far * 100 / float(i+1))[:4] + "%")
            print("")

    return weights_0_1, weights_1_2, unique_review, word2index

In [17]:
weights_0_1, weights_1_2, unique_review, word2index = train_set(X,y,learnrate = 0.1)

Progress:0.0% Speed(reviews/sec):0.0 #Correct:1 #Trained:1 Training Accuracy:100.%
Progress:10.4% Speed(reviews/sec):2616. #Correct:1784 #Trained:2501 Training Accuracy:71.3%
Progress:20.8% Speed(reviews/sec):2574. #Correct:3795 #Trained:5001 Training Accuracy:75.8%
Progress:31.2% Speed(reviews/sec):2586. #Correct:5886 #Trained:7501 Training Accuracy:78.4%
Progress:41.6% Speed(reviews/sec):2608. #Correct:8044 #Trained:10001 Training Accuracy:80.4%
Progress:52.0% Speed(reviews/sec):2603. #Correct:10180 #Trained:12501 Training Accuracy:81.4%
Progress:62.5% Speed(reviews/sec):2601. #Correct:12305 #Trained:15001 Training Accuracy:82.0%
Progress:72.9% Speed(reviews/sec):2597. #Correct:14421 #Trained:17501 Training Accuracy:82.4%
Progress:83.3% Speed(reviews/sec):2595. #Correct:16586 #Trained:20001 Training Accuracy:82.9%
Progress:93.7% Speed(reviews/sec):2594. #Correct:18773 #Trained:22501 Training Accuracy:83.4%
Progress:99.9% Speed(reviews/sec):2594. #Correct:20102 #Trained:24000 Training

In [18]:
test_set(X_test, y_test, weights_0_1, weights_1_2, unique_review, word2index)

Progress:0.0% Speed(reviews/sec):0.0 #Correct:1 #Tested:1 Testing Accuracy:100.%Progress:0.1% Speed(reviews/sec):292.1 #Correct:2 #Tested:2 Testing Accuracy:100.%Progress:0.2% Speed(reviews/sec):368.8 #Correct:2 #Tested:3 Testing Accuracy:66.6%Progress:0.3% Speed(reviews/sec):466.7 #Correct:3 #Tested:4 Testing Accuracy:75.0%Progress:0.4% Speed(reviews/sec):538.2 #Correct:4 #Tested:5 Testing Accuracy:80.0%Progress:0.5% Speed(reviews/sec):592.5 #Correct:5 #Tested:6 Testing Accuracy:83.3%Progress:0.6% Speed(reviews/sec):633.8 #Correct:6 #Tested:7 Testing Accuracy:85.7%Progress:0.7% Speed(reviews/sec):669.1 #Correct:7 #Tested:8 Testing Accuracy:87.5%Progress:0.8% Speed(reviews/sec):699.2 #Correct:8 #Tested:9 Testing Accuracy:88.8%Progress:0.9% Speed(reviews/sec):721.4 #Correct:9 #Tested:10 Testing Accuracy:90.0%Progress:1.0% Speed(reviews/sec):743.7 #Correct:10 #Tested:11 Testing Accuracy:90.9%Progress:1.1% Speed(reviews/sec):711.7 #Correct:11 #Tested:12 Testing Accuracy:91.6%

Progress:99.9% Speed(reviews/sec):1155. #Correct:848 #Tested:1000 Testing Accuracy:84.8%

# IV - Further Enhance Accuracy

#### Consider words which satisfy following conditions,
1. Occurence of words > 50 (max_count)
2. Positive to Negative ratio of each word > 0.05 (polarity_cutoff)
3. Increase number of training epocs

Find the count of words

In [19]:
positive_counts = Counter()
negative_counts = Counter()
total_counts = Counter()

for i,j in zip(X,y):
    if j == "positive":
        for word in i.split(" "):
            positive_counts[word] += 1
            total_counts[word] += 1
    else:
        for word in i.split(" "):
            negative_counts[word] += 1
            total_counts[word] += 1

Find Positive to negative ratio

In [20]:
unique_review = list(set(X.str.cat(sep = " ").split(" ")))

In [21]:
pos_neg_ratio = Counter()
for i in unique_review:
    pos_neg_ratio[i] = positive_counts[i] / float(negative_counts[i]+1)  #if pos_neg_ratio>1, word occurs more in positive reviews
                                                                             #if 0<pos_neg_ratio<1, word occurs more in negative reviews
for word,ratio in pos_neg_ratio.most_common():
    if(ratio > 1):
        pos_neg_ratio[word] = np.log(ratio)                   #Log function shifts the threshold axis
    else:                                                     #if pos_neg_ratio>1, f(pos_neg_ratio) ==> pos_neg_ratio > 0
        pos_neg_ratio[word] = -np.log((1 / (ratio + 0.01)))   #if 0<pos_neg_ratio<1, f(pos_neg_ratio) ==> pos_neg_ratio < 0        
                                                              #Now, pos_neg_ratio>0 if word is positive; pos_neg_ratio<0 if word is negative

Let us inlcude above codes blocks into the function and take input arguments max_count, polarity_cutoff as threasholds

In [22]:
def train_set(X_raw, y, learnrate, max_count, polarity_cutoff):
    
    unique_review = list(set(X_raw.str.cat(sep = " ").split(" ")))
    
    positive_counts = Counter()
    negative_counts = Counter()
    total_counts = Counter()

    for i,j in zip(X_raw,y):
        if j == "positive":
            for word in i.split(" "):
                positive_counts[word] += 1
                total_counts[word] += 1
        else:
            for word in i.split(" "):
                negative_counts[word] += 1
                total_counts[word] += 1 
    
    pos_neg_ratio = Counter()
    for i in unique_review:
        if total_counts[i] > max_count:
            pos_neg_ratio[i] = positive_counts[i] / float(negative_counts[i]+1)  
            
    for word,ratio in pos_neg_ratio.most_common():
        if(ratio > 1):
            pos_neg_ratio[word] = np.log(ratio)                   
        else:                                                     
            pos_neg_ratio[word] = -np.log((1 / (ratio + 0.01)))   
            
    unique_review_new = []
    for i in unique_review:
        if total_counts[i] > max_count:
            if abs(pos_neg_ratio[i]) > polarity_cutoff:
                unique_review_new.append(i)

    
    word2index = {}
    for i, word in enumerate(unique_review_new):
        word2index[word] = i

    correct_so_far = 0

    input_row = np.zeros(len(unique_review_new))
    weights_0_1 = np.zeros((len(unique_review_new),10))
    np.random.seed(1)
    weights_1_2 = np.random.normal(0.0, 1**-0.5, 10)

    X = []
    for i in range(len(X_raw)):
        indices = set()
        for j in X_raw[i].split(" "):
            if (j in word2index.keys()):
                indices.add(word2index[j])
        X.append(list(indices))

    start_time = time.time()

    for i in range(len(X)):

        layer_1 = np.zeros(10)
        for j in range(10):
            layer_1[j] = sum(weights_0_1[X[i],j])
        layer_2 = sigmoid(np.dot(layer_1, weights_1_2))

        error_term_2 = (layer_2 - label_train[i]) * layer_2 * (1 - layer_2)
        error_term_1 = np.dot(weights_1_2, error_term_2)

        weights_1_2 -= learnrate * error_term_2 * layer_1
        weights_0_1[X[i]] -= learnrate * error_term_1

        # Keep track of correct predictions
        if(layer_2 >= 0.5 and y[i] == 'positive'):
            correct_so_far += 1
        elif(layer_2 < 0.5 and y[i] == 'negative'):
            correct_so_far += 1


        if ((i%2500 == 0) | (i == len(X)-1)):
            elapsed_time = float(time.time() - start_time)
            reviews_per_second = i / elapsed_time if elapsed_time > 0 else 0
            sys.stdout.write("\rProgress:" + str(100 * i/float(len(X)))[:4] \
                     + "% Speed(reviews/sec):" + str(reviews_per_second)[0:5] \
                     + " #Correct:" + str(correct_so_far) + " #Trained:" + str(i+1) \
                     + " Training Accuracy:" + str(correct_so_far * 100 / float(i+1))[:4] + "%")
            print("")

    return weights_0_1, weights_1_2, unique_review_new, word2index

In [23]:
weights_0_1, weights_1_2, unique_review, word2index = train_set(X, y, learnrate = 0.1, 
                                                                max_count = 50, polarity_cutoff = 0.05)

Progress:0.0% Speed(reviews/sec):0 #Correct:1 #Trained:1 Training Accuracy:100.%
Progress:10.4% Speed(reviews/sec):3259. #Correct:1858 #Trained:2501 Training Accuracy:74.2%
Progress:20.8% Speed(reviews/sec):3208. #Correct:3885 #Trained:5001 Training Accuracy:77.6%
Progress:31.2% Speed(reviews/sec):3218. #Correct:5985 #Trained:7501 Training Accuracy:79.7%
Progress:41.6% Speed(reviews/sec):3234. #Correct:8133 #Trained:10001 Training Accuracy:81.3%
Progress:52.0% Speed(reviews/sec):3233. #Correct:10272 #Trained:12501 Training Accuracy:82.1%
Progress:62.5% Speed(reviews/sec):3237. #Correct:12399 #Trained:15001 Training Accuracy:82.6%
Progress:72.9% Speed(reviews/sec):3230. #Correct:14533 #Trained:17501 Training Accuracy:83.0%
Progress:83.3% Speed(reviews/sec):3224. #Correct:16713 #Trained:20001 Training Accuracy:83.5%
Progress:93.7% Speed(reviews/sec):3220. #Correct:18909 #Trained:22501 Training Accuracy:84.0%
Progress:99.9% Speed(reviews/sec):3219. #Correct:20234 #Trained:24000 Training A

In [24]:
test_set(X_test, y_test, weights_0_1, weights_1_2, unique_review, word2index)

Progress:0.0% Speed(reviews/sec):0.0 #Correct:1 #Tested:1 Testing Accuracy:100.%Progress:0.1% Speed(reviews/sec):506.9 #Correct:1 #Tested:2 Testing Accuracy:50.0%Progress:0.2% Speed(reviews/sec):808.5 #Correct:1 #Tested:3 Testing Accuracy:33.3%Progress:0.3% Speed(reviews/sec):1008. #Correct:2 #Tested:4 Testing Accuracy:50.0%Progress:0.4% Speed(reviews/sec):1151. #Correct:3 #Tested:5 Testing Accuracy:60.0%Progress:0.5% Speed(reviews/sec):1257. #Correct:4 #Tested:6 Testing Accuracy:66.6%Progress:0.6% Speed(reviews/sec):1339. #Correct:5 #Tested:7 Testing Accuracy:71.4%Progress:0.7% Speed(reviews/sec):1405. #Correct:6 #Tested:8 Testing Accuracy:75.0%Progress:0.8% Speed(reviews/sec):1459. #Correct:7 #Tested:9 Testing Accuracy:77.7%Progress:0.9% Speed(reviews/sec):1388. #Correct:8 #Tested:10 Testing Accuracy:80.0%Progress:1.0% Speed(reviews/sec):1431. #Correct:9 #Tested:11 Testing Accuracy:81.8%Progress:1.1% Speed(reviews/sec):1465. #Correct:10 #Tested:12 Testing Accuracy:83.3%P

Progress:99.9% Speed(reviews/sec):2471. #Correct:849 #Tested:1000 Testing Accuracy:84.9%

In [25]:
weights_0_1, weights_1_2, unique_review, word2index = train_set(X, y, learnrate = 0.1, 
                                                                max_count = 50, polarity_cutoff = 0.8)

Progress:0.0% Speed(reviews/sec):0 #Correct:1 #Trained:1 Training Accuracy:100.%
Progress:10.4% Speed(reviews/sec):10864 #Correct:2088 #Trained:2501 Training Accuracy:83.4%
Progress:20.8% Speed(reviews/sec):10746 #Correct:4205 #Trained:5001 Training Accuracy:84.0%
Progress:31.2% Speed(reviews/sec):10729 #Correct:6327 #Trained:7501 Training Accuracy:84.3%
Progress:41.6% Speed(reviews/sec):10669 #Correct:8483 #Trained:10001 Training Accuracy:84.8%
Progress:52.0% Speed(reviews/sec):10662 #Correct:10615 #Trained:12501 Training Accuracy:84.9%
Progress:62.5% Speed(reviews/sec):10683 #Correct:12759 #Trained:15001 Training Accuracy:85.0%
Progress:72.9% Speed(reviews/sec):10656 #Correct:14883 #Trained:17501 Training Accuracy:85.0%
Progress:83.3% Speed(reviews/sec):10659 #Correct:17056 #Trained:20001 Training Accuracy:85.2%
Progress:93.7% Speed(reviews/sec):10650 #Correct:19227 #Trained:22501 Training Accuracy:85.4%
Progress:99.9% Speed(reviews/sec):10654 #Correct:20521 #Trained:24000 Training A

In [26]:
test_set(X_test, y_test, weights_0_1, weights_1_2, unique_review, word2index)

Progress:99.9% Speed(reviews/sec):3357. #Correct:808 #Tested:1000 Testing Accuracy:80.8%

Use hyper parameter epoch = 20 to train the model

In [27]:
def train_set_epochs(X_raw, y, learnrate, max_count, polarity_cutoff):

    positive_counts = Counter()
    negative_counts = Counter()
    total_counts = Counter()

    for i,j in zip(X_raw,y):
        if j == "positive":
            for word in i.split(" "):
                positive_counts[word] += 1
                total_counts[word] += 1
        else:
            for word in i.split(" "):
                negative_counts[word] += 1
                total_counts[word] += 1 

    pos_neg_ratio = Counter()
    for i in unique_review:
        if total_counts[i] > max_count:
            pos_neg_ratio[i] = positive_counts[i] / float(negative_counts[i]+1)

    for word,ratio in pos_neg_ratio.most_common():
        if(ratio > 1):
            pos_neg_ratio[word] = np.log(ratio)
        else:
            pos_neg_ratio[word] = -np.log((1 / (ratio + 0.01)))

            
    unique_review_new = []
    for i in unique_review:
        if total_counts[i] > max_count:
            if abs(pos_neg_ratio[i]) > polarity_cutoff:
                unique_review_new.append(i)

    
    word2index = {}
    for i, word in enumerate(unique_review_new):
        word2index[word] = i

    

    input_row = np.zeros(len(unique_review_new))
    weights_0_1 = np.zeros((len(unique_review_new),10))
    np.random.seed(1)
    weights_1_2 = np.random.normal(0.0, 1**-0.5, 10)

    X = []
    for i in range(len(X_raw)):
        indices = set()
        for j in X_raw[i].split(" "):
            if (j in word2index.keys()):
                indices.add(word2index[j])
        X.append(list(indices))

    start_time = time.time()
    epochs = 20
    for di in range(epochs):
        correct_so_far = 0
        for i in range(len(X)):

            layer_1 = np.zeros(10)
            for j in range(10):
                layer_1[j] = sum(weights_0_1[X[i],j])
            layer_2 = sigmoid(np.dot(layer_1, weights_1_2))

            error_term_2 = (layer_2 - label_train[i]) * layer_2 * (1 - layer_2)
            error_term_1 = np.dot(weights_1_2, error_term_2)

            weights_1_2 -= learnrate * error_term_2 * layer_1
            weights_0_1[X[i]] -= learnrate * error_term_1

            # Keep track of correct predictions
            if(layer_2 >= 0.5 and y[i] == 'positive'):
                correct_so_far += 1
            elif(layer_2 < 0.5 and y[i] == 'negative'):
                correct_so_far += 1


            if ((i == len(X)-1) & (di%2 == 0)):
                elapsed_time = float(time.time() - start_time)
                reviews_per_second = i / elapsed_time if elapsed_time > 0 else 0
                print("\rEpoch:" + str(di) \
                         + " Speed(reviews/sec):" + str(reviews_per_second)[0:5] \
                         + " #Correct:" + str(correct_so_far) + " #Trained:" + str(i+1) \
                         + " Training Accuracy:" + str(correct_so_far * 100 / float(i+1))[:4] + "%")

    return weights_0_1, weights_1_2, unique_review_new, word2index

In [28]:
weights_0_1, weights_1_2, unique_review, word2index = train_set_epochs(X, y, learnrate = 0.1, 
                                                                max_count = 50, polarity_cutoff = 0.05)

Epoch:0 Speed(reviews/sec):10912 #Correct:20521 #Trained:24000 Training Accuracy:85.5%
Epoch:2 Speed(reviews/sec):3662. #Correct:21145 #Trained:24000 Training Accuracy:88.1%
Epoch:4 Speed(reviews/sec):2179. #Correct:21298 #Trained:24000 Training Accuracy:88.7%
Epoch:6 Speed(reviews/sec):1557. #Correct:21382 #Trained:24000 Training Accuracy:89.0%
Epoch:8 Speed(reviews/sec):1213. #Correct:21430 #Trained:24000 Training Accuracy:89.2%
Epoch:10 Speed(reviews/sec):992.3 #Correct:21488 #Trained:24000 Training Accuracy:89.5%
Epoch:12 Speed(reviews/sec):839.5 #Correct:21517 #Trained:24000 Training Accuracy:89.6%
Epoch:14 Speed(reviews/sec):727.5 #Correct:21558 #Trained:24000 Training Accuracy:89.8%
Epoch:16 Speed(reviews/sec):642.5 #Correct:21579 #Trained:24000 Training Accuracy:89.9%
Epoch:18 Speed(reviews/sec):575.3 #Correct:21585 #Trained:24000 Training Accuracy:89.9%


In [29]:
test_set(X_test, y_test, weights_0_1, weights_1_2, unique_review, word2index)

Progress:0.0% Speed(reviews/sec):0.0 #Correct:0 #Tested:1 Testing Accuracy:0.0%Progress:0.1% Speed(reviews/sec):498.7 #Correct:1 #Tested:2 Testing Accuracy:50.0%Progress:0.2% Speed(reviews/sec):798.2 #Correct:2 #Tested:3 Testing Accuracy:66.6%Progress:0.3% Speed(reviews/sec):987.2 #Correct:3 #Tested:4 Testing Accuracy:75.0%Progress:0.4% Speed(reviews/sec):1139. #Correct:4 #Tested:5 Testing Accuracy:80.0%Progress:0.5% Speed(reviews/sec):1424. #Correct:5 #Tested:6 Testing Accuracy:83.3%Progress:0.6% Speed(reviews/sec):1496. #Correct:6 #Tested:7 Testing Accuracy:85.7%Progress:0.7% Speed(reviews/sec):1551. #Correct:7 #Tested:8 Testing Accuracy:87.5%Progress:0.8% Speed(reviews/sec):1773. #Correct:8 #Tested:9 Testing Accuracy:88.8%Progress:0.9% Speed(reviews/sec):1627. #Correct:9 #Tested:10 Testing Accuracy:90.0%Progress:1.0% Speed(reviews/sec):1662. #Correct:10 #Tested:11 Testing Accuracy:90.9%Progress:1.1% Speed(reviews/sec):1681. #Correct:11 #Tested:12 Testing Accuracy:91.6%P

Progress:62.9% Speed(reviews/sec):3383. #Correct:520 #Tested:630 Testing Accuracy:82.5%Progress:63.0% Speed(reviews/sec):3370. #Correct:520 #Tested:631 Testing Accuracy:82.4%Progress:63.1% Speed(reviews/sec):3376. #Correct:521 #Tested:632 Testing Accuracy:82.4%Progress:63.2% Speed(reviews/sec):3381. #Correct:521 #Tested:633 Testing Accuracy:82.3%Progress:63.3% Speed(reviews/sec):3386. #Correct:522 #Tested:634 Testing Accuracy:82.3%Progress:63.4% Speed(reviews/sec):3374. #Correct:522 #Tested:635 Testing Accuracy:82.2%Progress:63.5% Speed(reviews/sec):3379. #Correct:523 #Tested:636 Testing Accuracy:82.2%Progress:63.6% Speed(reviews/sec):3366. #Correct:524 #Tested:637 Testing Accuracy:82.2%Progress:63.7% Speed(reviews/sec):3372. #Correct:525 #Tested:638 Testing Accuracy:82.2%Progress:63.8% Speed(reviews/sec):3377. #Correct:526 #Tested:639 Testing Accuracy:82.3%Progress:63.9% Speed(reviews/sec):3382. #Correct:527 #Tested:640 Testing Accuracy:82.3%Progress:64.0% Speed(reviews/se

# V - Analysing behaviour of Weights

In [30]:
weights_0_1, weights_1_2, unique_review, word2index = train_set(X, y, learnrate = 0.01, 
                                                                max_count = 0, polarity_cutoff = 0.0)

Progress:0.0% Speed(reviews/sec):0.0 #Correct:1 #Trained:1 Training Accuracy:100.%
Progress:10.4% Speed(reviews/sec):2578. #Correct:1962 #Trained:2501 Training Accuracy:78.4%
Progress:20.8% Speed(reviews/sec):2548. #Correct:4002 #Trained:5001 Training Accuracy:80.0%
Progress:31.2% Speed(reviews/sec):2561. #Correct:6120 #Trained:7501 Training Accuracy:81.5%
Progress:41.6% Speed(reviews/sec):2583. #Correct:8271 #Trained:10001 Training Accuracy:82.7%
Progress:52.0% Speed(reviews/sec):2583. #Correct:10431 #Trained:12501 Training Accuracy:83.4%
Progress:62.5% Speed(reviews/sec):2586. #Correct:12565 #Trained:15001 Training Accuracy:83.7%
Progress:72.9% Speed(reviews/sec):2583. #Correct:14670 #Trained:17501 Training Accuracy:83.8%
Progress:83.3% Speed(reviews/sec):2577. #Correct:16833 #Trained:20001 Training Accuracy:84.1%
Progress:93.7% Speed(reviews/sec):2575. #Correct:19015 #Trained:22501 Training Accuracy:84.5%
Progress:99.9% Speed(reviews/sec):2573. #Correct:20335 #Trained:24000 Training

In [31]:
def get_most_similar_words(focus = "horrible"):
    most_similar = Counter()

    for word in word2index.keys():
        most_similar[word] = np.dot(weights_0_1[word2index[word]],weights_0_1[word2index[focus]])
    
    return most_similar.most_common()

In [1]:
get_most_similar_words("excellent")[:30]

NameError: name 'get_most_similar_words' is not defined

In [33]:
get_most_similar_words("terrible")

[('worst', 0.16966107259049845),
 ('awful', 0.1202684701969124),
 ('waste', 0.11945367265311005),
 ('poor', 0.09275888757443548),
 ('terrible', 0.09142538719772791),
 ('dull', 0.0842092716782236),
 ('poorly', 0.08124154451604203),
 ('disappointment', 0.08006475962136869),
 ('fails', 0.0785997737233375),
 ('disappointing', 0.07733948548032336),
 ('boring', 0.07712785874801288),
 ('unfortunately', 0.07550244970585906),
 ('worse', 0.07060183536419468),
 ('mess', 0.07056429962359041),
 ('stupid', 0.06948482283254304),
 ('badly', 0.06688890366622856),
 ('annoying', 0.06568702190337417),
 ('bad', 0.06309381453757214),
 ('save', 0.06288059749586572),
 ('disappointed', 0.06269235381207285),
 ('wasted', 0.06138718302805127),
 ('supposed', 0.060985452957725166),
 ('horrible', 0.060121772339380125),
 ('laughable', 0.05869840628546764),
 ('crap', 0.05810452866788457),
 ('basically', 0.057218840369636155),
 ('nothing', 0.0571582200430342),
 ('ridiculous', 0.05690548106893143),
 ('lacks', 0.05576656

In [34]:
unique_review = list(set(X.str.cat(sep = " ").split(" ")))
word2index = {}
for i, word in enumerate(unique_review):
    word2index[word] = i

In [35]:
import matplotlib.colors as colors

words_to_visualize = list()
for word, ratio in pos_neg_ratio.most_common(500):
    if(word in word2index.keys()):
        words_to_visualize.append(word)
    
for word, ratio in list(reversed(pos_neg_ratio.most_common()))[0:500]:
    if(word in word2index.keys()):
        words_to_visualize.append(word)

In [36]:
pos = 0
neg = 0

colors_list = list()
vectors_list = list()
for word in words_to_visualize:
    if word in pos_neg_ratio.keys():
        vectors_list.append(weights_0_1[word2index[word]])
        if(pos_neg_ratio[word] > 0):
            pos+=1
            colors_list.append("#00ff00")
        else:
            neg+=1
            colors_list.append("#000000")

In [37]:
from bokeh.models import ColumnDataSource, LabelSet
from bokeh.plotting import figure, show, output_file
from bokeh.io import output_notebook
output_notebook()

In [38]:
from sklearn.manifold import TSNE
tsne = TSNE(n_components=2, random_state=0)
words_top_ted_tsne = tsne.fit_transform(vectors_list)

In [39]:
p = figure(tools="pan,wheel_zoom,reset,save",
           toolbar_location="above",
           title="vector T-SNE for most polarized words")

source = ColumnDataSource(data=dict(x1=words_top_ted_tsne[:,0],
                                    x2=words_top_ted_tsne[:,1],
                                    names=words_to_visualize,
                                    color=colors_list))

p.scatter(x="x1", y="x2", size=8, source=source, fill_color="color")

word_labels = LabelSet(x="x1", y="x2", text="names", y_offset=6,
                  text_font_size="8pt", text_color="#555555",
                  source=source, text_align='center')
p.add_layout(word_labels)

show(p)

# green indicates positive words, black indicates negative words