In [1]:
import nltk
from nltk.tokenize import word_tokenize
import numpy as np
import random
import pickle
from collections import Counter
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
hm_lines = 100000

In [4]:
def create_lexicon(pos,neg):

    lexicon = []
    with open(pos,'r') as f:
        contents = f.readlines()
        for l in contents[:hm_lines]:
            all_words = word_tokenize(l)
            lexicon += list(all_words)

    with open(neg,'r') as f:
        contents = f.readlines()
        for l in contents[:hm_lines]:
            all_words = word_tokenize(l)
            lexicon += list(all_words)
            
    lexicon = [lemmatizer.lemmatize(i) for i in lexicon]
    w_counts = Counter(lexicon)
    l2 = []
    for w in w_counts:
        #print(w_counts[w])
        if 1000 > w_counts[w] > 50:
            l2.append(w)
    print(len(l2))
    return l2

In [5]:
def sample_handling(sample,lexicon,classification):

    featureset = []

    with open(sample,'r') as f:
        contents = f.readlines()
        for l in contents[:hm_lines]:
            current_words = word_tokenize(l.lower())
            current_words = [lemmatizer.lemmatize(i) for i in current_words]
            features = np.zeros(len(lexicon))
            for word in current_words:
                if word.lower() in lexicon:
                    index_value = lexicon.index(word.lower())
                    features[index_value] += 1

            features = list(features)
            featureset.append([features,classification])

    return featureset

In [6]:
def create_feature_sets_and_labels(pos,neg,test_size = 0.1):
    lexicon = create_lexicon(pos,neg)
    features = []
    features += sample_handling('pos.txt',lexicon,[1,0])
    features += sample_handling('neg.txt',lexicon,[0,1])
    random.shuffle(features)
    features = np.array(features)

    testing_size = int(test_size*len(features))

    train_x = list(features[:,0][:-testing_size])
    train_y = list(features[:,1][:-testing_size])
    test_x = list(features[:,0][-testing_size:])
    test_y = list(features[:,1][-testing_size:])

    return train_x,train_y,test_x,test_y

In [7]:
train_x,train_y,test_x,test_y = create_feature_sets_and_labels('pos.txt','neg.txt')

423


# Defining the Neural Network

In [12]:
%matplotlib inline
import matplotlib.pyplot as plt
import tensorflow as tf
import numpy as np
from tqdm import tqdm
# Defining the model parameters

# Defining the number of nodes in the hidden layers 
n_nodes_h1 = 500
n_nodes_h2 = 500
n_nodes_h3 = 500

# Number of classes to classify the input data
n_classes = 2
# In order to handle the large data size, we will only concentrate on a batch of data
batch_size = 100

length = len(train_x[0])
# Defining the output as well as the input data
x = tf.placeholder('float', [None, length])
y = tf.placeholder('float')

In [None]:
# Deining the Neural Network Model
def neural_network_model(data):
    
    
    # Defining the weights for the first hidden layer with the shape no_inputs X n_nodes_h1
    hidden_1_layer = {'weights': tf.Variable(tf.random_normal([length, n_nodes_h1])), 
                     'biases': tf.Variable(tf.random_normal([1,n_nodes_h1]))}
    
    hidden_2_layer = {'weights': tf.Variable(tf.random_normal([n_nodes_h1, n_nodes_h2])), 
                     'biases': tf.Variable(tf.random_normal([1,n_nodes_h2]))}
    
    hidden_3_layer = {'weights': tf.Variable(tf.random_normal([n_nodes_h2, n_nodes_h3])), 
                     'biases': tf.Variable(tf.random_normal([1,n_nodes_h3]))}
    
    output_layer = {'weights': tf.Variable(tf.random_normal([n_nodes_h3, n_classes])), 
                     'biases': tf.Variable(tf.random_normal([1,n_classes]))}
    
    
   
    
    # Implementing the feedforward
    # input_data*weights + biases
    # Then pass this input through a ReLu    
    
    l1 = tf.add(tf.matmul(data,hidden_1_layer['weights']), hidden_1_layer['biases'])
    l1 = tf.nn.relu(l1)
    
    l2 = tf.add(tf.matmul(l1,hidden_2_layer['weights']), hidden_2_layer['biases'])
    l2 = tf.nn.relu(l2)
    
    l3 = tf.add(tf.matmul(l2,hidden_3_layer['weights']), hidden_3_layer['biases'])
    l3 = tf.nn.relu(l3)
    
    output = tf.matmul(l3,output_layer['weights']) + output_layer['biases']

    return output

saver = tf.train.Saver()
tf_log = 'tf.log'

In [14]:
# Training the neural network
def train_neural_network(x,y):
    prediction = neural_network_model(x)
    # Calculating the cost function
    cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=prediction,labels= y))
    # Optimizing the cost function : Using the AdamOptimizer and the default learning rate of 0.01
    optimizer = tf.train.AdamOptimizer().minimize(cost)
    
    # Defining the number of epochs : Cycles of feedforward + backprop
    hm_epochs = 10
    
    # Running the tensorflow session
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        # Running through the epochs and training the network
        for epoch in range(hm_epochs):
            epoch_loss = 0
            i =0
            while i < len(train_x):
                start = i
                end = batch_size + i
                x_t = train_x[start:end]
                y_t = train_y[start:end]
                # Run the optimizer on the cost function
                sess.run(optimizer, feed_dict={x: x_t, y: y_t})
            print("Epoch ", epoch, "completed out of ", hm_epochs)
            
        # Testing the network
        correct = tf.equal(tf.argmax(prediction,1), tf.argmax(y,1))
        accuracy = tf.reduce_mean(tf.cast(correct, 'float'))
        print("Accuracy" , accuracy.eval({x: test_x, y: test_y})) 

In [None]:
train_neural_network(x,y)