# Process data cell

In [9]:
# Load the Drive helper and mount
from google.colab import drive

# This will proot for authorization
drive.mount('/content/drive')

import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import numpy as np
import random
import pickle
from collections import Counter
import io

lemmatizer = WordNetLemmatizer()
hm_lines = 10000000

def create_lexicon(pos, neg):
  lexicon = []
  for fi in [pos, neg]:
    with io.open(fi, 'r', encoding='cp437') as f:
      contents = f.readlines()
      for l in contents[:hm_lines]:
        all_words = word_tokenize(l.lower())
        lexicon += list(all_words)
  
  lexicon = [lemmatizer.lemmatize(i) for i in lexicon]
  w_counts = Counter(lexicon)
  l2 = []
  for w in w_counts:
    if 1000 > w_counts[w] > 50:
      l2.append(w)
  print(len(l2))
  return l2
  
def sample_handling(sample, lexicon, classification):
  featureset = []
  
  with io.open(sample, 'r', encoding='cp437') as f:
    contents = f.readlines()
    for l in contents[:hm_lines]:
      current_words = word_tokenize(l.lower())
      current_words = [lemmatizer.lemmatize(i) for i in current_words]
      features = np.zeros(len(lexicon))
      for word in current_words:
        if word.lower() in lexicon:
          index_value = lexicon.index(word.lower())
          features[index_value] += 1
      features = list(features)
      featureset.append([features, classification])
      
  return featureset

def create_feature_sets_and_labels(pos, neg, test_size=0.1):
  lexicon = create_lexicon(pos, neg)
  features = []
  pos_path = '/content/drive/My Drive/Colab Notebooks/ProcessingOurOwnData/pos.txt'
  neg_path = '/content/drive/My Drive/Colab Notebooks/ProcessingOurOwnData/neg.txt'
  features += sample_handling(pos_path, lexicon, [1, 0])
  features += sample_handling(neg_path, lexicon, [0, 1])
  random.shuffle(features)
  
  '''
  does tf.argmax([output]) == tf.argmax([expectations])
  tf.argmax([4234, 7923]) == tf.argmax([1, 0])
  # neural net will try to shift weights to make statement true
  '''
  features = np.array(features)
  testing_size = int(test_size*len(features))
  
  train_x = list(features[:,0][:-testing_size])
  train_y = list(features[:,1][:-testing_size])
  
  test_x = list(features[:,0][-testing_size:])
  test_y = list(features[:,1][-testing_size:])
  
  return train_x, train_y, test_x, test_y

Mounted at /content/drive


# Run the data into the neural network

In [10]:
import tensorflow as tf

pos_path = '/content/drive/My Drive/Colab Notebooks/ProcessingOurOwnData/pos.txt'
neg_path = '/content/drive/My Drive/Colab Notebooks/ProcessingOurOwnData/neg.txt'
train_x, train_y, test_x, test_y = create_feature_sets_and_labels(pos_path, neg_path)

n_nodes_hl1 = 500
n_nodes_hl2 = 500
n_nodes_hl3 = 500

n_classes = 2
batch_size = 100

x = tf.placeholder('float', [None, len(train_x[0])])
y = tf.placeholder('float')

def neural_network_model(data):
    hidden_1_layer = {'weights':tf.Variable(tf.random_normal([len(train_x[0]), n_nodes_hl1])),
                      'biases':tf.Variable(tf.random_normal([n_nodes_hl1]))}

    hidden_2_layer = {'weights':tf.Variable(tf.random_normal([n_nodes_hl1, n_nodes_hl2])),
                      'biases':tf.Variable(tf.random_normal([n_nodes_hl2]))}

    hidden_3_layer = {'weights':tf.Variable(tf.random_normal([n_nodes_hl2, n_nodes_hl3])),
                      'biases':tf.Variable(tf.random_normal([n_nodes_hl3]))}

    output_layer = {'weights':tf.Variable(tf.random_normal([n_nodes_hl3, n_classes])),
                    'biases':tf.Variable(tf.random_normal([n_classes])),}


    l1 = tf.add(tf.matmul(data,hidden_1_layer['weights']), hidden_1_layer['biases'])
    l1 = tf.nn.relu(l1)

    l2 = tf.add(tf.matmul(l1,hidden_2_layer['weights']), hidden_2_layer['biases'])
    l2 = tf.nn.relu(l2)

    l3 = tf.add(tf.matmul(l2,hidden_3_layer['weights']), hidden_3_layer['biases'])
    l3 = tf.nn.relu(l3)

    output = tf.matmul(l3,output_layer['weights']) + output_layer['biases']

    return output

def train_neural_network(x):
    prediction = neural_network_model(x)
    # OLD VERSION:
    #cost = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(prediction,y) )
    # NEW:
    cost = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(logits=prediction, labels=y) )
    optimizer = tf.train.AdamOptimizer().minimize(cost)
    
    hm_epochs = 10
    with tf.Session() as sess:
        # OLD:
        #sess.run(tf.initialize_all_variables())
        # NEW:
        sess.run(tf.global_variables_initializer())

        for epoch in range(hm_epochs):
            epoch_loss = 0
            
            i = 0
            while i < len(train_x):
              start = i
              end = i+batch_size
              
              batch_x = np.array(train_x[start:end])
              batch_y = np.array(train_y[start:end])
              
              _, c = sess.run([optimizer, cost], feed_dict={x: batch_x, y: batch_y})
              epoch_loss += c
              i += batch_size
              
            print('Epoch', epoch+1, 'completed out of',hm_epochs,'loss:',epoch_loss)

        correct = tf.equal(tf.argmax(prediction, 1), tf.argmax(y, 1))

        accuracy = tf.reduce_mean(tf.cast(correct, 'float'))
        print('Accuracy:',accuracy.eval({x:test_x, y:test_y}))

train_neural_network(x)

423
Epoch 1 completed out of 10 loss: 246572.97900390625
Epoch 2 completed out of 10 loss: 116631.92590332031
Epoch 3 completed out of 10 loss: 71234.6969909668
Epoch 4 completed out of 10 loss: 48191.06187438965
Epoch 5 completed out of 10 loss: 34388.94721984863
Epoch 6 completed out of 10 loss: 25208.71479034424
Epoch 7 completed out of 10 loss: 19681.99220275879
Epoch 8 completed out of 10 loss: 18565.751735687256
Epoch 9 completed out of 10 loss: 22584.425861358643
Epoch 10 completed out of 10 loss: 17088.36088323593
Accuracy: 0.5694184
