In [20]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import numpy as numpy
import random
import pickle
from collections import Counter
import codecs
import tensorflow as tf

nltk.download('punkt')
nltk.download('wordnet')

totalLinesToRead = 100
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Get lemmas

In [21]:
def lematizeFile(fileName):
    lex = []
    with open(fileName, 'r') as f:
        contains = f.readlines()
        for line in contains[:totalLinesToRead]:
            words = word_tokenize(line.decode('utf-8', 'replace'))
            lex += list(words)
    return [lemmatizer.lemmatize(i) for i in lex]

In [22]:
positiveLemmas = lematizeFile('./data/netflix-train-pos')
negativeLemmas = lematizeFile('./data/netflix-train-neg')

In [23]:
print("First lemma " + positiveLemmas[0])

First lemma I


In [24]:
print("Positive lemmas: " + str(len(positiveLemmas)))
print("Negative lemmas: " + str(len(negativeLemmas)))

Positive lemmas: 29612
Negative lemmas: 27430


In [25]:
lemmas = positiveLemmas + negativeLemmas

# Lemmatize (take words repeats 50 < count < 1000)

In [26]:
words = Counter(lemmas)
vocabulary = []
for word in words:
    if 50 < words[word] < 1000:
            vocabulary.append(word)

In [27]:
print("Vocabulary: " + str(len(vocabulary)))
print("Sample: " + vocabulary[0])

Vocabulary: 131
Sample: how


# Make One Hot Encoding features

In [28]:
def samples(classif, lex, section):
    features = []
    with open(section, 'r') as file1:
        contents = file1.readlines()
        for line in contents[:totalLinesToRead]:
            words = word_tokenize(line.decode('utf-8', 'replace').lower())
            words = [lemmatizer.lemmatize(i) for i in words]
            feats = numpy.zeros(len(lex))
            for w in words:
                if w.lower() in lex:
                    index = lex.index(w.lower())
                    feats[index] += 1
            feats = list(feats)
            features.append([feats, classif])
            
    return features

In [29]:
positive_features = samples([1,0], vocabulary, './data/netflix-train-pos')
negative_features = samples([0,1], vocabulary, './data/netflix-train-neg')

In [30]:
print("Positive features size: " + str(len(positive_features)))
print("Negative features size: " + str(len(negative_features)))

Positive features size: 100
Negative features size: 100


## Create feature list and shuffle it

In [31]:
testRation=0.1

features = positive_features + negative_features
random.shuffle(features)
features = np.array(features)

testSize = int(len(features)*testRation)

training_positive = list(features[:,0][:-testSize])
training_negative = list(features[:,1][:-testSize])

testing_positive = list(features[:,0][-testSize:])
testing_negative = list(features[:,1][-testSize:])

## Save to Pickle

In [32]:
with open('sentiment_set.pickle', 'wb') as f:
    pickle.dump([training_positive, training_negative, testing_positive, testing_negative], f)

## Create Sentiment Neural Network

In [33]:
n_nodes_hl1 = 500
n_nodes_hl2 = 500
n_nodes_hl3 = 500

n_classes = 2
batch_size = 100

x = tf.placeholder('float', [None, len(training_positive[0])])
y = tf.placeholder('float')

def neural_network_model(data):
    hidden_1_layer = {'weights':tf.Variable(tf.random_normal([len(training_positive[0]), n_nodes_hl1])),
                      'biases':tf.Variable(tf.random_normal([n_nodes_hl1]))}

    hidden_2_layer = {'weights':tf.Variable(tf.random_normal([n_nodes_hl1, n_nodes_hl2])),
                      'biases':tf.Variable(tf.random_normal([n_nodes_hl2]))}

    hidden_3_layer = {'weights':tf.Variable(tf.random_normal([n_nodes_hl2, n_nodes_hl3])),
                      'biases':tf.Variable(tf.random_normal([n_nodes_hl3]))}

    output_layer = {'weights':tf.Variable(tf.random_normal([n_nodes_hl3, n_classes])),
                    'biases':tf.Variable(tf.random_normal([n_classes])),}


    l1 = tf.add(tf.matmul(data,hidden_1_layer['weights']), hidden_1_layer['biases'])
    l1 = tf.nn.relu(l1)

    l2 = tf.add(tf.matmul(l1,hidden_2_layer['weights']), hidden_2_layer['biases'])
    l2 = tf.nn.relu(l2)

    l3 = tf.add(tf.matmul(l2,hidden_3_layer['weights']), hidden_3_layer['biases'])
    l3 = tf.nn.relu(l3)

    output = tf.matmul(l3,output_layer['weights']) + output_layer['biases']

    return output

def train_neural_network(x):
    prediction = neural_network_model(x)
    cost = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(logits=prediction, labels=y) )
    optimizer = tf.train.AdamOptimizer().minimize(cost)
    
    hm_epochs = 10
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())

        for epoch in range(hm_epochs):
            epoch_loss = 0
            
            i = 0
            while i < len(training_positive):
                start = i
                end = i+batch_size
                batch_one = numpy.array(training_positive[start:end])
                batch_two = numpy.array(training_negative[start:end])
                
                _, c = sess.run([optimizer, cost], feed_dict={x: batch_one, y: batch_two})
                epoch_loss += c
                i += batch_size

            print('Epoch', epoch+1, 'completed out of',hm_epochs,'loss:',epoch_loss)

        correct = tf.equal(tf.argmax(prediction, 1), tf.argmax(y, 1))

        accuracy = tf.reduce_mean(tf.cast(correct, 'float'))
        print('Accuracy:',accuracy.eval({x:testing_positive, y:testing_negative}))

train_neural_network(x)

('Epoch', 1, 'completed out of', 10, 'loss:', 67842.4140625)
('Epoch', 2, 'completed out of', 10, 'loss:', 49319.125)
('Epoch', 3, 'completed out of', 10, 'loss:', 34726.078125)
('Epoch', 4, 'completed out of', 10, 'loss:', 25761.6884765625)
('Epoch', 5, 'completed out of', 10, 'loss:', 20273.4462890625)
('Epoch', 6, 'completed out of', 10, 'loss:', 15916.265625)
('Epoch', 7, 'completed out of', 10, 'loss:', 12739.115234375)
('Epoch', 8, 'completed out of', 10, 'loss:', 9564.52001953125)
('Epoch', 9, 'completed out of', 10, 'loss:', 6771.039306640625)
('Epoch', 10, 'completed out of', 10, 'loss:', 4458.547119140625)
('Accuracy:', 0.55000001)


## Using more data

In [37]:
import io

def init_process(fin,fout):
    outfile = open(fout,'a')
    with io.open(fin, buffering=200000, encoding='latin-1') as f:
        try:
            for line in f:
                line = line.replace('"','')
                initial_polarity = line.split(',')[0]
                if initial_polarity == '0':
                    initial_polarity = [1,0]
                elif initial_polarity == '4':
                    initial_polarity = [0,1]

                tweet = line.split(',')[-1]
                outline = str(initial_polarity)+':::'+tweet
                outfile.write(outline)
        except Exception as e:
            print(str(e))
    outfile.close()

## Uploading the new Sentiment140 data

In [40]:
init_process('./data/training.1600000.processed.noemoticon.csv','train_set.csv')
init_process('./data/testdata.manual.2009.06.14.csv','test_set.csv')

'ascii' codec can't encode characters in position 12-14: ordinal not in range(128)
