In [None]:
import pandas as pd
import numpy as np

In [84]:
am_data = pd.read_csv("amazon_cells_labelled.txt", sep="\t", names=["text_review","sentiment"])
im_data = pd.read_csv("imdb_labelled.txt", sep="\t", names=["text_review", "sentiment"])
ye_data = pd.read_csv("yelp_labelled.txt", sep="\t", names=["text_review", "sentiment"])
print(am_data.head())
print(im_data.head())
print(ye_data.head())
print(ye_data.shape[0])

                                         text_review  sentiment
0  So there is no way for me to plug it in here i...          0
1                        Good case, Excellent value.          1
2                             Great for the jawbone.          1
3  Tied to charger for conversations lasting more...          0
4                                  The mic is great.          1
                                         text_review  sentiment
0  A very, very, very slow-moving, aimless movie ...          0
1  Not sure who was more lost - the flat characte...          0
2  Attempting artiness with black & white and cle...          0
3       Very little music or anything to speak of.            0
4  The best scene in the movie was when Gerardo i...          1
                                         text_review  sentiment
0                           Wow... Loved this place.          1
1                                 Crust is not good.          0
2          Not tasty and the texture was

In [85]:
# Concatenate the 3 dataframes
frames = [am_data, im_data, ye_data]
data = pd.concat(frames)

In [86]:
print(data.shape[0])

2748


In [87]:
import sys
import os
# Creating the positive and negative files
pos_file = open("pos.txt", 'a')
neg_file = open("neg.txt", 'a')
for index, row in data.iterrows():
    if row['sentiment'] == 0:
        neg_file.write(row['text_review'].lower())
        neg_file.write("\n")
    else:
        pos_file.write(row['text_review'].lower())
        pos_file.write("\n")


for filename in os.listdir("positive"):
    if filename.endswith(".txt"):
        with open("positive/"+filename) as infile:
            for line in infile:
                pos_file.write(line)
            
for filename in os.listdir("negative"):
    if filename.endswith(".txt"):
        with open("negative/"+filename) as infile:
            for line in infile:
                neg_file.write(line)
                neg_file.write("\n")
                
for filename in os.listdir("pos_test"):
    if filename.endswith(".txt"):
        with open("pos_test/"+filename) as infile:
            for line in infile:
                pos_file.write(line)
            
for filename in os.listdir("neg_test"):
    if filename.endswith(".txt"):
        with open("neg_test/"+filename) as infile:
            for line in infile:
                neg_file.write(line)
                neg_file.write("\n")
                


pos_file.close()
neg_file.close()

In [88]:
texts = []
labels = []

with open("pos.txt") as infile:
    for line in infile:
        texts.append(line)
        labels.append(0)

with open("neg.txt") as infile:
    for line in infile:
        texts.append(line)
        labels.append(1)

In [89]:
print(len(texts), len(labels))

28525 28525


In [90]:
# Declaration of constants
MAX_SEQUENCE_LENGTH = 1000
MAX_NB_WORDS = 20000
EMBEDDING_DIM = 100
VALIDATION_SPLIT = 0.2

In [91]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

tokenizer = Tokenizer(nb_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
labels = to_categorical(np.asarray(labels))
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)



Found 126319 unique tokens.
Shape of data tensor: (28525, 1000)
Shape of label tensor: (28525, 2)


In [92]:
# Split the data into training set and validation set
indices = np.arange(data.shape[0])
# Shuffle the data  and labels so that positive and negative are not clubbed together
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

x_train = data[:-nb_validation_samples]
y_train = labels[:-nb_validation_samples]
x_val = data[-nb_validation_samples:]
y_val = labels[-nb_validation_samples:]

In [93]:
print(x_train.shape[0], x_val.shape[0])

22820 5705


In [94]:
# Preparing the embedding layer
embeddings_index  = {}
f = open('glove.6B.100d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Found %s word vectors.' % len(embeddings_index))

Found 400000 word vectors.


In [95]:
embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [96]:
from keras.layers import Embedding

embedding_layer = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)

In [None]:
from keras.layers import Dense, Input, Flatten, Dropout
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Model
from keras import regularizers

# train a 1D convnet with global maxpooling
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)
x = Conv1D(128, 5, activation='relu')(embedded_sequences)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation='relu')(x)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation='relu')(x)
x = MaxPooling1D(35)(x)
x = Flatten()(x)
x = Dropout(0.2)(x)
x = Dense(128, activation='relu')(x)
x = Dropout(0.3)(x)
x = Dense(128, activation='relu', kernel_regularizer=regularizers.l2(0.01))(x)
preds = Dense(2, activation='softmax')(x)

model = Model(sequence_input, preds)
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['acc'])

model.fit(x_train, y_train,
          batch_size=128,
          epochs=30,
          validation_data=(x_val, y_val))

Train on 22820 samples, validate on 5705 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x113a02ac8>

In [None]:
text = "Amazing museum!"
text = np.array([text.lower()])
tokenizer = Tokenizer(nb_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(text)
sequence = tokenizer.texts_to_sequences(text)
data = pad_sequences(sequence, maxlen=MAX_SEQUENCE_LENGTH)
prediction = model.predict(np.array(data))
print(prediction)