In [80]:
import pandas as pd
import numpy as np
import re
import nltk

nltk.download('stopwords')
nltk.download('vader_lexicon')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
english_stemmer=nltk.stem.SnowballStemmer('english')
from nltk import word_tokenize
from keras.callbacks import ModelCheckpoint
from keras.layers import Dense, Dropout, Reshape, Flatten, concatenate, Input, Conv1D, GlobalMaxPooling1D, Embedding
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from gensim import models
from keras.models import Model

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [21]:
data = pd.read_csv('imdb_labelled.tsv', 
                   header = None, 
                   delimiter='\t')
data.columns = ['Text', 'Label']
data.head()
print(data.shape)

(748, 2)


In [22]:
data.Label.value_counts()

1    386
0    362
Name: Label, dtype: int64

In [31]:
def data_clean(rev, remove_stopwords=True):
  new_text = re.sub("[^a-zA-Z]"," ", rev)
  words = new_text.lower().split()
  if remove_stopwords:
        sts = set(stopwords.words("english"))
        words = [w for w in words if not w in sts]
  ary=[]
  eng_stemmer = english_stemmer 
  for word in words:
      ary.append(eng_stemmer.stem(word))  #stem the words to retain the root/base word
  return(ary)

data['Text_Clean'] = data['Text'].apply(lambda x: data_clean(x))
data['Text_Final'] = [' '.join(sen) for sen in data['Text_Clean']]
data['tokens'] = data['Text_Clean']

In [32]:
pos=[]
neg=[]
for l in data.Label:
  if l==0:
    pos.append(0)
    neg.append(1)
  elif l==1:
    pos.append(1)
    neg.append(0)

data['Pos']=pos
data['Neg']=neg

data = data[['Text_Final', 'tokens', 'Label', 'Pos', 'Neg']]
data.head()

Unnamed: 0,Text_Final,tokens,Label,Pos,Neg
0,slow move aimless movi distress drift young man,"[slow, move, aimless, movi, distress, drift, y...",0,0,1
1,sure lost flat charact audienc near half walk,"[sure, lost, flat, charact, audienc, near, hal...",0,0,1
2,attempt arti black white clever camera angl mo...,"[attempt, arti, black, white, clever, camera, ...",0,0,1
3,littl music anyth speak,"[littl, music, anyth, speak]",0,0,1
4,best scene movi gerardo tri find song keep run...,"[best, scene, movi, gerardo, tri, find, song, ...",1,1,0


In [36]:
data_train, data_test = train_test_split(data, test_size=0.10, random_state=42)

In [37]:
all_training_words = [word for tokens in data_train['tokens'] for word in tokens]
training_sentence_lengths = [len(tokens) for tokens in data_train['tokens']]
Training_vocab = sorted(list(set(all_training_words)))
print("%s words total, with a vocabulary size of %s" % (len(all_training_words), len(Training_vocab)))
print("Max sentence length is %s" %max(training_sentence_lengths))

6827 words total, with a vocabulary size of 2244
Max sentence length is 677


In [41]:
all_test_words = [word for tokens in data_test['tokens'] for word in tokens]
test_sentence_lengths = [len(tokens) for tokens in data_test['tokens']]
TEST_VOCAB = sorted(list(set(all_test_words)))
print('%s words total, with a vocabulary size of %s' % (len(all_test_words), len(TEST_VOCAB)))
print('Max sentence length is %s' % max(test_sentence_lengths))

567 words total, with a vocabulary size of 414
Max sentence length is 24


In [49]:
!pip3 install wget

Collecting wget
  Downloading https://files.pythonhosted.org/packages/47/6a/62e288da7bcda82b935ff0c6cfe542970f04e29c756b0e147251b2fb251f/wget-3.2.zip
Building wheels for collected packages: wget
  Building wheel for wget (setup.py) ... [?25l[?25hdone
  Created wheel for wget: filename=wget-3.2-cp36-none-any.whl size=9682 sha256=827c941742ba810f7cb4c3fe7efce5e40f14192174490b981b038e1a24f3ddf4
  Stored in directory: /root/.cache/pip/wheels/40/15/30/7d8f7cea2902b4db79e3fea550d7d7b85ecb27ef992b618f3f
Successfully built wget
Installing collected packages: wget
Successfully installed wget-3.2


In [51]:
import gensim.downloader as api

wv = api.load('word2vec-google-news-300')



In [54]:
def get_average_word2vec(tokens_list, vector, generate_missing=False, k=300):
    if len(tokens_list)<1:
        return np.zeros(k)
    if generate_missing:
        vectorized = [vector[word] if word in vector else np.random.rand(k) for word in tokens_list]
    else:
        vectorized = [vector[word] if word in vector else np.zeros(k) for word in tokens_list]
    length = len(vectorized)
    summed = np.sum(vectorized, axis=0)
    averaged = np.divide(summed, length)
    return averaged

def get_word2vec_embeddings(vectors, clean_comments, generate_missing=False):
    embeddings = clean_comments['tokens'].apply(lambda x: get_average_word2vec(x, vectors, 
                                                                                generate_missing=generate_missing))
    return list(embeddings)

In [58]:
training_embeddings = get_word2vec_embeddings(wv, data_train, generate_missing=True)

In [59]:
MAX_SEQUENCE_LENGTH = 50
EMBEDDING_DIM = 300

In [63]:
tokenizer = Tokenizer(num_words=len(Training_vocab), lower=True, char_level=False)
tokenizer.fit_on_texts(data_train["Text_Final"].tolist())
training_sequences = tokenizer.texts_to_sequences(data_train["Text_Final"].tolist())

train_word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(train_word_index))

Found 2244 unique tokens.


In [66]:
train_cnn_data = pad_sequences(training_sequences, maxlen=MAX_SEQUENCE_LENGTH)

In [69]:
train_embedding_weights = np.zeros((len(train_word_index)+1, EMBEDDING_DIM))
for word,index in train_word_index.items():
    train_embedding_weights[index,:] = wv[word] if word in wv else np.random.rand(EMBEDDING_DIM)
print(train_embedding_weights.shape)

(2245, 300)


In [70]:
test_sequences = tokenizer.texts_to_sequences(data_test["Text_Final"].tolist())
test_cnn_data = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH)

In [71]:
def ConvNet(embeddings, max_sequence_length, num_words, embedding_dim, labels_index):
    
    embedding_layer = Embedding(num_words,
                            embedding_dim,
                            weights=[embeddings],
                            input_length=max_sequence_length,
                            trainable=False)
    
    sequence_input = Input(shape=(max_sequence_length,), dtype='int32')
    embedded_sequences = embedding_layer(sequence_input)

    convs = []
    filter_sizes = [2,3,4,5,6]

    for filter_size in filter_sizes:
        l_conv = Conv1D(filters=200, kernel_size=filter_size, activation='relu')(embedded_sequences)
        l_pool = GlobalMaxPooling1D()(l_conv)
        convs.append(l_pool)


    l_merge = concatenate(convs, axis=1)

    x = Dropout(0.1)(l_merge)  
    x = Dense(128, activation='relu')(x)
    x = Dropout(0.2)(x)
    preds = Dense(labels_index, activation='sigmoid')(x)

    model = Model(sequence_input, preds)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['acc'])
    model.summary()
    return model

In [72]:
label_names = ['Pos', 'Neg']

In [73]:
y_train = data_train[label_names].values

In [74]:
x_train = train_cnn_data
y_tr = y_train

In [81]:
model = ConvNet(train_embedding_weights, MAX_SEQUENCE_LENGTH, len(train_word_index)+1, EMBEDDING_DIM, 
                len(list(label_names)))

Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            [(None, 50)]         0                                            
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 50, 300)      673500      input_3[0][0]                    
__________________________________________________________________________________________________
conv1d_10 (Conv1D)              (None, 49, 200)      120200      embedding_2[0][0]                
__________________________________________________________________________________________________
conv1d_11 (Conv1D)              (None, 48, 200)      180200      embedding_2[0][0]                
_______________________________________________________________________________________

In [82]:
num_epochs = 3
batch_size = 34

In [83]:
hist = model.fit(x_train, y_tr, epochs=num_epochs, validation_split=0.1, shuffle=True, batch_size=batch_size)

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [85]:
predictions = model.predict(test_cnn_data, batch_size=1024, verbose=1)



In [87]:
labels = [1,0]
prediction_labels=[]
for p in predictions:
    prediction_labels.append(labels[np.argmax(p)])

In [88]:
sum(data_test.Label==prediction_labels)/len(prediction_labels)

0.7466666666666667

In [89]:
data_test.Label.value_counts()

0    44
1    31
Name: Label, dtype: int64