In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from  sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,confusion_matrix

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras import models,layers

from wordcloud import WordCloud, STOPWORDS

import os
# count = 0
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))
#         if count > 4 :
#             break
#         count += 1

Using TensorFlow backend.


In [2]:
imdb_dir = 'aclImdb'
train_dir = os.path.join(imdb_dir,'train')
test_dir = os.path.join(imdb_dir,'test')
labels = []
texts = []

test_labels = []
test_texts = []

In [3]:
for label_type in ['pos','neg']:
    dir_name = os.path.join(train_dir,label_type)
    for fname in os.listdir(dir_name):
        if fname[-4:] == '.txt':
            f = open(os.path.join(dir_name,fname))
            texts.append(f.read())
            f.close()
            if label_type == 'neg':
                labels.append(0)
            else:
                labels.append(1)

In [None]:
for label_type in ['pos','neg']:
    dir_name = os.path.join(test_dir,label_type)
    for fname in os.listdir(dir_name):
        if fname[-4:] == '.txt':
            f = open(os.path.join(dir_name,fname))
            test_texts.append(f.read())
            f.close()
            if label_type == 'neg':
                test_labels.append(0)
            else:
                test_labels.append(1)

In [None]:
print(f'Length of texts is {len(texts)}')
print(f'Length of labels id {len(labels)}')
print(f'Length of test_texts is {len(test_texts)}')
print(f'Length of test_labels is {len(test_labels )}')

In [None]:
texts_df = pd.DataFrame({'texts': texts,
                        'labels':labels})

In [None]:
texts_df.head()

In [None]:
positive = texts_df[texts_df['labels']==1]['texts']
negative = texts_df[texts_df['labels']==0]['texts']

In [None]:
stopwords = set(STOPWORDS)

wordcloud = WordCloud(
                          background_color='white',
                          stopwords=stopwords,
                          max_words=200,
                          max_font_size=40, 
                          random_state=42
                         ).generate(str(positive))

print(wordcloud)
fig = plt.figure(1)
plt.imshow(wordcloud)
plt.axis('off')
plt.show()
fig.savefig("word1.png", dpi=900)

In [None]:
wordcloud = WordCloud(
                          background_color='black',
                          stopwords=stopwords,
                          max_words=200,
                          max_font_size=40, 
                          random_state=42
                         ).generate(str(negative))

print(wordcloud)
fig = plt.figure(1)
plt.imshow(wordcloud)
plt.axis('off')
plt.show()
fig.savefig("word1.png", dpi=900)

In [None]:
MAX_LENGTH = 500
MAX_WORDS = 20000
EMBENDING_DIM = 100

In [None]:
tokenizer = Tokenizer(num_words=MAX_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

In [None]:
word_index = tokenizer.word_index

In [None]:
print(f'Found {len(word_index)} unique tokens.' )

In [None]:
data = pad_sequences(sequences,maxlen=MAX_LENGTH)
labels = np.array(labels)

In [None]:
print(f'Shape of Data tensor is {data.shape}')
print(f'Shape of Labels tensor is {labels.shape}')

In [None]:
X_train, X_val, y_train, y_val = train_test_split(data,labels,test_size=0.2,random_state=42)

In [None]:
glove_dir = "glove.6B.100d.txt"

embedding_index = {}
f = open(glove_dir)
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.array(values[1:], dtype='float32')
    embedding_index[word] = coefs
f.close()

print(f'Found {len(embedding_index)} word vectors')

In [None]:
embedding_index['go'][:10]

In [None]:
embedding_matrix = np.zeros((MAX_WORDS,EMBENDING_DIM))

In [None]:
for word, i in word_index.items():
    if i < MAX_WORDS:
        embedding_vector = embedding_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

In [None]:
embedding_df = pd.DataFrame(embedding_matrix)

In [None]:
embedding_df.shape

In [None]:
model = models.Sequential()
model.add(layers.Embedding(MAX_WORDS,EMBENDING_DIM, input_length=MAX_LENGTH))
model.add(layers.LSTM(32))
model.add(layers.Dense(1,activation='sigmoid'))
model.summary()

In [None]:
model.layers[0].set_weights([embedding_matrix])
model.layers[0].trainable = False

In [None]:
model.compile(optimizer='rmsprop', 
              loss='binary_crossentropy',
              metrics=['acc'])
history = model.fit(X_train, y_train,
                    epochs=3,
                    batch_size=1000,
                    validation_data=(X_val, y_val))
model.save_weights('pre_trained_glove_model.h5')

In [None]:
test_sequences = tokenizer.texts_to_sequences(test_texts)
test_data = pad_sequences(test_sequences,maxlen=MAX_LENGTH)
test_labels = np.array(test_labels)

In [None]:
predictions = model.predict(test_data)

In [None]:
pred_labels  = (predictions>0.5)

In [None]:
mat = confusion_matrix(pred_labels, test_labels)
plt.figure(figsize=(4, 4))
sns.set()
sns.heatmap(mat.T, square=True, annot=True, fmt='d', cbar=False,
            xticklabels=np.unique(test_labels),
            yticklabels=np.unique(test_labels))
plt.xlabel('true label')
plt.ylabel('predicted label')

In [None]:
print(classification_report(pred_labels,test_labels))