In [1]:
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import pandas as pd
from tensorflow.keras.datasets import imdb
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM,Dense, Dropout, SpatialDropout1D, Embedding
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.callbacks import EarlyStopping

# Data pre-processing
import nltk
from nltk.corpus import stopwords
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
#from scipy import stats
from numpy import savetxt

import seaborn as sns

ModuleNotFoundError: No module named 'tensorflow'

### Load data set

In [None]:
imdb_df = pd.read_csv('sentiment labelled sentences/imdb_labelled.txt',delimiter='\t',
                        header=None, 
                        names=['review', 'sentiment'])

### Data Prep, EDA and Data Cleaning

In [None]:
imdb_df.columns.to_list()

In [None]:
imdb_df.shape

In [None]:
imdb_df.head(10)

In [None]:
imdb_df.review.head()

In [None]:
imdb_df.sentiment.head()

In [None]:
# Data type in the series
print ('Sentiment Data Type: {}'.format(imdb_df.sentiment.dtypes))
print ('Sentiment Data Type: {}'.format(imdb_df.review.dtype))
#imdb_df.review = imdb_df.review.astype(str)
#print(imdb_df.dropna(inplace=True))

In [None]:
# count of zero and positives values
imdb_df.sentiment.value_counts()

In [None]:
# visualize sentiments
sns.countplot(imdb_df.sentiment, x=imdb_df.sentiment, palette='dark')
plt.title('Sentiment Distribution')
plt.ylabel('Count')
plt.xlabel('Negative or Positive Feedback')
plt.show()

In [None]:
# Check if there are any null values
imdb_df.isna().sum()

In [None]:
# Idenitfy outliers in the review length
imdb_df.review.str.len().plot.box()

### Remove outlier from the dataset

In [None]:
outliers = imdb_df.review.str.len().quantile(0.99)
q_low = imdb_df.review.str.len().quantile(0.01)
q_hi  = imdb_df.review.str.len().quantile(0.99)

imdb_df = imdb_df[(imdb_df.review.str.len() < q_hi) & (imdb_df.review.str.len() > q_low)]

In [None]:
# Idenitfy outliers in the review length
imdb_df.review.str.len().plot.box()

In [None]:
# Investigate distribution of Revenue column using histogram
imdb_df.review.str.len().plot(kind = "hist", title = 'Review length Histogram')

In [None]:
# check for special characters and numbers from the reviews
imdb_df['anySpecialChar'] = imdb_df.review.str.isalpha()
imdb_df[imdb_df['anySpecialChar'] == 'True']

In [None]:
#drop the anySpecialChar
imdb_df = imdb_df.drop(columns='anySpecialChar')

# remove special characters and numbers from the reviews
import re
def remove_special_char(data):
    pat = r'[^a-zA-z.,!?/:;\"\'\s]' # regex to identify special characters
    return (re.sub(pat, '', data))
#remove_special_characters('“007 Not sure@% #fun! 558923 do# ** of it.? $500USD!”')

# Remove Special characters
imdb_df['review'] = imdb_df.review.apply(remove_special_char)

In [None]:
# Convert data to lowercase
imdb_df['review'] = imdb_df.review.apply(lambda x: " ".join(x.lower() for x in x.split()))

In [None]:
print('Min lenght of review: ',imdb_df.review.str.len().min())
print('Median lenght of review: ',imdb_df.review.str.len().median())
print('Max lenght of review: ',imdb_df.review.str.len().max())

In [None]:
# Extract sentence and labels
sentence = np.array(imdb_df['review'])
rating = np.array(imdb_df['sentiment'])

In [None]:
type(rating)

In [None]:
# find max lenght of the list in the encoded_docs
def FindMaxLength(lst):
    maxList = max((x) for x in lst)
    maxLength = max(len(x) for x in lst )
    minLength = min(len(x) for x in lst )
    #return maxList, maxLength
    return maxLength, minLength
     
# Driver Code
#print('Max lenght: {}, Min length: {}'.format(FindMaxLength(X_train)[0],FindMaxLength(X_train)[1]))

In [None]:
# visualize accuracy and loss of the first model
def plot_learningCurve(history, epochs):
    epoch_range = range(1, epochs+1)
    plt.plot(epoch_range, history.history['accuracy'])
    plt.plot(epoch_range, history.history['val_accuracy'])
    plt.title('Model Accuracy')
    plt.ylabel('Accuracy')
    plt.xlabel('Epoch')
    plt.legend(['Train', 'Val'], loc='upper left')
    plt.show()
    plt.plot(epoch_range, history.history['loss'])
    plt.plot(epoch_range, history.history['val_loss'])
    plt.title('Model Loss')
    plt.ylabel('Loss')
    plt.xlabel('Epoch')
    plt.legend(['Train', 'Val'], loc='upper left')
    plt.show()

In [None]:
# Identify stopwords
import nltk
from nltk.corpus import stopwords
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')

In [None]:
stopwords = set(stopwords.words('english'))

In [None]:
stopwords.update(["br", "href"])

textt = " ".join(review for review in sentence)

wordcloud = WordCloud(stopwords=stopwords).generate(textt)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.savefig('wordcloud11.png')
plt.show()

### Split into training and testing sets.

In [None]:
# split the data 80/20
X_train, X_test, Y_train, Y_test = train_test_split(sentence, rating, test_size=0.2, random_state = 1000, stratify=rating)

In [None]:
# train and test dataset
print('Train dataset: ', X_train.shape)
print('test dataset: ', X_test.shape)
print('Train dataset: ', Y_train.shape)
print('test dataset: ', Y_test.shape)

In [None]:
#Apply the tokenizer and pad to a max length
tokenizer = Tokenizer(num_words=5000, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

X_train_s = tokenizer.texts_to_sequences(X_train)
X_test_s = tokenizer.texts_to_sequences(X_test)

In [None]:
# Max sequence length
print('Max length: {}, Min length: {}'.format(FindMaxLength(X_train_s)[0],FindMaxLength(X_train_s)[1]))

In [None]:
#Idenitfy vocabolary
vocab_size = len(tokenizer.word_index) + 1
print('Vocabulary Size: ', len(tokenizer.word_index)+1)

In [None]:
#Sample of the Vocabulary data
list(tokenizer.word_index.items())[50:60]

### Padding the sentence

In [None]:
X_train = pad_sequences(X_train_s, maxlen=FindMaxLength(X_train_s)[0])
X_test = pad_sequences(X_test_s, maxlen=FindMaxLength(X_test_s)[0])

In [None]:
print(X_train[0])

In [None]:
### Save the training and test data

savetxt('padded_X_train.csv', X_train)
savetxt('padded_X_test.csv', X_test)
savetxt('rating_Y_train.csv', Y_train)
savetxt('rating_Y_test.csv', Y_test)

In [None]:
# find out max embedding
max_embd = int(round(np.sqrt(vocab_size),0))
print(max_embd)

In [None]:
# early stopping monitor
esm = EarlyStopping(patience=2)

model = Sequential() 
model.add(Embedding(vocab_size, output_dim=max_embd, input_length=FindMaxLength(X_train_s)[0]) )
model.add(LSTM(50, dropout=0.5, recurrent_dropout=0.5))
model.add(Dense(1, activation='sigmoid')) 
model.compile(loss='binary_crossentropy',optimizer='adam', metrics=['accuracy'])  
print(model.summary())

In [None]:
# Compile the model
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [None]:
history = model.fit(X_train, Y_train,
                    validation_split=0.2,
                    epochs=10,
                    callbacks=esm,
                    batch_size=32) # slicing the data into "batches" of size batch_size, 
                                   # and repeatedly iterating over the entire dataset for a given number of epochs

In [None]:
loss, accuracy = model.evaluate(X_test, Y_test, verbose=False)
print("Testing Accuracy:  {:.4f} Testing Loss {:.4f}".format(accuracy,loss))

In [None]:
plot_learningCurve(history,len(history.epoch))

In [None]:
def predict_sentiment(text):
    tw = tokenizer.texts_to_sequences([text])
    tw = pad_sequences(tw,maxlen=FindMaxLength(X_train_s)[0])
    prediction = int(model.predict(tw).round().item())
    print("Predicted label: ", prediction)

### Test the model by adding sentances

In [None]:
print('Actual Data :{}, Rating {}'.format(imdb_df.review.head()[1],imdb_df.sentiment.head()[1]))

In [None]:
imdb_df.head(10)

In [None]:
test_sentence2 = "not sure who was more lost - the flat characters or the audience, nearly half of whom walked out"
predict_sentiment(test_sentence2)

In [None]:
test_sentence2 = "saw the movie today and thought it was a good ...	"
predict_sentiment(test_sentence2)

### Saving the model

In [None]:
model.save('D213Task2.keras')