In [1]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import load_model,save_model
import tensorflow as tf
import numpy as np
from numpy import array
import pandas as pd
from sklearn.model_selection import train_test_split
import string
import re
import nltk
from nltk.stem import WordNetLemmatizer
import string
from nltk import word_tokenize
nltk.download('wordnet')
nltk.download('stopwords')
from nltk.corpus import stopwords
#stop = stopwords.words('english')
from tensorflow.keras.utils import to_categorical

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\LENOVO\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\LENOVO\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
stop_words = stopwords.words('english')
addition = ['•', '!', '"', '#', '”', '“', '$', '%', '&', "'", '–', '(', ')', '*','’', '+', ',', '-', '.', '/', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '…']
stop_words.extend(addition)

In [3]:
data = pd.read_csv('Reviews.csv')

In [4]:
data.dropna(how='any',inplace=True)
data.drop_duplicates(inplace=True, subset=['Score','Text'])
idx = data[data["HelpfulnessNumerator"]>data["HelpfulnessDenominator"]].index
data.drop(index=idx, inplace=True)

In [5]:
def create_target(x):
    return 2 if x>3 else 0 if x<3 else 1
data['target'] = data['Score'].apply(create_target)

In [6]:
neutral = data.loc[data.target==1]
positive = data.loc[data.target==2].sample(40000)
negative = data.loc[data.target==0].sample(40000)
data = pd.concat([positive, negative, neutral])

In [7]:
def preprocess(text):
  text = text.lower()
  text = re.sub(r'\@\w+|\#','', text)
  text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)
  text = text.translate(str.maketrans('', '', string.punctuation))
  result = ''.join([i for i in text if not i.isdigit()])
  tweet_tokens = nltk.word_tokenize(result)
  filtered_words = [w for w in tweet_tokens if not w in stop_words]
  lemmatizer = WordNetLemmatizer()
  lemma_words = [lemmatizer.lemmatize(w, pos='a') for w in filtered_words]
  lemma_words=" ".join(lemma_words)
  return lemma_words

In [8]:
data['cleaned'] = data['Text'].apply(preprocess)

In [9]:
data.cleaned.apply(lambda x: len(x.split(" "))).mean()

43.166256718593424

In [10]:
text = data['cleaned'].tolist()
y = data['target']

In [11]:
token = Tokenizer()
token.fit_on_texts(text)

In [12]:
vocab_size  = len(token.word_index) + 1

In [13]:
encoded_text = token.texts_to_sequences(text)

In [14]:
max_length = 50
X = pad_sequences(encoded_text, maxlen=max_length, padding='post')

In [15]:
glove_vectors = dict()

In [16]:
file = open('glove.6B.50d.txt', encoding='utf-8')

for line in file:
    values = line.split()
    word = values[0]
    #storing the word in the variable
    vectors = np.asarray(values[1: ])
    #storing the vector representation of the respective word in the dictionary
    glove_vectors[word] = vectors
file.close()

In [17]:
keys = glove_vectors.keys()

In [18]:
word_vector_matrix = np.zeros((vocab_size, 50))
for word, index in token.word_index.items():
    vector = glove_vectors.get(word)
    if vector is not None:
        word_vector_matrix[index] = vector

In [19]:
y_cat = to_categorical(y)

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y_cat, random_state = 42, test_size = 0.3, stratify = y_cat)

In [21]:
model =  tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, 50, input_length=max_length, weights = [word_vector_matrix], trainable = False),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128, return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    #tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(3,activation = 'softmax')
])

In [22]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 50, 50)            5119850   
_________________________________________________________________
bidirectional (Bidirectional (None, 50, 256)           183296    
_________________________________________________________________
bidirectional_1 (Bidirection (None, 128)               164352    
_________________________________________________________________
dense (Dense)                (None, 64)                8256      
_________________________________________________________________
dropout (Dropout)            (None, 64)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 3)                 195       
Total params: 5,475,949
Trainable params: 356,099
Non-trainable params: 5,119,850
________________________________________

In [23]:
model.compile(loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True), 
             optimizer=tf.keras.optimizers.Adam(1e-4), 
             metrics=['accuracy'])

In [24]:
history = model.fit(x = X_train, y = y_train, epochs = 100, validation_data = (X_test, y_test))

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100


Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


In [25]:
y = model.evaluate(x = X_test, y = y_test)
y[1]



0.6465640068054199