<a href="https://colab.research.google.com/github/adityakalkeri1/Projects/blob/NLP/Ratings_project/LSTM_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
import tensorflow as tf
import nltk
from wordcloud import WordCloud, STOPWORDS

In [None]:
print('Tensorflow Version', tf.__version__)
print('Sklearn Version',  sklearn. __version__ )
print('NLTK version', nltk.__version__)

Tensorflow Version 2.5.0
Sklearn Version 0.22.2.post1
NLTK version 3.2.5


In [None]:
df = pd.read_csv('/content/drive/MyDrive/Datasets/Ratings Project/Full_comments_dataset.csv')

In [None]:
df.shape

(75245, 5)

In [None]:
df.head()

Unnamed: 0.1,Unnamed: 0,Comment,Product,Website,Rating
0,0,Good camera for beginners.,DSLR,Amazon,5
1,1,Great camera. Perfect product for young buddin...,DSLR,Amazon,5
2,2,Really impressed with Amazon surprise for deli...,DSLR,Amazon,5
3,3,I am really fully content with the product. It...,DSLR,Amazon,5
4,4,This camera is a perfect choice if you are a b...,DSLR,Amazon,5


In [None]:
df.drop('Unnamed: 0', axis = 1, inplace = True)

In [None]:
df.dropna(inplace = True)

In [None]:
import re
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
def text_clean(row):
    row = re.sub('\n', ' ', row)                                                             #For removing \n in the comments
    row = re.sub('@[A-Za-z0-9]', '', row)                                                    #For removing any usernames
    row = re.sub("""[!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~]""", ' ', row)                         #For removing punctuations
    row = row.split()                                                                       #For removing the stopwords
    row = [word for word in row if word not in stop_words]
    row = (' ').join(row)                                                             
    return row
df['Comment'] = df['Comment'].apply(text_clean)

In [None]:
#we will be removing words like 'i', 'samsung', 'amazon', 'flipkart', 'the', 'The'
remove_words = ['I', 'The', 'Amazon','Flipkart', 'It', 'mobiles', 'TV', 'DSLR', 'Smartwatch', 'Laptop']
def words_to_be_removed(row):
  row = row.split()                                                                       #For removing the stopwords
  row = [word for word in row if word not in remove_words]
  row = (' ').join(row)
  return row
df['Comment'] = df['Comment'].apply(words_to_be_removed)

In [None]:
from nltk.stem.porter import PorterStemmer
porter =PorterStemmer()
def stem_words(row):
    row = [porter.stem(word) for word in row]
    row = ('').join(row)
    return row
df['Comment'] = df['Comment'].apply(stem_words)

In [None]:
#Spliting the data into train and testing set
from sklearn.model_selection import train_test_split
X = df['Comment']
y = df['Rating']
y = tf.keras.utils.to_categorical(y, dtype = 'int')
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state = 10)

In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
tokenizer = Tokenizer(num_words = 19000, oov_token = '<OOV>')

tokenizer.fit_on_texts(X_train)

word_index = tokenizer.word_index
print('Word Index length= ', len(word_index))

Word Index length=  19964


In [None]:
max_len = 300

In [None]:
#converting texts to sequences
X_train_tokens = tokenizer.texts_to_sequences(X_train)
X_test_tokens = tokenizer.texts_to_sequences(X_test) 

In [None]:
#Padding the sequences
X_train_pad = pad_sequences(X_train_tokens, maxlen = max_len, truncating = 'post')
X_test_pad = pad_sequences(X_test_tokens, maxlen = max_len, truncating='post')

In [None]:
#Defining model

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense

model = Sequential([
                    Embedding(20000, 64, input_length = max_len),
                    LSTM(64, dropout = 0.2, recurrent_dropout = 0.2, return_sequences = True),
                    LSTM(64, dropout = 0.2),
                    Dense(6, activation = 'sigmoid')
])


In [None]:
model.compile(optimizer = tf.keras.optimizers.Adam(learning_rate=0.00001),
              loss = 'categorical_crossentropy',
              metrics = 'accuracy')

In [None]:
checkpoint_filepath = '/content/drive/MyDrive/Deep learning model/Comments_rating/'
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=True,
    monitor='accuracy',
    mode='max',
    save_best_only=True)


In [None]:
early_stop = tf.keras.callbacks.EarlyStopping(
    monitor='accuracy', min_delta=0.01, patience=3, verbose=0,
    mode='auto', baseline=None, restore_best_weights=False
)


In [None]:
reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(monitor='accuracy', factor=0.1,
                              patience=1, min_lr=0.0000001, min_delta=0.05, verbose = 1)

In [None]:
history = model.fit(X_train_pad, y_train, epochs = 1, callbacks=[model_checkpoint_callback, early_stop, reduce_lr])

Epoch 1/5
Epoch 2/5

Epoch 00002: ReduceLROnPlateau reducing learning rate to 9.999999747378752e-07.
Epoch 3/5

Epoch 00003: ReduceLROnPlateau reducing learning rate to 1e-07.
Epoch 4/5


In [None]:
model.evaluate(X_test_pad, y_test)



[0.48099106550216675, 0.8463132977485657]

In [None]:
model.load_weights('/content/drive/MyDrive/Deep learning model/Malignant_comments/')

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7fc25bf7e710>

In [None]:
model.evaluate(X_test_pad, y_test)



[0.48099106550216675, 0.8463132977485657]