<a href="https://colab.research.google.com/github/aditya129712/Restaurant-review-classifier-using-LSTM/blob/main/Restaurant_Review_Classifier_using_LSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Restaurant Review Classifier using LSTM

In [1]:
#Import the library and dataset
import pandas as pd

dataset = pd.read_csv('Restaurant_Reviews.tsv', delimiter = '\t', quoting = 3)

In [2]:
dataset.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [3]:
dataset.shape

(1000, 2)

In [5]:
dataset.isnull().sum()

Review    0
Liked     0
dtype: int64

In [9]:
## Get the Independent features
X=dataset.drop('Liked',axis=1)

In [10]:
## Get the Dependent features
y=dataset['Liked']

In [11]:
X.shape

(1000, 1)

In [12]:
y.shape

(1000,)

In [13]:
import tensorflow as tf

In [15]:
tf.__version__

'2.8.2'

In [16]:
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense

In [17]:
### Vocabulary size
voc_size=100

### Onehot Representation

In [19]:
reviews = X.copy()

In [21]:
reviews['Review'][1]

'Crust is not good.'

In [22]:
reviews

Unnamed: 0,Review
0,Wow... Loved this place.
1,Crust is not good.
2,Not tasty and the texture was just nasty.
3,Stopped by during the late May bank holiday of...
4,The selection on the menu was great and so wer...
...,...
995,I think food should have flavor and texture an...
996,Appetite instantly gone.
997,Overall I was not impressed and would not go b...
998,"The whole experience was underwhelming, and I ..."


In [23]:
import nltk
import re
from nltk.corpus import stopwords

In [24]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [25]:
### Dataset Preprocessing
from nltk.stem.porter import PorterStemmer ##stemming purpose
ps = PorterStemmer()
corpus = []
for i in range(0, len(reviews)):
    review = re.sub('[^a-zA-Z]', ' ', reviews['Review'][i])
    review = review.lower()
    review = review.split()
    
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

In [26]:
corpus

['wow love place',
 'crust good',
 'tasti textur nasti',
 'stop late may bank holiday rick steve recommend love',
 'select menu great price',
 'get angri want damn pho',
 'honeslti tast fresh',
 'potato like rubber could tell made ahead time kept warmer',
 'fri great',
 'great touch',
 'servic prompt',
 'would go back',
 'cashier care ever say still end wayyy overpr',
 'tri cape cod ravoli chicken cranberri mmmm',
 'disgust pretti sure human hair',
 'shock sign indic cash',
 'highli recommend',
 'waitress littl slow servic',
 'place worth time let alon vega',
 'like',
 'burritto blah',
 'food amaz',
 'servic also cute',
 'could care less interior beauti',
 'perform',
 'right red velvet cake ohhh stuff good',
 'never brought salad ask',
 'hole wall great mexican street taco friendli staff',
 'took hour get food tabl restaur food luke warm sever run around like total overwhelm',
 'worst salmon sashimi',
 'also combo like burger fri beer decent deal',
 'like final blow',
 'found place acc

In [27]:
corpus[1]

'crust good'

In [28]:
onehot_repr=[one_hot(words,voc_size)for words in corpus] 
onehot_repr

[[15, 56, 98],
 [88, 53],
 [15, 32, 7],
 [20, 89, 77, 73, 32, 17, 45, 30, 56],
 [96, 36, 77, 7],
 [39, 57, 82, 21, 85],
 [71, 9, 95],
 [48, 29, 44, 77, 73, 14, 52, 35, 36, 40],
 [43, 77],
 [77, 89],
 [11, 78],
 [78, 56, 41],
 [36, 36, 93, 23, 4, 46, 30, 54],
 [97, 14, 30, 31, 88, 84, 59],
 [28, 48, 24, 81, 4],
 [35, 38, 90, 72],
 [77, 30],
 [35, 42, 80, 11],
 [98, 91, 35, 4, 68, 12],
 [29],
 [50, 76],
 [9, 84],
 [11, 58, 93],
 [77, 36, 96, 59, 53],
 [59],
 [76, 55, 90, 68, 33, 30, 53],
 [57, 74, 63, 25],
 [4, 36, 77, 74, 85, 38, 1, 16],
 [79, 35, 39, 9, 76, 5, 9, 10, 63, 7, 16, 95, 29, 11, 92],
 [86, 85, 57],
 [58, 23, 29, 38, 43, 16, 59, 42],
 [29, 66, 36],
 [43, 98, 21, 77, 5],
 [38, 29, 53, 38, 98, 70, 70, 87, 74, 9, 18, 38, 64],
 [20, 29, 98, 29],
 [98, 46, 5, 47],
 [66, 63, 53, 7],
 [1, 11, 41, 14, 57, 29, 48, 43, 35, 33, 76],
 [15, 40, 61, 68],
 [11, 19],
 [35, 6, 96],
 [42, 53, 38, 78, 87, 75],
 [86, 23, 78, 38, 53, 99, 72, 33, 82, 67, 70, 39, 2],
 [97, 89, 74, 22, 1, 77, 11],
 

In [29]:
corpus[1]

'crust good'

In [30]:
onehot_repr[1]

[88, 53]

### Embedding Representation

In [31]:
sent_length=15
embedded_docs=pad_sequences(onehot_repr,padding='post',maxlen=sent_length)
print(embedded_docs)

[[15 56 98 ...  0  0  0]
 [88 53  0 ...  0  0  0]
 [15 32  7 ...  0  0  0]
 ...
 [20  7 78 ...  0  0  0]
 [57 98 80 ...  0  0  0]
 [87 38 66 ...  0  0  0]]


In [32]:
embedded_docs[1]

array([88, 53,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
      dtype=int32)

In [33]:
embedded_docs[0]

array([15, 56, 98,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
      dtype=int32)

In [34]:
## Creating model
embedding_vector_features=40 ##features representation
model=Sequential()
model.add(Embedding(voc_size,embedding_vector_features,input_length=sent_length))
model.add(LSTM(100))
model.add(Dense(1,activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 15, 40)            4000      
                                                                 
 lstm (LSTM)                 (None, 100)               56400     
                                                                 
 dense (Dense)               (None, 1)                 101       
                                                                 
Total params: 60,501
Trainable params: 60,501
Non-trainable params: 0
_________________________________________________________________
None


In [35]:
len(embedded_docs),y.shape

(1000, (1000,))

In [36]:
import numpy as np
X_final=np.array(embedded_docs)
y_final=np.array(y)

In [37]:
X_final.shape,y_final.shape

((1000, 15), (1000,))

## Spliting into Train and Test data

In [38]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_final, y_final, test_size=0.33, random_state=42)

### Model Training

In [40]:
import tensorflow as tf

In [41]:
early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor="val_loss",
    min_delta=0,
    patience=0,
    verbose=0,
    mode="auto",
    baseline=None,
    restore_best_weights=False,
)

In [47]:
### Finally Training
model.fit(X_train, y_train, validation_data=(X_test,y_test), epochs=30, batch_size=64, callbacks = early_stopping)

Epoch 1/30
Epoch 2/30


<keras.callbacks.History at 0x7f08e302ab50>

### Adding Dropout 

In [48]:
from tensorflow.keras.layers import Dropout
## Creating model
embedding_vector_features=40
model=Sequential()
model.add(Embedding(voc_size,embedding_vector_features,input_length=sent_length))
model.add(Dropout(0.3))
model.add(LSTM(100))
model.add(Dropout(0.3))
model.add(Dense(1,activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

### Performance Metrics And Accuracy

In [49]:
y_pred=model.predict(X_test)

In [50]:
y_pred=np.where(y_pred > 0.6, 1,0) ##AUC ROC Curve

In [51]:
from sklearn.metrics import confusion_matrix

In [52]:
confusion_matrix(y_test,y_pred)

array([[173,   0],
       [157,   0]])

In [53]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

0.5242424242424243

In [54]:
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.52      1.00      0.69       173
           1       0.00      0.00      0.00       157

    accuracy                           0.52       330
   macro avg       0.26      0.50      0.34       330
weighted avg       0.27      0.52      0.36       330



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
