This model will read data from a swiggy review data-set and Classify the Sentiment: Positive or Negative

In [2]:
import pandas as pd
import numpy as np
import re
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import tensorflow as tf

In [3]:
data = pd.read_csv('swiggy.csv')
data.head()


Unnamed: 0,ID,Area,City,Restaurant Price,Avg Rating,Total Rating,Food Item,Food Type,Delivery Time,Review
0,1,Suburb,Ahmedabad,600,4.2,6198,Sushi,Fast Food,30-40 min,"Good, but nothing extraordinary."
1,2,Business District,Pune,200,4.7,4865,Pepperoni Pizza,Non-Vegetarian,50-60 min,"Good, but nothing extraordinary."
2,3,Suburb,Bangalore,600,4.7,2095,Waffles,Fast Food,50-60 min,Late delivery ruined it.
3,4,Business District,Mumbai,900,4.0,6639,Sushi,Vegetarian,50-60 min,Best meal I've had in a while!
4,5,Tech Park,Mumbai,200,4.7,6926,Spring Rolls,Gluten-Free,20-30 min,Mediocre experience.


In [5]:
# Small code snippet for Tokenizer:

sentences = [
    'I am a good Boy', 
    'This is a good desk',
    'Boy is roaming around'
]

# Build vocab from the text data:
tokenizer = Tokenizer(num_words=100)
tokenizer.fit_on_texts(sentences)

# View the word index (Vocab index)
word_index = tokenizer.word_index
word_index

# Convert orignal text to sequence
text_to_sequence = tokenizer.texts_to_sequences(sentences)
text_to_sequence

# pad_sequences: This function ensures that all sequences in a list have the same length, 
# which is a requirement for feeding data into a neural network.
padded_sequence = pad_sequences(text_to_sequence, padding = 'post')
padded_sequence


array([[ 5,  6,  1,  2,  3],
       [ 7,  4,  1,  2,  8],
       [ 3,  4,  9, 10,  0]], dtype=int32)

In [6]:
data = pd.read_csv('swiggy.csv')
data.head()

Unnamed: 0,ID,Area,City,Restaurant Price,Avg Rating,Total Rating,Food Item,Food Type,Delivery Time,Review
0,1,Suburb,Ahmedabad,600,4.2,6198,Sushi,Fast Food,30-40 min,"Good, but nothing extraordinary."
1,2,Business District,Pune,200,4.7,4865,Pepperoni Pizza,Non-Vegetarian,50-60 min,"Good, but nothing extraordinary."
2,3,Suburb,Bangalore,600,4.7,2095,Waffles,Fast Food,50-60 min,Late delivery ruined it.
3,4,Business District,Mumbai,900,4.0,6639,Sushi,Vegetarian,50-60 min,Best meal I've had in a while!
4,5,Tech Park,Mumbai,200,4.7,6926,Spring Rolls,Gluten-Free,20-30 min,Mediocre experience.


In [7]:
# Data Preprocessing:
# Cleaning and sentiment labeling:

data['Review'] = data['Review'].str.lower()
data['Review']

# good, but nothing extraordinary.
# Data still has many non-needed values we need to clear it:

data['Review'] = data['Review'].replace(r'[^a-z0-9\s]', '', regex = True)
data['Review']
# good but nothing extraordinary

# Sentiment col adding 1 for > 3.5 and 0 otherwise
data['Sentiment'] = data['Avg Rating'].apply(lambda x: 1 if x > 3.5 else 0)
data['Sentiment']

data.shape
# (8000, 11)
# Drop the null rows:
data= data.dropna()
data.shape

(8000, 11)

In [8]:
# Tokenize and pad the review data and extract the target sentiment:
# Tokenize: Converts text data to integer sequences
# Padding: Ensures all input sequences have the same length


max_features = 5000 #Sets the maximum number of words to keep in the tokenizer
max_length = 200 # Defines the fixed length for each input sequence after padding
tokenizer = Tokenizer(num_words= max_features)
tokenizer.fit_on_texts(data['Review'])
text_to_sequence= tokenizer.texts_to_sequences(data['Review'])


# [47, 10, 11, 48],
#  [47, 10, 11, 48],
#  [71, 9, 72, 3],

X = pad_sequences(text_to_sequence, maxlen = max_length)
y = data['Sentiment'].values


X #[[ 0,  0,  0, ..., 10, 11, 48],
    #    [ 0,  0,  0, ..., 10, 11, 48],

y  #[1, 1, 1, 

array([1, 1, 1, ..., 1, 1, 1])

In [9]:
# Splitting the data: Training, validation and test sets
# While mainting the class distribution

In [10]:
# The stratify parameter in scikit-learn's train_test_split function ensures that the resulting training and testing sets maintain 
# the same proportion of samples for each class as in the original dataset. 


# train_test_split(X, y, test_size=0.2, random_state=42, stratify=y) : Splits data into 80% training and 20% test sets, preserving sentiment class balance
# train_test_split(X_train, y_train, test_size=0.1, random_state=42, stratify=y_train) : Further splits training data into 90% training and 10% validation sets, keeping class distribution consistent

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2, random_state = 42, stratify = y)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.1, random_state = 42, stratify=y_train)

In [11]:
# RNN Modeling:
# 1. Architecting Model
# 2. Compile Model
# 3. Train Model
# 4. Evaluate Model
# 5. Test Model

In [12]:
model = tf.keras.models.Sequential()

model.add(tf.keras.layers.Embedding(output_dim = 16, input_dim=max_features, input_length = max_length))
model.add(tf.keras.layers.SimpleRNN(units = 64, activation= 'tanh', return_sequences=False))
model.add(tf.keras.layers.Dense(units= 1, activation= 'sigmoid'))
model.summary()



In [13]:
model.compile(optimizer = 'adam', loss='binary_crossentropy', metrics = ['accuracy'])

In [14]:
model.fit(X_train, y_train, epochs= 10, validation_data = (X_test, y_test), batch_size = 32)

score = model.evaluate(X_test,y_test)

print(f'Test Accuracy: {score[1]:.2f}')

Epoch 1/10
[1m180/180[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 33ms/step - accuracy: 0.6977 - loss: 0.6119 - val_accuracy: 0.7156 - val_loss: 0.5988
Epoch 2/10
[1m180/180[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 37ms/step - accuracy: 0.7160 - loss: 0.5971 - val_accuracy: 0.7156 - val_loss: 0.5973
Epoch 3/10
[1m180/180[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 47ms/step - accuracy: 0.7160 - loss: 0.5968 - val_accuracy: 0.7156 - val_loss: 0.5976
Epoch 4/10
[1m180/180[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 61ms/step - accuracy: 0.7160 - loss: 0.5963 - val_accuracy: 0.7156 - val_loss: 0.5996
Epoch 5/10
[1m180/180[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 60ms/step - accuracy: 0.7160 - loss: 0.5969 - val_accuracy: 0.7156 - val_loss: 0.5976
Epoch 6/10
[1m 70/180[0m [32m━━━━━━━[0m[37m━━━━━━━━━━━━━[0m [1m6s[0m 61ms/step - accuracy: 0.7151 - loss: 0.5978

KeyboardInterrupt: 

Test Accuracy: 0.72
means model has archived the accuracy of 72%

In [94]:
# Predict sentiments: We will create a function to preprocess a single review, predict its sentiment and display the result.
def make_prediction(review):
# Preprocess
    review = review.lower()
    review = re.sub(r'[^a-z0-9\s]', '', review)

# Tokenize
    seq = tokenizer.texts_to_sequences([review])

# Padding
    padded_seq = pad_sequences(seq, maxlen = max_length)

    prediction = model.predict(padded_seq)[0][0]

    return f"{'Positive' if prediction >= 0.5 else 'Negative'} (Probability: {prediction:.2f})"


review = 'Late delivery ruined it'

make_prediction(review)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step


'Positive (Probability: 0.68)'