In [1]:
import pandas as pd
import numpy as np
import time

import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical

In [2]:
# Load the full dataset
df = pd.read_csv('olareviews.csv', low_memory=False)

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,reviewId,userName,userImage,content,score,thumbsUpCount,reviewCreatedVersion,at,replyContent,repliedAt
0,0.0,f64f8374-6302-42b2-b853-98012d9c6e78,2213mehakjot Kaur9e,https://play-lh.googleusercontent.com/a/AGNmyx...,you have started an option of auto but it is n...,1.0,0.0,5.8.1,3/24/2023 11:04,,
1,1.0,57f7056b-588b-43c2-8278-eed057738c78,Mahesh Mahi,https://play-lh.googleusercontent.com/a-/ACB-R...,Super,5.0,0.0,6.1.5,3/24/2023 11:00,,
2,2.0,cc569be8-6fd8-49be-8689-18e0db1ce2f7,poonam Sharma,https://play-lh.googleusercontent.com/a-/ACB-R...,Good 😊,5.0,0.0,6.1.5,3/24/2023 10:52,,
3,3.0,ef1f9c19-717e-4cd0-a458-ea577ac33489,kids Orignal,https://play-lh.googleusercontent.com/a-/ACB-R...,Bakwas app login is not safe,1.0,0.0,6.1.5,3/24/2023 10:29,,
4,4.0,75d1193d-51ed-4c7b-88fa-4dd951c4f68f,Ankur Garg,https://play-lh.googleusercontent.com/a-/ACB-R...,"Worst app, they charge cancellation fee even w...",1.0,0.0,,3/24/2023 9:48,,


In [4]:
df.columns

Index(['Unnamed: 0', 'reviewId', 'userName', 'userImage', 'content', 'score',
       'thumbsUpCount', 'reviewCreatedVersion', 'at', 'replyContent',
       'repliedAt'],
      dtype='object')

In [5]:
# dropping unwanted columns
df.drop(['Unnamed: 0', 'reviewId', 'userName', 'userImage', 'thumbsUpCount', 'reviewCreatedVersion', 'at', 'replyContent', 'repliedAt'], axis = 1, inplace = True)

In [6]:
df.dropna(inplace=True)

In [7]:
df['score'] = df['score'].astype('int')

In [8]:
df.head()

Unnamed: 0,content,score
0,you have started an option of auto but it is n...,1
1,Super,5
2,Good 😊,5
3,Bakwas app login is not safe,1
4,"Worst app, they charge cancellation fee even w...",1


In [13]:
# Randomly sample 5000 entries
df = df.sample(n=5000, random_state=42)

In [14]:
def lowercase_text(text):
    return text.lower()

# Apply the lowercase function
df['lowercase_text'] = df['content'].apply(lowercase_text)

In [15]:
def remove_punctuation(text):
    return re.sub(r'[^\w\s]', '', text)

# Apply the remove punctuation function
df['no_punctuation_text'] = df['lowercase_text'].apply(remove_punctuation)

In [16]:
def tokenize_text(text):
    return word_tokenize(text)

# Apply the tokenization function
df['tokenized_text'] = df['no_punctuation_text'].apply(tokenize_text)

In [17]:
stop_words = set(stopwords.words('english'))

def remove_stopwords(words):
    return [word for word in words if word not in stop_words]

# Apply the remove stopwords function
df['no_stopwords_text'] = df['tokenized_text'].apply(remove_stopwords)

In [18]:
lemmatizer = WordNetLemmatizer()

def lemmatize_words(words):
    return [lemmatizer.lemmatize(word) for word in words]

# Apply the lemmatization function
df['lemmatized_text'] = df['no_stopwords_text'].apply(lemmatize_words)

In [19]:
# Join the lemmatized words back into a single string
df['cleaned_text'] = df['lemmatized_text'].apply(lambda words: ' '.join(words))

In [20]:
df.head()

Unnamed: 0,content,score,lowercase_text,no_punctuation_text,tokenized_text,no_stopwords_text,lemmatized_text,cleaned_text
154890,Good,5,good,good,[good],[good],[good],good
39676,Sab se compney hai Ye Drivar ke liye to bhaut ...,1,sab se compney hai ye drivar ke liye to bhaut ...,sab se compney hai ye drivar ke liye to bhaut ...,"[sab, se, compney, hai, ye, drivar, ke, liye, ...","[sab, se, compney, hai, ye, drivar, ke, liye, ...","[sab, se, compney, hai, ye, drivar, ke, liye, ...",sab se compney hai ye drivar ke liye bhaut bek...
6287,Worst app drivers do not accept the request.,1,worst app drivers do not accept the request.,worst app drivers do not accept the request,"[worst, app, drivers, do, not, accept, the, re...","[worst, app, drivers, accept, request]","[worst, app, driver, accept, request]",worst app driver accept request
40174,Not available in my City Jaunpur,2,not available in my city jaunpur,not available in my city jaunpur,"[not, available, in, my, city, jaunpur]","[available, city, jaunpur]","[available, city, jaunpur]",available city jaunpur
115105,Worst app in the world the driver cancel booki...,1,worst app in the world the driver cancel booki...,worst app in the world the driver cancel booki...,"[worst, app, in, the, world, the, driver, canc...","[worst, app, world, driver, cancel, booking, d...","[worst, app, world, driver, cancel, booking, d...",worst app world driver cancel booking despite ...


In [21]:
# Initialize TfidfVectorizer
vectorizer_tfidf = TfidfVectorizer()
X_tfidf = vectorizer_tfidf.fit_transform(df['cleaned_text'])

In [22]:
start_time = time.time()

# Prepare labels for multi-class classification
encoder = LabelEncoder()
y_encoded = encoder.fit_transform(df['score'])  # Convert labels to 0-based integers
y_categorical = to_categorical(y_encoded)  # Convert integer labels to one-hot

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y_categorical, test_size=0.2, random_state=42)

# Define the model
model_nn_tfidf = Sequential([
    Dense(512, activation='relu', input_dim=X_train.shape[1]),
    Dropout(0.5),
    Dense(256, activation='relu'),
    Dropout(0.5),
    Dense(5, activation='softmax')  # One output unit per class
])

# Compile the model
model_nn_tfidf.compile(optimizer='adam',
                       loss='categorical_crossentropy',  # Suitable for multi-class classification
                       metrics=['accuracy'])

# Model summary
model_nn_tfidf.summary()


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [23]:
# Train the model
history = model_nn_tfidf.fit(X_train, y_train,
                             epochs=1,
                             batch_size=32,
                             validation_data=(X_test, y_test),
                             verbose=1)

[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 22ms/step - accuracy: 0.6408 - loss: 1.0787 - val_accuracy: 0.8170 - val_loss: 0.6617


In [24]:
# Evaluate the model on the test data
loss, accuracy_nn_tfidf = model_nn_tfidf.evaluate(X_test, y_test, verbose=0)
print('TF-IDF Vectorizer using Neural Network')
print('Test Accuracy:', accuracy_nn_tfidf)

# Make predictions (return probabilities)
y_pred_nn_tfidf_prob = model_nn_tfidf.predict(X_test)
y_pred_nn_tfidf = encoder.inverse_transform(y_pred_nn_tfidf_prob.argmax(axis=1))  # Convert probabilities to labels

# Convert one-hot encoded test labels back to labels
y_test_labels = encoder.inverse_transform(y_test.argmax(axis=1))

print('Classification Report:')
print(classification_report(y_test_labels, y_pred_nn_tfidf))

end_time = time.time()
time_nn_tfidf = end_time - start_time
print(f"Execution time: {end_time - start_time} seconds")

TF-IDF Vectorizer using Neural Network
Test Accuracy: 0.8169999718666077
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step
Classification Report:
              precision    recall  f1-score   support

           1       0.84      0.96      0.90       552
           2       0.00      0.00      0.00        40
           3       0.00      0.00      0.00        36
           4       0.00      0.00      0.00        62
           5       0.78      0.92      0.84       310

    accuracy                           0.82      1000
   macro avg       0.32      0.38      0.35      1000
weighted avg       0.70      0.82      0.76      1000

Execution time: 7.49054741859436 seconds


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
