In [1]:
import pandas as pd

# Load the dataset
file_path = '/content/reviews.csv'
data = pd.read_csv(file_path)

# Display basic information about the dataset
dataset_info = data.info()

# Display the first few rows of the dataset
first_rows = data.head()

dataset_info, first_rows


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12495 entries, 0 to 12494
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   reviewId              12495 non-null  object
 1   userName              12495 non-null  object
 2   userImage             12495 non-null  object
 3   content               12495 non-null  object
 4   score                 12495 non-null  int64 
 5   thumbsUpCount         12495 non-null  int64 
 6   reviewCreatedVersion  10333 non-null  object
 7   at                    12495 non-null  object
 8   replyContent          5818 non-null   object
 9   repliedAt             5818 non-null   object
 10  sortOrder             12495 non-null  object
 11  appId                 12495 non-null  object
dtypes: int64(2), object(10)
memory usage: 1.1+ MB


(None,
                                             reviewId  \
 0  gp:AOqpTOEhZuqSqqWnaKRgv-9ABYdajFUB0WugPGh-SG-...   
 1  gp:AOqpTOH0WP4IQKBZ2LrdNmFy_YmpPCVrV3diEU9KGm3...   
 2  gp:AOqpTOEMCkJB8Iq1p-r9dPwnSYadA5BkPWTf32Z1azu...   
 3  gp:AOqpTOGFrUWuKGycpje8kszj3uwHN6tU_fd4gLVFy9z...   
 4  gp:AOqpTOHls7DW8wmDFzTkHwxuqFkdNQtKHmO6Pt9jhZE...   
 
                            userName  \
 0                          Eric Tie   
 1                        john alpha   
 2                       Sudhakar .S   
 3  SKGflorida@bellsouth.net DAVID S   
 4                     Louann Stoker   
 
                                            userImage  \
 0  https://play-lh.googleusercontent.com/a-/AOh14...   
 1  https://play-lh.googleusercontent.com/a-/AOh14...   
 2  https://play-lh.googleusercontent.com/a-/AOh14...   
 3  https://play-lh.googleusercontent.com/-75aK0WF...   
 4  https://play-lh.googleusercontent.com/-pBcY_Z-...   
 
                                              content  score  t

In [2]:

simplified_data = data[['content', 'score']]

simplified_data.head()


Unnamed: 0,content,score
0,I cannot open the app anymore,1
1,I have been begging for a refund from this app...,1
2,Very costly for the premium version (approx In...,1
3,"Used to keep me organized, but all the 2020 UP...",1
4,Dan Birthday Oct 28,1


In [3]:
from sklearn.model_selection import train_test_split

# Divide the dataset into training and test sets
train_data, test_data = train_test_split(simplified_data, test_size=0.2, random_state=42)

# Display the sizes of the training and test sets
train_data_size = train_data.shape[0]
test_data_size = test_data.shape[0]

train_data_size, test_data_size


(9996, 2499)

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

train_data['sentiment'] = train_data['score'].apply(lambda x: 'positive' if x > 3 else ('neutral' if x == 3 else 'negative'))
test_data['sentiment'] = test_data['score'].apply(lambda x: 'positive' if x > 3 else ('neutral' if x == 3 else 'negative'))

modellr= make_pipeline(
    TfidfVectorizer(stop_words='english'),
    LogisticRegression(max_iter=1000)
)

modellr.fit(train_data['content'], train_data['sentiment'])

predictions = modellr.predict(test_data['content'])

report = classification_report(test_data['sentiment'], predictions)
accuracy = accuracy_score(test_data['sentiment'], predictions)

report, accuracy


('              precision    recall  f1-score   support\n\n    negative       0.68      0.84      0.75       958\n     neutral       0.40      0.09      0.14       421\n    positive       0.76      0.83      0.79      1120\n\n    accuracy                           0.71      2499\n   macro avg       0.61      0.58      0.56      2499\nweighted avg       0.67      0.71      0.67      2499\n',
 0.7070828331332533)

In [5]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical
import numpy as np


tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(train_data['content'])
X_train = tokenizer.texts_to_sequences(train_data['content'])
X_test = tokenizer.texts_to_sequences(test_data['content'])

maxlen = 100  # You might want to choose a different `maxlen`
X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)

encoder = LabelEncoder()
encoder.fit(train_data['sentiment'])
y_train = encoder.transform(train_data['sentiment'])
y_test = encoder.transform(test_data['sentiment'])
num_classes = np.max(y_train) + 1
y_train = to_categorical(y_train, num_classes)
y_test = to_categorical(y_test, num_classes)

model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=100, input_length=maxlen))
model.add(LSTM(128))
model.add(Dropout(0.5))
model.add(Dense(num_classes, activation='softmax'))

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, batch_size=32, epochs=5, validation_data=(X_test, y_test))

# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test Accuracy: {accuracy}')


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Test Accuracy: 0.5370147824287415


We used two algorithms LSTM and logistic regression and we found that Logistic regression was performing better than LSTM With 71% accuracy

In [9]:
import pickle
filename='trained_model.sav'
pickle.dump(modellr,open(filename,'wb'))

In [10]:
loaded_model=pickle.load(open('/content/trained_model.sav','rb'))

In [11]:
import pickle

# Load the model from the pickle file
with open('/content/trained_model.sav', 'rb') as file:
    modellr = pickle.load(file)

# Make predictions
predictions = modellr.predict(test_data['content'])

# Evaluate predictions
report = classification_report(test_data['sentiment'], predictions)
accuracy = accuracy_score(test_data['sentiment'], predictions)

print("Classification Report:\n", report)
print("Accuracy:", accuracy)


Classification Report:
               precision    recall  f1-score   support

    negative       0.68      0.84      0.75       958
     neutral       0.40      0.09      0.14       421
    positive       0.76      0.83      0.79      1120

    accuracy                           0.71      2499
   macro avg       0.61      0.58      0.56      2499
weighted avg       0.67      0.71      0.67      2499

Accuracy: 0.7070828331332533


2 : Negative Score 3 : Neutral Score 4 : Positive Score 5 : Very Positive