In [1]:
import pandas as pd
import numpy as np
from NaivesBayes import NBSentimentModel
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, recall_score, precision_score

In [2]:

# Assuming the CSV is correctly loaded into amazon_data
amazon_data = pd.read_csv(
    'amazon.csv',
)

# Ensure 'score' is numeric and handle errors if present
amazon_data['rating'] = pd.to_numeric(amazon_data['rating'], errors='coerce')

# Convert scores to binary ratings (1 if score >= 3, else 0)
amazon_data['binary_rating'] = (amazon_data['rating'] >= 3).astype(int)

# Split data into training and testing sets
train_data, test_data = train_test_split(amazon_data, test_size=0.2, random_state=42)
train_data, dev_data = train_test_split(train_data, test_size=0.4, random_state=42)

# Prepare training and testing data
train_sentences = train_data['review_content'].fillna("").tolist()
train_labels = train_data['binary_rating'].tolist()
dev_sentences = dev_data['review_content'].fillna("").tolist()
dev_labels = dev_data['binary_rating'].tolist()
test_sentences = test_data['review_content'].fillna("").tolist()
test_labels = test_data['binary_rating'].tolist()

best_accuracy = 0
best_ngram_size = 1

# Try n-gram sizes from 1 to 8
for ngram_size in range(1, 12):
    model = NBSentimentModel(ngram_size=ngram_size)
    model.fit(train_sentences, train_labels)
    predicted_dev_ratings = model.predict(dev_sentences)
    accuracy = accuracy_score(dev_labels, predicted_dev_ratings)
    print(f"Model accuracy on dev data with n-gram size {ngram_size}: {accuracy}")
    
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_ngram_size = ngram_size

print(f"Best n-gram size based on dev data: {best_ngram_size}")

Model accuracy on dev data with n-gram size 1: 0.9722814498933902
Model accuracy on dev data with n-gram size 2: 0.9872068230277186
Model accuracy on dev data with n-gram size 3: 0.9914712153518124
Model accuracy on dev data with n-gram size 4: 0.9914712153518124
Model accuracy on dev data with n-gram size 5: 0.9957356076759062
Model accuracy on dev data with n-gram size 6: 0.9957356076759062
Model accuracy on dev data with n-gram size 7: 0.9957356076759062
Model accuracy on dev data with n-gram size 8: 0.9957356076759062
Model accuracy on dev data with n-gram size 9: 0.9957356076759062
Model accuracy on dev data with n-gram size 10: 0.9957356076759062
Model accuracy on dev data with n-gram size 11: 0.9957356076759062
Best n-gram size based on dev data: 5


In [3]:
# Assuming NBSentimentModel is a defined or imported class for handling Naive Bayes sentiment analysis
#Best n-gram size after tesing on dev set is 5
model = NBSentimentModel(ngram_size=5)
model.fit(train_sentences, train_labels)

# Predict ratings based on test data
predicted_ratings = model.predict(test_sentences)
test_data['predicted_rating'] = predicted_ratings

# Evaluate model performance
accuracy = accuracy_score(test_labels, predicted_ratings)
conf_matrix = confusion_matrix(test_labels, predicted_ratings)

print(f"Model accuracy on test data: {accuracy}")
print("Confusion Matrix:")
print(conf_matrix)

Model accuracy on test data: 0.9965870307167235
Confusion Matrix:
[[  0   1]
 [  0 292]]


In [4]:
#oding a binary conversion of the rating columns such that a rating of anything >=3 is 1 and is 0 otherwise.
amazon_data['binary_rating'] = (amazon_data['rating'] >= 3).astype(int)
print(amazon_data[['review_content', 'binary_rating']])


                                         review_content  binary_rating
0     Looks durable Charging is fine tooNo complains...              1
1     I ordered this cable to connect my phone to An...              1
2     Not quite durable and sturdy,https://m.media-a...              1
3     Good product,long wire,Charges good,Nice,I bou...              1
4     Bought this instead of original apple, does th...              1
...                                                 ...            ...
1460  I received product without spanner,Excellent p...              1
1461  ok,got everything as mentioned but the measuri...              1
1462  plastic but cool body ,u have to find sturdy s...              1
1463  I have installed this in my kitchen working fi...              1
1464  It does it job perfectly..only issue is temp c...              1

[1465 rows x 2 columns]


In [5]:
restaurant = pd.read_csv("Restaurant_Reviews.csv")
train_data, test_data = train_test_split(restaurant, test_size=0.2, random_state=42)
test_sentences = test_data['Review'].fillna("").tolist()
test_labels = test_data['Liked'].tolist()

predicted_ratings = model.predict(test_sentences)
test_data['predicted_rating'] = predicted_ratings

accuracy = accuracy_score(test_labels, predicted_ratings)
print(f"Model accuracy on test data: {accuracy}")

Model accuracy on test data: 0.52


In [6]:
movie = pd.read_csv("IMDB Dataset.csv")

movie['binary_rating'] = movie['sentiment'].map({'positive': 1, 'negative': 0})

movie['binary_rating'] = pd.to_numeric(movie['binary_rating'], errors='coerce')

newdf_0 = movie[movie['binary_rating'] == 0]
newdf_1 = movie[movie['binary_rating'] == 1]

newdf_1_downsampled = newdf_1.sample(n=500, random_state=42)
newdf_0_downsampled = newdf_0.sample(n=500, random_state=42)
movie = pd.concat([newdf_0_downsampled, newdf_1_downsampled])
movie = movie.sample(frac=1, random_state=42).reset_index(drop=True)

train_data, test_data = train_test_split(movie, test_size=0.2, random_state=42)
test_sentences = test_data['review'].fillna("").tolist()
test_labels = test_data['binary_rating'].tolist()

predicted_ratings = model.predict(test_sentences)
test_data['predicted_rating'] = predicted_ratings

accuracy = accuracy_score(test_labels, predicted_ratings)
print(f"Model accuracy on test data: {accuracy}")

Model accuracy on test data: 0.5
