In [1]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.utils import resample
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Load the CSV file into a DataFrame
csv_file_path = '/content/all_kindle_review .csv'
reviews_df = pd.read_csv(csv_file_path)



In [2]:
reviews_df.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,asin,helpful,rating,reviewText,reviewTime,reviewerID,reviewerName,summary,unixReviewTime
0,0,11539,B0033UV8HI,"[8, 10]",3,"Jace Rankin may be short, but he's nothing to ...","09 2, 2010",A3HHXRELK8BHQG,Ridley,Entertaining But Average,1283385600
1,1,5957,B002HJV4DE,"[1, 1]",5,Great short read. I didn't want to put it dow...,"10 8, 2013",A2RGNZ0TRF578I,Holly Butler,Terrific menage scenes!,1381190400
2,2,9146,B002ZG96I4,"[0, 0]",3,I'll start by saying this is the first of four...,"04 11, 2014",A3S0H2HV6U1I7F,Merissa,Snapdragon Alley,1397174400
3,3,7038,B002QHWOEU,"[1, 3]",3,Aggie is Angela Lansbury who carries pocketboo...,"07 5, 2014",AC4OQW3GZ919J,Cleargrace,very light murder cozy,1404518400
4,4,1776,B001A06VJ8,"[0, 1]",4,I did not expect this type of book to be in li...,"12 31, 2012",A3C9V987IQHOQD,Rjostler,Book,1356912000


In [3]:
reviews_df.describe()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,rating,unixReviewTime
count,12000.0,12000.0,12000.0,12000.0
mean,5999.5,10024.275667,3.25,1344537000.0
std,3464.24595,10502.233123,1.421619,43693740.0
min,0.0,0.0,1.0,960249600.0
25%,2999.75,2999.75,2.0,1316218000.0
50%,5999.5,5999.5,3.5,1356826000.0
75%,8999.25,12475.75,4.25,1376870000.0
max,11999.0,47770.0,5.0,1405814000.0


In [6]:

# Function to map ratings to sentiment labels
def map_rating_to_sentiment(rating):
    if rating <= 2:
        return 'negative'
    elif rating == 3:
        return 'neutral'
    else:
        return 'positive'

# Apply the sentiment mapping to the rating column
reviews_df['sentiment'] = reviews_df['rating'].apply(map_rating_to_sentiment)

# Select relevant columns and drop any missing reviews
reviews_df_clean = reviews_df[['reviewText', 'sentiment']].dropna()


In [7]:
reviews_df_clean.head()

Unnamed: 0,reviewText,sentiment
0,"Jace Rankin may be short, but he's nothing to ...",neutral
1,Great short read. I didn't want to put it dow...,positive
2,I'll start by saying this is the first of four...,neutral
3,Aggie is Angela Lansbury who carries pocketboo...,neutral
4,I did not expect this type of book to be in li...,positive


In [9]:

# Using TF-IDF vectorizer to convert text to numerical features
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)

# Balance the dataset by upsampling the neutral class
# Separate the classes
negative_reviews = reviews_df_clean[reviews_df_clean.sentiment == 'negative']
neutral_reviews = reviews_df_clean[reviews_df_clean.sentiment == 'neutral']
positive_reviews = reviews_df_clean[reviews_df_clean.sentiment == 'positive']

# Upsample the neutral class to match the positive class
neutral_upsampled = resample(neutral_reviews,
                             replace=True,    # sample with replacement
                             n_samples=len(positive_reviews), # match positive class
                             random_state=42)

# Combine the balanced dataset
balanced_reviews_df = pd.concat([negative_reviews, neutral_upsampled, positive_reviews])


In [10]:
balanced_reviews_df.head()

Unnamed: 0,reviewText,sentiment
6,This has the makings of a good story... unfort...,negative
10,"I received a copy of book to review. ""A secret...",negative
12,I read the other reviews and decided to give i...,negative
15,I was very disappointed with this book... not ...,negative
17,How many times is author going to get Amazon t...,negative


In [11]:

# Split the balanced data into training and test sets
X_balanced_train, X_balanced_test, y_balanced_train, y_balanced_test = train_test_split(
    balanced_reviews_df['reviewText'], balanced_reviews_df['sentiment'], test_size=0.2, random_state=42)

# Fit the TF-IDF vectorizer on the training data and transform both train and test data
X_balanced_train_vec = tfidf_vectorizer.fit_transform(X_balanced_train)
X_balanced_test_vec = tfidf_vectorizer.transform(X_balanced_test)

# Train the Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_balanced_train_vec, y_balanced_train)

# Predict using the Random Forest model
y_rf_pred = rf_model.predict(X_balanced_test_vec)

# Evaluate the Random Forest model
rf_classification_report = classification_report(y_balanced_test, y_rf_pred, target_names=['negative', 'neutral', 'positive'])

# Output the classification report
print(rf_classification_report)

# Save the model using joblib for future use
import joblib
joblib.dump(rf_model, 'random_forest_sentiment_model.pkl')
joblib.dump(tfidf_vectorizer, 'tfidf_vectorizer.pkl')

              precision    recall  f1-score   support

    negative       0.84      0.70      0.76       767
     neutral       0.95      0.93      0.94      1296
    positive       0.80      0.91      0.85      1137

    accuracy                           0.87      3200
   macro avg       0.86      0.85      0.85      3200
weighted avg       0.87      0.87      0.87      3200



['tfidf_vectorizer.pkl']