<a href="https://colab.research.google.com/github/awebstudent/Sentiment_Analyzer/blob/main/Sentiment7.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
#approx 150,000 reviews
import pandas as pd
import json

# Function to read JSON lines file
def read_json_lines(file_path):
    with open(file_path, 'r') as file:
        lines = file.readlines()
        data = [json.loads(line) for line in lines]
    return data

# File path to the JSON lines file
file_path = 'Prime_Pantry_5.json'

# Read the JSON lines data
data = read_json_lines(file_path)

# Normalize the JSON data
df = pd.json_normalize(data)

# Save the DataFrame to a CSV file
df.to_csv('amazon_reviews.csv', index=False)

print("Conversion completed. CSV file saved as 'amazon_reviews.csv'.")


Conversion completed. CSV file saved as 'amazon_reviews.csv'.


In [5]:
# Step 1: Importing Libraries
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [6]:
df = pd.read_csv('amazon_reviews.csv')
df.head()

Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,reviewerName,reviewText,summary,unixReviewTime,vote,image,style.Size:,style.Style:,style.Format:
0,4.0,True,"09 24, 2015",A31Y9ELLA1JUB0,B0000DIWNI,Her Royal Peepness Princess HoneyBunny Blayze,I purchased this Saran premium plastic wrap af...,Pretty Good For plastic Wrap,1443052800,,,,,
1,5.0,True,"06 23, 2015",A2FYW9VZ0AMXKY,B0000DIWNI,Mary,I am an avid cook and baker. Saran Premium Pl...,"The Best Plastic Wrap for your Cooking, Baking...",1435017600,,,,,
2,5.0,True,"06 13, 2015",A1NE43T0OM6NNX,B0000DIWNI,Tulay C,"Good wrap, keeping it in the fridge makes it e...",Good and strong.,1434153600,,,,,
3,4.0,True,"06 3, 2015",AHTCPGK2CNPKU,B0000DIWNI,OmaShops,I prefer Saran wrap over other brands. It does...,Doesn't cling as well to dishes as other brand...,1433289600,,,,,
4,5.0,True,"04 20, 2015",A25SIBTMVXLB59,B0000DIWNI,Nitemanslim,Thanks,Five Stars,1429488000,,,,,


In [7]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [8]:
#  Step 3: Data Preprocessing
def preprocess_text(text):
    # Remove special characters and digits
    text = re.sub(r'[^a-zA-Z\s]', '', str(text))
    # Convert to lowercase
    text = text.lower()
    # Tokenization
    tokens = word_tokenize(text)
    # Removing stopwords
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words]
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]
    # Join tokens back into text
    processed_text = ' '.join(lemmatized_tokens)
    return processed_text

df['clean_text'] = df['reviewText'].apply(preprocess_text)






In [9]:
# Remove rows with empty 'clean_text' after preprocessing
df.dropna(subset=['clean_text'], inplace=True)

# Step 4: Split the Dataset
X = df['clean_text']
y = df['overall']  # Assuming 'overall' contains the target variable

In [10]:
# Convert target variable to categorical format (positive, negative, neutral)
def categorize_sentiment(rating):
    if rating >= 4:
        return 'positive'
    elif rating <= 2:
        return 'negative'
    else:
        return 'neutral'

y_categorical = y.apply(categorize_sentiment)

X_train, X_test, y_train, y_test = train_test_split(X, y_categorical, test_size=0.2, random_state=42)

In [11]:
# Step 5: Feature Extraction
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)


In [38]:

joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')

['tfidf_vectorizer.pkl']

In [12]:
# Step 6: Model Training
# Logistic Regression
logistic_model = LogisticRegression(max_iter=1000)
logistic_model.fit(X_train_tfidf, y_train)

# Naive Bayes
nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf, y_train)

# Random Forest
rf_model = RandomForestClassifier(n_estimators=100)
rf_model.fit(X_train_tfidf, y_train)


In [13]:
# Function to evaluate models
def evaluate_model(model, X_test, y_test):
    predictions = model.predict(X_test)
    accuracy = accuracy_score(y_test, predictions)
    report = classification_report(y_test, predictions)
    cm = confusion_matrix(y_test, predictions)
    return accuracy, report, cm

# Evaluate Logistic Regression model
logistic_accuracy, logistic_report, logistic_cm = evaluate_model(logistic_model, X_test_tfidf, y_test)
print("Logistic Regression Model Accuracy:", logistic_accuracy)
print("Logistic Regression Model Classification Report:\n", logistic_report)
print("Logistic Regression Model Confusion Matrix:\n", logistic_cm)

# Evaluate Naive Bayes model
nb_accuracy, nb_report, nb_cm = evaluate_model(nb_model, X_test_tfidf, y_test)
print("Naive Bayes Model Accuracy:", nb_accuracy)
print("Naive Bayes Model Classification Report:\n", nb_report)
print("Naive Bayes Model Confusion Matrix:\n", nb_cm)

# Evaluate Random Forest model
rf_accuracy, rf_report, rf_cm = evaluate_model(rf_model, X_test_tfidf, y_test)
print("Random Forest Model Accuracy:", rf_accuracy)
print("Random Forest Model Classification Report:\n", rf_report)
print("Random Forest Model Confusion Matrix:\n", rf_cm)


Logistic Regression Model Accuracy: 0.902641701139415
Logistic Regression Model Classification Report:
               precision    recall  f1-score   support

    negative       0.70      0.38      0.49      1412
     neutral       0.52      0.15      0.24      1828
    positive       0.92      0.99      0.95     24318

    accuracy                           0.90     27558
   macro avg       0.71      0.51      0.56     27558
weighted avg       0.88      0.90      0.88     27558

Logistic Regression Model Confusion Matrix:
 [[  536    88   788]
 [  136   278  1414]
 [   91   166 24061]]
Naive Bayes Model Accuracy: 0.8885260178532549
Naive Bayes Model Classification Report:
               precision    recall  f1-score   support

    negative       0.85      0.11      0.19      1412
     neutral       0.54      0.02      0.04      1828
    positive       0.89      1.00      0.94     24318

    accuracy                           0.89     27558
   macro avg       0.76      0.38      0.39  

In [14]:
def ensemble_predict(X):
    preds_logistic = logistic_model.predict(X)
    preds_nb = nb_model.predict(X)
    preds_rf = rf_model.predict(X)

    preds = []
    for i in range(X.shape[0]):  # Use X.shape[0] to get the number of rows
        # Voting - Simple Majority Voting
        pred = max(set([preds_logistic[i], preds_nb[i], preds_rf[i]]), key=[preds_logistic[i], preds_nb[i], preds_rf[i]].count)
        preds.append(pred)
    return preds


In [15]:
# Step 8: Model Evaluation
def evaluate_model(predictions, y_test):
    accuracy = accuracy_score(y_test, predictions)
    report = classification_report(y_test, predictions)
    cm = confusion_matrix(y_test, predictions)
    return accuracy, report, cm

# Generate predictions using the ensemble model
ensemble_predictions = ensemble_predict(X_test_tfidf)

# Evaluate the ensemble model
ensemble_accuracy, ensemble_report, ensemble_cm = evaluate_model(ensemble_predictions, y_test)

# Print evaluation results
print("Ensemble Model Accuracy:", ensemble_accuracy)
print("Ensemble Model Classification Report:\n", ensemble_report)
print("Ensemble Model Confusion Matrix:\n", ensemble_cm)

Ensemble Model Accuracy: 0.898033238986864
Ensemble Model Classification Report:
               precision    recall  f1-score   support

    negative       0.77      0.26      0.39      1412
     neutral       0.68      0.07      0.13      1828
    positive       0.90      1.00      0.95     24318

    accuracy                           0.90     27558
   macro avg       0.78      0.44      0.49     27558
weighted avg       0.88      0.90      0.86     27558

Ensemble Model Confusion Matrix:
 [[  366    23  1023]
 [   79   135  1614]
 [   30    41 24247]]


In [29]:
from sklearn.model_selection import GridSearchCV

param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100]}
grid_search = GridSearchCV(LogisticRegression(max_iter=3000), param_grid, cv=5)

grid_search.fit(X_train_tfidf, y_train)
best_params = grid_search.best_params_

# Train Logistic Regression model with best parameters
best_logistic_model = LogisticRegression(max_iter=3000, C=best_params['C'])

best_logistic_model.fit(X_train_tfidf, y_train)


In [30]:
# Plot confusion matrix
import matplotlib.pyplot as plt
import numpy as np
def plot_confusion_matrix(cm, classes, title='Confusion Matrix', cmap=plt.cm.Blues):
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
   # Evaluate best logistic regression model
best_logistic_predictions = best_logistic_model.predict(X_test_tfidf)
best_logistic_accuracy, best_logistic_report, best_logistic_cm = evaluate_model(best_logistic_predictions, y_test)




In [31]:
def hybrid_model(X):
    logistic_preds = best_logistic_model.predict(X)
    ensemble_preds = ensemble_predict(X)

    hybrid_preds = []
    for i in range(len(logistic_preds)):
        if logistic_preds[i] == ensemble_preds[i]:
            hybrid_preds.append(logistic_preds[i])
        else:
            # Use ensemble prediction if logistic and ensemble predictions differ
            hybrid_preds.append(ensemble_preds[i])

    return hybrid_preds

# Generate predictions using the hybrid model
hybrid_predictions = hybrid_model(X_test_tfidf)

# Evaluate the hybrid model
hybrid_accuracy, hybrid_report, hybrid_cm = evaluate_model(hybrid_predictions, y_test)

# Print evaluation results for hybrid model
print("Hybrid Model Accuracy:", hybrid_accuracy)
print("Hybrid Model Classification Report:\n", hybrid_report)
print("Hybrid Model Confusion Matrix:\n", hybrid_cm)


Hybrid Model Accuracy: 0.898033238986864
Hybrid Model Classification Report:
               precision    recall  f1-score   support

    negative       0.77      0.26      0.39      1412
     neutral       0.68      0.07      0.13      1828
    positive       0.90      1.00      0.95     24318

    accuracy                           0.90     27558
   macro avg       0.78      0.44      0.49     27558
weighted avg       0.88      0.90      0.86     27558

Hybrid Model Confusion Matrix:
 [[  366    23  1023]
 [   79   135  1614]
 [   30    41 24247]]


In [35]:
import joblib


# Save the hybrid model to a file
joblib.dump(hybrid_model, 'hybrid_model.pkl')

['hybrid_model.pkl']

In [37]:
# Sample inputs for testing
sample_inputs = [
    "This product is great! I love it.",
    "This product is terrible. Waste of money.",
    "Not bad, but could be better."
]

# Preprocess the sample inputs
preprocessed_samples = [preprocess_text(text) for text in sample_inputs]

# Convert the preprocessed texts into TF-IDF features
sample_tfidf = vectorizer.transform(preprocessed_samples)

# Use the hybrid model to make predictions on the sample inputs
sample_predictions = []

for sample in sample_tfidf:
    logistic_pred = best_logistic_model.predict(sample)[0]
    nb_pred = nb_model.predict(sample)[0]
    rf_pred = rf_model.predict(sample)[0]

    # Voting - Simple Majority Voting
    # Count the occurrences of each sentiment prediction
    pred_counts = {'positive': 0, 'neutral': 0, 'negative': 0}
    pred_counts[logistic_pred] += 1
    pred_counts[nb_pred] += 1
    pred_counts[rf_pred] += 1

    # Get the sentiment with the maximum count
    final_prediction = max(pred_counts, key=pred_counts.get)

    sample_predictions.append(final_prediction)

# Print the predictions
for input_text, prediction in zip(sample_inputs, sample_predictions):
    print(f"Input: {input_text}\nPredicted Sentiment: {prediction}\n")






Input: This product is great! I love it.
Predicted Sentiment: positive

Input: This product is terrible. Waste of money.
Predicted Sentiment: negative

Input: Not bad, but could be better.
Predicted Sentiment: positive



In [39]:


# Save the models
joblib.dump(logistic_model, 'logistic_model.pkl')
joblib.dump(nb_model, 'nb_model.pkl')
joblib.dump(rf_model, 'rf_model.pkl')



['rf_model.pkl']