# **Model and Dataset**



*   Logistic Regression: https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html
*   Amazon Reviews for Sentiment Analysis Dataset: https://www.kaggle.com/datasets/bittlingmayer/amazonreviews




In [1]:
pip install scikit-learn==1.3.2



In [2]:
import pandas as pd
import numpy as np
import nltk
import re
nltk.download('vader_lexicon')
nltk.download('punkt')
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from nltk.sentiment import SentimentIntensityAnalyzer

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
# Read the train text file
with open('/content/train.ft.txt', 'r', encoding='utf-8') as file:
    lines = file.readlines()

# Extracting labels and review text
labels_train = [line.split(' ', 1)[0] for line in lines]
reviews_train = [line.split(' ', 1)[1][:-1] for line in lines]

# Creating a DataFrame for train set
train_df = pd.DataFrame({'Label': labels_train, 'Review': reviews_train})

train_df

Unnamed: 0,Label,Review
0,__label__2,Stuning even for the non-gamer: This sound tra...
1,__label__2,The best soundtrack ever to anything.: I'm rea...
2,__label__2,Amazing!: This soundtrack is my favorite music...
3,__label__2,Excellent Soundtrack: I truly like this soundt...
4,__label__2,"Remember, Pull Your Jaw Off The Floor After He..."
...,...,...
48595,__label__1,Good as long as it lasts: This laptop was real...
48596,__label__2,very good: this is a very good laptop. I have ...
48597,__label__2,"A ""must-have"" for people who want to look behi..."
48598,__label__2,A book for all people: The remarkable thing ab...


In [4]:
# Read the test text file
with open('/content/test.ft.txt', 'r', encoding='utf-8') as file:
    lines = file.readlines()

# Extracting labels and review text
labels_test = [line.split(' ', 1)[0] for line in lines]
reviews_test = [line.split(' ', 1)[1][:-1] for line in lines]

# Creating a DataFrame for test set
test_df = pd.DataFrame({'Label': labels_test, 'Review': reviews_test})

test_df

Unnamed: 0,Label,Review
0,__label__2,Great CD: My lovely Pat has one of the GREAT v...
1,__label__2,One of the best game music soundtracks - for a...
2,__label__1,Batteries died within a year ...: I bought thi...
3,__label__2,"works fine, but Maha Energy is better: Check o..."
4,__label__2,Great for the non-audiophile: Reviewed quite a...
...,...,...
55767,__label__1,"If you must read this book, try the library! S..."
55768,__label__1,TOO MUCH OF THE SAME: This book is absolute no...
55769,__label__1,Not so effective: My girls are still laying an...
55770,__label__2,Very Nice: I wasn't sure what to expect when I...


# **Using NLTK to Define a Threshold**


*   The original dataset did not contain neutral sentiments, so NLTK compound scores were used to relabel



In [5]:
# Initialize NLTK's Sentiment Intensity Analyzer
nltk_analyzer = SentimentIntensityAnalyzer()

# Calculate compound scores for all reviews in training set
train_compound_scores = np.array([nltk_analyzer.polarity_scores(review)['compound'] for review in train_df['Review']])

In [6]:
# Calculate compound scores for all reviews in testing set
test_compound_scores = np.array([nltk_analyzer.polarity_scores(review)['compound'] for review in test_df['Review']])

In [7]:
# Define threshold as -0.2 to 0.2
neutral_threshold = 0.2

# Relabeling the labels in the training set to include neutral sentiment
train_df['Sentiment'] = np.where((train_compound_scores >= -neutral_threshold) & (train_compound_scores <= neutral_threshold),
  0, np.where(train_df['Label'] == '__label__1', -1, 1))

In [8]:
# Relabeling the labels in the testing set to include neutral sentiment
test_df['Sentiment'] = np.where((test_compound_scores >= -neutral_threshold) & (test_compound_scores <= neutral_threshold), 0, np.where(test_df['Label'] == '__label__1', -1, 1))

# **Vectorizing**

In [34]:
# Vectorize the text using TF-IDF for train set
vectorizer = CountVectorizer(
    max_features=10000,
    ngram_range=(1, 3),
    min_df=2,
    max_df=0.8,
    binary=True,
    strip_accents='ascii',
    analyzer='word'
)

X_train_vec = vectorizer.fit_transform(train_df['Review'])
y_train = train_df['Sentiment']

In [35]:
# Vectorize the text using count vectorizer for test set
X_test_vec = vectorizer.transform(test_df['Review'])
y_test = test_df['Sentiment']

# **Training and Saving Model**

In [36]:
from collections import Counter
cnt = Counter(y_train)
label_sum = sum(cnt.values())
proportions = {label: count / label_sum for label, count in cnt.items()}
print(proportions)

{1: 0.49232510288065845, -1: 0.4392798353909465, 0: 0.06839506172839506}


In [37]:
# Define Logistic Regression Model
model = LogisticRegression(
    C=0.1,
    solver='sag',
    max_iter=2000,
    penalty='l2',
    class_weight=proportions,
    multi_class='multinomial')

# Fitting model to the scaled data
model.fit(X_train_vec, y_train)

In [38]:
from sklearn.pipeline import Pipeline
from joblib import dump

# Create a pipeline to combine scaling and the model
pipeline = Pipeline([
    ('vectorizer', vectorizer),
    ('logistic_regression', model)
])

# Save the trained model using joblib
model_file_path = 'trained_logistic_regression.joblib'
dump(pipeline, model_file_path)

['trained_logistic_regression.joblib']

# **Testing and Evaluating**

In [39]:
# Predict on the test set
predicted_labels = model.predict(X_test_vec)

In [40]:
# Evaluate the model
accuracy = accuracy_score(y_test, predicted_labels)
print(f"Accuracy: {accuracy:.2f}")
print(classification_report(y_test, predicted_labels))

Accuracy: 0.83
              precision    recall  f1-score   support

          -1       0.81      0.87      0.84     24535
           0       0.00      0.00      0.00      3960
           1       0.85      0.92      0.89     27277

    accuracy                           0.83     55772
   macro avg       0.56      0.60      0.58     55772
weighted avg       0.77      0.83      0.80     55772



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
