In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [None]:
df = pd.read_csv("/content/amazon.csv")
df.sample(5)

In [None]:
df.drop('Unnamed: 0', inplace=True,axis=1)
df.dropna(subset=['reviewText'],inplace=True)

In [None]:
df.sample()

In [None]:
df.shape

In [None]:
# now have to maping the rating

def maping (rating):
  if rating<=2:
    return 0
  elif rating==3:
    return 1
  else:
    return 2

df["sentiments"]=df['overall'].apply(maping)


In [None]:
df.sample(3)

In [None]:
#class didtributions

df["sentiments"].value_counts().sort_index()

In [None]:
# text processing

stop_words=set(stopwords.words("english"))
lemmatizer=WordNetLemmatizer()

In [None]:
nltk.download('stopwords')

In [None]:
nltk.download('wordnet')

In [None]:
# funtion to clean raw data
def processed_text(text):
  text =str(text).lower() # will converts to lower case

  text =re.sub(r'[^a-z\s]',"",text) # remove non-alphabetic characters

  #tokenization (splitting the text) and clean the data
  words = text.split()

  cleanWords=[lemmatizer.lemmatize(i) for i in words if i not in stop_words]
  # lemmatizer.lematize(i) this converts a word into its base (dictionary) form.
  # running --> run

  #Stopwords are very common words that usually donâ€™t add much meaning.
  # is , am , to ,  etc

  return " ".join(cleanWords)

df['clean _text']=df["reviewText"].apply(processed_text)

In [None]:
df.head()

In [None]:
# Plot A: Class Balance

sns.countplot(x='sentiments', data=df)
plt.title('Distribution of Sentiment Classes')
plt.xticks([0, 1, 2])
plt.xlabel('Sentiment Class')
plt.ylabel('Count')

In [None]:
# Plot B: Review Length Analysis
# We calculate the number of words in each processed review
df['review_length'] = df['clean _text'].apply(lambda x: len(x.split()))

plt.subplot(1, 2, 2)
sns.histplot(data=df, x='review_length', hue='sentiments', kde=True)
plt.title('Review Length Distribution by Class')
plt.xlabel('Number of Words')
plt.xlim(0, 200) # Limit x-axis to 200 words for better visibility
plt.show()

In [None]:
#model training
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score


In [None]:

# train test split

X=df['clean _text']
y=df['sentiments']

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

In [None]:
# pipe line flow


# Step 1: TfidfVectorizer - Converts text to numerical vectors based on word importance.
# Step 2: LogisticRegression - A linear classification model.
#         We use 'multi_class="multinomial"(used in Logistic Regression when the target variable has more than two classes)' to explicitly handle 3 classes.

pipeline = Pipeline([

                     ('tfidf', TfidfVectorizer(max_features=5000)),#Converts text documents into numerical vectors
                     ('tf1',LogisticRegression(multi_class='multinomial', solver='saga', max_iter=1000))
])

In [None]:
pipeline.fit(X_train,y_train
             )

In [None]:
y_pred =pipeline.predict(X_test)

In [None]:
accuracy_score(y_test, y_pred)


In [None]:

# Print Detailed Classification Report
# Precision: Accuracy of positive predictions.
# Recall: Ability to find all positive instances.
# F1-Score: Harmonic mean of precision and recall.
print("Classification Report")
target_names = ['Negative', 'Neutral', 'Positive']
print(classification_report(y_test, y_pred, target_names=target_names))

In [None]:
# This shows where the model is getting confused (e.g., misclassifying Neutral as Positive)
plt.figure(figsize=(8, 6))
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=target_names, yticklabels=target_names)
plt.title('Confusion Matrix')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()

In [None]:
def predict_sentiment(text):

    #Accepts a raw string, cleans it, and returns the predicted sentiment.

    # The pipeline handles vectorization automatically
    prediction_idx = pipeline.predict([text])[0]
    probabilities = pipeline.predict_proba([text])[0]

    labels = {0: 'Negative', 1: 'Neutral', 2: 'Positive'}
    predicted_label = labels[prediction_idx]
    confidence = probabilities[prediction_idx]

    if confidence<0.90:
      return f"Input : {text}\n Predicted sentiment : Negative (Confidance : {confidence})"

    elif confidence >0.90 and confidence<0.95 :
      return f"Input : {text}\n Predicted sentiment : neutral (Confidance : {confidence})"
    else:
        return f"Input : {text}\n Predicted sentiment : positive (Confidance : {confidence})"


In [None]:
# Test cases

print(predict_sentiment("i love this product but hate its design "))

In [None]:
df