The following Python libraries are used in the implementation:
pandas: For data manipulation and analysis.
nltk: For text preprocessing tasks (tokenization, stop word removal).
sklearn: For machine learning algorithms, text vectorization (TF-IDF), and model evaluation.


In [None]:

import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import re

# Download NLTK resources (for stopwords)

In [None]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

# Step 1: Load dataset (e.g., Amazon reviews)
## Replace this with the path to your dataset
## Example dataset: 'amazon_reviews.csv'

In [None]:
data = pd.read_csv('/content/Dataset-SA.csv', encoding='latin-1')


# Check for missing values and drop them
# Drop rows with missing values in 'Rate', 'Sentiment', and 'Review' columns

In [None]:
data.dropna(subset=['Rate', 'Sentiment', 'Review'], inplace=True)


# Step 2: Data Preprocessing
# Function to clean and preprocess the text


In [None]:
import re
from nltk.corpus import stopwords

def preprocess_text(text):
    # Check if the input is a string
    if isinstance(text, str):
        # Remove non-alphabetic characters and lowercase the text
        text = re.sub(r'[^a-zA-Z\s]', '', text, re.I)
        text = text.lower()
        # The following two lines were incorrectly indented
        tokens = text.split()
        stop_words = set(stopwords.words('english'))
        tokens = [word for word in tokens if word not in stop_words]

        return ' '.join(tokens)
    else:
        # Handle non-string inputs (e.g., return an empty string or a placeholder)
        return ''  # Or any other suitable handling

# Apply text preprocessing

In [None]:
data['Rate'] = data['Review'].apply(preprocess_text)


# Step 3: Feature Extraction (TF-IDF Vectorizer)
# Convert text into numerical features using TF-IDF

In [None]:
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(data['Review']).toarray()
y = data['Sentiment']


# Step 4: Split Data into Training and Test Sets

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


# Step 5: Train a Naive Bayes Classifier

In [None]:
model = MultinomialNB()
model.fit(X_train, y_train)


# Step 6: Make Predictions on the Test Set

In [None]:
y_pred = model.predict(X_test)


# Step 7: Evaluate the Model


In [None]:
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')

Accuracy: 91.74%


# Classification Report


In [None]:
print('Classification Report:')
print(classification_report(y_test, y_pred))

Classification Report:
              precision    recall  f1-score   support

    negative       0.83      0.83      0.83      2100
     neutral       0.14      0.01      0.01       823
    positive       0.93      0.98      0.96     13862

    accuracy                           0.92     16785
   macro avg       0.63      0.61      0.60     16785
weighted avg       0.88      0.92      0.90     16785



# Confusion Matrix


In [None]:
print('Confusion Matrix:')
print(confusion_matrix(y_test, y_pred))

Confusion Matrix:
[[ 1745     7   348]
 [  180     6   637]
 [  184    30 13648]]
