<h1 style="text-align: center;">Amazon Review Sentiment Analysis<br>Using NLP</h1>

In [26]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

### Loading the dataset

Dataset : [Amazon review dataset ](https://www.kaggle.com/code/khanmdsaifullahanjar/amazon-review-sentiment-analysis-using-nltk)

In [27]:
# Load the dataset
df = pd.read_csv('amazon.csv')

# Display the first few rows
print(df.head())

                                          reviewText  Positive
0  This is a one of the best apps acording to a b...         1
1  This is a pretty good version of the game for ...         1
2  this is a really cool game. there are a bunch ...         1
3  This is a silly game and can be frustrating, b...         1
4  This is a terrific game on any pad. Hrs of fun...         1


In [28]:
df.columns

Index(['reviewText', 'Positive'], dtype='object')

In [29]:
df.rename(columns={'Positive': 'sentiment'}, inplace=True)

In [30]:
# Check for class balance
print(df['sentiment'].value_counts())

sentiment
1    15233
0     4767
Name: count, dtype: int64


### Data Preprocessing:
* Text Cleaning: Remove HTML tags, punctuation, and non-alphabetic characters.
* Tokenization: Split text into individual words.
* Stopword Removal: Remove common words that don't contribute much meaning (e.g., 'the', 'and').
* Lemmatization: Reduce words to their base form (e.g., 'running' to 'run').

In [52]:
import nltk
nltk.download('punkt', force=True)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [53]:
nltk.data.path.append('/root/nltk_data')


In [54]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [55]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# # Download NLTK resources
# nltk.download('punkt')
# nltk.download('stopwords')
# nltk.download('wordnet')


# Preprocessing function
def preprocess_text(text):
    # Tokenization
    tokens = word_tokenize(text)
    # Lowercasing
    tokens = [word.lower() for word in tokens]
    # Removing stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(tokens)

# Apply preprocessing
df['cleaned_review'] = df['reviewText'].apply(preprocess_text)


### Text Vectorization:

Converting text data into numerical form that the machine learning model can process.

Common techniques include:
* TF-IDF (Term Frequency-Inverse Document Frequency): Weighs words based on their importance in a document.
* Bag of Words (BoW): Converts each review into a vector of word counts or frequencies.
* Word Embedding: Convert words into vectors using models like Word2Vec, GloVe, etc.


In [56]:
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(df['cleaned_review']).toarray()
y = df['sentiment']  # Assuming 'sentiment' is the label column

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


### Train a Classifier

In [57]:
classifier = LogisticRegression()
classifier.fit(X_train, y_train)

# Predictions
y_pred = classifier.predict(X_test)

# Evaluate
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.889
              precision    recall  f1-score   support

           0       0.86      0.64      0.73       958
           1       0.89      0.97      0.93      3042

    accuracy                           0.89      4000
   macro avg       0.88      0.80      0.83      4000
weighted avg       0.89      0.89      0.88      4000



### Function to predict sentiment

In [59]:
def predict_sentiment(review):
    # Preprocess the input review
    cleaned_review = preprocess_text(review)
    # Transform using the vectorizer
    review_vector = vectorizer.transform([cleaned_review])
    # Predict sentiment
    sentiment = classifier.predict(review_vector)[0]
    return sentiment


### Prediction

In [60]:
# Example Input Review
input_review = "The product is excellent and works perfectly! Highly recommend it."

# Predict sentiment
predicted_sentiment = predict_sentiment(input_review)

# Display results
print(f"Review: {input_review}")
print(f"Predicted Sentiment: {'Positive' if predicted_sentiment == 1 else 'Negative'}")

Review: The product is excellent and works perfectly! Highly recommend it.
Predicted Sentiment: Positive


In [74]:
# Example Input Review
input_review = "The product is very bad and works not at all perfec! Never recommend it."

# Predict sentiment
predicted_sentiment = predict_sentiment(input_review)

# Display results
print(f"Review: {input_review}")
print(f"Predicted Sentiment: {'Positive' if predicted_sentiment == 1 else 'Negative'}")

Review: The product is very bad and works not at all perfec! Never recommend it.
Predicted Sentiment: Negative
