In [None]:
!pip install nltk




In [None]:
# Import pandas library for data manipulation and analysis
# Pandas helps us work with datasets in table format (rows and columns)
import pandas as pd

# Import Google Drive utility from Google Colab
# This allows Colab to access files stored in our Google Drive
from google.colab import drive

# Mount Google Drive to the Colab environment
# After mounting, Drive files will be accessible under /content/drive
drive.mount('/content/drive')

# Import regular expression module
# Regex is used to clean text by removing unwanted patterns like symbols and numbers
import re

# Import Natural Language Toolkit (NLTK)
# NLTK provides useful tools for text preprocessing in NLP tasks
import nltk

# Import stopwords list from NLTK
# Stopwords are common words (like 'is', 'the', 'and') that do not add meaning
from nltk.corpus import stopwords

# Import WordNet Lemmatizer from NLTK
# Lemmatizer converts words to their base form (e.g., running → run)
from nltk.stem import WordNetLemmatizer


Mounted at /content/drive


In [None]:
# Path where the curated Parquet folder is stored in Google Drive
# This folder was downloaded from HDFS (Gold layer) and uploaded to Drive
folder_path = '/content/drive/MyDrive/appliance_reviews_curated/'

# Read the Parquet dataset into a Pandas DataFrame
# Pandas can read a folder containing Parquet part files
# This is safe because Parquet is columnar and schema-aware
df = pd.read_parquet(folder_path)
df_ml = df.copy()


In [None]:
df.shape
df.head()

Unnamed: 0,asin,reviewerID,verified,overall,reviewText,summary,unixReviewTime,title,brand,category
0,B00002N7IL,A3SHVDMM83IHJ4,True,5.0,Just what I needed for my electric range. Matc...,Matched pigtail cord and works great.,1426809600,"Leviton 5206 50 Amp, 125/250 Volt, NEMA 10-50R...",Leviton,"[Appliances, Parts & Accessories, Range Parts ..."
1,B00002N7IL,A3SHVDMM83IHJ4,True,5.0,Just what I needed for my electric range. Matc...,Matched pigtail cord and works great.,1426809600,"Leviton 5206 50 Amp, 125/250 Volt, NEMA 10-50R...",Leviton,"[Appliances, Parts & Accessories, Range Parts ..."
2,B00004SQHD,A2OXDRWBASV91Y,True,5.0,I like the fact that the wire ends have mounti...,complete package,1366416000,Coleman Cable 09045 5-Foot Range Cord,Coleman Cable,"[Appliances, Parts & Accessories, Range Parts ..."
3,B00004SQHD,A2OXDRWBASV91Y,True,5.0,I like the fact that the wire ends have mounti...,complete package,1366416000,Coleman Cable 09045 5-Foot Range Cord,Coleman Cable,"[Appliances, Parts & Accessories, Range Parts ..."
4,B00004SQHD,A2KG6AWJSWILPR,True,5.0,"Needed another couple of feet with new dryer, ...",Perfect Fit,1363392000,Coleman Cable 09045 5-Foot Range Cord,Coleman Cable,"[Appliances, Parts & Accessories, Range Parts ..."


Create Sentiment Labels (RULE-BASED)

In [None]:
# Define a function to convert numeric ratings into sentiment labels
# We use this because the dataset does not already have sentiment (positive/negative) tags
def label_sentiment(rating):

    # Check if the rating value is greater than or equal to 4
    # Ratings 4 and 5 generally mean the customer is satisfied
    if rating >= 4:
        # Return 'positive' sentiment for high ratings
        return "positive"

    # Check if the rating value is exactly 3
    # Rating 3 is considered neither good nor bad
    elif rating == 3:
        # Return 'neutral' sentiment for average rating
        return "neutral"

    # If the rating is less than 3 (i.e., 1 or 2)
    # These ratings indicate dissatisfaction
    else:
        # Return 'negative' sentiment for low ratings
        return "negative"

# Apply the label_sentiment function to the 'overall' column of the DataFrame
# The 'apply' function runs label_sentiment on each rating value
df["sentiment"] = df["overall"].apply(label_sentiment)

# Count the number of records in each sentiment category
# This helps us understand class distribution (positive, neutral, negative)
df["sentiment"].value_counts()



Unnamed: 0_level_0,count
sentiment,Unnamed: 1_level_1
positive,472311
negative,63997
neutral,28102


Combine Text Fields

In [None]:
# Combine the 'summary' and 'reviewText' columns into one single text column
# This gives the model more context by using both short and detailed reviews
# fillna("") replaces missing values with empty strings to avoid errors during concatenation
df["text"] = df["summary"].fillna("") + " " + df["reviewText"].fillna("")

# Select only the columns required for machine learning
# 'text' will be used as the input feature for the model
# 'sentiment' will be used as the target label
df = df[["text", "sentiment"]]

# Display the first few rows of the DataFrame
# This helps verify that text combination and column selection worked correctly
df.head()




Unnamed: 0,text,sentiment
0,Matched pigtail cord and works great. Just wha...,positive
1,Matched pigtail cord and works great. Just wha...,positive
2,complete package I like the fact that the wire...,positive
3,complete package I like the fact that the wire...,positive
4,Perfect Fit Needed another couple of feet with...,positive


**Text Preprocessing**

In [None]:
# Download the stopwords dataset from NLTK
# Stopwords are common words (like 'the', 'is', 'and') that do not add much meaning
# These will be removed during text preprocessing
nltk.download("stopwords")

# Download the WordNet dataset
# WordNet is required for lemmatization (converting words to their base form)
nltk.download("wordnet")

# Download the Open Multilingual WordNet dataset
# This supports WordNet lemmatizer with better word relationships
# Required dependency for WordNet to work correctly
nltk.download("omw-1.4")



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

Initialize NLP Tools

In [None]:
# Load the list of English stopwords from NLTK
# Stopwords are common words (like 'is', 'the', 'and') that do not add useful meaning
# Converting the list to a set improves lookup speed during text processing
stop_words = set(stopwords.words("english"))

# Initialize the WordNet Lemmatizer
# This object will be used to convert words into their base/root form
# Example: 'running' → 'run', 'cars' → 'car'
lemmatizer = WordNetLemmatizer()




Define Clean Text Function

In [None]:
# Define a function to clean and preprocess raw text data
# This function prepares text so that it can be used by ML/NLP models
def clean_text(text):

    # Step 1: Convert the entire text to lowercase
    # This ensures consistency (e.g., 'Good' and 'good' are treated the same)
    text = text.lower()

    # Step 2: Remove punctuation, numbers, and special characters
    # The regex keeps only alphabets (a–z, A–Z) and spaces
    # This helps remove noise from the text
    text = re.sub(r"[^a-zA-Z\s]", "", text)

    # Step 3: Tokenize the text
    # Split the cleaned sentence into individual words (tokens)
    tokens = text.split()

    # Step 4: Remove stopwords and apply lemmatization
    # - Stopwords are removed because they do not add meaningful information
    # - Lemmatization converts words to their base form
    cleaned_tokens = [
        lemmatizer.lemmatize(word)   # Convert word to its root form
        for word in tokens           # Loop through each word in the text
        if word not in stop_words    # Keep only meaningful words
    ]

    # Step 5: Join the cleaned tokens back into a single string
    # This produces the final cleaned text used for feature extraction
    return " ".join(cleaned_tokens)



Apply Cleaning to Dataset

In [None]:
# Apply the clean_text function to each value in the 'text' column
# This preprocesses all reviews by cleaning, tokenizing, removing stopwords, and lemmatizing
# The cleaned output is stored in a new column called 'clean_text'
df["clean_text"] = df["text"].apply(clean_text)



Validate Preprocessing

In [None]:
# Display the first 5 rows of the original and cleaned text columns

df[["text", "clean_text"]].head(5)


Unnamed: 0,text,clean_text
0,Matched pigtail cord and works great. Just wha...,matched pigtail cord work great needed electri...
1,Matched pigtail cord and works great. Just wha...,matched pigtail cord work great needed electri...
2,complete package I like the fact that the wire...,complete package like fact wire end mounting t...
3,complete package I like the fact that the wire...,complete package like fact wire end mounting t...
4,Perfect Fit Needed another couple of feet with...,perfect fit needed another couple foot new dry...


Drop Original Text

In [None]:
# Select only the final columns required for model training
# 'clean_text' will be used as the input feature (processed review text)
# 'sentiment' will be used as the target label for prediction
df = df[["clean_text", "sentiment"]]

# Display the first few rows of the final dataset
df.head()



Unnamed: 0,clean_text,sentiment
0,matched pigtail cord work great needed electri...,positive
1,matched pigtail cord work great needed electri...,positive
2,complete package like fact wire end mounting t...,positive
3,complete package like fact wire end mounting t...,positive
4,perfect fit needed another couple foot new dry...,positive


Import Required Libraries

In [None]:
# Import TF-IDF Vectorizer from scikit-learn
# TF-IDF converts text data into numerical features that ML models can understand
# It gives higher weight to important words and lower weight to very common words
from sklearn.feature_extraction.text import TfidfVectorizer

# Import train_test_split function from scikit-learn
# This is used to split the dataset into training and testing sets
# Training data is used to build the model, testing data is used to evaluate performance
from sklearn.model_selection import train_test_split



Split Data (Before Vectorization)

In [None]:
# Assign the input feature variable X
# 'clean_text' contains the processed review text used for model training
X = df["clean_text"]

# Assign the target variable y
# 'sentiment' contains the labels (positive, neutral, negative)
y = df["sentiment"]

# Split the dataset into training and testing sets
# X_train, y_train → used to train the ML model
# X_test, y_test → used to evaluate the model performance
X_train, X_test, y_train, y_test = train_test_split(
    X,                 # Input features (cleaned text)
    y,                 # Target labels (sentiment)
    test_size=0.3,     # 30% data used for testing, 70% for training
    random_state=7,    # Fixed value to ensure reproducible results
    stratify=y         # Maintains the same class distribution in train and test sets
)


Initialize TF-IDF Vectorizer

In [None]:
# Initialize the TF-IDF Vectorizer
# This converts cleaned text into numerical feature vectors for ML models
tfidf = TfidfVectorizer(

    # Limit the number of features (unique words/phrases) to the top 5000
    # This helps control dimensionality and reduces memory usage
    max_features=5000,

    # Use both unigrams (single words) and bigrams (two-word combinations)
    # This helps capture more context, such as "not good" or "very bad"
    ngram_range=(1, 2)
)


Fit on Training Data, Transform Both

In [None]:
# Fit the TF-IDF vectorizer on the training text data
# This learns the vocabulary and IDF values only from training data
# Using fit_transform prevents data leakage from the test set
X_train_tfidf = tfidf.fit_transform(X_train)

# Transform the test text data using the already learned TF-IDF vocabulary
# We do NOT use fit again to ensure fair model evaluation
X_test_tfidf = tfidf.transform(X_test)


Validate Vectorization

In [None]:
# Print the shape of the TF-IDF transformed training data
# This shows the number of training samples and the number of features created
# Format: (number of rows, number of TF-IDF features)
print("TF-IDF Train shape:", X_train_tfidf.shape)

# Print the shape of the TF-IDF transformed testing data
# This confirms that both train and test sets have the same number of features
print("TF-IDF Test shape:", X_test_tfidf.shape)



TF-IDF Train shape: (395087, 5000)
TF-IDF Test shape: (169323, 5000)


Import Required ML Libraries

In [None]:
# Import Logistic Regression model from scikit-learn
# Logistic Regression is a linear classification algorithm
# It works very well with high-dimensional and sparse data like TF-IDF vectors
from sklearn.linear_model import LogisticRegression

# Import accuracy_score metric
# Accuracy measures the percentage of correct predictions made by the model
from sklearn.metrics import accuracy_score

# Import classification_report
# This provides detailed metrics like precision, recall, and F1-score for each class
from sklearn.metrics import classification_report

# Import confusion_matrix
# Confusion matrix shows correct and incorrect predictions for each class
from sklearn.metrics import confusion_matrix


Initialize the Model

In [None]:
# Initialize the Logistic Regression classification model
# This model will be used to predict sentiment from TF-IDF features
model = LogisticRegression(

    # Set maximum number of iterations for the optimization algorithm
    # Increased to ensure the model converges when dealing with large feature spaces
    max_iter=1000,

    # Use all available CPU cores to speed up model training
    # Setting -1 tells scikit-learn to use maximum parallel processing
    n_jobs=-1
)


Train the Model

In [None]:
# Train the Logistic Regression model using the training dataset
# The model learns the relationship between TF-IDF feature vectors and sentiment labels
model.fit(
    X_train_tfidf,   # TF-IDF transformed training text data (input features)
    y_train          # Corresponding sentiment labels (target values)
)


Make Predictions on Test Data

In [None]:
# Use the trained Logistic Regression model to predict sentiment labels
# Predictions are made on the TF-IDF transformed test dataset (unseen data)
y_pred = model.predict(X_test_tfidf)



Evaluate Model Accuracy

In [None]:
# Use the trained Logistic Regression model to predict sentiment labels
# Predictions are made on the TF-IDF transformed test dataset
# This data was not seen by the model during training, so it tests generalization
y_pred = model.predict(X_test_tfidf)


Detailed Classification Report

In [None]:
# Generate a detailed classification report for model evaluation
# This report shows precision, recall, F1-score, and support for each sentiment class
# It helps us understand how well the model performs on positive, neutral, and negative reviews
print(classification_report(
    y_test,   # Actual sentiment labels from the test dataset
    y_pred    # Predicted sentiment labels generated by the model
))



              precision    recall  f1-score   support

    negative       0.83      0.81      0.82     19199
     neutral       0.76      0.37      0.49      8431
    positive       0.95      0.98      0.97    141693

    accuracy                           0.93    169323
   macro avg       0.85      0.72      0.76    169323
weighted avg       0.93      0.93      0.93    169323



Confusion Matrix

In [None]:
# Create a confusion matrix to evaluate model prediction performance
# The confusion matrix compares actual labels with predicted labels
# Rows represent actual sentiment classes
# Columns represent predicted sentiment classes
conf_matrix = confusion_matrix(
    y_test,   # Actual sentiment labels from the test dataset
    y_pred    # Predicted sentiment labels from the model
)

# Print a heading for clarity
print("Confusion Matrix:")

# Display the confusion matrix values
# This helps identify how many predictions were correct and where the model made errors
print(conf_matrix)



Confusion Matrix:
[[ 15555    369   3275]
 [  1605   3089   3737]
 [  1669    608 139416]]


Label Mapping

In [None]:
# Show class labels order used in confusion matrix
print("Class labels:", model.classes_)


Class labels: ['negative' 'neutral' 'positive']


Understand How Logistic Regression Works (Concept)

Before code, understand this clearly:

Logistic Regression assigns a weight to each word feature

Positive weight → pushes prediction towards positive

Negative weight → pushes prediction towards negative

Weight magnitude → importance of the word

Extract Feature Names from TF-IDF

In [None]:
# Retrieve the feature names (words and n-grams) learned by the TF-IDF vectorizer
# These features represent the vocabulary used to convert text into numerical form
# Useful for model interpretation and understanding which words influence predictions
feature_names = tfidf.get_feature_names_out()


Extract Model Coefficients

In [None]:
# Retrieve the learned coefficients (weights) from the Logistic Regression model
# These coefficients represent how strongly each TF-IDF feature influences predictions
# Shape of coefficients: (number_of_classes, number_of_features)
# Each row corresponds to a sentiment class
coefficients = model.coef_



Inspect Class Labels Order

In [None]:
# Display the order of sentiment classes learned by the Logistic Regression model
# This order corresponds to the rows in the model's coefficient matrix (model.coef_)
# Knowing this is important for correctly interpreting feature importance per class
model.classes_


array(['negative', 'neutral', 'positive'], dtype=object)

Convert Coefficients to DataFrame

In [None]:
# Import pandas library for data manipulation
# Pandas is used here to organize model coefficients in a readable table format
import pandas as pd

# Create a DataFrame to store Logistic Regression coefficients
# This helps interpret which words influence each sentiment class
coef_df = pd.DataFrame(

    # Transpose the coefficients matrix
    # After transpose: each row represents a word (feature)
    coefficients.T,

    # Set feature names (words / n-grams) as row labels
    # This makes it easy to identify which coefficient belongs to which word
    index=feature_names,

    # Set sentiment classes as column names
    # Each column shows how strongly a word affects that sentiment
    columns=model.classes_
)


Top Words for POSITIVE Sentiment

In [None]:
# Identify the top words that most strongly contribute to POSITIVE sentiment
# Higher coefficient values mean the word increases the likelihood of positive prediction
coef_df["positive"] \
    .sort_values(ascending=False).head(15)
    # Sort words by coefficient strength (highest to lowest)
    # Select the top 15 most influential positive words



Unnamed: 0,positive
five star,9.165211
four star,8.824231
perfect,8.060564
great,7.801559
five,7.529752
perfectly,7.091083
excellent,6.294458
saved,5.889499
love,5.618744
easy,5.11055


Top Words for NEGATIVE Sentiment

In [None]:
# Identify the top words that most strongly contribute to NEGATIVE sentiment
# Higher coefficient values increase the likelihood of negative sentiment prediction
coef_df["negative"] \
    .sort_values(ascending=False).head(15)
    # Sort words by coefficient strength (highest to lowest)
    # Select the top 15 most influential negative words



Unnamed: 0,negative
one star,11.020948
two star,10.176104
junk,4.382462
doesnt work,4.308311
disappointing,4.121475
returning,4.065718
return,4.044324
useless,4.017734
worse,3.779542
didnt work,3.64568


Neutral


In [None]:
# Identify the top words that most strongly contribute to NEUTRAL sentiment
# Higher coefficient values increase the likelihood of predicting neutral sentiment
coef_df["neutral"] \
    .sort_values(ascending=False).head(15)   # Sort words by coefficient strength (highest to lowest)
                             # Select the top 15 most influential neutral words


Unnamed: 0,neutral
three star,13.59639
okay,3.810325
however,3.140609
average,3.123599
ok,2.877392
made work,2.189988
luckily,2.00979
pricey,1.991181
otherwise,1.955132
slower,1.776399


Compare Positive vs Negative Weights

In [None]:
# Create a new column to measure the difference between positive and negative influence
# A high positive value means the word strongly favors positive sentiment over negative
# A high negative value (not shown here) would favor negative sentiment over positive
coef_df["pos_minus_neg"] = coef_df["positive"] - coef_df["negative"]

# Sort words by the positive-minus-negative score in descending order
# This highlights words that most strongly push predictions toward POSITIVE sentiment
coef_df["pos_minus_neg"] \
    .sort_values(ascending=False).head(15) # Highest positive influence first
                             # Show top 15 most influential words



Unnamed: 0,pos_minus_neg
perfect,14.47227
great,14.420742
five star,14.153639
four star,13.719066
perfectly,12.408808
five,11.263687
excellent,10.910803
love,10.45401
saved,10.266442
easy,9.498801


Train Alternative Models (SVM & Naive Bayes)

Import Naive Bayes

In [None]:
# Import Multinomial Naive Bayes classifier from scikit-learn
# Multinomial Naive Bayes is specifically designed for text data
# It works well with frequency-based features like TF-IDF or Bag-of-Words
from sklearn.naive_bayes import MultinomialNB


Initialize the Model

In [None]:
# Initialize the Multinomial Naive Bayes classifier
# This model is commonly used for text classification tasks
# It is fast, simple, and works well with TF-IDF or Bag-of-Words features
nb_model = MultinomialNB()


Train the Model

In [None]:
# Train the Multinomial Naive Bayes model using TF-IDF features
# The model learns the probability distribution of words for each sentiment class
nb_model.fit(
    X_train_tfidf,   # TF-IDF transformed training text data
    y_train          # Corresponding sentiment labels
)



Predict on Test Data

In [None]:
# Predict sentiments for test data
y_pred_nb = nb_model.predict(X_test_tfidf)


Evaluate Naive Bayes

In [None]:
# Import evaluation metrics from scikit-learn
# accuracy_score calculates overall prediction accuracy
# classification_report provides precision, recall, and F1-score for each class
from sklearn.metrics import accuracy_score, classification_report

# Calculate and print the accuracy of the Naive Bayes model
# Accuracy represents the proportion of correct predictions on the test dataset
print("Naive Bayes Accuracy:", accuracy_score(y_test, y_pred_nb))

# Print a heading for readability
print("\nNaive Bayes Classification Report:\n")

# Generate and display a detailed classification report
# This helps evaluate Naive Bayes performance for each sentiment class
print(classification_report(
    y_test,      # Actual sentiment labels
    y_pred_nb    # Predicted sentiment labels by Naive Bayes model
))



Naive Bayes Accuracy: 0.9162252027190636

Naive Bayes Classification Report:

              precision    recall  f1-score   support

    negative       0.78      0.74      0.76     19199
     neutral       0.71      0.30      0.42      8431
    positive       0.94      0.98      0.96    141693

    accuracy                           0.92    169323
   macro avg       0.81      0.67      0.71    169323
weighted avg       0.91      0.92      0.91    169323



Support Vector Machine (Linear SVM)

Import Linear SVM

In [None]:
# Import Linear Support Vector Classifier from scikit-learn
# LinearSVC is a Support Vector Machine (SVM) algorithm with a linear kernel
# It works very well for high-dimensional and sparse text data like TF-IDF features
# Often performs strongly in sentiment analysis tasks
from sklearn.svm import LinearSVC



Initialize the Model

In [None]:
# Initialize the Linear Support Vector Machine (SVM) classifier
# LinearSVC is well-suited for text classification with TF-IDF features
svm_model = LinearSVC(

    # Regularization parameter
    # Controls the trade-off between model complexity and classification error
    # Higher C -> less regularization, lower C -> more regularization
    C=1.0,

    # Maximum number of iterations for the optimization algorithm
    # Increased to ensure convergence with high-dimensional text data
    max_iter=5000
)

Train the SVM Model

In [None]:
# Train the Linear SVM model using TF-IDF features
# The model learns a decision boundary that best separates sentiment classes
# Training is done only on the training dataset
svm_model.fit(
    X_train_tfidf,   # TF-IDF transformed training text data
    y_train          # Corresponding sentiment labels
)



Predict on Test Data

In [None]:
# Use the trained Linear SVM model to predict sentiment labels
# Predictions are made on the TF-IDF transformed test dataset
# This evaluates how well the SVM model generalizes to unseen data
y_pred_svm = svm_model.predict(X_test_tfidf)



Evaluate SVM

In [None]:
# Print the accuracy score of the SVM model
# Accuracy represents the proportion of correct sentiment predictions on test data
print("SVM Accuracy:", accuracy_score(
    y_test,        # Actual sentiment labels
    y_pred_svm     # Predicted sentiment labels by SVM model
))

# Print a blank line and heading for better readability
print("\nSVM Classification Report:\n")

# Generate and display a detailed classification report for the SVM model
# This includes precision, recall, F1-score, and support for each sentiment class
print(classification_report(
    y_test,        # Actual sentiment labels from test dataset
    y_pred_svm     # Predicted sentiment labels
))



SVM Accuracy: 0.9334880671852023

SVM Classification Report:

              precision    recall  f1-score   support

    negative       0.82      0.82      0.82     19199
     neutral       0.86      0.32      0.47      8431
    positive       0.95      0.99      0.97    141693

    accuracy                           0.93    169323
   macro avg       0.88      0.71      0.75    169323
weighted avg       0.93      0.93      0.93    169323



Compare All Models

In [None]:
# Create a DataFrame to compare accuracy scores of different models
# This helps summarize and compare model performance in a clean tabular format
results = pd.DataFrame({

    # List of machine learning models used in the project
    "Model": [
        "Logistic Regression",
        "Naive Bayes",
        "Linear SVM"
    ],

    # Corresponding accuracy scores for each model
    # Accuracy is calculated using predictions on the test dataset
    "Accuracy": [
        accuracy_score(y_test, y_pred),       # Accuracy of Logistic Regression
        accuracy_score(y_test, y_pred_nb),    # Accuracy of Naive Bayes
        accuracy_score(y_test, y_pred_svm)    # Accuracy of Linear SVM
    ]
})

# Display the comparison table
# This clearly shows which model performed best on the dataset
results



Unnamed: 0,Model,Accuracy
0,Logistic Regression,0.933482
1,Naive Bayes,0.916225
2,Linear SVM,0.933488


Predict Sentiment for Custom Text

Choose the Best Model

In [None]:
best_model = svm_model


Create Prediction Function

In [None]:
# Define a function to predict sentiment for a new, unseen text input
# This function can be used for real-time or user-input sentiment prediction
def predict_sentiment(text):
    """
    Takes a raw input string and returns the predicted sentiment label.
    """

    # Step 1: Clean and preprocess the input text
    # Applies the same preprocessing steps used during model training
    cleaned = clean_text(text)

    # Step 2: Convert the cleaned text into TF-IDF features
    # We use transform (not fit_transform) to avoid changing the learned vocabulary
    # The text is wrapped in a list because TF-IDF expects iterable input
    vectorized = tfidf.transform([cleaned])

    # Step 3: Predict the sentiment using the trained best-performing model
    # [0] is used because the prediction is returned as an array
    prediction = best_model.predict(vectorized)[0]

    # Return the predicted sentiment label (positive / neutral / negative)
    return prediction


Test with Custom Inputs

In [None]:
print(predict_sentiment("This product is absolutely amazing and works perfectly"))
print(predict_sentiment("The appliance stopped working after two days"))
print(predict_sentiment("It is okay, not great but not terrible"))


positive
negative
neutral


In [None]:
print(predict_sentiment("just ok"))
print(predict_sentiment("Highly recommend this, very satisfied"))


neutral
positive


Tableau


In [None]:
# Reload curated data from Parquet (Gold layer export)
df_context = pd.read_parquet('/content/drive/MyDrive/appliance_reviews_curated/')
# Format date as dd/mm/yyyy string















In [None]:
df_context["review_date"] = pd.to_datetime(
    df_context["unixReviewTime"],
    unit="s"
)

df_context["review_date"] = df_context["review_date"].dt.strftime("%d/%m/%Y")
df_context.head()

Unnamed: 0,asin,reviewerID,verified,overall,reviewText,summary,unixReviewTime,title,brand,category,review_date
0,B00002N7IL,A3SHVDMM83IHJ4,True,5.0,Just what I needed for my electric range. Matc...,Matched pigtail cord and works great.,1426809600,"Leviton 5206 50 Amp, 125/250 Volt, NEMA 10-50R...",Leviton,"[Appliances, Parts & Accessories, Range Parts ...",20/03/2015
1,B00002N7IL,A3SHVDMM83IHJ4,True,5.0,Just what I needed for my electric range. Matc...,Matched pigtail cord and works great.,1426809600,"Leviton 5206 50 Amp, 125/250 Volt, NEMA 10-50R...",Leviton,"[Appliances, Parts & Accessories, Range Parts ...",20/03/2015
2,B00004SQHD,A2OXDRWBASV91Y,True,5.0,I like the fact that the wire ends have mounti...,complete package,1366416000,Coleman Cable 09045 5-Foot Range Cord,Coleman Cable,"[Appliances, Parts & Accessories, Range Parts ...",20/04/2013
3,B00004SQHD,A2OXDRWBASV91Y,True,5.0,I like the fact that the wire ends have mounti...,complete package,1366416000,Coleman Cable 09045 5-Foot Range Cord,Coleman Cable,"[Appliances, Parts & Accessories, Range Parts ...",20/04/2013
4,B00004SQHD,A2KG6AWJSWILPR,True,5.0,"Needed another couple of feet with new dryer, ...",Perfect Fit,1363392000,Coleman Cable 09045 5-Foot Range Cord,Coleman Cable,"[Appliances, Parts & Accessories, Range Parts ...",16/03/2013


In [None]:
# Recreate combined text
df_context["text"] = (
    df_context["summary"].fillna("") + " " +
    df_context["reviewText"].fillna("")
)


In [None]:

# Apply SAME preprocessing function
df_context["clean_text"] = df_context["text"].apply(clean_text)

In [None]:

# Transform text using trained TF-IDF vectorizer
X_all_tfidf = tfidf.transform(df_context["clean_text"])

# Predict sentiment using trained SVM model
df_context["predicted_sentiment"] = svm_model.predict(X_all_tfidf)

In [None]:

tableau_df = df_context[[
    "brand",
    "overall",
    "review_date",
    "predicted_sentiment"
]]



tableau_df.shape

(564410, 4)

In [None]:
tableau_df.to_csv(
    "amazon_brand_sentiment_tableau.csv",
    index=False
)
