In [None]:
# Install the spaCy library
# spaCy is an industry-grade NLP library used for fast and efficient text processing
!pip install spacy

# Download the small English language model for spaCy
# This model provides tokenization, lemmatization, stopwords, and basic NLP features
!python -m spacy download en_core_web_sm


In [None]:
# Import pandas library for data manipulation and analysis
# Pandas helps us work with datasets in table format (rows and columns)
import pandas as pd


# Import the spaCy library
# spaCy provides an efficient NLP pipeline for text processing
import spacy

# Load the small English language model
# This model includes tokenizer, lemmatizer, stopwords, and basic linguistic rules
nlp = spacy.load("en_core_web_sm")

# Import Google Drive utility from Google Colab
# This allows Colab to access files stored in our Google Drive
from google.colab import drive

# Mount Google Drive to the Colab environment
# After mounting, Drive files will be accessible under /content/drive
drive.mount('/content/drive')

# Load spaCy English model with unnecessary components disabled
# Disabling parser, tagger, and NER greatly improves performance
nlp = spacy.load(
    "en_core_web_sm",
    disable=["parser", "ner", "tagger"]
)



In [None]:
# Path where the curated Parquet folder is stored in Google Drive
# This folder was downloaded from HDFS (Gold layer) and uploaded to Drive
folder_path = '/content/drive/MyDrive/appliance_reviews_curated/'

# Read the Parquet dataset into a Pandas DataFrame
# Pandas can read a folder containing Parquet part files
# This is safe because Parquet is columnar and schema-aware
df = pd.read_parquet(folder_path)

Create Sentiment Labels (RULE-BASED)

In [None]:
# Define a function to convert numeric ratings into sentiment labels
# We use this because the dataset does not already have sentiment (positive/negative) tags
def label_sentiment(rating):

    # Check if the rating value is greater than or equal to 4
    # Ratings 4 and 5 generally mean the customer is satisfied
    if rating >= 4:
        # Return 'positive' sentiment for high ratings
        return "positive"

    # Check if the rating value is exactly 3
    # Rating 3 is considered neither good nor bad
    elif rating == 3:
        # Return 'neutral' sentiment for average rating
        return "neutral"

    # If the rating is less than 3 (i.e., 1 or 2)
    # These ratings indicate dissatisfaction
    else:
        # Return 'negative' sentiment for low ratings
        return "negative"

# Apply the label_sentiment function to the 'overall' column of the DataFrame
# The 'apply' function runs label_sentiment on each rating value
df["sentiment"] = df["overall"].apply(label_sentiment)

# Count the number of records in each sentiment category
# This helps us understand class distribution (positive, neutral, negative)
df["sentiment"].value_counts()



Combine Text Fields

In [None]:
# Combine the 'summary' and 'reviewText' columns into one single text column
# This gives the model more context by using both short and detailed reviews
# fillna("") replaces missing values with empty strings to avoid errors during concatenation
df["text"] = df["summary"].fillna("") + " " + df["reviewText"].fillna("")

# Select only the columns required for machine learning
# 'text' will be used as the input feature for the model
# 'sentiment' will be used as the target label
df = df[["text", "sentiment"]]

# Display the first few rows of the DataFrame
# This helps verify that text combination and column selection worked correctly
df.head()




Define spaCy-based Text Cleaning Function

In [None]:
# Define a faster text cleaning function using spaCy
# Lemmatization is intentionally skipped for performance and DL compatibility
def clean_text_spacy_fast(text):
    """
    Fast spaCy-based preprocessing:
    - Tokenization
    - Lowercasing
    - Stopword removal
    - Punctuation removal
    """

    # Process text using minimal spaCy pipeline
    doc = nlp(text)

    # Extract clean tokens (no lemmatization for speed)
    tokens = [
        token.text.lower()      # Use original token text and lowercase it
        for token in doc
        if not token.is_stop    # Remove stopwords
        and not token.is_punct  # Remove punctuation
        and token.is_alpha      # Keep only alphabetic tokens
    ]

    # Join tokens into a single cleaned string
    return " ".join(tokens)


Apply spaCy Preprocessing to Dataset

In [None]:
# Import tqdm to visualize progress
from tqdm import tqdm

# Create list to store cleaned text
cleaned_texts = []

# Process text in batches using spaCy's pipe for maximum speed
for doc in tqdm(
    nlp.pipe(
        df["text"],         # Raw review text
        batch_size=2000,    # Larger batch size for better throughput
        n_process=1         # Single process for Colab stability
    ),
    total=len(df),
    desc="spaCy preprocessing"
):
    # Extract cleaned tokens
    tokens = [
        token.text.lower()
        for token in doc
        if not token.is_stop
        and not token.is_punct
        and token.is_alpha
    ]

    # Append cleaned sentence
    cleaned_texts.append(" ".join(tokens))

# Assign cleaned text back to DataFrame
df["clean_text_spacy"] = cleaned_texts


spaCy preprocessing: 100%|██████████| 564410/564410 [24:52<00:00, 378.12it/s]


In [None]:
# Display original text and spaCy-cleaned text side by side
# This helps visually verify whether preprocessing worked correctly
df[["text", "clean_text_spacy"]].head(5)


Unnamed: 0,text,clean_text_spacy
0,Matched pigtail cord and works great. Just wha...,matched pigtail cord works great needed electr...
1,Matched pigtail cord and works great. Just wha...,matched pigtail cord works great needed electr...
2,complete package I like the fact that the wire...,complete package like fact wire ends mounting ...
3,complete package I like the fact that the wire...,complete package like fact wire ends mounting ...
4,Perfect Fit Needed another couple of feet with...,perfect fit needed couple feet new dryer perfe...


Encode Sentiment Label

In [None]:
# Import LabelEncoder from scikit-learn
# LabelEncoder converts categorical text labels into numeric form
from sklearn.preprocessing import LabelEncoder

# Initialize the label encoder
# This object will learn a mapping between sentiment labels and numbers
label_encoder = LabelEncoder()

# Encode the sentiment column into numeric labels
# Example mapping (may vary):
# negative -> 0, neutral -> 1, positive -> 2
df["sentiment_encoded"] = label_encoder.fit_transform(df["sentiment"])

# Display the mapping between original labels and encoded values
# This is important for interpreting model predictions later
dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))


{'negative': np.int64(0), 'neutral': np.int64(1), 'positive': np.int64(2)}

Train–Test Split

In [None]:
# Import train_test_split to divide data into training and testing sets
# This helps evaluate how well the LSTM generalizes to unseen data
from sklearn.model_selection import train_test_split

# Define input feature X
# We use spaCy-cleaned text as input to the LSTM model
X = df["clean_text_spacy"]

# Define target variable y
# These are the numeric sentiment labels created earlier
y = df["sentiment_encoded"]

# Split the dataset into training and testing sets
# 80% data for training, 20% for testing
# stratify=y ensures class balance is maintained in both sets
X_train, X_test, y_train, y_test = train_test_split(
    X,                 # Input text data
    y,                 # Encoded sentiment labels
    test_size=0.2,     # Use 20% of data for testing
    random_state=42,   # Fixed seed for reproducibility
    stratify=y         # Preserve class distribution
)

# Print the size of train and test sets for confirmation
print("Training samples:", len(X_train))
print("Testing samples:", len(X_test))


Training samples: 451528
Testing samples: 112882


Tokenize Text Using Keras Tokenizer

In [None]:
# Import Tokenizer from Keras
# Tokenizer converts text into sequences of integers
from tensorflow.keras.preprocessing.text import Tokenizer

# Define the maximum number of words to keep in the vocabulary
# Keeping top frequent words controls model size and memory usage
MAX_VOCAB_SIZE = 20000

# Initialize the tokenizer
# oov_token handles words not seen during training (Out-Of-Vocabulary)
tokenizer = Tokenizer(
    num_words=MAX_VOCAB_SIZE,
    oov_token="<OOV>"
)

# Fit the tokenizer ONLY on training text
# This learns the word-to-index mapping from training data
# Avoids data leakage from test set
tokenizer.fit_on_texts(X_train)

# Convert training text into sequences of integers
X_train_seq = tokenizer.texts_to_sequences(X_train)

# Convert test text into sequences using the same tokenizer
X_test_seq = tokenizer.texts_to_sequences(X_test)

# Print vocabulary size to verify tokenizer learning
print("Vocabulary size:", len(tokenizer.word_index))


Vocabulary size: 51078


Pad Sequences (Fixed-Length Input)

In [None]:
# Import pad_sequences from Keras
# pad_sequences is required to make all input sequences the same length
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Re-pad training sequences with the current MAX_SEQUENCE_LENGTH
# This ensures consistent input size for the LSTM
MAX_SEQUENCE_LENGTH = 100


X_train_pad = pad_sequences(
    X_train_seq,               # Integer sequences from tokenizer
    maxlen=MAX_SEQUENCE_LENGTH,
    padding="post",            # Add zeros at the end of shorter sequences
    truncating="post"          # Cut extra words from longer sequences
)

# Re-pad test sequences using the same configuration
X_test_pad = pad_sequences(
    X_test_seq,
    maxlen=MAX_SEQUENCE_LENGTH,
    padding="post",
    truncating="post"
)

# Confirm padding shapes
print("Padded training shape:", X_train_pad.shape)
print("Padded testing shape:", X_test_pad.shape)



Padded training shape: (451528, 100)
Padded testing shape: (112882, 100)


Build LSTM Model Architecture

In [None]:
# Import required Keras components
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

# Number of sentiment classes
NUM_CLASSES = len(label_encoder.classes_)

# Define a lighter LSTM model for faster training
model = Sequential()

# Smaller embedding dimension for speed
model.add(
    Embedding(
        input_dim=MAX_VOCAB_SIZE,
        output_dim=64   # Reduced from 128
    )
)

# LSTM with fewer units
model.add(
    LSTM(
        units=64,       # Reduced from 128
        return_sequences=False
    )
)

# Dropout for regularization
model.add(Dropout(0.5))

# Output layer
model.add(
    Dense(
        NUM_CLASSES,
        activation="softmax"
    )
)

# Build model explicitly
model.build(input_shape=(None, MAX_SEQUENCE_LENGTH))

# Show updated summary
model.summary()


Compile the LSTM Model

In [None]:
# Compile the optimized LSTM model
# This step is mandatory before calling model.fit()

model.compile(
    optimizer="adam",                # Adam optimizer for efficient learning
    loss="sparse_categorical_crossentropy",
    # Used because labels are integer-encoded and task is multi-class
    metrics=["accuracy"]              # Track accuracy during training
)


Train the LSTM Model

In [None]:
# Train the optimized LSTM model
history = model.fit(
    X_train_pad,
    y_train,
    epochs=15,            # Reduced epochs
    batch_size=256,      # Larger batch = faster training
    validation_data=(X_test_pad, y_test),
    verbose=1
)


Epoch 1/15
[1m1764/1764[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m467s[0m 263ms/step - accuracy: 0.8332 - loss: 0.5653 - val_accuracy: 0.8368 - val_loss: 0.5448
Epoch 2/15
[1m1764/1764[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m462s[0m 262ms/step - accuracy: 0.8383 - loss: 0.5449 - val_accuracy: 0.8373 - val_loss: 0.5430
Epoch 3/15
[1m1764/1764[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m461s[0m 261ms/step - accuracy: 0.8440 - loss: 0.5032 - val_accuracy: 0.8429 - val_loss: 0.4590
Epoch 4/15
[1m1764/1764[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m467s[0m 265ms/step - accuracy: 0.8401 - loss: 0.4431 - val_accuracy: 0.8369 - val_loss: 0.4402
Epoch 5/15
[1m1764/1764[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m471s[0m 267ms/step - accuracy: 0.8422 - loss: 0.4715 - val_accuracy: 0.8925 - val_loss: 0.3932
Epoch 6/15
[1m1764/1764[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m456s[0m 258ms/step - accuracy: 0.8966 - loss: 0.3626 - val_accuracy: 0.9106 - val_loss:

Evaluate LSTM Model Performance

In [None]:
# Import evaluation metrics
# These are used to measure classification performance
from sklearn.metrics import accuracy_score, classification_report

# Predict sentiment classes for test data
# model.predict returns probabilities, so we take argmax to get class labels
y_pred_lstm = model.predict(X_test_pad).argmax(axis=1)

# Calculate overall accuracy of the LSTM model
# Accuracy shows the percentage of correct predictions
lstm_accuracy = accuracy_score(y_test, y_pred_lstm)

print("LSTM Accuracy:", lstm_accuracy)

# Generate a detailed classification report
# This includes precision, recall, F1-score for each sentiment class
print("\nLSTM Classification Report:\n")
print(
    classification_report(
        y_test,                 # True sentiment labels
        y_pred_lstm,            # Predicted sentiment labels
        target_names=label_encoder.classes_  # Human-readable class names
    )
)


[1m3528/3528[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m56s[0m 16ms/step
LSTM Accuracy: 0.9091174855158484

LSTM Classification Report:

              precision    recall  f1-score   support

    negative       0.78      0.76      0.77     12800
     neutral       0.37      0.23      0.28      5620
    positive       0.94      0.97      0.96     94462

    accuracy                           0.91    112882
   macro avg       0.70      0.65      0.67    112882
weighted avg       0.90      0.91      0.90    112882



Test LSTM with Custom Text

In [None]:
# Define a function to predict sentiment using the trained LSTM model
# This function follows the SAME preprocessing and tokenization pipeline
def predict_sentiment_lstm(text):
    """
    Takes raw input text and returns predicted sentiment label using LSTM.
    """

    # Step 1: Clean input text using spaCy preprocessing
    cleaned_text = clean_text_spacy_fast(text)

    # Step 2: Convert cleaned text into integer sequence using trained tokenizer
    sequence = tokenizer.texts_to_sequences([cleaned_text])

    # Step 3: Pad the sequence to match LSTM input length
    padded_sequence = pad_sequences(
        sequence,
        maxlen=MAX_SEQUENCE_LENGTH,
        padding="post",
        truncating="post"
    )

    # Step 4: Predict sentiment probabilities using LSTM model
    prediction_probs = model.predict(padded_sequence)

    # Step 5: Get predicted class index (highest probability)
    predicted_class = prediction_probs.argmax(axis=1)[0]

    # Step 6: Convert numeric label back to original sentiment text
    return label_encoder.inverse_transform([predicted_class])[0]


# ---- Test the LSTM model with random text inputs ----

print(predict_sentiment_lstm("This product works perfectly and I am very happy"))
print(predict_sentiment_lstm("The appliance stopped working after two days"))
print(predict_sentiment_lstm("It is okay, not great but not terrible"))


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 57ms/step
positive




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 54ms/step
negative
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 119ms/step




neutral
