# Unit 5 - Incremental Capstone - NLP

### Aravind Kothandaraman
2509 batch

#### This is being worked upon using the "car reviews" zip file dataset provided in the IC dataset.

# Install and Import necessary packages

In [1]:
# Install required packages (only once; skip if already installed)
# NLTK (Natural Language Toolkit) is a Python library for teaching and working with human language data, offering tools for tokenization, tagging, and parsing.
!pip install nltk




[notice] A new release of pip is available: 25.1.1 -> 26.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
# Import the necessary packages
import nltk
from nltk.tokenize import sent_tokenize

In [3]:
# Download required NLTK data
nltk.download('punkt')  # Sentence tokenizer
nltk.download('words')  # English word list
nltk.download('stopwords')  # Common stopwords
nltk.download('wordnet')  # WordNet lexical database
nltk.download('punkt_tab') # Download punkt_tab resource

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Aravi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\Aravi\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Aravi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Aravi\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Aravi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [4]:
# Import WordNet, a lexical database used for lemmatization and semantic analysis
from nltk.corpus import wordnet

# Import a list of common English stopwords (e.g., "the", "and", "is") used for filtering out non-informative words
from nltk.corpus import stopwords

In [5]:
import pandas as pd
import numpy as np

# Load the "reviews.zip" dataset, extract the dataset file and cleanse/preprocess the text data.

## *Task-1 work from IC*

## Extract the dataset from the zip file

In [4]:
source_zip = 'review.zip'
import zipfile
with zipfile.ZipFile(source_zip, 'r') as zip_ref:
    zip_ref.extractall('.')

## Load the data and review

In [42]:
dataset = 'Car_Reviews_Database.csv'

In [43]:
# tried with the `utf-8` encoding, but it didnt work.
# attempted with `latin1` encoding and it worked.
car_reviews_df = pd.read_csv(dataset, encoding="latin1")

In [44]:
car_reviews_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 278 entries, 0 to 277
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Year    278 non-null    int64 
 1   Model   278 non-null    object
 2   Review  234 non-null    object
dtypes: int64(1), object(2)
memory usage: 6.6+ KB


In [45]:
car_reviews_df.head(10)

Unnamed: 0,Year,Model,Review
0,2009,Honda,Although arguably the first-generation Insight...
1,2009,Honda,2009 Honda Accord EX-L 4 : This car is very c...
2,2010,Honda,I have owed and driven Honda products for 20 y...
3,2010,Honda,"Honda Accord Euro L : The seats are average, b..."
4,2011,Honda,Honda HR-V: Continuous variable transmission ...
5,2011,Honda,Not much has changed with the historically sec...
6,2012,Honda,Honda Ballade 150 1.5: This is the most reliab...
7,2012,Honda,"Ride quality is top-notch, though communicatio..."
8,2013,Honda,Honda Jazz Hybrid 1.4 : This is my second Hond...
9,2013,Honda,"The CR-V's voluminous cargo area, quick-foldin..."


## Data preprocessing - Rename columns for easier handling

In [46]:
# rename columns for easier handling
car_reviews_df.rename(columns={'Review': 'review_text', 'Model ': 'car_make', 'Year': 'car_year'}, inplace=True)

In [47]:
car_reviews_df['review_text'].head(10)

0    Although arguably the first-generation Insight...
1    2009 Honda Accord EX-L 4  : This car is very c...
2    I have owed and driven Honda products for 20 y...
3    Honda Accord Euro L : The seats are average, b...
4     Honda HR-V: Continuous variable transmission ...
5    Not much has changed with the historically sec...
6    Honda Ballade 150 1.5: This is the most reliab...
7    Ride quality is top-notch, though communicatio...
8    Honda Jazz Hybrid 1.4 : This is my second Hond...
9    The CR-V's voluminous cargo area, quick-foldin...
Name: review_text, dtype: object

### Text cleaning tasks

### Word tokenization, lemmatization and stopwords removal

using tokens and stopwords removal to remove the noise from the text data.

**Next step is lemmatization.**

###### *Writing my own words about understanding and its purpose of usage*

This is to find the root word of the words/texts in the given dataset, so that we try to minimize the total characters/words as our input to the NLP. The lesser the tokens are, the better the efficiency and performance of the NLP models.

### Bringing all these requirements into a function

*As John recommended...*

In [48]:
# need these for the wordnet_pos_tag usage
nltk.download('averaged_perceptron_tagger')
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Aravi\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\Aravi\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


True

In [49]:
import string
import re
import nltk

def get_wordnet_pos(word):
  tag = nltk.pos_tag([word])[0][1][0].upper()
  tag_dict = {"J": wordnet.ADJ,
              "N": wordnet.NOUN,
              "V": wordnet.VERB,
              "R": wordnet.ADV}
  return tag_dict.get(tag, wordnet.NOUN)

def clean_reviews(review_text):
  """Clean a single review string.

  Improvements added:
  - Handles None/NaN inputs safely.
  - Normalizes literal "[NBSP]" markers and Unicode non-breaking spaces (\xa0).
  - Replaces newlines, carriage returns and tabs with spaces.
  - Collapses repeated whitespace to a single space and strips ends.
  - Continues lowercasing, punctuation removal, tokenization, lemmatization and stopword removal.
  """
  # handle missing values
  if pd.isna(review_text):
    return ''

  # ensure we have a string
  review_text = str(review_text)

  # Normalize common NBSP representations (literal marker and unicode non-breaking space)
  review_text = review_text.replace('[NBSP]', ' ')
  review_text = review_text.replace('\xa0', ' ')

  # Replace newlines, carriage returns and tabs with a space
  review_text = re.sub(r'[\r\n\t]+', ' ', review_text)

  # Collapse multiple whitespace characters into a single space and strip
  review_text = re.sub(r'\s+', ' ', review_text).strip()

  # converting to lower_case
  review_text = review_text.lower()

  # remove punctuations
  review_text = re.sub(f"[{re.escape(string.punctuation)}]", '', review_text)

  # word tokenization
  word_tokens = nltk.word_tokenize(review_text)

  # lemmatization
  lemmatizer = nltk.WordNetLemmatizer()
  lemmatized_tokens = [lemmatizer.lemmatize(word, get_wordnet_pos(word)) for word in word_tokens]

  # remove stopwords
  stop_words = set(stopwords.words('english'))
  filtered_tokens = [word for word in lemmatized_tokens if word not in stop_words]
  filtered_text = ' '.join(filtered_tokens)

  return filtered_text

### Handling missing values in `review_text`
In NLP tasks, rows with missing `review_text` cannot provide any information for sentiment analysis or modeling. Therefore, we should identify and remove these rows before proceeding with cleaning and analysis.

In [50]:
car_reviews_df.isnull().sum()

car_year        0
car_make        0
review_text    44
dtype: int64

In [51]:
# check for nulls in the review_text column
print(f"Nulls in review_text before: {car_reviews_df['review_text'].isnull().sum()}")

# Drop rows where review_text is null
car_reviews_df.dropna(subset=['review_text'], inplace=True)

print(f"Nulls in review_text after: {car_reviews_df['review_text'].isnull().sum()}")
print(f"DataFrame size after dropping nulls: {len(car_reviews_df)}")

Nulls in review_text before: 44
Nulls in review_text after: 0
DataFrame size after dropping nulls: 234


In [52]:
car_reviews_df['cleaned_review_text'] = car_reviews_df['review_text'].apply(clean_reviews)

In [53]:
# Remove rows that became empty after cleaning (e.g., reviews consisting only of stopwords or punctuation)
print(f"Empty cleaned_review_text before: {(car_reviews_df['cleaned_review_text'] == '').sum()}")
car_reviews_df = car_reviews_df[car_reviews_df['cleaned_review_text'] != '']
print(f"Empty cleaned_review_text after: {(car_reviews_df['cleaned_review_text'] == '').sum()}")
print(f"Final DataFrame size for modeling: {len(car_reviews_df)}")

Empty cleaned_review_text before: 0
Empty cleaned_review_text after: 0
Final DataFrame size for modeling: 234


In [54]:
# check again the df after cleaning and renaming
car_reviews_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 234 entries, 0 to 277
Data columns (total 4 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   car_year             234 non-null    int64 
 1   car_make             234 non-null    object
 2   review_text          234 non-null    object
 3   cleaned_review_text  234 non-null    object
dtypes: int64(1), object(3)
memory usage: 9.1+ KB


In [55]:
car_reviews_df[car_reviews_df['review_text'].isnull()]

Unnamed: 0,car_year,car_make,review_text,cleaned_review_text


### Summary from Pre-Processing

- We have only 234 reviews in total, after cleaning and removing nulls/empty reviews, for our sentiment analysis task. This is a small dataset for NLP tasks, which may lead to overfitting and limited generalization.
- So we are going to use VADER sentiment analysis tool to generate sentiment labels from the reviews, as the dataset does not have any sentiment labels.

### Option-1: Sentimental analysis using VADER
As we have only customer reviews and no sentiment labels in our dataset, we need to generate the sentiment from the customer reviews before we could Label Encode them.

In [17]:
# Install vaderSentiment if not already installed
!pip install vaderSentiment

Collecting vaderSentiment
  Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl.metadata (572 bytes)
Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl (125 kB)
Installing collected packages: vaderSentiment
Successfully installed vaderSentiment-3.3.2



[notice] A new release of pip is available: 25.1.1 -> 26.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [56]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# Initialize VADER sentiment intensity analyzer
analyzer = SentimentIntensityAnalyzer()

# Function to get sentiment scores and labels
def get_vader_sentiment(text):
    if pd.isna(text):
        return 'Neutral'
    score = analyzer.polarity_scores(text)
    compound = score['compound']

    if compound >= 0.05:
        return 'Positive', compound
    elif compound <= -0.05:
        return 'Negative', compound
    else:
        return 'Neutral', compound

In [58]:
# Apply the function to the 'review_text' column
car_reviews_df[['sentiment_vdr', 'confidence']] = car_reviews_df['cleaned_review_text'].apply(lambda x: pd.Series(get_vader_sentiment(x)))

# Display the distribution of generated sentiments
print(car_reviews_df['sentiment_vdr'].value_counts())

# Display the first few rows with sentiments
car_reviews_df[['cleaned_review_text', 'sentiment_vdr', 'confidence']].head(10)

sentiment_vdr
Positive    207
Negative     19
Neutral       8
Name: count, dtype: int64


Unnamed: 0,cleaned_review_text,sentiment_vdr,confidence
0,although arguably firstgeneration insights hy...,Positive,0.4767
1,2009 honda accord exl 4 car comfortable sporty...,Positive,0.8176
2,owe driven honda product 20 year purchase vehi...,Negative,-0.5984
3,honda accord euro l seat average little rear l...,Positive,0.7184
4,honda hrv continuous variable transmission fai...,Negative,-0.5423
5,much change historically secondbestselling hon...,Positive,0.1531
6,honda ballade 150 15 reliable car ever comfort...,Positive,0.9242
7,ride quality topnotch though communication roa...,Positive,0.802
8,honda jazz hybrid 14 second honda first one ho...,Positive,0.6558
9,crvs voluminous cargo area quickfolding seat f...,Positive,0.296


### Option-2: Using Pre-trained BERT model for sentiment analysis

In [60]:
!pip install transformers

Collecting transformers
  Downloading transformers-5.0.0-py3-none-any.whl.metadata (37 kB)
Collecting filelock (from transformers)
  Downloading filelock-3.20.3-py3-none-any.whl.metadata (2.1 kB)
Collecting huggingface-hub<2.0,>=1.3.0 (from transformers)
  Downloading huggingface_hub-1.3.7-py3-none-any.whl.metadata (13 kB)
Collecting pyyaml>=5.1 (from transformers)
  Using cached pyyaml-6.0.3-cp312-cp312-win_amd64.whl.metadata (2.4 kB)
Collecting tokenizers<=0.23.0,>=0.22.0 (from transformers)
  Downloading tokenizers-0.22.2-cp39-abi3-win_amd64.whl.metadata (7.4 kB)
Collecting typer-slim (from transformers)
  Downloading typer_slim-0.21.1-py3-none-any.whl.metadata (16 kB)
Collecting safetensors>=0.4.3 (from transformers)
  Downloading safetensors-0.7.0-cp38-abi3-win_amd64.whl.metadata (4.2 kB)
Collecting fsspec>=2023.5.0 (from huggingface-hub<2.0,>=1.3.0->transformers)
  Downloading fsspec-2026.1.0-py3-none-any.whl.metadata (10 kB)
Collecting hf-xet<2.0.0,>=1.2.0 (from huggingface-hub<


[notice] A new release of pip is available: 25.1.1 -> 26.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [61]:
# Use pre-trained transformer models
from transformers import pipeline

# Load pre-trained sentiment model
sentiment_pipeline = pipeline(
    "sentiment-analysis",
    model="distilbert-base-uncased-finetuned-sst-2-english"
)

# Analyze your reviews
results = sentiment_pipeline(car_reviews_df['cleaned_review_text'].tolist())

# Extract predictions
car_reviews_df['sentiment_hft'] = [r['label'] for r in results]
car_reviews_df['confidence_hft'] = [r['score'] for r in results]

print(car_reviews_df[['cleaned_review_text', 'sentiment_hft', 'confidence_hft']].head())

  from .autonotebook import tqdm as notebook_tqdm
PyTorch was not found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.
PyTorch was not found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


NameError: name 'torch' is not defined

### Label encode the sentiment column

In [20]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
car_reviews_df['sentiment_lbl'] = label_encoder.fit_transform(car_reviews_df['sentiment_vdr'])

In [21]:
print(car_reviews_df.head())

print(car_reviews_df['sentiment'].value_counts())
print(car_reviews_df['sentiment_lbl'].value_counts())

   car_year car_make                                        review_text  \
0      2009    Honda  Although arguably the first-generation Insight...   
1      2009    Honda  2009 Honda Accord EX-L 4  : This car is very c...   
2      2010    Honda  I have owed and driven Honda products for 20 y...   
3      2010    Honda  Honda Accord Euro L : The seats are average, b...   
4      2011    Honda   Honda HR-V: Continuous variable transmission ...   

                                 cleaned_review_text sentiment  sentiment_lbl  
0  although arguably firstgeneration insights hy...  Positive              2  
1  2009 honda accord exl 4 car comfortable sporty...  Positive              2  
2  owe driven honda product 20 year purchase vehi...  Negative              0  
3  honda accord euro l seat average little rear l...  Positive              2  
4  honda hrv continuous variable transmission fai...  Negative              0  
sentiment
Positive    195
Neutral      53
Negative     30
Name: count

### Comparing the `CountVectorizer()` vs `TfIdfVectorizer()` from our cleaned review texts

In [22]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report


def compare_vectorizers_logreg(
        df,
        text_col='cleaned_review_text',
        label_col='sentiment_lbl',
        test_size=0.2,
        random_state=42,
        max_features=5000,
        logreg_kwargs=None,
        verbose=True
):
    """
    Task: Split text/labels, train LogisticRegression with CountVectorizer and TfidfVectorizer,
    print dataset sizes + accuracies + winner, and return fitted artifacts for reuse.
    """
    if logreg_kwargs is None:
        logreg_kwargs = {}

    cleaned_reviews_X = df[text_col]
    encoded_sentiments_y = df[label_col]

    X_train_text, X_test_text, y_train, y_test = train_test_split(
        cleaned_reviews_X, encoded_sentiments_y, test_size=test_size, random_state=random_state
    )

    if verbose:
        print(f"Train size: {len(X_train_text)}")
        print(f"Test size: {len(X_test_text)}")
        print(f"Total: {len(X_train_text) + len(X_test_text)}")
        print(f"Original data size: {len(cleaned_reviews_X)}")

    count_vec = CountVectorizer(max_features=max_features)
    X_train_count = count_vec.fit_transform(X_train_text)
    X_test_count = count_vec.transform(X_test_text)

    model_count = LogisticRegression(**logreg_kwargs)
    model_count.fit(X_train_count, y_train)
    score_count = model_count.score(X_test_count, y_test)

    if verbose:
        print(f"CountVectorizer Accuracy: {score_count:.4f}")

    tfidf_vec = TfidfVectorizer(max_features=max_features)
    X_train_tfidf = tfidf_vec.fit_transform(X_train_text)
    X_test_tfidf = tfidf_vec.transform(X_test_text)

    model_tfidf = LogisticRegression(**logreg_kwargs)
    model_tfidf.fit(X_train_tfidf, y_train)
    score_tfidf = model_tfidf.score(X_test_tfidf, y_test)

    if verbose:
        print(f"TfidfVectorizer Accuracy: {score_tfidf:.4f}")

        if score_tfidf > score_count:
            print(f"\n✅ TfidfVectorizer wins by {(score_tfidf - score_count) * 100:.2f}%")
        elif score_tfidf == score_count:
            print(f"\n✅ Tie: Both models perform equally well")
        else:
            print(f"\n✅ CountVectorizer wins by {(score_count - score_tfidf) * 100:.2f}%")

    return {
        'X_train_text': X_train_text,
        'X_test_text': X_test_text,
        'y_train': y_train,
        'y_test': y_test,
        'count_vectorizer': count_vec,
        'tfidf_vectorizer': tfidf_vec,
        'count_model': model_count,
        'tfidf_model': model_tfidf,
        'count_accuracy': score_count,
        'tfidf_accuracy': score_tfidf,
    }


In [23]:
results = compare_vectorizers_logreg(car_reviews_df)

Train size: 222
Test size: 56
Total: 278
Original data size: 278
CountVectorizer Accuracy: 0.8214
TfidfVectorizer Accuracy: 0.7679

✅ CountVectorizer wins by 5.36%


In [24]:
# Make sure these are different
print("\nFirst 3 train reviews:")
print(results['X_train_text'][:3])
print("\nFirst 3 test reviews:")
print(results['X_test_text'][:3])


First 3 train reviews:
260                                                     
124    2019 buick regal offer plenty cargo space good...
33     2015 hyundai sonata limited 24 seat beautiful ...
Name: cleaned_review_text, dtype: object

First 3 test reviews:
30     hyundai build warm hatch elantra sedan good co...
126    purchase vehicle replacement yukon denali tota...
199    acura mdx sport hybrid lot go crossover lack a...
Name: cleaned_review_text, dtype: object


In [None]:
# checking for data integrity between training and test data
print(f"Total reviews: {len(car_reviews_df['cleaned_review_text'])}")
print(f"Train set: {len(results['y_train'])}")
print(f"Test set: {len(results['y_test'])}")

# Check label distribution
print("\nLabel distribution in test set:")
print(pd.Series(results['y_test']).value_counts())

*Below code was derived with the help of ClaudeCode AI assistant to perform some diagnostics to check for data leakage and other potential issues with the model performance.*

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score

# 1. Check for data leakage
print("="*60)
print("DIAGNOSTIC 1: Data Leakage Check")
print("="*60)
print(f"Train size: {len(results['X_train_text'])}")
print(f"Test size: {len(results['X_test_text'])}")
print(f"Overlap: {len(set(results['X_train_text']) & set(results['X_test_text']))} reviews")
if len(set(results['X_train_text']) & set(results['X_test_text'])) > 0:
    print("⚠️ WARNING: Train and test sets overlap!")

# 2. Cross-validation (more realistic)
print("\n" + "="*60)
print("DIAGNOSTIC 2: Cross-Validation Scores")
print("="*60)
# We need to vectorize before cross_val_score if we want to use the text directly,
# or use the already vectorized data.
X_tfidf = results['tfidf_vectorizer'].transform(results['X_train_text'])
cv_scores = cross_val_score(results['tfidf_model'], X_tfidf, results['y_train'], cv=5)
print(f"CV Scores: {cv_scores}")
print(f"Mean CV Accuracy: {cv_scores.mean():.4f} (+/- {cv_scores.std():.4f})")
if cv_scores.mean() < 0.95:
    print("✅ More realistic scores - model isn't just memorizing")
else:
    print("⚠️ Still suspiciously high")

# 3. Dataset size and distribution
print("\n" + "="*60)
print("DIAGNOSTIC 3: Dataset Analysis")
print("="*60)
print(f"Total samples: {len(car_reviews_df['cleaned_review_text'])}")
print(f"Classes: {len(set(results['y_train']))}")
print("\nClass distribution:")
print(pd.Series(results['y_train']).value_counts())
print(f"\nAverage review length: {np.mean([len(str(r).split()) for r in results['X_train_text']]):.1f} words")

# 4. Look at what model is using
print("\n" + "="*60)
print("DIAGNOSTIC 4: Most Important Features")
print("="*60)
feature_names = results['tfidf_vectorizer'].get_feature_names_out()
if hasattr(results['tfidf_model'], 'coef_'):
    # For binary/multiclass logistic regression
    for idx, class_name in enumerate(label_encoder.classes_):
        top_features = np.argsort(results['tfidf_model'].coef_[idx])[-10:]
        print(f"\nTop features for '{class_name}':")
        print([feature_names[i] for i in top_features])

## Findings summary

So far, the two vectorization techniques have been compared using Logistic Regression as the model. The TfidfVectorizer has shown a slight edge in accuracy over CountVectorizer. But still they show 100% accuracy which is suspiciously high for a real-world NLP task, indicating potential data leakage. The above some extra analysis diagnostics were performed to check for data leakage, cross-validation scores, dataset size and distribution, and important features used by the model. To address this, I am going to try check the uniquess of the reviews and resolve them.

In [None]:
# TRIAL 1 -Remove duplicates BEFORE splitting
unique_reviews_df_1 = car_reviews_df.drop_duplicates(subset=['cleaned_review_text'], keep='first')
print(f"Original size: {len(car_reviews_df)}")
print(f"After removing duplicates: {len(unique_reviews_df_1)}")

In [None]:
# TRIAL 2 - Try with the dupes removal on the original reviews_text column
unique_reviews_df = car_reviews_df.drop_duplicates(subset=['review_text'], keep='first')
print(f"Original size: {len(car_reviews_df)}")
print(f"After removing duplicates based on original review_text: {len(unique_reviews_df)}")

After carefully reviewing the data, we have to take only 300 unique reviews for our further analysis to avoid data leakage. So, we will proceed with the `unique_reviews_df` DataFrame for the next steps.

In [None]:
unique_reviews_df['cleaned_reviews_v2'] = unique_reviews_df['review_text'].apply(clean_reviews)

unique_reviews_df.head()

In [None]:
unique_review_comparison_results = compare_vectorizers_logreg(df=unique_reviews_df, text_col='cleaned_review_text', label_col='sentiment_lbl')

## Task-2 sentimental analysis

# Task
Perform sentiment analysis on the `Car_Reviews_Database.csv` dataset. This involves splitting the preprocessed and vectorized text data into training and testing sets, then training and evaluating both Logistic Regression and Naïve Bayes models. Additionally, prepare the cleaned text data for deep learning by tokenizing, creating word embeddings, and padding sequences, followed by building and evaluating an LSTM neural network. Finally, install the Hugging Face Transformers library, tokenize the cleaned text data for BERT, and then build and evaluate a pre-trained BERT model fine-tuned for sentiment classification.

In [None]:
from sklearn.model_selection import train_test_split

# Using TF-IDF vectorized data from unique_review_comparison_results
X = unique_review_comparison_results['tfidf_vectorizer'].transform(unique_reviews_df['cleaned_review_text']).toarray()
y = unique_reviews_df['sentiment'].astype(str)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Shape of X_train: {X_train.shape}")
print(f"Shape of X_test: {X_test.shape}")
print(f"Shape of y_train: {y_train.shape}")
print(f"Shape of y_test: {y_test.shape}")


## Build and Evaluate Logistic Regression Model

### Subtask:
Train a Logistic Regression model on the training data and evaluate its performance using appropriate metrics (e.g., accuracy, precision, recall, F1-score).


**Reasoning**:
To train and evaluate a Logistic Regression model, I will import the necessary classes and functions, initialize the model, fit it to the training data, make predictions on the test data, and then calculate and print the required evaluation metrics.



In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Initialize Logistic Regression model
# max_iter is set to a higher value to ensure convergence
# solver='liblinear' is generally good for small datasets or when L1/L2 regularization is used
log_reg_model = LogisticRegression(max_iter=1000, solver='liblinear', random_state=42)

# Fit the model to the training data
print("Training Logistic Regression model...")
log_reg_model.fit(X_train, y_train)
print("Model training complete.")

# Make predictions on the test data
y_pred_log_reg = log_reg_model.predict(X_test)

# Calculate and print evaluation metrics
accuracy_log_reg = accuracy_score(y_test, y_pred_log_reg)
precision_log_reg = precision_score(y_test, y_pred_log_reg, average='weighted')
recall_log_reg = recall_score(y_test, y_pred_log_reg, average='weighted')
f1_log_reg = f1_score(y_test, y_pred_log_reg, average='weighted')

print(f"\n--- Logistic Regression Model Evaluation ---")
print(f"Accuracy: {accuracy_log_reg:.4f}")
print(f"Precision (weighted): {precision_log_reg:.4f}")
print(f"Recall (weighted): {recall_log_reg:.4f}")
print(f"F1-Score (weighted): {f1_log_reg:.4f}")

## Build and Evaluate Naïve Bayes Model

### Subtask:
Train a Naïve Bayes classifier on the training data and evaluate its performance using appropriate metrics.


#### Reasoning:
To train and evaluate a Naïve Bayes model, I will import the necessary classes and functions, initialize the model, fit it to the training data, make predictions on the test data, and then calculate and print the required evaluation metrics.

**Reasoning**:
Implement the Naïve Bayes model training and evaluation as described in the instructions, including importing necessary libraries, initializing the model, fitting it to the training data, making predictions, and calculating evaluation metrics.



In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Initialize Multinomial Naive Bayes model
nb_model = MultinomialNB(fit_prior=True, class_prior=None)

# Fit the model to the training data
print("Training Naïve Bayes model...")
nb_model.fit(X_train, y_train)
print("Model training complete.")

# Make predictions on the test data
y_pred_nb = nb_model.predict(X_test)

# Calculate and print evaluation metrics
accuracy_nb = accuracy_score(y_test, y_pred_nb)
precision_nb = precision_score(y_test, y_pred_nb, average='weighted')
recall_nb = recall_score(y_test, y_pred_nb, average='weighted')
f1_nb = f1_score(y_test, y_pred_nb, average='weighted')

print(f"\n--- Naïve Bayes Model Evaluation ---")
print(f"Accuracy: {accuracy_nb:.4f}")
print(f"Precision (weighted): {precision_nb:.4f}")
print(f"Recall (weighted): {recall_nb:.4f}")
print(f"F1-Score (weighted): {f1_nb:.4f}")

## Prepare Data for Deep Learning Models (LSTM)

### Subtask:
Tokenize the cleaned text data for deep learning, create word embeddings, and pad sequences to a uniform length for LSTM input.


**Reasoning**:
To prepare the cleaned text data for deep learning models like LSTM, I will first import the necessary Keras utilities. Then, I'll initialize a Tokenizer to build a vocabulary from the cleaned reviews, convert these texts into numerical sequences, and finally pad these sequences to a uniform length suitable for neural network input.



In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Initialize a Tokenizer
# Set num_words to a suitable number based on your vocabulary size. Let's use 5000 as a starting point.
# The vocab from CountVectorizer had 111 unique words, so 5000 is more than enough.
# We will use the 'cleaned_review_text' column for this.
tokenizer = Tokenizer(num_words=5000, oov_token='<unk>')

# Fit the tokenizer on the 'cleaned_review_text' column to build the vocabulary
print("Fitting tokenizer on cleaned review text...")
tokenizer.fit_on_texts(car_reviews_df['cleaned_review_text'])
print("Tokenizer fitting complete.")

# Convert texts to sequences of integers
print("Converting texts to sequences...")
sequences = tokenizer.texts_to_sequences(car_reviews_df['cleaned_review_text'])
print("Text to sequence conversion complete.")

# Determine the maximum sequence length
max_sequence_length = max([len(x) for x in sequences])
print(f"Maximum sequence length: {max_sequence_length}")

# Pad the sequences to a uniform length
# Use 'post' padding and truncation to maintain the beginning of the reviews
print("Padding sequences...")
padded_sequences = pad_sequences(sequences, maxlen=max_sequence_length, padding='post', truncating='post')
print("Sequence padding complete.")

print(f"Shape of padded sequences: {padded_sequences.shape}")
# Display the first 5 padded sequences
print("\nFirst 5 padded sequences:")
print(padded_sequences[:5])

**Reasoning**:
Now that the text data has been tokenized and padded, the next step in preparing data for deep learning models like LSTM is to prepare the target variable for model training. This involves encoding the categorical sentiment labels into a numerical format suitable for deep learning, such as one-hot encoding or label encoding, depending on the model's output layer.



In [None]:
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical

# Encode the target variable (sentiment) to numerical values
print("Encoding sentiment labels...")
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(car_reviews_df['sentiment'])

# Convert integer labels to one-hot encoded vectors for multi-class classification
y_dl = to_categorical(y_encoded)
print("Sentiment labels encoded and one-hot converted.")

print(f"Shape of one-hot encoded labels (y_dl): {y_dl.shape}")
print("First 5 one-hot encoded labels:")
print(y_dl[:5])
print("Original sentiment categories:", label_encoder.classes_)

**Reasoning**:
The data has been preprocessed for deep learning models. Now, I need to split the padded sequences (features) and the one-hot encoded labels (target) into training and testing sets to prepare for building the LSTM model.



In [None]:
from sklearn.model_selection import train_test_split

# Split the padded sequences and one-hot encoded labels into training and testing sets
# Use a test_size of 0.2 and random_state for reproducibility
X_train_dl, X_test_dl, y_train_dl, y_test_dl = train_test_split(padded_sequences, y_dl, test_size=0.2, random_state=42)

print(f"Shape of X_train_dl: {X_train_dl.shape}")
print(f"Shape of X_test_dl: {X_test_dl.shape}")
print(f"Shape of y_train_dl: {y_train_dl.shape}")
print(f"Shape of y_test_dl: {y_test_dl.shape}")

## Build and Evaluate LSTM Model

### Subtask:
Construct and train an LSTM (Long Short-Term Memory) neural network for sentiment classification, then evaluate its performance.


**Reasoning**:
To construct and train the LSTM model, I will import the necessary Keras layers, define model parameters, build a Sequential model with Embedding, SpatialDropout1D, LSTM, and Dense layers, compile it, and then train and evaluate it using the prepared deep learning datasets.



In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, SpatialDropout1D

# 1. Define model parameters
vocabulary_size = tokenizer.num_words
embedding_dim = 128 # A common choice for embedding dimensions
input_length = max_sequence_length
num_sentiment_classes = y_dl.shape[1]

print(f"Vocabulary size: {vocabulary_size}")
print(f"Embedding dimension: {embedding_dim}")
print(f"Input sequence length: {input_length}")
print(f"Number of sentiment classes: {num_sentiment_classes}")

# 2. Build the LSTM model
print("Building LSTM model...")
lstm_model = Sequential([
    Embedding(input_dim=vocabulary_size, output_dim=embedding_dim, input_length=input_length),
    SpatialDropout1D(0.2),
    LSTM(128),
    Dense(num_sentiment_classes, activation='softmax')
])
print("LSTM model built.")

# 3. Compile the model
print("Compiling LSTM model...")
lstm_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
print("LSTM model compiled.")

lstm_model.summary()

# 4. Train the LSTM model
print("Training LSTM model...")
history = lstm_model.fit(
    X_train_dl,
    y_train_dl,
    epochs=5, # Number of epochs can be tuned
    batch_size=128, # Batch size can be tuned
    validation_data=(X_test_dl, y_test_dl),
    verbose=1
)
print("LSTM model training complete.")

# 5. Evaluate the model
print("Evaluating LSTM model performance on test set...")
loss, accuracy = lstm_model.evaluate(X_test_dl, y_test_dl, verbose=0)

print(f"\n--- LSTM Model Evaluation ---")
print(f"Test Loss: {loss:.4f}")
print(f"Test Accuracy: {accuracy:.4f}")

**Reasoning**:
The previous code produced a warning about the deprecated `input_length` argument in the `Embedding` layer. Removing this argument will resolve the warning as Keras can infer the input length from the input data.



In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, SpatialDropout1D

# 1. Define model parameters
vocabulary_size = tokenizer.num_words
embedding_dim = 128 # A common choice for embedding dimensions
input_length = max_sequence_length
num_sentiment_classes = y_dl.shape[1]

print(f"Vocabulary size: {vocabulary_size}")
print(f"Embedding dimension: {embedding_dim}")
print(f"Input sequence length: {input_length}")
print(f"Number of sentiment classes: {num_sentiment_classes}")

# 2. Build the LSTM model
print("Building LSTM model...")
lstm_model = Sequential([
    Embedding(input_dim=vocabulary_size, output_dim=embedding_dim), # Removed input_length
    SpatialDropout1D(0.2),
    LSTM(128),
    Dense(num_sentiment_classes, activation='softmax')
])
print("LSTM model built.")

# 3. Compile the model
print("Compiling LSTM model...")
lstm_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
print("LSTM model compiled.")

lstm_model.summary()

# 4. Train the LSTM model
print("Training LSTM model...")
history = lstm_model.fit(
    X_train_dl,
    y_train_dl,
    epochs=5, # Number of epochs can be tuned
    batch_size=128, # Batch size can be tuned
    validation_data=(X_test_dl, y_test_dl),
    verbose=1
)
print("LSTM model training complete.")

# 5. Evaluate the model
print("Evaluating LSTM model performance on test set...")
loss, accuracy = lstm_model.evaluate(X_test_dl, y_test_dl, verbose=0)

print(f"\n--- LSTM Model Evaluation ---")
print(f"Test Loss: {loss:.4f}")
print(f"Test Accuracy: {accuracy:.4f}")

## Prepare Data for BERT Model

### Subtask:
Install the Hugging Face Transformers library and tokenize the cleaned text data for BERT.

**Reasoning**:
The next step is to install the Hugging Face Transformers library, which is necessary for working with BERT models.



In [None]:
!pip install transformers

**Reasoning**:
With the `transformers` library installed, I will now import the necessary BERT tokenizer and tokenize the `cleaned_review_text` column from the DataFrame. This prepares the text data into an input format suitable for BERT models.



In [None]:
from transformers import BertTokenizer
import torch

# Load the BERT tokenizer
tokenizer_bert = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

# Function to tokenize and encode text for BERT
def tokenize_for_bert(text, tokenizer, max_len=128):
    encoded_dict = tokenizer.encode_plus(
                        text,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = max_len,      # Pad & truncate all sentences.
                        pad_to_max_length = True,
                        return_attention_mask = True,   # Return attention mask.
                        return_tensors = 'pt',     # Return PyTorch tensors.
                   )
    return encoded_dict['input_ids'], encoded_dict['attention_mask']

print("Tokenizing and encoding text for BERT...")
# Apply the tokenizer to the 'cleaned_review_text' column
bert_input_ids = []
bert_attention_masks = []

max_bert_len = 128 # Define a max length for BERT sequences

for text in car_reviews_df['cleaned_review_text']:
    input_id, attention_mask = tokenize_for_bert(text, tokenizer_bert, max_bert_len)
    bert_input_ids.append(input_id)
    bert_attention_masks.append(attention_mask)

# Convert the lists of tensors into single tensors
bert_input_ids = torch.cat(bert_input_ids, dim=0)
bert_attention_masks = torch.cat(bert_attention_masks, dim=0)

print("Tokenization and encoding complete.")
print(f"Shape of BERT input IDs: {bert_input_ids.shape}")
print(f"Shape of BERT attention masks: {bert_attention_masks.shape}")

# Prepare the labels for BERT (using the previously encoded y_encoded)
# Convert y_encoded to a PyTorch tensor
labels_bert = torch.tensor(y_encoded)
print(f"Shape of BERT labels: {labels_bert.shape}")