In [7]:
# I had imported all the necessary core Python libraries for data handling and preprocessing
import numpy as np
import pandas as pd
import re
import string

# I had imported the HuggingFace Transformers library to use DistilBERT for embeddings
from transformers import pipeline, DistilBertTokenizer, DistilBertForSequenceClassification
import torch

# I had imported scikit-learn libraries for machine learning models and preprocessing
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

# I had imported SMOTE to handle imbalanced classes in the dataset
from imblearn.over_sampling import SMOTE

In [8]:
df = pd.read_csv('/content/Datafiniti_Hotel_Reviews_Jun19.csv')
df.head()

In [9]:
# I had created a new column 'sentiment' by mapping ratings to sentiment categories

df["sentiment"] = df["reviews.rating"].map({
    0: "negative",
    1: "negative",
    2: "negative",
    3: "neutral",
    4: "positive",
    5: "positive"
})


# Display sample data
df.head()

Unnamed: 0,id,dateAdded,dateUpdated,address,categories,primaryCategories,city,country,keys,latitude,...,reviews.rating,reviews.sourceURLs,reviews.text,reviews.title,reviews.userCity,reviews.userProvince,reviews.username,sourceURLs,websites,sentiment
0,AWE2FvX5RxPSIh2RscTK,2018-01-18T18:43:12Z,2019-05-20T23:55:47Z,5620 Calle Real,"Hotels,Hotels and motels,Hotel and motel mgmt....",Accommodation & Food Services,Goleta,US,us/ca/goleta/5620callereal/-1127060008,34.44178,...,3,https://www.tripadvisor.com/Hotel_Review-g3243...,"This hotel was nice and quiet. Did not know, t...",Best Western Plus Hotel,San Jose,UnitedStates,tatsurok2018,https://www.tripadvisor.com/Hotel_Review-g3243...,https://www.bestwestern.com/en_US/book/hotel-r...,neutral
1,AVwcj_OhkufWRAb5wi9T,2016-11-06T20:21:05Z,2019-05-20T23:31:56Z,5th And San Carlos PO Box 3574,"Hotels,Lodging,Hotel",Accommodation & Food Services,Carmel by the Sea,US,us/ca/carmelbythesea/5thandsancarlospobox3574/...,36.55722,...,4,https://www.tripadvisor.com/Hotel_Review-g3217...,We stayed in the king suite with the separatio...,Clean rooms at solid rates in the heart of Carmel,San Francisco,CA,STEPHEN N,http://www.tripadvisor.com/Hotel_Review-g32172...,http://www.bestwestern.com,positive
2,AVwcj_OhkufWRAb5wi9T,2016-11-06T20:21:05Z,2019-05-20T23:31:56Z,5th And San Carlos PO Box 3574,"Hotels,Lodging,Hotel",Accommodation & Food Services,Carmel by the Sea,US,us/ca/carmelbythesea/5thandsancarlospobox3574/...,36.55722,...,3,https://www.tripadvisor.com/Hotel_Review-g3217...,"Parking was horrible, somebody ran into my ren...",Business,Prescott Valley,AZ,15Deborah,http://www.tripadvisor.com/Hotel_Review-g32172...,http://www.bestwestern.com,neutral
3,AVwcj_OhkufWRAb5wi9T,2016-11-06T20:21:05Z,2019-05-20T23:31:56Z,5th And San Carlos PO Box 3574,"Hotels,Lodging,Hotel",Accommodation & Food Services,Carmel by the Sea,US,us/ca/carmelbythesea/5thandsancarlospobox3574/...,36.55722,...,5,https://www.tripadvisor.com/Hotel_Review-g3217...,Not cheap but excellent location. Price is som...,Very good,Guaynabo,PR,Wilfredo M,http://www.tripadvisor.com/Hotel_Review-g32172...,http://www.bestwestern.com,positive
4,AVwcj_OhkufWRAb5wi9T,2016-11-06T20:21:05Z,2019-05-20T23:31:56Z,5th And San Carlos PO Box 3574,"Hotels,Lodging,Hotel",Accommodation & Food Services,Carmel by the Sea,US,us/ca/carmelbythesea/5thandsancarlospobox3574/...,36.55722,...,2,https://www.tripadvisor.com/Hotel_Review-g3217...,If you get the room that they advertised on th...,Low chance to come back here,Reno,NV,Luc D,http://www.tripadvisor.com/Hotel_Review-g32172...,http://www.bestwestern.com,negative


In [10]:
cols_to_drop = ['id', 'dateAdded', 'dateUpdated', 'address', 'categories',
               'primaryCategories', 'city', 'country', 'keys', 'latitude',
               'reviews.sourceURLs','reviews.title', 'reviews.userCity',
                'reviews.userProvince', 'reviews.username', 'sourceURLs', 'websites', 'sentiment']
df.drop(columns=cols_to_drop, inplace=True, errors='ignore')

In [11]:
# I had imported AutoTokenizer and pipeline from HuggingFace to clean text using DistilBERT tokenizer

from transformers import AutoTokenizer, pipeline

# Then Loaded DistilBERT tokenizer and sentiment analysis pipeline
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")
sentiment_pipeline = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")

# Function to clean text using DistilBERT tokenizer
def clean_text_with_distilbert(text):
    if not isinstance(text, str) or text.strip() == "":
        return ""  # Return empty string for non-text values

    # Tokenize and remove special characters
    inputs = tokenizer(text, truncation=True, max_length=512) # I had tokenized the text using tokenizer and limited to 512 tokens
    tokens = inputs["input_ids"] # Then had extracted input token IDs

    # Convert tokens back to words and remove punctuation/symbols
    words = [tokenizer.decode([t]) for t in tokens if tokenizer.decode([t]).isalnum()]
    # The I joined the cleaned words back into a sentence
    cleaned_text = " ".join(words)

    return cleaned_text



# Function to classify sentiment using cleaned text
def classify_sentiment(text):
    cleaned_text = clean_text_with_distilbert(text)

    if cleaned_text.strip() == "":
        return "neutral"  # If text is empty after cleaning assume neutral

    # Get sentiment using DistilBERT pipeline
    result = sentiment_pipeline(cleaned_text)[0]

    label = result["label"].lower()
    score = result["score"]

    # Define confidence threshold for neutral sentiment
    neutral_threshold = 0.6
    if score < neutral_threshold:
        return "neutral"
    return label

# Apply cleaning and sentiment classification
df["cleaned_review"] = df["reviews.text"].apply(clean_text_with_distilbert)
df["sentiment_llm"] = df["cleaned_review"].apply(classify_sentiment)

# Display results
df[["reviews.text", "cleaned_review", "sentiment_llm"]].head()


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Device set to use cuda:0
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Unnamed: 0,reviews.text,cleaned_review,sentiment_llm
0,"This hotel was nice and quiet. Did not know, t...",this hotel was nice and quiet did not know the...,negative
1,We stayed in the king suite with the separatio...,we stayed in the king suite with the separatio...,positive
2,"Parking was horrible, somebody ran into my ren...",parking was horrible somebody ran into my rent...,negative
3,Not cheap but excellent location. Price is som...,not cheap but excellent location price is some...,positive
4,If you get the room that they advertised on th...,if you get the room that they advertised on th...,negative


In [12]:
cols_to_drop = ['id', 'dateAdded', 'dateUpdated', 'address', 'categories',
               'primaryCategories', 'city', 'country', 'keys', 'latitude',
               'reviews.sourceURLs','reviews.title', 'reviews.userCity', 'reviews.text'
                'reviews.userProvince', 'reviews.username', 'sourceURLs', 'websites', 'sentiment']
df.drop(columns=cols_to_drop, inplace=True, errors='ignore')

In [13]:
from transformers import AutoTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
!pip install datasets
from datasets import Dataset
import pandas as pd

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

# Define label mapping
label_map = {"negative": 0, "neutral": 1, "positive": 2}

# Convert text labels into numeric labels
df["labels"] = df["sentiment_llm"].map(label_map)


# Convert DataFrame to Hugging Face Dataset
dataset = Dataset.from_pandas(df)

# Tokenize dataset
def tokenize_function(examples):
    return tokenizer(examples["cleaned_review"], truncation=True, padding="max_length")

from sklearn.model_selection import train_test_split

# Tokenize
tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Convert Hugging Face Dataset to Pandas for splitting
# Split entire dataset into training and testing
train_test = tokenized_datasets.train_test_split(test_size=0.2, seed=42)
# dataset_subset = Dataset.from_pandas(train_test)  # Convert back to Hugging Face Dataset



# Remove unnecessary columns (keeping 'cleaned_review' and 'labels')
tokenized_datasets = tokenized_datasets.remove_columns(["sentiment_llm"])  # Keep 'cleaned_review' and 'labels'
tokenized_datasets.set_format("torch")  # Convert dataset format to PyTorch

# Load DistilBERT model for classification (3 sentiment classes)
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=3)

# Define Training Arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=2,
    weight_decay=0.01,
)



trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_test["train"],  # Use only 10% of total dataset for training
    eval_dataset=train_test["test"]  # Use part of 10% for evaluation
)

# Train the model
trainer.train()


Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m24.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.12.0-py3-none-any.

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mameykpathare[0m ([33mameykpathare-student[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss
1,0.3501,0.418256
2,0.2086,0.337848


TrainOutput(global_step=4000, training_loss=0.29300049591064453, metrics={'train_runtime': 916.7242, 'train_samples_per_second': 17.453, 'train_steps_per_second': 4.363, 'total_flos': 2119516176384000.0, 'train_loss': 0.29300049591064453, 'epoch': 2.0})

In [14]:
### I had repeated the same process for test data

batch_size = 4  # Reduce to optimize memory
embedding_list = [] # Storage for embeddings


# Load the DistilBERT model (make sure it's the same as used in training)
from transformers import DistilBertModel
bert_model = DistilBertModel.from_pretrained("distilbert-base-uncased") # Base model without classification head

# Configure GPU usage
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # Detect hardware
bert_model.to(device)  # Move model to appropriate device

# Batch processing loop
for i in range(0, len(train_test["train"]), batch_size):
    # Getting batch of cleaned reviews
    batch_texts = train_test["train"][i:i+batch_size]["cleaned_review"]  # Access from Dataset

    # Tokenize batch (rest remains the same)
    inputs = tokenizer(batch_texts, padding="max_length", truncation=True, max_length=512, return_tensors="pt")
    # Move inputs to device
    inputs = {k: v.to(device) for k, v in inputs.items()}

    # Forward pass (no gradient calculation)
    with torch.no_grad():
        outputs = bert_model(**inputs)

    # Extract [CLS] token embeddings
    batch_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()  # First token embedding
    embedding_list.append(batch_embeddings)  # Accumulating batches


# Created full embedding matrix
X = np.vstack(embedding_list) # Combine all batches
y = np.array(train_test["train"]["labels"])  # Get corresponding labels from dataset not dataframe

In [15]:
# Reduce dimensions from 768 to 50
pca = PCA(n_components=50) # Reduce to 50 principal components
bert_reduced = pca.fit_transform(X)  #  Apply PCA to embeddings

# Feature scaling
scaler = StandardScaler()  # Initialize scaler
bert_scaled = scaler.fit_transform(bert_reduced)  # Standardize features

In [16]:
import string
import numpy as np

# Calculated the features for ALL reviews in the original DataFrame
df["review_length"] = df["cleaned_review"].apply(lambda x: len(x.split()))
df["punctuation_count"] = df["cleaned_review"].apply(lambda x: sum(1 for c in x if c in string.punctuation))

# Got the matching features for our current split
# Since we can't use indices we'll match based on the review text
train_texts = train_test["train"]["cleaned_review"]
subset_features = np.array([
    [
        df[df["cleaned_review"] == text]["review_length"].values[0],
        df[df["cleaned_review"] == text]["punctuation_count"].values[0]
    ]
    for text in train_texts
])

# Combined with BERT embeddings
X_combined = np.hstack((bert_scaled, subset_features))

# Getting labels from the split
y = np.array(train_test["train"]["labels"])

In [17]:
# I had applied SMOTE to oversample minority classes
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

In [18]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Scale entire feature set
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train Logistic Regression with higher max_iter
log_reg = LogisticRegression(max_iter=500, solver="lbfgs")
log_reg.fit(X_train_scaled, y_train)

# Predict and evaluate Logistic Regression
y_pred_lr = log_reg.predict(X_test_scaled)
print("Logistic Regression:\n", classification_report(y_test, y_pred_lr))

Logistic Regression:
               precision    recall  f1-score   support

           0       0.91      0.93      0.92      1249
           1       0.96      1.00      0.98      1179
           2       0.94      0.88      0.91      1195

    accuracy                           0.94      3623
   macro avg       0.94      0.94      0.94      3623
weighted avg       0.94      0.94      0.94      3623



In [19]:
!pip install xgboost
from xgboost import XGBClassifier

# Ensure labels start from 0 for XGBoost and Random Forest
y_train_fixed = y_train - y_train.min()  # Shifting labels
y_test_fixed = y_test - y_test.min()  # Shifted labels


# Train XGBoost with  labels
xgb = XGBClassifier(n_estimators=100, learning_rate=0.1, max_depth=6, random_state=42)
xgb.fit(X_train_scaled, y_train_fixed)

# Predict and  XGBoost
y_pred_xgb = xgb.predict(X_test_scaled)
print("XGBoost:\n", classification_report(y_test_fixed, y_pred_xgb))

# Train Random Forest with corrected labels
rf = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)
rf.fit(X_train_scaled, y_train_fixed)  # Use y_train_fixed

# Predict and evaluate Random Forest
y_pred_rf = rf.predict(X_test_scaled)
print("Random Forest:\n", classification_report(y_test_fixed, y_pred_rf))

XGBoost:
               precision    recall  f1-score   support

           0       0.92      0.96      0.94      1249
           1       0.99      1.00      1.00      1179
           2       0.96      0.91      0.93      1195

    accuracy                           0.96      3623
   macro avg       0.96      0.96      0.96      3623
weighted avg       0.96      0.96      0.96      3623

Random Forest:
               precision    recall  f1-score   support

           0       0.90      0.95      0.93      1249
           1       0.96      1.00      0.98      1179
           2       0.96      0.87      0.91      1195

    accuracy                           0.94      3623
   macro avg       0.94      0.94      0.94      3623
weighted avg       0.94      0.94      0.94      3623



In [20]:
# Train SVM
svm = SVC(probability=True, random_state=42)
svm.fit(X_train_scaled, y_train_fixed)  # Ensure consistent labels

# Got prediction probabilities
prob_lr = log_reg.predict_proba(X_test_scaled)
prob_rf = rf.predict_proba(X_test_scaled)
prob_svm = svm.predict_proba(X_test_scaled)
prob_xgb = xgb.predict_proba(X_test_scaled)

# Weighted averaging of predictions
final_probs = (prob_lr + prob_rf + prob_svm + prob_xgb) / 4
final_preds = np.argmax(final_probs, axis=1)

print("Ensemble Model:\n", classification_report(y_test_fixed, final_preds))

Ensemble Model:
               precision    recall  f1-score   support

           0       0.93      0.96      0.94      1249
           1       1.00      1.00      1.00      1179
           2       0.96      0.92      0.94      1195

    accuracy                           0.96      3623
   macro avg       0.96      0.96      0.96      3623
weighted avg       0.96      0.96      0.96      3623

