In [7]:
import pickle

# 1) Load the cleaned DataFrame
with open('../data/df_bike_clean.pkl', 'rb') as f:
    df_bike = pickle.load(f)
print("Loaded cleaned reviews:", df_bike.shape)

# 2) Load your TF–IDF features and labels
with open('../data/X_reviews.pkl', 'rb') as f:
    X_reviews = pickle.load(f)
with open('../data/y_reviews.pkl', 'rb') as f:
    y_reviews = pickle.load(f)

print("X_reviews:", X_reviews.shape)
print("y_reviews value counts:\n", y_reviews.value_counts())



Loaded cleaned reviews: (50000, 4)
X_reviews: (50000, 115)
y_reviews value counts:
 sentiment
negative    16840
positive    16777
neutral     16383
Name: count, dtype: int64


In [8]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X_reviews,
    y_reviews,
    test_size=0.2,
    stratify=y_reviews,
    random_state=42
)

print("Train set:", X_train.shape, y_train.shape)
print("Test  set:", X_test.shape,  y_test.shape)


Train set: (40000, 115) (40000,)
Test  set: (10000, 115) (10000,)


In [9]:
from sklearn.linear_model  import LogisticRegression
from sklearn.naive_bayes   import MultinomialNB
from sklearn.metrics       import classification_report, confusion_matrix

# --- Logistic Regression ---
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)
preds_lr = lr.predict(X_test)

print("=== Logistic Regression ===")
print(classification_report(y_test, preds_lr))
print("Confusion Matrix:\n", confusion_matrix(y_test, preds_lr))

# --- Multinomial Naïve Bayes ---
nb = MultinomialNB()
nb.fit(X_train, y_train)
preds_nb = nb.predict(X_test)

print("\n=== Multinomial Naïve Bayes ===")
print(classification_report(y_test, preds_nb))
print("Confusion Matrix:\n", confusion_matrix(y_test, preds_nb))



=== Logistic Regression ===
              precision    recall  f1-score   support

    negative       1.00      1.00      1.00      3368
     neutral       1.00      1.00      1.00      3277
    positive       1.00      1.00      1.00      3355

    accuracy                           1.00     10000
   macro avg       1.00      1.00      1.00     10000
weighted avg       1.00      1.00      1.00     10000

Confusion Matrix:
 [[3368    0    0]
 [   0 3277    0]
 [   0    0 3355]]

=== Multinomial Naïve Bayes ===
              precision    recall  f1-score   support

    negative       1.00      1.00      1.00      3368
     neutral       1.00      1.00      1.00      3277
    positive       1.00      1.00      1.00      3355

    accuracy                           1.00     10000
   macro avg       1.00      1.00      1.00     10000
weighted avg       1.00      1.00      1.00     10000

Confusion Matrix:
 [[3368    0    0]
 [   0 3277    0]
 [   0    0 3355]]


In [14]:
# LSTM Model 
import numpy as np
from sklearn.preprocessing     import LabelEncoder
from sklearn.model_selection   import train_test_split
from tensorflow.keras.preprocessing.text   import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models     import Sequential
from tensorflow.keras.layers     import Embedding, LSTM, Dense

# 1) Encode string labels to ints
le = LabelEncoder()
y_enc = le.fit_transform(y_reviews)            # y_reviews from Cell 1

# 2) Tokenize & pad your cleaned text
tokenizer = Tokenizer(num_words=5000, oov_token="<OOV>")
tokenizer.fit_on_texts(df_bike['cleaned_review'])
seqs   = tokenizer.texts_to_sequences(df_bike['cleaned_review'])
padded = pad_sequences(seqs, maxlen=100)

# 3) Split **padded** sequences and **y_enc** together
X_seq_train, X_seq_test, y_seq_train, y_seq_test = train_test_split(
    padded, 
    y_enc, 
    test_size=0.2, 
    stratify=y_enc, 
    random_state=42
)

print("Sequence train shape:", X_seq_train.shape, "Label train shape:", y_seq_train.shape)

# 4) Build & compile the LSTM
model = Sequential([
    Embedding(input_dim=5000, output_dim=128),  # drop input_length
    LSTM(64, dropout=0.2, recurrent_dropout=0.2),
    Dense(len(le.classes_), activation='softmax')
])
model.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

# 5) Train
history = model.fit(
    X_seq_train, 
    y_seq_train, 
    validation_split=0.1, 
    epochs=5,
    batch_size=32
)

# 6) Evaluate
loss, acc = model.evaluate(X_seq_test, y_seq_test)
print(f"LSTM Test Loss: {loss:.3f}, Test Acc: {acc:.3f}")



Sequence train shape: (40000, 100) Label train shape: (40000,)
Epoch 1/5
[1m1125/1125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 38ms/step - accuracy: 0.9527 - loss: 0.1592 - val_accuracy: 1.0000 - val_loss: 3.7594e-05
Epoch 2/5
[1m1125/1125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 39ms/step - accuracy: 1.0000 - loss: 2.0445e-04 - val_accuracy: 1.0000 - val_loss: 4.5783e-05
Epoch 3/5
[1m1125/1125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 40ms/step - accuracy: 1.0000 - loss: 5.1919e-05 - val_accuracy: 1.0000 - val_loss: 9.3853e-06
Epoch 4/5
[1m1125/1125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 40ms/step - accuracy: 1.0000 - loss: 1.3751e-05 - val_accuracy: 1.0000 - val_loss: 4.0151e-06
Epoch 5/5
[1m1125/1125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 39ms/step - accuracy: 1.0000 - loss: 6.1420e-06 - val_accuracy: 1.0000 - val_loss: 2.0234e-06
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 8ms/step -

In [None]:
# Task 3: Key Topic Extraction
# **Objective:** Identify recurring themes in customer reviews.


In [15]:
# Prepare for topic extraction
import pandas as pd
from gensim.corpora import Dictionary

# Reload cleaned reviews (or use df_bike from earlier)
df = pd.read_pickle('../data/df_bike_clean.pkl')

# Token lists
documents = [text.split() for text in df['cleaned_review']]

# Build dictionary and filter extremes
dictionary = Dictionary(documents)
dictionary.filter_extremes(no_below=10, no_above=0.5)

# Create the corpus
corpus = [dictionary.doc2bow(doc) for doc in documents]


In [16]:
# Train LDA and show topics
import gensim

lda = gensim.models.LdaModel(
    corpus=corpus,
    id2word=dictionary,
    num_topics=10,
    passes=5,
    random_state=42
)

# Print top words per topic
for i, topic in lda.print_topics():
    print(f"Topic {i:02d}:", topic)


Topic 00: 0.158*"process" + 0.128*"seat" + 0.128*"comfort" + 0.112*"return" + 0.051*"experience" + 0.049*"absolutely" + 0.049*"fantastic" + 0.048*"loved" + 0.045*"quality" + 0.045*"entire"
Topic 01: 0.112*"service" + 0.066*"rental" + 0.056*"bike" + 0.054*"extremely" + 0.054*"urgent" + 0.054*"improvement" + 0.054*"need" + 0.053*"topnotch" + 0.053*"great" + 0.053*"made"
Topic 02: 0.205*"customer" + 0.197*"service" + 0.076*"fast" + 0.076*"friendly" + 0.076*"fun" + 0.076*"impressive" + 0.068*"poor" + 0.066*"expensive" + 0.065*"quality" + 0.019*"rental"
Topic 03: 0.162*"app" + 0.162*"mobile" + 0.093*"ride" + 0.057*"everything" + 0.056*"loved" + 0.055*"flawless" + 0.055*"booking" + 0.055*"fine" + 0.054*"rave" + 0.054*"wouldnt"
Topic 04: 0.192*"nothing" + 0.098*"great" + 0.098*"either" + 0.098*"complain" + 0.098*"wasnt" + 0.098*"went" + 0.098*"smoothly" + 0.098*"especially" + 0.021*"availability" + 0.020*"support"
Topic 05: 0.267*"bike" + 0.203*"condition" + 0.108*"decent" + 0.072*"ride" + 0.

In [21]:
from IPython.display import Markdown, display


for idx, row in df_topics.iterrows():
    md = f"**Topic {idx}**: " + " • ".join(row["Top Words"].split(", "))
    display(Markdown(md))


**Topic 00**: process • seat • comfort • return • experience • absolutely • fantastic • loved • quality • entire

**Topic 01**: service • rental • bike • extremely • urgent • improvement • need • topnotch • great • made

**Topic 02**: customer • service • fast • friendly • fun • impressive • poor • expensive • quality • rental

**Topic 03**: app • mobile • ride • everything • loved • flawless • booking • fine • rave • wouldnt

**Topic 04**: nothing • great • either • complain • wasnt • went • smoothly • especially • availability • support

**Topic 05**: bike • condition • decent • ride • best • terrible • nightmare • okay • excellent • wonderful

**Topic 06**: life • battery • nothing • rental • okay • wellmaintained • smooth • special • functional • extraordinary

**Topic 07**: experience • rental • ever • ruined • worst • entire • process • worked • average • expected

**Topic 08**: process • wait • long • awful • time • seamless • perfect • major • letdown • worth

**Topic 09**: experience • rental • bike • superb • outstanding • renting • need • replaced • immediately • horrible

In [17]:
topic_per_review = [max(lda.get_document_topics(bow), key=lambda x: x[1])[0]
                    for bow in corpus]
df['dominant_topic'] = topic_per_review


In [18]:
df['dominant_topic'].value_counts(normalize=True).sort_index()  


dominant_topic
0    0.08534
1    0.10020
2    0.06928
3    0.09514
4    0.06534
5    0.10854
6    0.11668
7    0.12854
8    0.13118
9    0.09976
Name: proportion, dtype: float64

## Key Insights
### Most common themes

 - Topic 08 (Wait Times & Process Frustrations): ~13.1% of reviews

 - Topic 07 (Extremes of Rental Experience): ~12.9%

 - Topic 06 (Battery Life & Maintenance): ~11.7%

 - Least common themes

 - Topic 04 (Smooth Operations / No Complaints): ~6.5%

 - Topic 02 (Customer Service Quality): ~6.9%