In [1]:
# Imports & Load Raw Data
import pandas as pd


df = pd.read_csv('../data/bike_rental_reviews.csv', encoding='latin-1')
print("Raw data shape:", df.shape)


Raw data shape: (50000, 2)


In [2]:
# Text Cleaning Setup
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk

# download once
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text(text: str) -> str:
    text = text.lower()
    text = re.sub(r'[^a-z0-9\s]', '', text)
    tokens = text.split()
    return " ".join(lemmatizer.lemmatize(t) for t in tokens if t not in stop_words)

# apply cleaning
df['cleaned_review'] = df['review_text'].astype(str).apply(clean_text)
print("Applied cleaning. Sample:")
display(df[['review_text','cleaned_review']].head())


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/sheilamcgovern/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/sheilamcgovern/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/sheilamcgovern/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Applied cleaning. Sample:


Unnamed: 0,review_text,cleaned_review
0,"The entire process was easy, and the availabil...",entire process easy availability high quality
1,Standard rental process. The mobile app was ac...,standard rental process mobile app acceptable
2,One of the best bike rentals Iâve had. The m...,one best bike rental ive mobile app made even ...
3,One of the best bike rentals Iâve had. The c...,one best bike rental ive customer service made...
4,Not worth the money. The seat comfort was a ma...,worth money seat comfort major letdown


In [3]:
# Drop Duplicates
# keep first occurrence of each unique cleaned_review
df_unique = df.drop_duplicates(subset='cleaned_review').reset_index(drop=True)
print("After deduplication:", df_unique.shape)


After deduplication: (300, 3)


In [4]:
# Vectorize with TF–IDF
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=5000)
X_unique = vectorizer.fit_transform(df_unique['cleaned_review'])
y_unique = df_unique['sentiment']

print("Feature matrix shape:", X_unique.shape)
print("Unique label counts:\n", y_unique.value_counts())


Feature matrix shape: (300, 115)
Unique label counts:
 sentiment
positive    100
neutral     100
negative    100
Name: count, dtype: int64


In [5]:
# Persist Deduplicated Artifacts
import pickle

with open('../data/df_unique.pkl',    'wb') as f: pickle.dump(df_unique,      f)
with open('../data/X_unique.pkl',     'wb') as f: pickle.dump(X_unique,       f)
with open('../data/y_unique.pkl',     'wb') as f: pickle.dump(y_unique,       f)
with open('../data/vectorizer_dup.pkl','wb') as f: pickle.dump(vectorizer,    f)

print("Saved deduped DataFrame and TF–IDF artifacts to ../data/")


Saved deduped DataFrame and TF–IDF artifacts to ../data/
