In [1]:
import pandas as pd
import numpy as np
import re
from textblob import TextBlob
from sklearn.preprocessing import MinMaxScaler, MultiLabelBinarizer, OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.impute import SimpleImputer

In [2]:
# Re-read reviews.csv using proper quoting rules
reviews_df = pd.read_csv("C:/Users/Gharat/Downloads/recommendation-engine/data/raw/reviews.csv", quotechar='"', escapechar='\\', encoding='utf-8')

# Check if it loaded correctly
reviews_df.head()

Unnamed: 0,review_id,user_id,location_id,reviews,ratings
0,1,41,8,Absolutely breathtaking panoramic views of the...,5.0
1,2,16,15,"The coastline was okay, nothing to particularl...",3.0
2,3,44,48,A truly profound spiritual experience. The air...,5.0
3,4,38,28,"The offbeat trails, while promising a secluded...",2.0
4,5,14,15,Loved the quiet stretches of beaches and the s...,4.0


In [3]:
# Drop rows with missing critical values
reviews_df.dropna(subset=["user_id", "location_id", "reviews", "ratings"], inplace=True)

In [4]:
# Clean review text
def clean_review(text):
    text = str(text).lower()
    text = re.sub(r"[^a-z\s]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

reviews_df["cleaned_review"] = reviews_df["reviews"].apply(clean_review)

In [5]:
# Clip ratings to range 1.0–5.0
reviews_df["ratings"] = reviews_df["ratings"].clip(lower=1.0, upper=5.0)

# Sentiment analysis
reviews_df["sentiment"] = reviews_df["cleaned_review"].apply(lambda x: TextBlob(x).sentiment.polarity)

# Review length features
reviews_df["review_len_char"] = reviews_df["cleaned_review"].str.len()
reviews_df["review_len_words"] = reviews_df["cleaned_review"].apply(lambda x: len(x.split()))

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

tfidf = TfidfVectorizer(max_features=300)
tfidf_matrix = tfidf.fit_transform(reviews_df["cleaned_review"])

svd = TruncatedSVD(n_components=50, random_state=0)
review_embeddings = svd.fit_transform(tfidf_matrix)

reviews_df["review_vector"] = list(review_embeddings)

In [7]:
reviews_df.head()

Unnamed: 0,review_id,user_id,location_id,reviews,ratings,cleaned_review,sentiment,review_len_char,review_len_words,review_vector
0,1,41,8,Absolutely breathtaking panoramic views of the...,5.0,absolutely breathtaking panoramic views of the...,0.225,216,33,"[0.37974855812545666, -0.22364180024569233, 0...."
1,2,16,15,"The coastline was okay, nothing to particularl...",3.0,the coastline was okay nothing to particularly...,0.18,215,38,"[0.2736029762048802, 0.0037990119590877985, -0..."
2,3,44,48,A truly profound spiritual experience. The air...,5.0,a truly profound spiritual experience the air ...,0.080556,229,36,"[0.4305076893104259, 0.03492404411061479, 0.04..."
3,4,38,28,"The offbeat trails, while promising a secluded...",2.0,the offbeat trails while promising a secluded ...,0.001042,246,35,"[0.22299582526967668, -0.00798626286482945, 0...."
4,5,14,15,Loved the quiet stretches of beaches and the s...,4.0,loved the quiet stretches of beaches and the s...,0.241667,235,40,"[0.4762284922550467, -0.07020500655321586, -0...."


In [10]:
reviews_df.to_csv("C:/Users/Gharat/Downloads/recommendation-engine/data/processed/reviews.csv", index=False)