# Review-based User Recommendations

In [2]:
import pandas as pd

pd.set_option("display.max_colwidth", 120)

from pathlib import Path

data_folder = Path("./data/recommendations/")

In [66]:
calendar_raw = pd.read_csv(data_folder / "calendar.csv")
neighbourhoods_raw = pd.read_csv(data_folder / "neighbourhoods.csv")
reviews_summary_raw = pd.read_csv(data_folder / "reviews_summary.csv")
listings_raw = pd.read_csv(data_folder / "listings.csv", low_memory=False)
listings_summary_raw = pd.read_csv(data_folder / "listings_summary.csv")
reviews_raw = pd.read_csv(data_folder / "reviews.csv")

In [67]:
import geojson

with open(data_folder / "neighbourhoods.geojson") as f:
    neighbourhoods_gj = geojson.load(f)

## Sentiment extraction

In [68]:
reviews_raw.shape

(1486236, 6)

In [69]:
# clean reivews
import re

reviews = reviews_raw[
    ~reviews_raw["comments"].str.match("host canceled this reservation").fillna(True)
].reset_index(
    drop=True
)  # remove cancelled
reviews["comments"] = (
    reviews["comments"].map(lambda x: re.sub("\w*\d\w*", " ", x)).reset_index(drop=True)
)  # remove numbers

In [104]:
reviews = reviews.dropna(subset=["comments"]).reset_index(drop=True)

reviews["comments"] = (
    reviews["comments"]
    .str.replace("\n", " ")
    .str.replace("\t", " ")
    .replace("!", "")
    .str.strip(
        "\" \n \t ! # % ' ( ) * + \r . , - / : ; < > ｡ = ? &"
    )  # remove all symbol-only comments
).reset_index(drop=True)
reviews["comments"] = reviews["comments"].replace("", pd.NA)
reviews["comments"] = reviews["comments"][
    ~reviews["comments"].map(lambda x: len(x) < 5).reset_index(drop=True)
]
reviews = reviews.dropna(subset=["comments"]).reset_index(drop=True)

In [105]:
reviews.shape

(1279642, 8)

In [122]:
from langdetect import detect


def detect_err(comment):
    try:
        return detect(comment)
    except:
        return "na"


reviews["language"] = reviews["comments"].map(detect_err)

In [None]:
# save stage
# reviews.to_parquet(data_folder / "reviews_clean_lang.parquet")
reviews = pd.read_parquet(data_folder / "reviews_clean_lang.parquet")

In [None]:
reviews["language"].value_counts().iloc[:10]

en       1279642
fr         69306
es         35314
de         23909
it         16918
ko         11602
zh-cn       9163
pt          6009
nl          4806
ro          2847
Name: language, dtype: int64

In [132]:
reviews = reviews.query("language == 'en'").reset_index(drop=True)

In [136]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

sia = SentimentIntensityAnalyzer()
compund_scorer = lambda comment: (sia.polarity_scores(comment))["compound"]
compund_scorer(reviews["comments"][0])

0.9413

In [137]:
reviews["comment_score"] = reviews["comments"].map(compund_scorer)

In [71]:
# reviews.to_parquet(data_folder / "reviews_sentiment.parquet")
reviews = pd.read_parquet(data_folder / "reviews_sentiment.parquet")

In [103]:
reviews.sample(5).sort_values("comment_score")

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments,language,comment_score
960823,22575808,541573851,2019-10-05,146612026,Sam,SCAM ! Doesn’t exist,en,-0.6932
1268628,37741921,553553314,2019-10-24,100642119,Toni,Great space in a fabulous location. My kids LOVED the breakfast that was provided! Only a few minutes walk to the L...,en,0.925
99350,889540,72809657,2016-05-04,29335357,Brianna,LOCATION! This place was awesome! The flat was perfect and you're getting what you're paying for. It's fantastic. Ri...,en,0.9768
294478,5209535,412371030,2019-02-15,237478138,Paul,"I had a lovely stay at Cliffs property, it was a clean, well appointed and thoughtfully designed space with some lov...",en,0.9781
543184,12896212,152794847,2017-05-18,2149086,Jo-Anne,"Everything was perfect about Jee's flat. It was right in the heart of things, so easy to access from the airport and...",en,0.9947


# Recommender System

In [3]:
# TODO sample per listing id instead of generally
reviews = pd.read_parquet(data_folder / "reviews_sentiment.parquet")[
    ["reviewer_id", "listing_id", "comment_score"]
]

rec_df = reviews.sample(frac=0.1, random_state=7)
rec_df["comment_score"] = (rec_df["comment_score"] + 1) / 2  # get rid of zeros

In [4]:
rec_df.nunique()

reviewer_id      123023
listing_id        35849
comment_score      6044
dtype: int64

In [5]:
# Need to clone funk svd manually and edit the setup.py https://github.com/gbolmier/funk-svd/tree/master
from surprise import KNNWithMeans, Reader, Dataset, accuracy, SVD
from surprise.model_selection import cross_validate, train_test_split

In [6]:
reader = Reader(rating_scale=(0, 1))
data = Dataset.load_from_df(rec_df, reader)  # .build_full_trainset()
trainset, testset = train_test_split(data, test_size=0.25)

In [8]:
sim_options = {
    "name": "cosine",
    "user_based": False,  # Compute  similarities between items
}
model = KNNWithMeans(sim_options=sim_options)
predictions = model.fit(trainset).test(testset)
print(accuracy.rmse(predictions))

Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 0.1417
0.14170569737512212


In [7]:
model = SVD()
predictions = model.fit(trainset).test(testset)
print(accuracy.rmse(predictions))

RMSE: 0.1411
0.14112489482058113


## ChatGPT Tagging

In [116]:
import openai
import re


def get_sentiment(input_text):
    prompt = f"""You are a sentiment analyser. Analyze the text.
    Respond in the json format  {{'topic':sentiment'}}
    There can be multiple topics for each housing review.
    Return ONLY the json and nothing else.
    
    Text:'{input_text}'"""

    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=[{"role": "user", "content": prompt}],
        max_tokens=100,
        n=1,
        stop=None,
        temperature=0.5,
    )
    response_text = response.choices[0].message["content"].strip()

    return response_text


# Test single example
sample_text = """Great place to stay, was perfect for the night I 
needed it during my weekend in London for visiting the city. 
It's very hard to go to the center with public transport, situated in a 
calm neighborhood and the Room is clean and comfortable"""

response = get_sentiment(sample_text)
print(response)

{
  "housing_review": "positive",
  "location": "positive",
  "public_transport": "negative",
  "neighborhood": "positive",
  "room": "positive"
}


# Archived trials

In [52]:
user_listing_mat = rec_df.pivot_table(
    index="reviewer_id", columns="listing_id", values="comment_score"
)
user_listing_mat.shape

(25319, 15521)

In [53]:
userrows = list(user_listing_mat.index)
users_index = {userrows[i]: i for i in range(len(userrows))}

itemcols = list(user_listing_mat.columns)
items_index = {itemcols[i]: i for i in range(len(itemcols))}

In [54]:
from scipy.sparse.linalg import svds
import numpy as np


def recommend_predictions(df_rec, k):
    """
    :param df_rec: dataframe with polarity values for known reviewer/listing pairs
    :param k: number of features to keep for SVD

    returns the dataframe with predicted polarity values for all (user,item) pairs
    """
    # get utility matrix
    util_mat = df_rec.pivot_table(
        index="reviewer_id", columns="listing_id", values="comment_score"
    )

    # keep track of reviewers and listings
    reviewer_rows = list(util_mat.index)
    users_index = {reviewer_rows[i]: i for i in range(len(reviewer_rows))}
    listing_cols = list(util_mat.columns)
    items_index = {listing_cols[i]: i for i in range(len(listing_cols))}

    # mask NaN and remove means
    mask = np.isnan(util_mat)
    masked_arr = np.ma.masked_array(util_mat, mask)
    item_means = np.mean(masked_arr, axis=0)
    util_mat = masked_arr.filled(item_means)
    means = np.tile(item_means, (util_mat.shape[0], 1))
    util_mat_demeaned = util_mat - means

    # run SVD
    U, sigma, Vt = svds(util_mat_demeaned, k=k)
    sigma = np.diag(sigma)
    all_predicted_polarity = np.dot(np.dot(U, sigma), Vt) + means

    return all_predicted_polarity, users_index, items_index

In [61]:
pred, _, _ = recommend_predictions(rec_df, k=25)

In [62]:
# evaluate on known reviewer-listing pairs

this_pred = []  # to store the predicted ratings
for i, row in rec_df.iterrows():
    user = row["reviewer_id"]
    item = row["listing_id"]

    try:
        u_index = users_index[user]
        if item in items_index:
            i_index = items_index[item]
            pred_rating = predicted_scores[u_index, i_index]
        else:
            pred_rating = np.mean(svdout[u_index, :])
        this_pred.append(pred_rating)

    except:
        print("error with user", user, "at index", i)

In [63]:
from sklearn.metrics import mean_squared_error

rmse = np.sqrt(mean_squared_error(rec_df["comment_score"], this_pred))
print(rmse)

0.1479933619716791


In [64]:
no_of_features = [10, 25, 40, 50]
rmse = []

for k in no_of_features:
    pred_out, _, _ = recommend_predictions(rec_df, k=k)

    this_pred = []
    for i, row in rec_df.iterrows():
        user = row["reviewer_id"]
        item = row["listing_id"]

        try:
            u_index = users_index[user]
            if item in items_index:
                i_index = items_index[item]
                pred_rating = pred_out[u_index, i_index]
            else:
                pred_rating = np.mean(svdout[u_index, :])
            this_pred.append(pred_rating)

        except:
            print("error with user", user, "at index", i)
            continue

    rmse_i = np.sqrt(mean_squared_error(rec_df["comment_score"], this_pred))
    rmse.append(rmse_i)

    print(k, rmse_i)

10 0.1525856784649185
25 0.1479933619716791
40 0.14393227628284896
50 0.1414792097088569


### FUNK SVD

In [45]:
reviews = pd.read_parquet(data_folder / "reviews_sentiment.parquet").sample(
    frac=0.5, random_state=7
)  # TODO sample per listing id instead of generally
rec_df = reviews[["reviewer_id", "listing_id", "comment_score"]]
rec_df.nunique()

reviewer_id      562688
listing_id        56795
comment_score     12189
dtype: int64

In [65]:
# Need to clone funk svd manually and edit the setup.py https://github.com/gbolmier/funk-svd/tree/master
from funk_svd import SVD


X = rec_df.copy(deep=True)
X.columns = ["u_id", "i_id", "rating"]

train = X.sample(frac=0.8, random_state=7)
val = X.drop(train.index.tolist()).sample(frac=0.5, random_state=8)
test = X.drop(train.index.tolist()).drop(val.index.tolist())


svd = SVD(
    lr=0.001,
    reg=0.005,
    n_epochs=100,
    n_factors=50,
    early_stopping=True,
    shuffle=False,
    min_rating=-1,
    max_rating=1,
)

svd.fit(X=train, X_val=val)

pred = svd.predict(test)

rmse = np.sqrt(mean_squared_error(test["rating"], pred))

Preprocessing data...

Preprocessing data...

Epoch 1/100  | val_loss: 0.08 - val_rmse: 0.28 - val_mae: 0.18 - took 0.0 sec
Epoch 2/100  | val_loss: 0.08 - val_rmse: 0.28 - val_mae: 0.18 - took 0.0 sec

Training took 0 sec
