In [2]:
import pandas as pd

pd.set_option("display.max_colwidth", 120)

from pathlib import Path

data_folder = Path("./data/recommendations/")

In [45]:
calendar_raw = pd.read_csv(data_folder / "calendar.csv")
neighbourhoods_raw = pd.read_csv(data_folder / "neighbourhoods.csv")
reviews_summary_raw = pd.read_csv(data_folder / "reviews_summary.csv")
listings_raw = pd.read_csv(data_folder / "listings.csv", low_memory=False)
listings_summary_raw = pd.read_csv(data_folder / "listings_summary.csv")
reviews_raw = pd.read_csv(data_folder / "reviews.csv")

In [2]:
import geojson

with open(data_folder / "neighbourhoods.geojson") as f:
    neighbourhoods_gj = geojson.load(f)

## Analyze reviews

In [68]:
reviews_raw.shape

NameError: name 'reviews_raw' is not defined

In [109]:
# clean reivews
import re

reviews = reviews_raw[
    ~reviews_raw["comments"].str.match("host canceled this reservation").fillna(True)
].reset_index(
    drop=True
)  # remove cancelled
reviews["comments"] = (
    reviews["comments"].map(lambda x: re.sub("\w*\d\w*", " ", x)).reset_index(drop=True)
)  # remove numbers

In [117]:
reviews = reviews.dropna(subset=["comments"]).reset_index(drop=True)


reviews["comments"] = (
    reviews["comments"]
    .str.replace("\n", " ")
    .str.replace("\t", " ")
    .replace("!", "")
    .str.strip(
        "\" \n \t ! # % ' ( ) * + \r . , - / : ; < > ｡ = ? &"
    )  # remove all symbol-only chars
).reset_index(drop=True)
reviews["comments"] = reviews["comments"].replace("", pd.NA)
reviews["comments"] = reviews["comments"][
    ~reviews["comments"].map(lambda x: len(x) < 5).reset_index(drop=True)
]
reviews = reviews.dropna(subset=["comments"]).reset_index(drop=True)

In [115]:
reviews.shape

(1479564, 6)

In [122]:
from langdetect import detect


def detect_err(comment):
    try:
        return detect(comment)
    except:
        return "na"


reviews["language"] = reviews["comments"].map(detect_err)

In [126]:
reviews

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments,language
0,11551,30672,2010-03-21,93896,Shar-Lyn,"The flat was bright, comfortable and clean and...",en
1,11551,32236,2010-03-29,97890,Zane,We stayed with Adriano and Valerio for a week ...,en
2,90700,337227,2011-06-27,311071,Miqua,it was all in all the perfect week!\r chilton ...,en
3,90700,378738,2011-07-17,224367,Prateek,"I'll start with the host, and then move on to ...",en
4,90700,543840,2011-09-18,1115024,Jennifer,Great location. Plenty to do just steps outsid...,en
...,...,...,...,...,...,...,...
1479559,39740287,559509688,2019-11-04,182032644,Isabel,"A very good stay, I would repeat for sure",en
1479560,22701498,558667202,2019-11-03,65955902,Shereen,"Set in a lovely development with onsite bar, c...",en
1479561,38398365,552239161,2019-10-21,60436496,Chee Ling,Website hidden by Airbnb) a.best owner and gen...,en
1479562,38398365,559541617,2019-11-04,97684167,Carolyn,This flat is perfection! Everything you need i...,en


In [124]:
# save stage
reviews.to_parquet(data_folder / "reviews_clean_lang.parquet")

In [130]:
reviews["language"].value_counts().iloc[:10]

en       1279642
fr         69306
es         35314
de         23909
it         16918
ko         11602
zh-cn       9163
pt          6009
nl          4806
ro          2847
Name: language, dtype: int64

In [132]:
reviews = reviews.query("language == 'en'").reset_index(drop=True)

In [136]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

sia = SentimentIntensityAnalyzer()
compund_scorer = lambda comment: (sia.polarity_scores(comment))["compound"]
compund_scorer(reviews["comments"][0])

0.9413

In [137]:
reviews["comment_score"] = reviews["comments"].map(compund_scorer)

In [138]:
reviews.to_parquet(data_folder / "reviews_sentiment.parquet")

In [146]:
reviews.sort_values("comment_score")

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments,language,comment_score
611193,14629530,407511297,2019-02-02,8258215,Salva,I don't like writing bad reviews. And this is probably the first one I write on AirBnB. And judging by M&B's track r...,en,-0.9985
1216944,33651970,463925992,2019-06-04,111616612,Emi,Avoid this place !!!!!******* The host left me waiting in the street for hours while he is having dinner !! Didn'...,en,-0.9982
559770,13422066,425474758,2019-03-18,153775364,Mark,Nicholas and Emilie are great hosts by all accounts but we only dealt with their lettings agency and unfortunately e...,en,-0.9981
637794,15161649,338693750,2018-10-20,2514709,Debbie Omkari,"When I arived at the property, the host thought I had got the day wrong as I had made an alteration. But I knew I w...",en,-0.9979
543367,12896220,483328502,2019-07-07,74921010,Will,"What can I say? The pictures look great, but the place is not. It's tiny, unsafe, loud, and most of the appliances...",en,-0.9979
...,...,...,...,...,...,...,...,...
242163,3982582,467285810,2019-06-10,122814336,Andy,We had a wonderful time staying in Philip and Euardo's gorgeous London flat. It is beautiful and spacious as it look...,en,0.9996
21313,198279,61428195,2016-02-04,15743934,Rob,I had an amazing time in London and big part of that was thanks to Radhika & Lorenzo. That made me feel welcome from...,en,0.9996
103507,970650,510515365,2019-08-16,190909789,Steffi,I have been staying a whole week at Hazel's and it was truly the best that could have happened to me. right after st...,en,0.9996
1045740,25339365,338389654,2018-10-19,173390041,Priya,We are so lucky to have found Alice’s home available for the time and where we needed. Alice graciously accommodate...,en,0.9997


# RecSys

In [46]:
# TODO sample per listing id instead of generally
reviews = pd.read_parquet(data_folder / "reviews_sentiment.parquet")[
    ["reviewer_id", "listing_id", "comment_score"]
]

rec_df = reviews.sample(frac=0.1, random_state=7)
rec_df["comment_score"] = (rec_df["comment_score"] + 1) / 2  # get rid of zeros

In [47]:
rec_df.nunique()

reviewer_id      123023
listing_id        35849
comment_score      6044
dtype: int64

In [48]:
data

<surprise.dataset.DatasetAutoFolds at 0x14006c760>

In [49]:
# Need to clone funk svd manually and edit the setup.py https://github.com/gbolmier/funk-svd/tree/master
from surprise import KNNWithMeans, Reader, Dataset, accuracy
from surprise.model_selection import cross_validate, train_test_split

reader = Reader(rating_scale=(0, 1))
data = Dataset.load_from_df(rec_df, reader)  # .build_full_trainset()
trainset, testset = train_test_split(data, test_size=0.25)

sim_options = {
    "name": "cosine",
    "user_based": False,  # Compute  similarities between items
}
model = KNNWithMeans(sim_options=sim_options)
predictions = model.fit(trainset).test(testset)
print(accuracy.rmse(predictions))

Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x140193cd0>

RMSE: 0.1431


0.14311728839882143

In [52]:
user_listing_mat = rec_df.pivot_table(
    index="reviewer_id", columns="listing_id", values="comment_score"
)
user_listing_mat.shape

(25319, 15521)

In [53]:
userrows = list(user_listing_mat.index)
users_index = {userrows[i]: i for i in range(len(userrows))}

itemcols = list(user_listing_mat.columns)
items_index = {itemcols[i]: i for i in range(len(itemcols))}

In [54]:
from scipy.sparse.linalg import svds
import numpy as np


def recommend_predictions(df_rec, k):
    """
    :param df_rec: dataframe with polarity values for known reviewer/listing pairs
    :param k: number of features to keep for SVD

    returns the dataframe with predicted polarity values for all (user,item) pairs
    """
    # get utility matrix
    util_mat = df_rec.pivot_table(
        index="reviewer_id", columns="listing_id", values="comment_score"
    )

    # keep track of reviewers and listings
    reviewer_rows = list(util_mat.index)
    users_index = {reviewer_rows[i]: i for i in range(len(reviewer_rows))}
    listing_cols = list(util_mat.columns)
    items_index = {listing_cols[i]: i for i in range(len(listing_cols))}

    # mask NaN and remove means
    mask = np.isnan(util_mat)
    masked_arr = np.ma.masked_array(util_mat, mask)
    item_means = np.mean(masked_arr, axis=0)
    util_mat = masked_arr.filled(item_means)
    means = np.tile(item_means, (util_mat.shape[0], 1))
    util_mat_demeaned = util_mat - means

    # run SVD
    U, sigma, Vt = svds(util_mat_demeaned, k=k)
    sigma = np.diag(sigma)
    all_predicted_polarity = np.dot(np.dot(U, sigma), Vt) + means

    return all_predicted_polarity, users_index, items_index

In [61]:
pred, _, _ = recommend_predictions(rec_df, k=25)

In [62]:
# evaluate on known reviewer-listing pairs

this_pred = []  # to store the predicted ratings
for i, row in rec_df.iterrows():
    user = row["reviewer_id"]
    item = row["listing_id"]

    try:
        u_index = users_index[user]
        if item in items_index:
            i_index = items_index[item]
            pred_rating = predicted_scores[u_index, i_index]
        else:
            pred_rating = np.mean(svdout[u_index, :])
        this_pred.append(pred_rating)

    except:
        print("error with user", user, "at index", i)

In [63]:
from sklearn.metrics import mean_squared_error

rmse = np.sqrt(mean_squared_error(rec_df["comment_score"], this_pred))
print(rmse)

0.1479933619716791


In [64]:
no_of_features = [10, 25, 40, 50]
rmse = []

for k in no_of_features:
    pred_out, _, _ = recommend_predictions(rec_df, k=k)

    this_pred = []
    for i, row in rec_df.iterrows():
        user = row["reviewer_id"]
        item = row["listing_id"]

        try:
            u_index = users_index[user]
            if item in items_index:
                i_index = items_index[item]
                pred_rating = pred_out[u_index, i_index]
            else:
                pred_rating = np.mean(svdout[u_index, :])
            this_pred.append(pred_rating)

        except:
            print("error with user", user, "at index", i)
            continue

    rmse_i = np.sqrt(mean_squared_error(rec_df["comment_score"], this_pred))
    rmse.append(rmse_i)

    print(k, rmse_i)

10 0.1525856784649185
25 0.1479933619716791
40 0.14393227628284896
50 0.1414792097088569


### FUNK SVD

In [45]:
reviews = pd.read_parquet(data_folder / "reviews_sentiment.parquet").sample(
    frac=0.5, random_state=7
)  # TODO sample per listing id instead of generally
rec_df = reviews[["reviewer_id", "listing_id", "comment_score"]]
rec_df.nunique()

reviewer_id      562688
listing_id        56795
comment_score     12189
dtype: int64

In [65]:
# Need to clone funk svd manually and edit the setup.py https://github.com/gbolmier/funk-svd/tree/master
from funk_svd import SVD


X = rec_df.copy(deep=True)
X.columns = ["u_id", "i_id", "rating"]

train = X.sample(frac=0.8, random_state=7)
val = X.drop(train.index.tolist()).sample(frac=0.5, random_state=8)
test = X.drop(train.index.tolist()).drop(val.index.tolist())


svd = SVD(
    lr=0.001,
    reg=0.005,
    n_epochs=100,
    n_factors=50,
    early_stopping=True,
    shuffle=False,
    min_rating=-1,
    max_rating=1,
)

svd.fit(X=train, X_val=val)

pred = svd.predict(test)

rmse = np.sqrt(mean_squared_error(test["rating"], pred))

Preprocessing data...

Preprocessing data...

Epoch 1/100  | val_loss: 0.08 - val_rmse: 0.28 - val_mae: 0.18 - took 0.0 sec
Epoch 2/100  | val_loss: 0.08 - val_rmse: 0.28 - val_mae: 0.18 - took 0.0 sec

Training took 0 sec
