In [1]:
import pandas as pd
from pathlib import Path

data_folder = Path("./data/recommendations/")

In [45]:
calendar_raw = pd.read_csv(data_folder / "calendar.csv")
neighbourhoods_raw = pd.read_csv(data_folder / "neighbourhoods.csv")
reviews_summary_raw = pd.read_csv(data_folder / "reviews_summary.csv")
listings_raw = pd.read_csv(data_folder / "listings.csv", low_memory=False)
listings_summary_raw = pd.read_csv(data_folder / "listings_summary.csv")
reviews_raw = pd.read_csv(data_folder / "reviews.csv")

In [3]:
import geojson

with open(data_folder / "neighbourhoods.geojson") as f:
    neighbourhoods_gj = geojson.load(f)

## Analyze reviews

In [46]:
reviews_raw.shape

(1486236, 6)

0          False
990706     False
990705     False
990704     False
990703     False
           ...  
1479863      NaN
1480313      NaN
1482655      NaN
1483551      NaN
1485904      NaN
Name: comments, Length: 1486236, dtype: object

In [109]:
# clean reivews
import re

reviews = reviews_raw[
    ~reviews_raw["comments"].str.match("host canceled this reservation").fillna(True)
].reset_index(
    drop=True
)  # remove cancelled
reviews["comments"] = (
    reviews["comments"].map(lambda x: re.sub("\w*\d\w*", " ", x)).reset_index(drop=True)
)  # remove numbers

In [117]:
reviews = reviews.dropna(subset=["comments"]).reset_index(drop=True)


reviews["comments"] = (
    reviews["comments"]
    .str.replace("\n", " ")
    .str.replace("\t", " ")
    .replace("!", "")
    .str.strip(
        "\" \n \t ! # % ' ( ) * + \r . , - / : ; < > ｡ = ? &"
    )  # remove all symbol-only chars
).reset_index(drop=True)
reviews["comments"] = reviews["comments"].replace("", pd.NA)
reviews["comments"] = reviews["comments"][
    ~reviews["comments"].map(lambda x: len(x) < 5).reset_index(drop=True)
]
reviews = reviews.dropna(subset=["comments"]).reset_index(drop=True)

In [115]:
reviews.shape

(1479564, 6)

In [122]:
from langdetect import detect


def detect_err(comment):
    try:
        return detect(comment)
    except:
        return "na"


reviews["language"] = reviews["comments"].map(detect_err)

In [126]:
reviews

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments,language
0,11551,30672,2010-03-21,93896,Shar-Lyn,"The flat was bright, comfortable and clean and...",en
1,11551,32236,2010-03-29,97890,Zane,We stayed with Adriano and Valerio for a week ...,en
2,90700,337227,2011-06-27,311071,Miqua,it was all in all the perfect week!\r chilton ...,en
3,90700,378738,2011-07-17,224367,Prateek,"I'll start with the host, and then move on to ...",en
4,90700,543840,2011-09-18,1115024,Jennifer,Great location. Plenty to do just steps outsid...,en
...,...,...,...,...,...,...,...
1479559,39740287,559509688,2019-11-04,182032644,Isabel,"A very good stay, I would repeat for sure",en
1479560,22701498,558667202,2019-11-03,65955902,Shereen,"Set in a lovely development with onsite bar, c...",en
1479561,38398365,552239161,2019-10-21,60436496,Chee Ling,Website hidden by Airbnb) a.best owner and gen...,en
1479562,38398365,559541617,2019-11-04,97684167,Carolyn,This flat is perfection! Everything you need i...,en


In [124]:
# save stage
reviews.to_parquet(data_folder / "reviews_clean_lang.parquet")

In [130]:
reviews["language"].value_counts().iloc[:10]

en       1279642
fr         69306
es         35314
de         23909
it         16918
ko         11602
zh-cn       9163
pt          6009
nl          4806
ro          2847
Name: language, dtype: int64

In [132]:
reviews = reviews.query("language == 'en'").reset_index(drop=True)

In [136]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

sia = SentimentIntensityAnalyzer()
compund_scorer = lambda comment: (sia.polarity_scores(comment))["compound"]
compund_scorer(reviews["comments"][0])

0.9413

In [137]:
reviews["comment_score"] = reviews["comments"].map(compund_scorer)

In [138]:
reviews.to_parquet(data_folder / "reviews_sentiment.parquet")