In [1]:
import eli5



In [2]:
from sklearn.linear_model import LogisticRegression

In [3]:
from sklearn.metrics import roc_auc_score

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [5]:
from sklearn.model_selection import train_test_split

In [6]:
import pandas as pd

In [7]:
listings_df = pd.read_csv("/home/anatoli/Документы/airbnb/listings.csv")

In [73]:
listings_df["price_value"] = listings_df["price"].apply(lambda x: x[1:].replace(",", "")).astype(float)

In [74]:
listings_df["price_per_accommodation"] = listings_df["price_value"] / listings_df["accommodates"]

In [85]:
dataset_df = listings_df[
    (listings_df["review_scores_rating"] >= 90) & \
    (listings_df["accommodates"] < 3)
][["room_type", "neighbourhood_cleansed", "price_per_accommodation", "name", "description", "neighborhood_overview", "latitude", "longitude"]].copy()

In [86]:
avg_prices_df = dataset_df.groupby(["room_type", "neighbourhood_cleansed"])["price_per_accommodation"].agg(["mean", "median", "std"]).reset_index()

In [87]:
avg_prices_df["max"] = avg_prices_df["mean"] + avg_prices_df["std"]

In [199]:
prices_map_df = avg_prices_df.set_index(["neighbourhood_cleansed", "room_type"])["mean"].unstack(-1)

In [200]:
prices_map_df = prices_map_df.sort_values("Private room", ascending=False).iloc[0:20]

In [201]:
for c in prices_map_df.columns:
    prices_map_df[c] = prices_map_df[c].apply(lambda x: "{:.2f} $".format(x))
    prices_map_df[c] = prices_map_df[c].str.replace("nan \$", "")

In [202]:
prices_map_df.index.name = ""

In [203]:
prices_map_df.columns.name = ""

In [204]:
prices_map_df

Unnamed: 0,Entire home/apt,Hotel room,Private room,Shared room
,,,,
Westminster,69.08 $,87.38 $,47.56 $,27.30 $
Kensington and Chelsea,56.62 $,66.00 $,46.05 $,44.00 $
Hammersmith and Fulham,46.19 $,54.92 $,37.73 $,26.39 $
Southwark,46.25 $,,35.88 $,25.00 $
Barking and Dagenham,31.08 $,,33.44 $,
City of London,58.10 $,84.00 $,33.10 $,
Camden,55.73 $,34.94 $,32.44 $,98.79 $
Haringey,35.16 $,17.00 $,32.07 $,17.60 $
Islington,48.29 $,48.00 $,31.53 $,21.62 $


In [94]:
merged_df = dataset_df.merge(avg_prices_df, on=["room_type", "neighbourhood_cleansed"])

In [95]:
merged_df["target"] = False
merged_df.loc[merged_df["price_per_accommodation"] >= merged_df["max"], "target"] = True

In [96]:
merged_df.dropna(inplace=True)

In [161]:
merged_df["target"].value_counts()

False    15368
True       915
Name: target, dtype: int64

In [98]:
train_indices, test_indices = train_test_split(merged_df.index, test_size=0.3)

In [99]:
merged_df["test"] = False
merged_df.loc[test_indices, "test"] = True

In [102]:
vectorizer = TfidfVectorizer(min_df=5, max_features=10000, stop_words='english')

In [103]:
merged_df["full_text"] = merged_df["description"] + " " +  merged_df["neighborhood_overview"]

In [104]:
def remove_tags(x):
    return x.replace("<", " ").replace(">", " ")

In [105]:
merged_df["full_text"] = merged_df["full_text"].apply(remove_tags)

In [106]:
X_train = vectorizer.fit_transform(merged_df[~merged_df["test"]]["full_text"])

In [107]:
X_test = vectorizer.transform(merged_df[merged_df["test"]]["full_text"])

In [108]:
y_train = merged_df[~merged_df["test"]]["target"]

In [109]:
y_test = merged_df[merged_df["test"]]["target"]

In [110]:
model = LogisticRegression()

In [111]:
model.fit(X_train, y_train)

LogisticRegression()

In [112]:
prediction_train = model.predict_proba(X_train).T[1]

In [113]:
X_test

<4885x7887 sparse matrix of type '<class 'numpy.float64'>'
	with 419065 stored elements in Compressed Sparse Row format>

In [114]:
prediction_test = model.predict_proba(X_test).T[1]

In [115]:
roc_auc_score(y_train, prediction_train)

0.906552101353414

In [116]:
roc_auc_score(y_test, prediction_test)

0.6751013625789299

In [117]:
eli5.explain_weights(model, feature_names=vectorizer.get_feature_names(), top=100)

Weight?,Feature
+2.236,ealing
+1.818,barnes
+1.727,property
+1.648,stylish
+1.610,hotel
+1.527,views
+1.517,croydon
+1.357,comfort
+1.339,modern
+1.293,islington


In [118]:
import numpy as np

In [119]:
feature_names = np.array(vectorizer.get_feature_names())

In [142]:
best_features = feature_names[np.argsort(-model.coef_)][0, 0:200]

In [143]:
from tqdm import tqdm_notebook

In [144]:
regions = listings_df["neighbourhood_cleansed"].str.lower().unique().tolist()

In [186]:
found_places = []
all_found_places_df = []

for place in tqdm_notebook(best_features):
    place_indices = merged_df["neighborhood_overview"].str.lower().str.contains(place)
    place_df = merged_df[place_indices][["latitude", "longitude", "target", "price_per_accommodation", "mean"]].copy()
    place_df["place"] = place
    if min(place_df[["latitude", "longitude"]].std() < 0.02):
        print(place)
        if any([place in region for region in regions]):
            print("Not included")
            continue
        found_places.append(place)
        all_found_places_df.append(place_df)


This function will be removed in tqdm==5.0.0
Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`



HBox(children=(FloatProgress(value=0.0, max=200.0), HTML(value='')))

barnes
putney
coulsdon
duplex
leyton
hoxton
pinner
pitshanger
beckenham
sidcup
clerkenwell
heated
samsung
belsize
leytonstone
deptford



In [187]:
found_places_df = pd.concat(all_found_places_df)

In [188]:
found_places_df["price_per_accommodation_diff"] = (found_places_df["price_per_accommodation"] - found_places_df["mean"]) / found_places_df["mean"] * 100

In [189]:
# listings_df["neighbourhood_cleansed"].value_counts()

In [190]:
# index = merged_df["full_text"].str.lower().str.contains("amazon")
# merged_df[index.fillna(False)]["full_text"].sample().iloc[0]

In [191]:
found_places_df["place"].unique()

array(['barnes', 'putney', 'coulsdon', 'duplex', 'leyton', 'hoxton',
       'pinner', 'pitshanger', 'beckenham', 'sidcup', 'clerkenwell',
       'heated', 'samsung', 'belsize', 'leytonstone', 'deptford'],
      dtype=object)

In [192]:
# place_names = {'spitalfields': "Spitalfields",
#  'chatsworth': 'Chatsworth',
#  'kensal': 'Kensal',
#  'leyton': 'Leyton',
#  'exmouth': 'Exmouth',
#  'parsons': 'Parsons Green',
#  'aldgate': 'Aldgate',
#  'wanstead': 'Wanstead',
#  'leytonstone': "Leyton",
#  'askew': 'Askew',
#  'cricklewood': 'Cricklewood',
#  'chancery': 'Chancery Lane',
#  'coulsdon': 'Coulsdon',
#  'isleworth': 'Isleworth',
#  'norbiton': 'Norbiton',
#  'marksman': 'The Marksman',
#  'goodmayes': 'Goodmayes',
#  'aeltc': 'aeltc'.upper()}
place_names = {
    feature: feature.capitalize()
    for feature in found_places_df["place"].unique()
    if feature not in ["duplex", "samsung", "heated"]
}

In [193]:
found_places_df["Place"] = found_places_df["place"].apply(lambda x: place_names.get(x))

In [194]:
# found_places_df.groupby("place").agg({
#     "price_per_accommodation": "mean",
#     "mean": "mean"
# })

In [195]:
import plotly.express as px

px.scatter_mapbox(found_places_df[~found_places_df["Place"].isnull()], lat="latitude", lon="longitude", color="Place", hover_name="Place",
                  color_continuous_scale=px.colors.cyclical.IceFire, size_max=15, zoom=9, mapbox_style="stamen-terrain")
