In [1542]:
import eli5

In [1543]:
from sklearn.linear_model import LogisticRegression

In [1544]:
from sklearn.metrics import roc_auc_score

In [1545]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [1546]:
from sklearn.model_selection import train_test_split

In [1547]:
import pandas as pd

In [1548]:
import plotly.express as px

In [1549]:
import json

In [1550]:
import geopandas as gpd

In [1551]:
direction = -1 # Cheap 
# direction = 1 # Expensive

In [1552]:
features = json.load(open("/home/anatoli/Документы/airbnb/neighbourhoods.geojson"))

In [1553]:
listings_df = pd.read_csv("/home/anatoli/Документы/airbnb/listings.csv")

In [1554]:
listings_df["price_value"] = listings_df["price"].apply(lambda x: x[1:].replace(",", "")).astype(float)

In [1555]:
listings_df["price_per_accommodation"] = listings_df["price_value"] / listings_df["accommodates"]

In [1556]:
dataset_df = listings_df[
    (listings_df["review_scores_rating"] >= 95) & \
    (listings_df["accommodates"] < 3)
][["room_type", "neighbourhood_cleansed", "price_per_accommodation", "name", "description", "neighborhood_overview", "latitude", "longitude"]].copy()

In [1557]:
avg_prices_df = \
    dataset_df.groupby(["room_type", "neighbourhood_cleansed"])["price_per_accommodation"]\
    .agg(["mean", "median", "std"]).reset_index()

In [1558]:
# avg_prices_df["max"] = avg_prices_df["mean"] + 0.5 * avg_prices_df["std"]
# avg_prices_df["min"] = avg_prices_df["mean"] - 0.5 * avg_prices_df["std"]

In [1559]:
prices_map_df = avg_prices_df.set_index(["neighbourhood_cleansed", "room_type"])["mean"].unstack(-1)

In [1560]:
geo_df = gpd.GeoDataFrame.from_features(features)

In [1561]:
room_prices_df = avg_prices_df[avg_prices_df["room_type"] == "Private room"].copy()

In [1562]:
room_prices_df = geo_df.merge(room_prices_df, left_on="neighbourhood", right_on="neighbourhood_cleansed").set_index("neighbourhood")

In [1563]:
# fig = px.choropleth_mapbox(
#     room_prices_df, geojson=room_prices_df["geometry"], locations=room_prices_df.index,
#     color="mean",
#     color_continuous_scale="Viridis",
#     range_color=(room_prices_df["mean"].min(), room_prices_df["mean"].max()),
#     mapbox_style="carto-positron",
#     zoom=9, 
#     center = {"lat": 51.5074, "lon": -0.1278},
#     opacity=0.5,
# )
# fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
# fig.show()

In [1564]:
prices_map_df = prices_map_df.sort_values("Private room", ascending=False).iloc[0:20]

In [1565]:
for c in prices_map_df.columns:
    prices_map_df[c] = prices_map_df[c].apply(lambda x: "{:.2f} $".format(x))
    prices_map_df[c] = prices_map_df[c].str.replace("nan \$", "")

In [1566]:
prices_map_df.index.name = ""

In [1567]:
prices_map_df.columns.name = ""

In [1568]:
prices_map_df

Unnamed: 0,Entire home/apt,Hotel room,Private room,Shared room
,,,,
Westminster,72.12 $,105.77 $,45.58 $,29.75 $
Kensington and Chelsea,59.79 $,72.69 $,41.07 $,45.88 $
Hammersmith and Fulham,47.18 $,60.90 $,38.38 $,25.62 $
Southwark,47.08 $,,38.02 $,28.60 $
Barking and Dagenham,34.88 $,,36.68 $,
Haringey,37.41 $,17.00 $,34.67 $,22.00 $
City of London,55.26 $,84.00 $,34.30 $,
Camden,58.42 $,48.72 $,32.71 $,28.38 $
Wandsworth,43.28 $,56.00 $,32.33 $,30.00 $


In [1569]:
# merged_df["price_per_accommodation_diff"]

In [1570]:
merged_df = dataset_df.merge(avg_prices_df, on=["room_type", "neighbourhood_cleansed"])

In [1571]:
merged_df["price_per_accommodation_diff"] = (merged_df["price_per_accommodation"] - merged_df["mean"]) / merged_df["mean"]

In [1572]:
merged_df["target"] = False
merged_df.loc[direction * merged_df["price_per_accommodation"] >= direction * merged_df["mean"], "target"] = True

In [1573]:
merged_df.dropna(inplace=True)

In [1574]:
merged_df["target"].value_counts()

True     8340
False    4374
Name: target, dtype: int64

In [1575]:
train_indices, test_indices = train_test_split(merged_df.index, test_size=0.3)

In [1576]:
merged_df["test"] = False
merged_df.loc[test_indices, "test"] = True

In [1577]:
vectorizer = TfidfVectorizer(min_df=5, max_features=50000, stop_words='english')

In [1578]:
merged_df["full_text"] = merged_df["description"] + " " +  merged_df["neighborhood_overview"]

In [1579]:
def remove_tags(x):
    return x.replace("<", " ").replace(">", " ")

In [1580]:
merged_df["full_text"] = merged_df["full_text"].apply(remove_tags)

In [1581]:
X_train = vectorizer.fit_transform(merged_df[~merged_df["test"]]["full_text"])

In [1582]:
X_test = vectorizer.transform(merged_df[merged_df["test"]]["full_text"])

In [1583]:
y_train = merged_df[~merged_df["test"]]["target"]

In [1584]:
y_test = merged_df[merged_df["test"]]["target"]

In [1585]:
from sklearn.model_selection import GridSearchCV

In [1586]:
grid = {"C": np.logspace(-2, 0, 30)}
# grid = {"n_estimators": [200]}

In [1587]:
from sklearn.ensemble import GradientBoostingClassifier

In [1588]:
model = LogisticRegression(solver="liblinear")
# model = GradientBoostingClassifier()

In [1589]:
search = GridSearchCV(model, grid, cv=3, scoring="roc_auc")

In [1590]:
search.fit(X_train, y_train)

GridSearchCV(cv=3, estimator=LogisticRegression(solver='liblinear'),
             param_grid={'C': array([0.01      , 0.01172102, 0.01373824, 0.01610262, 0.01887392,
       0.02212216, 0.02592944, 0.03039195, 0.03562248, 0.04175319,
       0.04893901, 0.05736153, 0.06723358, 0.07880463, 0.09236709,
       0.10826367, 0.1268961 , 0.14873521, 0.17433288, 0.20433597,
       0.23950266, 0.28072162, 0.32903446, 0.38566204, 0.45203537,
       0.52983169, 0.62101694, 0.72789538, 0.85316785, 1.        ])},
             scoring='roc_auc')

In [1591]:
search.best_score_

0.6092233665425278

In [1592]:
search.best_estimator_

LogisticRegression(C=0.38566204211634725, solver='liblinear')

In [1593]:
prediction_train = search.predict_proba(X_train).T[1]

In [1594]:
prediction_test = search.predict_proba(X_test).T[1]

In [1595]:
roc_auc_score(y_train, prediction_train), roc_auc_score(y_test, prediction_test)

(0.7956814686218842, 0.6738933834693805)

In [1596]:
eli5.explain_weights(search.best_estimator_, feature_names=vectorizer.get_feature_names(), top=100)

Weight?,Feature
+1.623,double
+1.280,bus
+1.272,peckham
+1.190,studio
+1.089,kitchen
+1.050,couples
+1.029,cooking
+1.007,venice
+1.007,east
+0.985,couple


In [1597]:
import numpy as np

In [1598]:
feature_names = np.array(vectorizer.get_feature_names())

In [1599]:
best_features = feature_names[np.argsort(-search.best_estimator_.coef_)][0, 0:1000]

In [1600]:
from tqdm import tqdm_notebook

In [1601]:
regions = listings_df["neighbourhood_cleansed"].str.lower().unique().tolist()

In [1602]:
from scipy.stats import ttest_ind, mannwhitneyu

In [1603]:
# found_places = []
# all_found_places_df = []

In [1604]:
for place in tqdm_notebook(best_features):
    place_indices = merged_df["neighborhood_overview"].str.lower().str.contains(place)
    place_df = merged_df[place_indices][["latitude", "longitude", "target", "price_per_accommodation_diff", "mean"]].copy()
    place_df["place"] = place
    place_df["direction"] = direction
    if min(place_df[["latitude", "longitude"]].std() < 0.02) and (place_df.shape[0] > 10):
        total_mean_diff = merged_df["price_per_accommodation_diff"].mean()
        place_mean_diff = place_df["price_per_accommodation_diff"].mean()
        place_pvalue = mannwhitneyu(place_df["price_per_accommodation_diff"], merged_df["price_per_accommodation_diff"]).pvalue
        
        print(place, place_mean_diff, place_pvalue)
        
        if (total_mean_diff * direction > place_mean_diff * direction) or (place_pvalue > 0.05):
            print("Not significant")
            continue
        
        if any([place in region for region in regions]):
            print("Not included")
            continue
        found_places.append(place)
        all_found_places_df.append(place_df)


This function will be removed in tqdm==5.0.0
Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`



HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))

peckham -0.23773441473356643 1.5319166097077098e-11
bethnal -0.12258846099117059 0.04852760026187321
morden -0.09163795759594177 0.23783997434338466
Not significant
maida -0.23188371544257771 9.18792107301194e-08
tooting -0.08948724378020521 0.00018505052038862813
newington -0.13880645673142725 0.05897416612355697
Not significant
dalston -0.11170691318917861 0.16845540124717107
Not significant
camberwell -0.2412468843313477 7.111759608033074e-08
catford -0.27794479537968103 0.0031158253650241875
portobello -0.11232699463486105 1.9516552003023763e-05
kilburn -0.08617936844332762 0.06423863359189368
Not significant
golborne -0.2206218386597863 0.00713469606912019
bayswater -0.25824893414631783 5.321803109657346e-06
feltham -0.32158724206256195 0.0015585853932511386
queensway -0.16770294754827195 0.0062769548744738175
marylebone 0.012441580890617874 0.005904159441795267
Not significant
holloway -0.08990007405820492 0.26343285846085857
Not significant
roehampton -0.2604162392169496 0.04563

In [1605]:
found_places_df = pd.concat(all_found_places_df)

In [1606]:
found_places_df

Unnamed: 0,latitude,longitude,target,price_per_accommodation_diff,mean,place,direction
1,51.52842,-0.11141,True,0.618430,31.512012,clerkenwell,1
6,51.52231,-0.09144,True,0.396293,31.512012,clerkenwell,1
148,51.52310,-0.10351,True,0.570830,31.512012,clerkenwell,1
174,51.52312,-0.10605,False,-0.127317,31.512012,clerkenwell,1
177,51.53156,-0.11290,False,-0.079716,31.512012,clerkenwell,1
...,...,...,...,...,...,...,...
13409,51.51573,-0.19841,True,-0.584044,72.123053,ledbury,-1
13432,51.51754,-0.19747,False,0.649958,72.123053,ledbury,-1
13497,51.51617,-0.19251,True,-0.653370,72.123053,ledbury,-1
13521,51.52570,-0.20090,True,-0.327538,72.123053,ledbury,-1


In [1456]:
# found_places_df["price_per_accommodation_diff"] = (found_places_df["price_per_accommodation"] - found_places_df["mean"]) / found_places_df["mean"] * 100

In [1457]:
# listings_df["neighbourhood_cleansed"].value_counts()

In [1681]:
index = merged_df["full_text"].str.lower().str.contains("bethnal")
merged_df[index.fillna(False)]["full_text"].sample().iloc[0]

" b The space /b  br / Located near the city center, in the cool neighborhood of Bethnal Green, it’s my apartment, offers full confort to any guest considering staying in a nice place for long term, situated in zone 2📍 E3 4TN 📍 br /  br / KEY FEATURES: br / Fast WiFi ⚡️ br / Fully furnished 🏠 br / Area full of shops and parks⛹️\u200d♂️ br /  br / ⭐️EXCELLENT LOCATION : ZONE 2 ⭐️ br /  4 Mins. To Mile End tube Station. br / 25 Mins. to City center.  br /  br / What you can reach in 20 min: br / -Shoredicht  br / -Liverpool street. br / -Tower bridge.   br / - St. Paul cathedral.   br / - Stratford shopping center.  br / - Canary Wharf.  br / -Victoria park.  br / And get to the city center in 25 min📍 br /  br / ABOUT MILE END:  br / Mile End is an area in the East End of London centred, 1.5 mile northeast of Liverpool Street station. It currently also gives its name to an electoral ward of the London Borough of Tower Hamlets and co-encompasses St Peter's ward. Part of the area holds con

In [1608]:
found_places_df["place"].unique()

array(['clerkenwell', 'hoxton', 'battersea', 'columbia', 'salusbury',
       'abbeville', 'barbican', 'bloomsbury', 'redchurch', 'turnham',
       'farringdon', 'parsons', 'mare', 'geffrye', 'haggerston',
       'peckham', 'bethnal', 'maida', 'tooting', 'camberwell', 'catford',
       'portobello', 'golborne', 'bayswater', 'feltham', 'queensway',
       'roehampton', 'stoke', 'canada', 'swiss', 'ladywell', 'sisters',
       'quays', 'elephant', 'edmonton', 'finsbury', 'streatham',
       'rotherhithe', 'nunhead', 'bussey', 'limehouse', 'zebra',
       'ledbury'], dtype=object)

In [1610]:
found_places_df["direction"].value_counts()

-1    2256
 1    1252
Name: direction, dtype: int64

In [1611]:
place_names = {
    feature: feature.capitalize()
    for feature in found_places_df["place"].unique()
    if feature not in ["duplex", "samsung", "heated"]
}

In [1612]:
found_places_df["Place"] = found_places_df["place"].apply(lambda x: place_names.get(x))

In [1613]:
# found_places_df.groupby("place").agg({
#     "price_per_accommodation": "mean",
#     "mean": "mean"
# })

In [1615]:
found_places_df["Costs"] = found_places_df["direction"].apply({1: "Expensive", -1: "Cheap"}.get)

In [1634]:
found_places_df["Price difference"] = found_places_df["price_per_accommodation_diff"].apply(lambda x: "{:.2f} %".format(x * 100))

In [1635]:
# found_places_df[]

In [1636]:
help(px.colors.qualitative)

Help on module _plotly_utils.colors.qualitative in _plotly_utils.colors:

NAME
    _plotly_utils.colors.qualitative - Qualitative color sequences are appropriate for data that has no natural ordering, such as categories, colors, names, countries etc. The color sequences in this module are mostly meant to be passed in as the `color_discrete_sequence` argument to various functions.

FUNCTIONS
    swatches(template=None)
        Parameters
        ----------
        template : str or dict or plotly.graph_objects.layout.Template instance
            The figure template name or definition.
        
        Returns
        -------
        fig : graph_objects.Figure containing the displayed image
            A `Figure` object. This figure demonstrates the color scales and
            sequences in this module, as stacked bar charts.

DATA
    __all__ = ['swatches']

FILE
    /home/anatoli/anaconda3/lib/python3.7/site-packages/_plotly_utils/colors/qualitative.py




In [1647]:
px.colors.qualitative.G10

['#3366CC',
 '#DC3912',
 '#FF9900',
 '#109618',
 '#990099',
 '#0099C6',
 '#DD4477',
 '#66AA00',
 '#B82E2E',
 '#316395']

In [1675]:
import plotly.express as px

px.scatter_mapbox(
    found_places_df[~found_places_df["Place"].isnull() & (found_places_df["direction"] == 1)], 
    lat="latitude", lon="longitude", color="Place", hover_name="Place",
    hover_data=["Price difference"],
    size_max=15, zoom=11, mapbox_style="carto-positron",
    width=900, height=700
)

In [1673]:
import plotly.express as px

px.scatter_mapbox(
    found_places_df[~found_places_df["Place"].isnull() & (found_places_df["direction"] == -1)], 
    lat="latitude", lon="longitude", color="Place", hover_name="Place",
    hover_data=["Price difference"],
    size_max=15, zoom=10, mapbox_style="carto-positron",
    width=900, height=700
)