In [101]:
import pandas as pd

In [102]:
listings_df = pd.read_csv("/home/anatoli/Документы/airbnb/listings.csv")

In [103]:
listings_df["price_value"] = listings_df["price"].apply(lambda x: x[1:].replace(",", "")).astype(float)

In [187]:
dataset_df = listings_df[listings_df["review_scores_rating"] >= 90][["room_type", "neighbourhood_cleansed", "price_value", "name", "description", "neighborhood_overview", "latitude", "longitude"]].copy()

In [188]:
# dataset_df = dataset_df.dropna()

In [189]:
avg_prices_df = dataset_df.groupby(["room_type", "neighbourhood_cleansed"])["price_value"].agg(["mean", "median", "std"]).reset_index()

In [190]:
avg_prices_df["max"] = avg_prices_df["mean"] + avg_prices_df["std"]

In [191]:
prices_map_df = avg_prices_df.set_index(["neighbourhood_cleansed", "room_type"])["mean"].unstack(-1)

In [192]:
prices_map_df.sort_values("Private room", ascending=False).iloc[0:10]

room_type,Entire home/apt,Hotel room,Private room,Shared room
neighbourhood_cleansed,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Westminster,206.954969,191.071429,82.647343,39.909091
City of London,386.654054,168.0,73.88,
Kensington and Chelsea,178.025962,142.375,73.761341,61.857143
Hammersmith and Fulham,135.755054,156.6,61.643695,47.111111
Camden,143.819528,63.434783,57.972914,112.583333
Southwark,137.912243,89.0,57.265269,108.444444
Richmond upon Thames,152.117021,132.75,54.158385,112.5
Barking and Dagenham,80.636364,,53.820755,40.0
Islington,191.040398,106.0,53.789195,37.2
Haringey,115.080756,34.0,53.611465,19.6


In [196]:
merged_df = dataset_df.merge(avg_prices_df, on=["room_type", "neighbourhood_cleansed"])

In [197]:
merged_df["target"] = False
merged_df.loc[merged_df["price_value"]>= merged_df["max"], "target"] = True

In [198]:
merged_df.dropna(inplace=True)

In [199]:
merged_df["target"].value_counts()

False    28091
True      1766
Name: target, dtype: int64

In [200]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [201]:
from sklearn.model_selection import train_test_split

In [202]:
train_indices, test_indices = train_test_split(merged_df.index, test_size=0.3)

In [203]:
merged_df["test"] = False
merged_df.loc[test_indices, "test"] = True

In [204]:
vectorizer = TfidfVectorizer(min_df=5, max_features=10000, stop_words='english')

In [205]:
merged_df["full_text"] = merged_df["description"] + " " +  merged_df["neighborhood_overview"]

In [303]:
def remove_tags(x):
    return x.replace("<", " ").replace(">", " ")

In [304]:
merged_df["full_text"] = merged_df["full_text"].apply(remove_tags)

In [305]:
X_train = vectorizer.fit_transform(merged_df[~merged_df["test"]]["full_text"])

In [306]:
X_test = vectorizer.transform(merged_df[merged_df["test"]]["full_text"])

In [307]:
y_train = merged_df[~merged_df["test"]]["target"]

In [308]:
y_test = merged_df[merged_df["test"]]["target"]

In [309]:
from sklearn.linear_model import LogisticRegression

In [310]:
from sklearn.metrics import roc_auc_score

In [311]:
model = LogisticRegression()

In [312]:
model.fit(X_train, y_train)

LogisticRegression()

In [313]:
prediction_train = model.predict_proba(X_train).T[1]

In [314]:
X_test

<8958x10000 sparse matrix of type '<class 'numpy.float64'>'
	with 816251 stored elements in Compressed Sparse Row format>

In [315]:
prediction_test = model.predict_proba(X_test).T[1]

In [316]:
roc_auc_score(y_train, prediction_train)

0.901457451204206

In [317]:
roc_auc_score(y_test, prediction_test)

0.8209212425701323

In [318]:
import eli5

In [319]:
eli5.explain_weights(model, feature_names=vectorizer.get_feature_names(), top=100)

Weight?,Feature
+3.444,rooms
+3.199,family
+3.054,bedrooms
+2.687,groups
+2.526,master
+2.251,stunning
+2.226,shoreditch
+2.189,families
+2.079,bbq
+2.018,house


In [320]:
import numpy as np

In [321]:
feature_names = np.array(vectorizer.get_feature_names())

In [322]:
best_features = feature_names[np.argsort(-model.coef_)][0, 0:500]

In [323]:
from tqdm import tqdm_notebook

In [324]:
regions = listings_df["neighbourhood_cleansed"].str.lower().unique().tolist()

In [325]:
found_places = []
all_found_places_df = []

for place in tqdm_notebook(best_features):
    place_indices = merged_df["neighborhood_overview"].str.lower().str.contains(place)
    place_df = merged_df[place_indices][["latitude", "longitude", "target", "price_value"]].copy()
    place_df["place"] = place
    if min(place_df[["latitude", "longitude"]].std() < 0.02):
        print(place)
        if any([place in region for region in regions]):
            print("Not included")
            continue
        found_places.append(place)
        all_found_places_df.append(place_df)


This function will be removed in tqdm==5.0.0
Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`



HBox(children=(FloatProgress(value=0.0, max=500.0), HTML(value='')))

parsons
leyton
leytonstone
fireplaces
aldgate
exmouth
isleworth
kensal
closets
wanstead
goodmayes
askew
blitz
aeltc
chatsworth
cricklewood
maids
skylights
norbiton
spitalfields
chancery
colleagues
coulsdon
marksman
lgbtqia



In [326]:
found_places_df = pd.concat(all_found_places_df)

In [328]:
# listings_df["neighbourhood_cleansed"].value_counts()

In [428]:
index = merged_df["full_text"].str.lower().str.contains("chancery")
merged_df[index.fillna(False)]["full_text"].sample().iloc[0]

"Light & airy modern family home with fantastic garden in Bloomsbury, Zone 1, in the heart of London’s legal area close to Chancery Lane & Holborn. A vibrant area with restaurants, shops, markets, and close to four tube stations.  Easy walking distance to Lamb Conduit St, Exmouth Market, British Museum, Kings Cross & Granary Square, Farringdon, and also very close to Covent Garden, Soho, & Oxford St, Smithfield, the Barbican & short bus trips to Shoreditch, Hoxton, Borough Market & Camden Town. br /  br /  b The space /b  br / This is a 1st & 2nd floor light-filled family home in WC1, Zone 1, in a small privately owned apartment building, with a lift.  It is very spacious & comfortable with a fantastic garden to enjoy.   br /  br / Downstairs - Entrance hall, Lounge & kitchen (pictured), Shower room (pictured), Laundry room, Double bedroom (sofa bed), Conservatory dining room (pictured), Leafy garden with BBQ & dining area (pictured) with retractable awning. There's a large flatscreen 

In [429]:
place_names = {'spitalfields': "Spitalfields",
 'chatsworth': 'Chatsworth',
 'kensal': 'Kensal',
 'leyton': 'Leyton',
 'exmouth': 'Exmouth',
 'parsons': 'Parsons Green',
 'aldgate': 'Aldgate',
 'wanstead': 'Wanstead',
 'leytonstone': "Leyton",
 'askew': 'Askew',
 'cricklewood': 'Cricklewood',
 'chancery': 'Chancery Lane',
 'coulsdon': 'Coulsdon',
 'isleworth': 'Isleworth',
 'norbiton': 'Norbiton',
 'marksman': 'The Marksman',
 'goodmayes': 'Goodmayes',
 'aeltc': 'aeltc'.upper()}

In [430]:
found_places_df["place_name"] = found_places_df["place"].apply(place_names.get)

In [432]:
import plotly.express as px

px.scatter_mapbox(found_places_df[~found_places_df["place_name"].isnull()], lat="latitude", lon="longitude", color="place_name", hover_name="place_name",
                  color_continuous_scale=px.colors.cyclical.IceFire, size_max=15, zoom=9, mapbox_style="stamen-terrain")
