# Identifying ghost hotels

In [1]:
import os
import numpy as np
import pandas as pd
import geopandas as gpd
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import re
from pysal.lib import weights

from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.spatial.distance import pdist, squareform
from sklearn.metrics import pairwise_distances_chunked
from sklearn.cluster import DBSCAN
from numbers import Number
from geopy.distance import great_circle
from shapely.geometry import MultiPoint

In [2]:
norm_df = pd.read_csv('https://github.com/jreades/i2p/blob/master/data/clean/2020-08-24-listings-with-nlp-subset.csv.gz?raw=true',
                compression='gzip', dtype={'id':np.float64, 'listing_url':str, 'name':str, 'description':str, 'description_norm':str})
ref_df = pd.read_csv('https://github.com/jreades/i2p/blob/master/data/src/2020-08-24-listings.csv.gz?raw=true',
                    compression='gzip', usecols=['id', 'room_type', 'calculated_host_listings_count', 'calculated_host_listings_count_entire_homes', 'host_id', 'host_name', 'availability_365', 'latitude', 'longitude', 'price', 'number_of_reviews_ltm'],
                    dtype={'id':np.float64,
                           'room_type':str, 
                           'calculated_host_listings_count':np.float64,
                           'calculated_host_listings_count_entire_homes':np.float64,
                           'host_id':np.float64, 
                           'host_name':str, 
                           'availability_365':np.float64,
                           'latitude':np.float64,
                          'longitude':np.float64,
                          'price':str,
                          'number_of_reviews_ltm':np.float64})

In [3]:
df = pd.merge(norm_df, ref_df, on='id') #default is inner so we leave it
df.drop(df[(df.room_type == 'Hotel room') |
           (df.id.isna()) |
           (df.calculated_host_listings_count.isna()) |
           (df.latitude.isna())].index, inplace=True)

#Converting columns
df['price'] = df['price'].str.replace('$','').str.replace(',','').astype('float')
ints = ['id', 'host_id', 'calculated_host_listings_count', 'calculated_host_listings_count_entire_homes', 'availability_365', 'number_of_reviews_ltm']
for i in ints:
    print(f"Converting {i}")
    try:
        df[i] = df[i].astype('int')
    except ValueError as e:
        print("\tConverting to unsigned 16-bit integer.")
        df[i] = df[i].astype(pd.UInt16Dtype())

#Setting custom index
df.set_index('id', inplace=True)

#Dropping non-revenue generating listings
df.drop(df[df['number_of_reviews_ltm'] < 1].index, inplace=True)

potential_ghosts = df[(df['calculated_host_listings_count_entire_homes'] >= 2) &
                   (df['room_type'] == 'Entire home/apt') &
                   (~df['description_norm'].isna())].copy()

print(f'There are {potential_ghosts.shape[0]} candidates for ghost listings, which make up {(potential_ghosts.shape[0]/df.shape[0])*100:.2f}% of revenue-generating listings in London.')

Converting id
Converting host_id
Converting calculated_host_listings_count
Converting calculated_host_listings_count_entire_homes
Converting availability_365
Converting number_of_reviews_ltm
There are 9424 candidates for ghost listings, which make up 28.06% of revenue-generating listings in London.


In [4]:
corpus = potential_ghosts.description_norm.values
vectorizer = TfidfVectorizer(use_idf=True, ngram_range=(1,2), max_df=0.5, min_df=0.05)
#ignore terms that appear in over 50% or less than 5% of documents
tcorpus = vectorizer.fit_transform(corpus).toarray() 
#fit creates list of words/ngrams + IDF scores
#transform creates matrix of TF/IDF scores per word/ngram
#convert to array for cosine similarity step

In [5]:
keys = potential_ghosts.index
cosine_distances = pd.DataFrame(squareform(pdist(tcorpus, metric='cosine')), index=keys, columns=keys)

In [6]:
#Filtering down to listings that meet criteria
ghost_listings = cosine_distances[(cosine_distances < 0.2).sum(1) > 2].index.tolist()
ghost_listings_mat = cosine_distances.loc[ghost_listings, ghost_listings]
ghost_listings_mat.shape

(2694, 2694)

#### Trying out DBSCAN

In [7]:
#Set up data frame to store results
results = potential_ghosts.loc[ghost_listings,]

In [8]:
def db_cluster(df1, df2, eps:Number, metric:str, name:str):
    '''Implements DBSCAN clustering algorithm on df1 and stores results in df2'''
    output = DBSCAN(eps=eps, min_samples=3, metric=metric).fit(df1)
    s = pd.Series(output.labels_, index=df2.index, name=name)
    df2[name] = s
    print(s.value_counts())
    return df2

In [9]:
db_cluster(ghost_listings_mat, results, 0.2, 'precomputed', 'text_dbscan')

 139    147
 202    106
 148     50
 293     42
-1       37
       ... 
 11       3
 7        3
 5        3
 1        3
 0        3
Name: text_dbscan, Length: 414, dtype: int64


Unnamed: 0_level_0,listing_url,name,description,description_norm,host_id,host_name,latitude,longitude,room_type,price,availability_365,number_of_reviews_ltm,calculated_host_listings_count,calculated_host_listings_count_entire_homes,text_dbscan
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
47687,https://www.airbnb.com/rooms/47687,Cosy Double studio in Zone 2 Hammersmith (6),<b>The space</b><br />Our studios are located ...,space studio locate hammersmith hammersmith le...,216660,Boris,51.49506,-0.22750,Entire home/apt,78.00,365,1,9,7,0
63948,https://www.airbnb.com/rooms/63948,Cosy Double studio in Zone 2 Hammersmith (4),<b>The space</b><br />Our studios are located ...,space studio locate hammersmith hammersmith le...,216660,Boris,51.49402,-0.22724,Entire home/apt,57.00,325,1,9,7,0
107048,https://www.airbnb.com/rooms/107048,A Luxury Studio Suite in Clerkenwell,"Situated in the heart of vibrant Clerkenwell, ...",situate heart vibrant clerkenwell many conside...,259088,Simon,51.52407,-0.10554,Entire home/apt,93.00,205,18,4,3,1
264776,https://www.airbnb.com/rooms/264776,Huge Four Bedroom Apartment,An extremely large and sunny four bedroom grou...,extremely large sunny four bedroom ground floo...,1389063,Sue,51.44251,-0.01989,Entire home/apt,218.00,312,7,11,11,2
264778,https://www.airbnb.com/rooms/264778,Two Bedroom Newly Refurbished Apartment,A large and sunny two bedroom second floor apa...,large sunny two bedroom second floor apartment...,1389063,Sue,51.44368,-0.02195,Entire home/apt,120.00,346,4,11,11,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44428146,https://www.airbnb.com/rooms/44428146,Greater London 1 Bedroom Cosy Flat,We are still accepting bookings and taking ext...,still accept booking take extra cleaning preca...,338510519,Blue Star,51.49358,-0.18963,Entire home/apt,98.00,66,4,22,22,409
44585910,https://www.airbnb.com/rooms/44585910,"Gem in the heart of Notting Hill , can host 4 ppl",Beautiful One Bedroom apartment located in the...,beautiful one bedroom apartment locate heart w...,339017866,Inesa,51.51002,-0.19454,Entire home/apt,87.00,67,1,5,5,403
44626200,https://www.airbnb.com/rooms/44626200,★Beautiful & Comfy 1BR with GardenView BAYSWAT...,This is a Bright and Beautiful apartment with ...,bright beautiful apartment garden view private...,240473406,Etsh,51.51133,-0.18156,Entire home/apt,57.86,68,1,24,24,313
44737446,https://www.airbnb.com/rooms/44737446,Amazing ONE Bedroom Apartment Kensington,Lovely 1 bedroom apartment perfect for couples...,lovely bedroom apartment perfect couple family...,338510519,Blue Star,51.49338,-0.18967,Entire home/apt,114.00,65,2,22,22,404


In [10]:
text_noise = results[results['text_dbscan'] == -1].index.tolist()

In [11]:
print(f'{len(text_noise)} listings are considered noise and will be dropped in the analysis')

37 listings are considered noise and will be dropped in the analysis


#### Spatial DBSCAN

In [12]:
gdf = gpd.GeoDataFrame(df, 
      geometry=gpd.points_from_xy(df['longitude'], df['latitude'], crs='epsg:4326'))
boros = gpd.read_file('https://github.com/jreades/i2p/blob/master/data/src/Boroughs.gpkg?raw=true')
gdf = gdf.to_crs(boros.crs) #reproject so that we can map it later

In [13]:
results.drop(results[results['text_dbscan'] == -1].index, inplace=True)
results.shape

(2657, 15)

In [14]:
spatial_dbs_listings = results[['latitude', 'longitude']]
spatial_dbs_listings.head()

Unnamed: 0_level_0,latitude,longitude
id,Unnamed: 1_level_1,Unnamed: 2_level_1
47687,51.49506,-0.2275
63948,51.49402,-0.22724
107048,51.52407,-0.10554
264776,51.44251,-0.01989
264778,51.44368,-0.02195


In [15]:
#Convert coordinates to radians
spatial_dbs_listings = spatial_dbs_listings.apply(np.radians)

The following steps are from https://geoffboeing.com/2014/08/clustering-to-reduce-spatial-data-set-size/:

In [16]:
e = 0.3/6371.0088 #divide maximum distance (in km) between two listings that could be in the same building by km per radian
db_cluster(spatial_dbs_listings, results, e, 'haversine', 'geo_dbscan')

 1      656
 8      294
 10     196
 11     164
-1       97
       ... 
 86       3
 84       3
 72       3
 70       3
 128      3
Name: geo_dbscan, Length: 130, dtype: int64


Unnamed: 0_level_0,listing_url,name,description,description_norm,host_id,host_name,latitude,longitude,room_type,price,availability_365,number_of_reviews_ltm,calculated_host_listings_count,calculated_host_listings_count_entire_homes,text_dbscan,geo_dbscan
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
47687,https://www.airbnb.com/rooms/47687,Cosy Double studio in Zone 2 Hammersmith (6),<b>The space</b><br />Our studios are located ...,space studio locate hammersmith hammersmith le...,216660,Boris,51.49506,-0.22750,Entire home/apt,78.00,365,1,9,7,0,0
63948,https://www.airbnb.com/rooms/63948,Cosy Double studio in Zone 2 Hammersmith (4),<b>The space</b><br />Our studios are located ...,space studio locate hammersmith hammersmith le...,216660,Boris,51.49402,-0.22724,Entire home/apt,57.00,325,1,9,7,0,0
107048,https://www.airbnb.com/rooms/107048,A Luxury Studio Suite in Clerkenwell,"Situated in the heart of vibrant Clerkenwell, ...",situate heart vibrant clerkenwell many conside...,259088,Simon,51.52407,-0.10554,Entire home/apt,93.00,205,18,4,3,1,1
264776,https://www.airbnb.com/rooms/264776,Huge Four Bedroom Apartment,An extremely large and sunny four bedroom grou...,extremely large sunny four bedroom ground floo...,1389063,Sue,51.44251,-0.01989,Entire home/apt,218.00,312,7,11,11,2,2
264778,https://www.airbnb.com/rooms/264778,Two Bedroom Newly Refurbished Apartment,A large and sunny two bedroom second floor apa...,large sunny two bedroom second floor apartment...,1389063,Sue,51.44368,-0.02195,Entire home/apt,120.00,346,4,11,11,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44428146,https://www.airbnb.com/rooms/44428146,Greater London 1 Bedroom Cosy Flat,We are still accepting bookings and taking ext...,still accept booking take extra cleaning preca...,338510519,Blue Star,51.49358,-0.18963,Entire home/apt,98.00,66,4,22,22,409,10
44585910,https://www.airbnb.com/rooms/44585910,"Gem in the heart of Notting Hill , can host 4 ppl",Beautiful One Bedroom apartment located in the...,beautiful one bedroom apartment locate heart w...,339017866,Inesa,51.51002,-0.19454,Entire home/apt,87.00,67,1,5,5,403,11
44626200,https://www.airbnb.com/rooms/44626200,★Beautiful & Comfy 1BR with GardenView BAYSWAT...,This is a Bright and Beautiful apartment with ...,bright beautiful apartment garden view private...,240473406,Etsh,51.51133,-0.18156,Entire home/apt,57.86,68,1,24,24,313,11
44737446,https://www.airbnb.com/rooms/44737446,Amazing ONE Bedroom Apartment Kensington,Lovely 1 bedroom apartment perfect for couples...,lovely bedroom apartment perfect couple family...,338510519,Blue Star,51.49338,-0.18967,Entire home/apt,114.00,65,2,22,22,404,10


In [17]:
#Drop the 97 noisy results
results.drop(results[results['geo_dbscan'] == -1].index, inplace=True)

In [18]:
#Find distinct ghost hotels by looking at elements that are in the same spatial and text clusters
hotels = results.groupby(['geo_dbscan','text_dbscan']).size().reset_index().rename(columns={0:'count'})

In [19]:
hotels.shape

(497, 3)

In [20]:
#Create new column for IDs of each likely hotel
hotels['hotel_id'] = hotels.index.values

In [27]:
results['id'] = results.index.values
hotels_results = results.merge(hotels, how='left').set_index('id')

In [28]:
hotels_results.head()

Unnamed: 0_level_0,listing_url,name,description,description_norm,host_id,host_name,latitude,longitude,room_type,price,availability_365,number_of_reviews_ltm,calculated_host_listings_count,calculated_host_listings_count_entire_homes,text_dbscan,geo_dbscan,count,hotel_id
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
47687,https://www.airbnb.com/rooms/47687,Cosy Double studio in Zone 2 Hammersmith (6),<b>The space</b><br />Our studios are located ...,space studio locate hammersmith hammersmith le...,216660,Boris,51.49506,-0.2275,Entire home/apt,78.0,365,1,9,7,0,0,3,0
63948,https://www.airbnb.com/rooms/63948,Cosy Double studio in Zone 2 Hammersmith (4),<b>The space</b><br />Our studios are located ...,space studio locate hammersmith hammersmith le...,216660,Boris,51.49402,-0.22724,Entire home/apt,57.0,325,1,9,7,0,0,3,0
107048,https://www.airbnb.com/rooms/107048,A Luxury Studio Suite in Clerkenwell,"Situated in the heart of vibrant Clerkenwell, ...",situate heart vibrant clerkenwell many conside...,259088,Simon,51.52407,-0.10554,Entire home/apt,93.0,205,18,4,3,1,1,3,2
264776,https://www.airbnb.com/rooms/264776,Huge Four Bedroom Apartment,An extremely large and sunny four bedroom grou...,extremely large sunny four bedroom ground floo...,1389063,Sue,51.44251,-0.01989,Entire home/apt,218.0,312,7,11,11,2,2,3,108
264778,https://www.airbnb.com/rooms/264778,Two Bedroom Newly Refurbished Apartment,A large and sunny two bedroom second floor apa...,large sunny two bedroom second floor apartment...,1389063,Sue,51.44368,-0.02195,Entire home/apt,120.0,346,4,11,11,2,2,3,108


In [33]:
geo_hotels_results = gdf.loc[hotels_results.index.values]

In [None]:

#Extract clusters of listings that have similar descriptions to over 2 other listings
#2 because 1 will always be the listing itself, and the other could be another spare room in an apartment
ghost_clusters = cosine_distances[(cosine_distances < 0.2).sum(1) > 2].apply(lambda row: row[row < 0.2].index.tolist(), 1).tolist()

In [None]:
#Creating distance band weights for ghost listings
#1. Convert original df to gdf and reproject so it's in BNG
gdf = gpd.GeoDataFrame(df, 
      geometry=gpd.points_from_xy(df['longitude'], df['latitude'], crs='epsg:4326'))
boros = gpd.read_file('https://github.com/jreades/i2p/blob/master/data/src/Boroughs.gpkg?raw=true')
gdf = gdf.to_crs(boros.crs)

In [None]:
#Create separate df of ghost listings
ghost_gdf = gdf.loc[ghost_listings].copy()
test_dbw = weights.DistanceBand.from_dataframe(ghost_gdf, 300)

In [None]:
listing_clusters = list(zip(ghost_listings, ghost_clusters))
sorted(listing_clusters, key=lambda x: len(x[1]), reverse=True)

In [None]:
ghost_hotels = set()
seen = set() #set of elements that have already been in a cluster

for l, c in sorted(listing_clusters, key=lambda x: len(x[1]), reverse=True):
    if l not in seen: #if the listing is not already in a cluster
        hotel = set(c) & set(test_dbw[l]) #create set of elements that are both similar and neighbours
        if (len(hotel) > 1) & (len(hotel & seen) == 0): #if there is more than one element in the hotel set,
            #and no elements in the hotel set have already been seen,
            hotel.add(l) #add the listing to the hotel, bc the original set wouldnt have it
            ghost_hotels.add(frozenset(hotel)) #freeze the hotel set and add to super-set of ghost hotels
            for i in hotel: 
                seen.add(i) #add each element in the hotel set to the set of listings already in clusters
                #we're assuming that the longest listing clusters will contain all potential subsets that could include a listing

In [None]:

print(f'There are {len(seen)} listings in {len(ghost_hotels)} suspected ghost hotels, which account for {(len(seen)/entire_homes)*100:.2f}% of revenue-generating entire home listings in London.')

In [None]:
sorted(list(ghost_hotels), key=len, reverse=True)[:6]

In [None]:
gdf.loc[[32010389, 34456467, 34456554]]

In [None]:
fig, ax = plt.subplots(1,1, figsize=(12,10))
boros.plot(ax=ax)
for h in ghost_hotels:
    gdf.loc[h].plot(ax=ax)