# Reproducible Analysis

In [1]:
import os
import numpy as np
import pandas as pd
import geopandas as gpd
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import re

from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.spatial.distance import pdist, squareform
from sklearn.metrics import pairwise_distances_chunked
from itertools import chain

## Identifying ghost hotels

### Loading and cleaning data

In [5]:
norm_df = pd.read_csv('https://github.com/jreades/i2p/blob/master/data/clean/2020-08-24-listings-with-nlp-subset.csv.gz?raw=true',
                compression='gzip', dtype={'id':np.float64, 'listing_url':str, 'name':str, 'description':str, 'description_norm':str})
ref_df = pd.read_csv('https://github.com/jreades/i2p/blob/master/data/src/2020-08-24-listings.csv.gz?raw=true',
                    compression='gzip', usecols=['id', 'room_type', 'calculated_host_listings_count', 'host_id', 'host_name', 'availability_365'],
                    dtype={'id':np.float64, 'room_type':str, 'calculated_host_listings_count':np.float64, 'host_id':np.float64, 'host_name':str, 'availability_365':np.float64})

#### Joining df with normalised description to df with room type, then dropping hotel rooms, serviced apartments, and null normalised descriptions

In [3]:
df = pd.merge(norm_df, ref_df, on='id') #default is inner so we leave it
df.drop(df[(df.room_type == 'Hotel room') |
           (df.id.isna()) |
           (df.calculated_host_listings_count.isna()) |
           (df.description.isna()) |
           (df.description_norm.isna())].index, inplace=True)

ints = ['id', 'host_id', 'calculated_host_listings_count', 'availability_365']
for i in ints:
    print(f"Converting {i}")
    try:
        df[i] = df[i].astype('int')
    except ValueError as e:
        print("\tConverting to unsigned 16-bit integer.")
        df[i] = df[i].astype(pd.UInt16Dtype())

df.drop(df[df['calculated_host_listings_count'] <= 2].index, inplace=True)
df.set_index('id', inplace=True)

Converting id
Converting host_id
Converting calculated_host_listings_count
Converting availability_365


In [4]:
df.shape[0]

22256

#### To filter or not to filter serviced apartments?

Serviced apartments are an established type of tourist accommodation with similar characteristics to ghost hotels: their units are typically located in the same building, and units listed on Airbnb would likely have similar descriptions. However, established serviced apartments may have planning permission to operate as such, unlike ghost hotels. 

I experimented with filtering out listings that are likely to be legitimate serviced apartments before proceeding with text similarity analysis, but ultimately decided against it for the following reasons:
1. Listings described as 'serviced apartments' or 'serviced accommodation' account for a small percentage of commercially-run Airbnb listings.
2. Listings in buildings classed as 'residential' are still sometimes described as serviced apartments for marketing purposes, and there is no way to distinguish between these listings and legitimate serviced apartment listings.

In [6]:
serviced = df[df.description.str.contains(r'(?:serviced (?:apartments?|accommodation)+)', regex=True, flags=re.IGNORECASE)][['description', 'name', 'calculated_host_listings_count', 'availability_365']]
print(f'{(serviced.shape[0]/df.shape[0])*100:.2f}% of commercially-run Airbnb listings are described as serviced apartments.')

2.07% of commercially-run Airbnb listings are described as serviced apartments.


### Text similarity

In [8]:
corpus = df.description_norm.values

#### Creating TF/IDF vectorizer

In [9]:
vectorizer = TfidfVectorizer(use_idf=True, ngram_range=(1,2), max_df=0.5, min_df=0.05)
#ignore terms that appear in over 50% or less than 5% of documents
tcorpus = vectorizer.fit_transform(corpus).toarray() 
#fit creates list of words/ngrams + IDF scores
#transform creates matrix of TF/IDF scores per word/ngram
#convert to array for cosine similarity step

#### Creating cosine distance matrix

In [10]:
def reduce_func(D_chunk, start):
    neigh = [np.flatnonzero(d < 0.15) for d in D_chunk] #index positions of listings with a cosine distance less than 0.2
    return neigh

ghost_idx = [] #set up list of indices to use for slicing original df
validation = [] #list of arrays, each containing int index of similar listings
for chunk in pairwise_distances_chunked(tcorpus, metric='cosine', reduce_func=reduce_func): #each chunk is a list of arrays
    for idx, d in enumerate(chunk):
        if (d.shape[0] >= 4):
            ghost_idx.append(idx)
            validation.append(d)
            
    #if the length of an array (d.shape[0]) is greater than or equal to 3,
    #the listing it corresponds to has low cosine distances from three or more listings,
    #indicating a likely ghost hotel

In [11]:
len(validation)

5703

In [45]:
pd.set_option('display.max_colwidth', None)
df.iloc[validation[938]][['description', 'host_id', 'name', 'room_type', 'availability_365']]

Unnamed: 0_level_0,description,host_id,name,room_type,availability_365
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
20781314,"A twin room, situated within heart of London's Central East End conveniently positioned for the Canary Wharf, London City Centre, West-End, Olympic Village and all the great things London has to offer. The room comes with wardrobe, chest of drawers; desk, flat screen TV, shared bathrooms/kitchen with cooking facilities, a washing machine, fridge, iron etc<br /><br />We provide fresh bed linen and towels, free WI-FI and is centrally heated.<br /><br /><b>The space</b><br />The London tube stations of Stepney Green, Whitechapel and Bethnal Green is only 5 minutes (500 m) walk away.<br /><br />The bed and breakfast is very close to the Spitalfields, Brick lane and Columbia road markets. You can find there unique presents and souvenirs, art, vintage and hand made clothes and jewellery, try excellent food and absorb the urban atmosphere of East London.<br /><br />You can easily get any part of London by Underground, Overground, Buses, River boat (Thames Clipper) or on street rental bikes. O",148577632,Cosmopolitan Rooms London Whitechapel - Room 3,Private room,28
20781539,"A twin room, situated within heart of London's Central East End conveniently positioned for the Canary Wharf, London City Centre, West-End, Olympic Village and all the great things London has to offer. The room comes with wardrobe, chest of drawers; desk, flat screen TV, shared bathrooms/kitchen with cooking facilities, a washing machine, fridge, iron etc<br /><br />We provide fresh bed linen and towels, free WI-FI and is centrally heated.<br /><br /><b>The space</b><br />The London tube stations of Stepney Green, Whitechapel and Bethnal Green is only 5 minutes (500 m) walk away.<br /><br />The bed and breakfast is very close to the Spitalfields, Brick lane and Columbia road markets. You can find there unique presents and souvenirs, art, vintage and hand made clothes and jewellery, try excellent food and absorb the urban atmosphere of East London.<br /><br />You can easily get any part of London by Underground, Overground, Buses, River boat (Thames Clipper) or on street rental bikes. O",148577632,Cosmopolitan Rooms London Whitechapel - Room 6,Private room,172
21304969,"A single room, situated within heart of London's Central East End conveniently positioned for the Canary Wharf, London City Centre, West-End, Olympic Village and all the great things London has to offer. The room comes with wardrobe, chest of drawers; desk, flat screen TV, shared bathrooms/kitchen with cooking facilities, a washing machine, fridge, iron etc<br /><br />We provide fresh bed linen and towels, free WI-FI and is centrally heated.<br /><br /><b>The space</b><br />The London tube stations of Stepney Green, Whitechapel and Bethnal Green is only 5 minutes (500 m) walk away.<br /><br />The bed and breakfast is very close to the Spitalfields, Brick lane and Columbia road markets. You can find there unique presents and souvenirs, art, vintage and hand made clothes and jewellery, try excellent food and absorb the urban atmosphere of East London.<br /><br />You can easily get any part of London by Underground, Overground, Buses, River boat (Thames Clipper) or on street rental bikes.",148577632,Cosmopolitan Rooms London Whitechapel - Room 1,Private room,29
21410243,"A twin room, situated within heart of London's Central East End conveniently positioned for the Canary Wharf, London City Centre, West-End, Olympic Village and all the great things London has to offer. The room comes with wardrobe, chest of drawers; desk, flat screen TV, shared bathrooms/kitchen with cooking facilities, a washing machine, fridge, iron etc<br /><br />We provide fresh bed linen and towels, free WI-FI and is centrally heated.<br /><br /><b>The space</b><br />The London tube stations of Stepney Green, Whitechapel and Bethnal Green is only 5 minutes (500 m) walk away.<br /><br />The bed and breakfast is very close to the Spitalfields, Brick lane and Columbia road markets. You can find there unique presents and souvenirs, art, vintage and hand made clothes and jewellery, try excellent food and absorb the urban atmosphere of East London.<br /><br />You can easily get any part of London by Underground, Overground, Buses, River boat (Thames Clipper) or on street rental bikes. O",148577632,Cosmopolitan Rooms London Whitechapel - Room 5,Private room,28


Some issues:
- Some listing clusters really are multiple spare rooms in someone's house (tend to be 3-4 listings in these clusters, e.g. no. 42 + 46 for validation list, when length of array is set to greater than 3)
- Some listing clusters are hotel rooms listed as private rooms (Crawford Suites, Central Hoxton)
- Some really big listing clusters are just property management companies with rooms in a variety of locations (Sonder

In [11]:
print(f'{len(ghost_idx)} listings are suspected ghost hotels.')

5703 listings are suspected ghost hotels.


In [13]:
ghost_listings = df.iloc[ghost_idx, :]

In [30]:
pd.set_option('display.max_colwidth', None)
ghost_listings.sample(5)[['description', 'host_id', 'calculated_host_listings_count']]

Unnamed: 0_level_0,description,host_id,calculated_host_listings_count
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
18744353,"Thank you for viewing ROSEMONT ROAD <br />We have a one bedroom ground floor flat as part of a detached family house. The flat has separate entrance and is close to buses and 4 stations; <br />Acton Town on Piccadilly and District line, West Acton on Central Line, Acton Central and Acton Main Line (6 minutes into Paddington).<br /><br />Morrison supermarket and Acton high street offering variety of restaurants pubs and bars are within a few minutes walk. Westfields shopping center is a 10 minute bus ride away.<br /><br /><b>The space</b><br />Cozy sitting room with a fitted kitchen including an electric oven & hob, washing machine, fridge freezer, microwave, toaster and kettle. <br />Bedroom with en suite shower room.<br /><br /><b>Guest access</b><br />The flat has a private entrance and is self contained.<br /><br /><b>Other things to note</b><br />Happy to give you a quote for pick up at any airport.",4261235,16
4364932,"Double bed, on second floor & adjacent to good bathroom (shared when student son at home). Wardrobe, drawers, desk, bedside tables with lamps, House in a friendly and safe residential neighbourhood.<br />Add £10 per night for couples, singles preferred.<br /><br /><b>The space</b><br />This is a nice room and very quiet and private. We also have two more expensive double rooms on the first floor so you can be 6 guests at one time<br /><br /><b>Guest access</b><br />Guests can make hot drinks in the kitchen whenever they like and of course have breakfast at whatever time is preferred - we provide tea and coffee (proper coffee machine), orange juice, cereals, milk, fruit, bread and butter (toaster) and all sorts of jams and spreads.<br /><br /><b>Other things to note</b><br />The bathroom is shared with our 20 year old son when he is home from University",6856235,3
14795298,"You are welcome to stay in our single room within a guest house - Great Location, only 4 minutes walking distance from Goodge Street Station - Perfect option for couples or solo travelleres who want to share lodgement<br /><br /><b>The space</b><br />We will be happy to host you in our Guest House in Fitzrovia. The flat has all you need to enjoy a great stay in central London. <br />The room is equipped with a double bed, tv, internet access and a desk. In the common areas, the apartment has a living room, a full equipped kitchen and a bathroom with shower.<br /><br /><b>Guest access</b><br />- Guests will have full access to common areas<br />- Feel at home, but please, treat the apartment as if it was yours<br />- We give full independence to our guests<br /><br /><b>Other things to note</b><br />- Please, read carefully and consider house rules",988379,43
20248295,"Hosted by me, Stephanie, and my brother, James. <br /><br />Situated in Grove Park of Chiswick, this beautiful 4 bedroom house is spacious and comfortable. The room is a double size bedroom. <br /><br />5 minute walk from Chiswick overground station (20 min train journey to Waterloo)<br /><br />Close to Gunnersbury tube station (0.4 miles) and Chiswick Park (District line)<br /><br />E3 bus can take you to Turnham Green station in 5 minutes. <br /><br />Close to Chiswick Business Park & GSK<br /><br />Chiswick House beautiful park close by.",116510596,4
20076729,"""Welcome to our bright room in Shoreditch! Perfect if you'd like to explore London in the hip and happening Shoreditch! You have some of Londons best restaurants, bars, clubs, markets and shopping destinations right on your doorstep. The apartment is perfect for exploring East""<br /><br /><b>The space</b><br />Room has a spacious and quirky feel to it with fresh colored walls, stylish wooden floors and tonnes of natural light coming through. Space features an ensuite bathroom, a small kitchenette (microwave, toaster, kettle, fridge?). You have access to other facilities in the building such a large communal kitchen, lounge, a washer/dryer and also the amazing communal roof terrace!<br /><br /><b>Guest access</b><br />Room is your own. Cleaned professionally with linens, it's prepared for each new visitor. Guests have access to the communal kitchen, lounge and rooftop terrace.<br /><br /><b>Other things to note</b><br />No parties, No smoking, No pets, No extra guests The room received",142570782,40


In [15]:
ghost_listings[
    ghost_listings['host_id'] == 33889201
].shape[0]

224

#### Previous approaches

In [None]:
cosine_distances = pd.DataFrame(squareform(pdist(tcorpus, metric='cosine')), index=keys, columns=keys)

In [None]:
cosine_distances.head(5)

#### Identifying listings that have multiple low cosine distances

My initial approach was to try to identify likely ghost hotels, but given that listings can appear in multiple clusters of varying lengths, it would be simpler to just identify _listings_ that are likely to be in ghost hotels

In [None]:
ghost_listings = cosine_distances[(cosine_distances < 0.2).sum(1) > 2].apply(lambda row: row[row < 0.2].index.tolist(), 1).tolist()

In [30]:
def FindMaxLength(lst): 
    maxList = max(lst, key = len) 
    maxLength = max(map(len, lst)) 
      
    return maxList, maxLength
  
print(FindMaxLength(validation)) 

(array([ 6488,  6491,  6747,  6749,  6843,  6844,  6846,  6880,  6881,
        6943,  6944,  6980,  7040,  7041,  7210,  7212,  7236,  7237,
        7238,  7239,  7241,  7263,  7354,  7974,  8009,  8171,  8257,
        8487,  8488,  8741,  8742,  8776,  8779,  9510,  9511,  9512,
        9543,  9555,  9672,  9673,  9677,  9678,  9679, 11026, 11036,
       11037, 11038, 11039, 11044, 11051, 11052, 11053, 11054, 11055,
       11056, 11057, 11058, 11059, 11060, 11062, 11063, 11064, 11065,
       11066, 11067, 11068, 11069, 11456, 11457, 11460, 11462, 11561,
       11562, 11563, 11564, 11565, 11566, 11575, 11771, 11960, 11961,
       12135, 12144, 12166, 12167, 12168, 12171, 12212, 12213, 12241,
       12243, 12256, 12260, 12273, 12277, 12299, 12300, 12301, 12302,
       12303, 12304, 12305, 12319, 12320, 12321, 12322, 12324, 12325,
       12326, 12327, 12414, 12435, 12436, 12437, 12438, 12439, 12498,
       13450, 14496, 15177, 15178, 15179, 15180, 15181, 15182, 15299,
       15324, 15325

In [None]:
counts = [len(h) for h in ghost_hotels]
sorted(counts, reverse=True)[:5]

In [None]:
ghost_listings[counts.index(60)]

In [None]:
ghost_hotels = set()
for i in ghost_listings:
    ghost_hotels.add(frozenset(i)) 
#the frozensets show that clusters of listings in different areas by the same host can have high cosine similarity
#need to find a better threshold
#but there are some ghost hotels that have cosine distances just under 0.2
#maybe draw buffers around each listing and 

In [None]:
len(ghost_hotels)

#### Different approach - creating a set of individual listings likely to be part of ghost hotels

In [None]:
ghost_listings2 = cosine_distances[(cosine_distances < 0.2).sum(1) > 2].index.tolist()

In [None]:
df.loc[cosine_distances[cosine_distances[40187901] < 0.2].index, ['description', 'host_id']] #it's picking up on ghost hotels with multiple associated accounts!