# Reproducible Analysis

In [1]:
import os
import numpy as np
import pandas as pd
import geopandas as gpd
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import re

from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.spatial.distance import pdist, squareform
from sklearn.metrics import pairwise_distances_chunked
from itertools import chain

## Identifying ghost hotels

### Loading and cleaning data

In [2]:
norm_df = pd.read_csv('https://github.com/jreades/i2p/blob/master/data/clean/2020-08-24-listings-with-nlp-subset.csv.gz?raw=true',
                compression='gzip', dtype={'id':np.float64, 'listing_url':str, 'name':str, 'description':str, 'description_norm':str})
ref_df = pd.read_csv('https://github.com/jreades/i2p/blob/master/data/src/2020-08-24-listings.csv.gz?raw=true',
                    compression='gzip', usecols=['id', 'room_type', 'calculated_host_listings_count', 'host_id', 'host_name', 'availability_365'],
                    dtype={'id':np.float64, 'room_type':str, 'calculated_host_listings_count':np.float64, 'host_id':np.float64, 'host_name':str, 'availability_365':np.float64})

#### Joining df with normalised description to df with room type, then dropping hotel rooms, serviced apartments, and null normalised descriptions

In [3]:
df = pd.merge(norm_df, ref_df, on='id') #default is inner so we leave it
df.drop(df[(df.room_type == 'Hotel room') |
           (df.id.isna()) |
           (df.calculated_host_listings_count.isna()) |
           (df.description.isna()) |
           (df.description_norm.isna())].index, inplace=True)

ints = ['id', 'host_id', 'calculated_host_listings_count', 'availability_365']
for i in ints:
    print(f"Converting {i}")
    try:
        df[i] = df[i].astype('int')
    except ValueError as e:
        print("\tConverting to unsigned 16-bit integer.")
        df[i] = df[i].astype(pd.UInt16Dtype())

df.drop(df[df['calculated_host_listings_count'] <= 2].index, inplace=True)
df.set_index('id', inplace=True)

Converting id
Converting host_id
Converting calculated_host_listings_count
Converting availability_365


In [4]:
df.shape[0]

22256

#### To filter or not to filter serviced apartments?

Serviced apartments are an established type of tourist accommodation with similar characteristics to ghost hotels: their units are typically located in the same building, and units listed on Airbnb would likely have similar descriptions. However, established serviced apartments may have planning permission to operate as such, unlike ghost hotels. 

I experimented with filtering out listings that are likely to be legitimate serviced apartments before proceeding with text similarity analysis, but ultimately decided against it for the following reasons:
1. Listings described as 'serviced apartments' or 'serviced accommodation' account for a small percentage of commercially-run Airbnb listings.
2. Listings in buildings classed as 'residential' are still sometimes described as serviced apartments for marketing purposes, and there is no way to distinguish between these listings and legitimate serviced apartment listings.

In [43]:
serviced = df[df.description.str.contains(r'(?:serviced (?:apartments?|accommodation)+)', regex=True, flags=re.IGNORECASE)][['description', 'name', 'calculated_host_listings_count', 'availability_365']]
print(f'{(serviced.shape[0]/df.shape[0])*100:.2f}% of commercially-run Airbnb listings are described as serviced apartments.')

1.46% of commercially-run Airbnb listings are described as serviced apartments.


### Text similarity

In [5]:
corpus = df.description_norm.values

#### Creating TF/IDF vectorizer

In [6]:
vectorizer = TfidfVectorizer(use_idf=True, ngram_range=(1,2), max_df=0.5, min_df=0.05)
#ignore terms that appear in over 50% or less than 5% of documents
tcorpus = vectorizer.fit_transform(corpus).toarray() 
#fit creates list of words/ngrams + IDF scores
#transform creates matrix of TF/IDF scores per word/ngram
#convert to array for cosine similarity step

#### Creating cosine distance matrix

In [13]:
def reduce_func(D_chunk, start):
    neigh = [np.flatnonzero(d < 0.15) for d in D_chunk] #index positions of listings with a cosine distance less than 0.2
    return neigh

ghost_idx = [] #set up list of indices to use for slicing original df
for chunk in pairwise_distances_chunked(tcorpus, metric='cosine', reduce_func=reduce_func): #each chunk is a list of arrays
    ghost_idx.append([idx for idx, d in enumerate(chunk) if d.shape[0] >= 3])
            
    #if the length of an array (d.shape[0]) is greater than or equal to 3,
    #the listing it corresponds to has low cosine distances from three or more listings,
    #indicating a likely ghost hotel

In [14]:
ghost_idx_flat = list(chain.from_iterable(ghost_idx)) #flatten ghost_idx, because right now there is a nested list for each chunk

In [15]:
print(f'{len(ghost_idx_flat)} listings are suspected ghost hotels.')

7682 listings are suspected ghost hotels.


In [16]:
ghost_listings = df.iloc[ghost_idx_flat, :]

In [27]:
pd.set_option('display.max_colwidth', None)
ghost_listings.sample(5)[['description', 'host_id', 'calculated_host_listings_count']]

Unnamed: 0_level_0,description,host_id,calculated_host_listings_count
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
5946100,"Newly refurbished flat, on 2nd and 3rd floors of elegant Edwardian building with views of woodlands in an up-market part of London. 3 mins walk from Highgate station.<br /><br /><b>The space</b><br />Quiet residential flat with parking, 3 minutes' walk from Highgate tube station on the Northern Line, 20 minutes from central London.<br /><br /><b>Guest access</b><br />The whole flat, two bedrooms and one sitting/dinning room.<br /><br /><b>Other things to note</b><br />There are two flights of stairs and the rooms are on different levels",30868653,3
8000622,Quiet private double bedroom in a very nice flatshare in the heart of Shoreditch. Topfloor flat with modern fitted kitchen open on the balcony. So much light! Nice shared bathroom and separated toilet.Sharing with busy permanent young professionals,1591664,21
15996748,"The property is on North End Road,1 minute walk from West Kensington station.Central London is easy accessible. Big Ben,London Eye,Buckingham Palace etc all within 15 minutes by train. The property is close to all the local amenities and supermarkets. You will find great many cafes, shops ,bars and restaurant all within few steps .The property is cleaned daily and kept tidy.Great base for solo travellers and couples for visiting the city centre.Enjoy the convenience by staying close to City<br /><br /><b>The space</b><br />The private room is in the same flat as my other listing ""Private Your Second Home in London"". The flat has 3 rooms , 2 of which are listed on Airbnb. The third single room has a long term occupant ( professional lady ). The apartment bathroom and kitchen will be shared with the guest from the room and the lady. It will always be cleaned daily and kept tidy. We give free WiFi to all our guests. Fresh towels and linens provided<br /><br /><b>Guest access</b><br />Gue",92601704,17
13250149,"Cosy double bedroom located in the Eastend. Five minutes away from central line and hammersmith and city line. Located close to many parks and 24hr bus services and Queen Mary univeristy. Includes self catered breakfast. I also provide private long term stay to avoid any extra fees! feel free to contact me.<br /><br /><b>The space</b><br />Breakfast included in price.<br /><br /><b>Guest access</b><br />Shared bathroom, two toilets, shared kitchen",74356612,10
18408491,"Traditional terrace house with architectural extension, located on a residential street away from the noise and traffic. 15 mins walk to the tube with easy links to Liverpool Street and the West End. The house is very light with generously sized rooms, period fire places wooden floor boards and architectural features. Close to The Olympic Park and Westfield shopping with lovely walks along the River Lea, Hackney Marshes, and Chatsworth Road market on a Sunday.<br /><br /><b>The space</b><br />Clean, tidy with lots of character and nature around.<br />The large bedroom you will be staying in has a double bed and original features. The room overlooks the beautiful south west facing garden. It is light and sunny in the day time and fills with a golden glow when the sun sets. Other features include, wooden floor boards, fire place and wooden double glazed sash window with blinds. The bathroom has a separate shower and a bath with brand new contemporary fittings. The architectural kitchen i",43775018,5


In [25]:
ghost_listings[
    ghost_listings['host_id'] == 33889201
].shape[0]

297

#### Previous approaches

In [None]:
cosine_distances = pd.DataFrame(squareform(pdist(tcorpus, metric='cosine')), index=keys, columns=keys)

In [None]:
cosine_distances.head(5)

#### Identifying listings that have multiple low cosine distances

My initial approach was to try to identify likely ghost hotels, but given that listings can appear in multiple clusters of varying lengths, it would be simpler to just identify _listings_ that are likely to be in ghost hotels

In [None]:
ghost_listings = cosine_distances[(cosine_distances < 0.2).sum(1) > 2].apply(lambda row: row[row < 0.2].index.tolist(), 1).tolist()

In [None]:
def FindMaxLength(lst): 
    maxList = max(lst, key = len) 
    maxLength = max(map(len, lst)) 
      
    return maxList, maxLength
  
print(FindMaxLength(ghost_hotels)) 

In [None]:
counts = [len(h) for h in ghost_hotels]
sorted(counts, reverse=True)[:5]

In [None]:
ghost_listings[counts.index(60)]

In [None]:
ghost_hotels = set()
for i in ghost_listings:
    ghost_hotels.add(frozenset(i)) 
#the frozensets show that clusters of listings in different areas by the same host can have high cosine similarity
#need to find a better threshold
#but there are some ghost hotels that have cosine distances just under 0.2
#maybe draw buffers around each listing and 

In [None]:
len(ghost_hotels)

#### Different approach - creating a set of individual listings likely to be part of ghost hotels

In [None]:
ghost_listings2 = cosine_distances[(cosine_distances < 0.2).sum(1) > 2].index.tolist()

In [None]:
df.loc[cosine_distances[cosine_distances[40187901] < 0.2].index, ['description', 'host_id']] #it's picking up on ghost hotels with multiple associated accounts!