# Reproducible Analysis

In [1]:
import os
import numpy as np
import pandas as pd
import geopandas as gpd
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import re

from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.spatial.distance import pdist, squareform
from sklearn.metrics import pairwise_distances_chunked

## Identifying ghost hotels

### Loading and cleaning data

In [2]:
norm_df = pd.read_csv('https://github.com/jreades/i2p/blob/master/data/clean/2020-08-24-listings-with-nlp-subset.csv.gz?raw=true',
                compression='gzip', dtype={'id':np.float64, 'listing_url':str, 'name':str, 'description':str, 'description_norm':str})
ref_df = pd.read_csv('https://github.com/jreades/i2p/blob/master/data/src/2020-08-24-listings.csv.gz?raw=true',
                    compression='gzip', usecols=['id', 'room_type', 'calculated_host_listings_count', 'host_id', 'host_name', 'availability_365'],
                    dtype={'id':np.float64, 'room_type':str, 'calculated_host_listings_count':np.float64, 'host_id':np.float64, 'host_name':str, 'availability_365':np.float64})

#### Joining df with normalised description to df with room type, then dropping hotel rooms, serviced apartments, and null normalised descriptions

In [3]:
df = pd.merge(norm_df, ref_df, on='id') #default is inner so we leave it
df.drop(df[(df.room_type == 'Hotel room') |
           (df.id.isna()) |
           (df.calculated_host_listings_count.isna()) |
           (df.description.isna()) |
           (df.description_norm.isna())].index, inplace=True)

ints = ['id', 'host_id', 'calculated_host_listings_count', 'availability_365']
for i in ints:
    print(f"Converting {i}")
    try:
        df[i] = df[i].astype('int')
    except ValueError as e:
        print("\tConverting to unsigned 16-bit integer.")
        df[i] = df[i].astype(pd.UInt16Dtype())

df.drop(df[df['calculated_host_listings_count'] < 2].index, inplace=True)
df.set_index('id', inplace=True)

Converting id
Converting host_id
Converting calculated_host_listings_count
Converting availability_365


In [4]:
df.shape[0]

32320

#### To filter or not to filter serviced apartments?

Serviced apartments are an established type of tourist accommodation with similar characteristics to ghost hotels: their units are typically located in the same building, and units listed on Airbnb would likely have similar descriptions. However, established serviced apartments may have planning permission to operate as such, unlike ghost hotels. 

I experimented with filtering out listings that are likely to be legitimate serviced apartments before proceeding with text similarity analysis, but ultimately decided against it for the following reasons:
1. Listings described as 'serviced apartments' or 'serviced accommodation' account for a small percentage of commercially-run Airbnb listings.
2. Listings in buildings classed as 'residential' are still sometimes described as serviced apartments for marketing purposes, and there is no way to distinguish between these listings and legitimate serviced apartment listings.

In [5]:
pd.set_option('display.max_colwidth', 280)
serviced = df[df.description.str.contains(r'(?:serviced (?:apartments?|accommodation)+)', regex=True, flags=re.IGNORECASE)][['description', 'name', 'calculated_host_listings_count', 'availability_365']]
print(f'{(serviced.shape[0]/df.shape[0])*100:.2f}% of commercially-run Airbnb listings are described as serviced apartments.')
serviced.sample(5)

1.46% of commercially-run Airbnb listings are described as serviced apartments.


Unnamed: 0_level_0,description,name,calculated_host_listings_count,availability_365
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
33878786,"Welcome to Central Hoxton Shoreditch the home of serviced apartments in Shoreditch, London. We promise to provide you with the value you deserve and space comfort and flexibility. Whether you are planning a weekend break in London or visiting for business purpose and familiy ...",Double Budget Room 2 Adults,9,365
41019108,"Our beautiful studio flats are ideal to comfortably sleep 2 people, with fully-equipped kitchenettes, luxuriously bathrooms and beautifully designed interiors. <br />Complimentary Wi-Fi, Central heating, Freeview TV, DVD Player, Kettle, Safe, Toaster, Iron, Hair dryer, Clothe...",Fantastic Studio Apartment Chelsea Green - CGA3,259,346
17432188,We have several units in the same building. Please contact us if you cannot find availability for your requested dates.<br /><br />Moments away from Borough & Southwark Stations these smart serviced apartments are an ideal choice for both business and leisure travellers. You ...,SLG Southwark Executive 1 Bedroom Apartment - B,111,127
41922994,Self-isolation unit for key workers and anyone else looking for self isolation. Professionally cleaned by our in-house staff who are trained in proper techniques and will disinfect before your stay. Our staff are fully trained and will deep clean your apartment before and aft...,High Street Kensington apartment for 4,33,0
21879577,Our Stylish three bedrooms are situated in the vibrant area of Victoria. These elegant apartments are designed with with warm beige tones and bright windows. With a fully-equipped kitchen. Everything is so centrally located and accessible for our guests to London. These pro...,Stylish & Modern 3 Bedroom in Central London,13,90


### Text similarity

In [6]:
corpus = df.description_norm.values

#### Creating TF/IDF vectorizer

In [7]:
vectorizer = TfidfVectorizer(use_idf=True, ngram_range=(1,2), max_df=0.5, min_df=0.05)
#ignore terms that appear in over 50% or less than 5% of documents
tcorpus = vectorizer.fit_transform(corpus).toarray() 
#fit creates list of words/ngrams + IDF scores
#transform creates matrix of TF/IDF scores per word/ngram
#convert to array for cosine similarity step

#### Creating cosine distance matrix

In [None]:
tcorpus.shape

In [8]:
def reduce_func(D_chunk, start):
    neigh = [np.flatnonzero(d < 0.2) for d in D_chunk] #index positions of listings with a cosine similarity less than 0.2
    return neigh

#for chunk in pairwise_distances_chunked(tcorpus, metric='cosine', reduce_func=reduce_func):
    #find index positions of rows with len(neigh) > 2

In [9]:
D_chunk = next(pairwise_distances_chunked(tcorpus, metric='cosine', reduce_func=reduce_func))

In [14]:
D_chunk[3]

array([  3, 624])

In [None]:
cosine_distances = pd.DataFrame(squareform(pdist(tcorpus, metric='cosine')), index=keys, columns=keys)

In [None]:
cosine_distances.head(5)

#### Identifying listings that have multiple low cosine distances

My initial approach was to try to identify likely ghost hotels, but given that listings can appear in multiple clusters of varying lengths, it would be simpler to just identify _listings_ that are likely to be in ghost hotels

In [None]:
ghost_listings = cosine_distances[(cosine_distances < 0.2).sum(1) > 2].apply(lambda row: row[row < 0.2].index.tolist(), 1).tolist()

In [None]:
def FindMaxLength(lst): 
    maxList = max(lst, key = len) 
    maxLength = max(map(len, lst)) 
      
    return maxList, maxLength
  
print(FindMaxLength(ghost_hotels)) 

In [None]:
counts = [len(h) for h in ghost_hotels]
sorted(counts, reverse=True)[:5]

In [None]:
ghost_listings[counts.index(60)]

In [None]:
ghost_hotels = set()
for i in ghost_listings:
    ghost_hotels.add(frozenset(i)) 
#the frozensets show that clusters of listings in different areas by the same host can have high cosine similarity
#need to find a better threshold
#but there are some ghost hotels that have cosine distances just under 0.2
#maybe draw buffers around each listing and 

In [None]:
len(ghost_hotels)

#### Different approach - creating a set of individual listings likely to be part of ghost hotels

In [None]:
ghost_listings2 = cosine_distances[(cosine_distances < 0.2).sum(1) > 2].index.tolist()

In [None]:
df.loc[cosine_distances[cosine_distances[40187901] < 0.2].index, ['description', 'host_id']] #it's picking up on ghost hotels with multiple associated accounts!