# Reproducible Analysis

In [1]:
import os
import numpy as np
import pandas as pd
import geopandas as gpd
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import re

import glob
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.spatial.distance import pdist, squareform

## Identifying ghost hotels

### Loading and cleaning data

In [2]:
norm_df = pd.read_csv('https://github.com/jreades/i2p/blob/master/data/clean/2020-08-24-listings-with-nlp-subset.csv.gz?raw=true',
                compression='gzip', dtype={'id':np.float64, 'listing_url':str, 'name':str, 'description':str, 'description_norm':str})
ref_df = pd.read_csv('https://github.com/jreades/i2p/blob/master/data/src/2020-08-24-listings.csv.gz?raw=true',
                    compression='gzip', usecols=['id', 'room_type', 'calculated_host_listings_count', 'host_id'],
                    dtype={'id':np.float64, 'room_type':str, 'calculated_host_listings_count':np.float64, 'host_id':np.float64})

#### Joining df with normalised description to df with room type, then dropping hotel rooms, serviced apartments, and null normalised descriptions

In [3]:
df = pd.merge(norm_df, ref_df, on='id') #default is inner so we leave it
df.drop(df[(df.room_type == 'Hotel room') |
           (df.id.isna()) |
           (df.calculated_host_listings_count.isna()) |
           (df.description.isna()) |
           (df.description_norm.isna())].index, inplace=True)
df.drop(df[df.description.str.contains(r'serviced', regex=True, flags=re.IGNORECASE)].index, inplace=True)

#### Converting host listings count to integer and dropping listings where the host only has one listing
(Airbnb has systems in place to detect multiple accounts and bans them)

In [4]:
ints = ['id', 'host_id', 'calculated_host_listings_count']
for i in ints:
    print(f"Converting {i}")
    try:
        df[i] = df[i].astype('int')
    except ValueError as e:
        print("\tConverting to unsigned 16-bit integer.")
        df[i] = df[i].astype(pd.UInt16Dtype())

df.drop(df[df['calculated_host_listings_count'] < 2].index, inplace=True)

Converting id
Converting host_id
Converting calculated_host_listings_count


In [6]:
df.set_index('id', inplace=True)

#### Creating separate data frames for entire home and private room listings

In [7]:
entire_homes = df[df.room_type == 'Entire home/apt']
private_rooms = df[df.room_type == 'Private room']

In [8]:
print(f'There are {entire_homes.shape[0]} entire home listings and {private_rooms.shape[0]} private room listings run by hosts with multiple listings.')

There are 18232 entire home listings and 13060 private room listings run by hosts with multiple listings.


### Text similarity - testing with private room listings

In [10]:
corpus = private_rooms.description_norm.values

#### Creating TF/IDF vectorizer

In [11]:
vectorizer = TfidfVectorizer(use_idf=True, ngram_range=(1,2), max_df=0.5, min_df=0.05)
#ignore terms that appear in over 50% or less than 5% of documents
tcorpus = vectorizer.fit_transform(corpus).toarray() 
#fit creates list of words/ngrams + IDF scores
#transform creates matrix of TF/IDF scores per word/ngram
#convert to array for cosine similarity step

#### Creating cosine distance matrix

In [13]:
keys = private_rooms.index.values

In [12]:
tcorpus.shape

(13060, 262)

In [14]:
cosine_distances = pd.DataFrame(squareform(pdist(tcorpus, metric='cosine')), index=keys, columns=keys)

In [15]:
cosine_distances.head(5)

Unnamed: 0,13913,17506,25123,38605,38950,40228,41311,41712,42001,43129,...,45063293,45063367,45065675,45066548,45067269,45077803,45081264,45081587,45083401,45085490
13913,0.0,0.747515,0.928287,0.784346,0.687085,0.672891,0.928287,0.721652,0.751964,0.869853,...,0.772143,0.772143,0.759286,0.907606,0.781843,0.846305,0.931093,0.911915,0.797081,0.8795
17506,0.747515,0.0,0.84087,0.822943,0.89672,0.874247,0.84087,0.837818,0.770506,0.895026,...,0.927199,0.927199,0.816514,0.903399,1.0,0.868653,0.887034,0.903285,0.90864,0.904626
25123,0.928287,0.84087,0.0,0.797349,0.768436,0.784954,0.0,0.898682,0.778578,0.750546,...,0.887773,0.887773,0.764521,0.837112,0.928732,0.840263,0.903731,0.766158,0.941996,0.918433
38605,0.784346,0.822943,0.797349,0.0,0.71546,0.83109,0.797349,0.833245,0.82943,0.776295,...,0.8737,0.8737,0.842239,0.817071,0.984336,0.84302,0.957226,0.958102,0.909318,0.939964
38950,0.687085,0.89672,0.768436,0.71546,0.0,0.198865,0.768436,0.846727,0.80939,0.782725,...,0.878677,0.878677,0.720726,0.849885,0.738523,0.875978,0.89519,0.823111,0.838009,0.82871


#### Identifying listings that have multiple low cosine distances

In [69]:
ghost_listings = cosine_distances[(cosine_distances < 0.2).sum(1) > 2].apply(lambda row: row[row < 0.2].index.tolist(), 1)

In [90]:
ghost_hotels = set()
for i in ghost_listings:
    ghost_hotels.add(frozenset(i)) 
#the frozensets show that clusters of listings in different areas by the same host can have high cosine similarity
#need to find a better threshold
#but there are some ghost hotels that have cosine distances just under 0.2
#maybe draw buffers around each listing and 

#### Different approach - creating a set of individual listings likely to be part of ghost hotels

In [66]:
ghost_listings2 = set(cosine_distances[(cosine_distances < 0.2).sum(1) > 2].index.tolist())

In [67]:
for l in ghost_listings2:
    cosine_distances.apply(lambda x: x[l] )

2965

In [29]:
pd.set_option('display.max_colwidth', None)
df.loc[cosine_distances[cosine_distances[45063218] < 0.2].index, ['description', 'host_id']] #it's picking up on ghost hotels with multiple associated accounts!

Unnamed: 0,description,host_id
44942288,"Our self contained modern and spacious double bedroom in Notting Hill/Westbourne Park minutes walk from the Underground.<br />The apartment is one of five purposely-built double and triple room lets in this brand new block of apartments.<br />The building has a communal shower/bathroom, which is deep cleaned daily along with all communal areas.<br /><br /><b>The space</b><br />The apartments are all designed by ‘Slim Interior Design’ with a chic, modern minimalist brief. <br />The flat has been finished to an excellent standard and comes fully furnished with a modern bathroom with shower & bath, laminate floors throughout, modern furniture and fittings. <br />We have also taking preventative measures to ensure guests have a safe stay with us with CCTV in communal areas, wireless check-in, daily communal cleans.<br />These double rooms are suitable for couples or single guests.<br /><br /><b>Guest access</b><br />Guest have access to their own room as well as the communal areas.<br /><b",2331446
44943419,"Our self contained modern and spacious double bedroom in Notting Hill/Westbourne Park minutes walk from the Underground.<br />The apartment is one of five purposely-built double and triple room lets in this brand new block of apartments.<br />The building has a communal shower/bathroom, which is deep cleaned daily along with all communal areas.<br /><br /><b>The space</b><br />The apartments are all designed by ‘Slim Interior Design’ with a chic, modern minimalist brief. <br />The flat has been finished to an excellent standard and comes fully furnished with a modern bathroom with shower & bath, laminate floors throughout, modern furniture and fittings. <br />We have also taking preventative measures to ensure guests have a safe stay with us with CCTV in communal areas, wireless check-in, daily communal cleans.<br />These double rooms are suitable for couples or single guests.<br /><br /><b>Guest access</b><br />Guest have access to their own room as well as the communal areas.<br /><b",2331446
44943988,"Our self contained modern and spacious double bedroom in Notting Hill/Westbourne Park minutes walk from the Underground.<br />The apartment is one of five purposely-built double and triple room lets in this brand new block of apartments.<br />The building has a communal shower/bathroom, which is deep cleaned daily along with all communal areas.<br /><br /><b>The space</b><br />The apartments are all designed by ‘Slim Interior Design’ with a chic, modern minimalist brief. <br />The flat has been finished to an excellent standard and comes fully furnished with a modern bathroom with shower & bath, laminate floors throughout, modern furniture and fittings. <br />We have also taking preventative measures to ensure guests have a safe stay with us with CCTV in communal areas, wireless check-in, daily communal cleans.<br />These double rooms are suitable for couples or single guests.<br /><br /><b>Guest access</b><br />Guest have access to their own room as well as the communal areas.<br /><b",248034547
44944231,"Our self contained modern and spacious double bedroom in Notting Hill/Westbourne Park minutes walk from the Underground.<br />The apartment is one of five purposely-built double and triple room lets in this brand new block of apartments.<br />The building has a communal shower/bathroom, which is deep cleaned daily along with all communal areas.<br /><br /><b>The space</b><br />The apartments are all designed by ‘Slim Interior Design’ with a chic, modern minimalist brief. <br />The flat has been finished to an excellent standard and comes fully furnished with a modern bathroom with shower & bath, laminate floors throughout, modern furniture and fittings. <br />We have also taking preventative measures to ensure guests have a safe stay with us with CCTV in communal areas, wireless check-in, daily communal cleans.<br />These double rooms are suitable for couples or single guests.<br /><br /><b>Guest access</b><br />Guest have access to their own room as well as the communal areas.<br /><b",248034547
45062758,"Our self contained modern and spacious double bedroom in Notting Hill/Westbourne Park minutes walk from the Underground.<br />The apartment is one of five purposely-built double and triple room lets in this brand new block of apartments.<br />The building has a communal shower/bathroom, which is deep cleaned daily along with all communal areas.<br /><br /><b>The space</b><br />The apartments are all designed by ‘Slim Interior Design’ with a chic, modern minimalist brief.<br />The flat has been finished to an excellent standard and comes fully furnished with a modern bathroom with shower & bath, laminate floors throughout, modern furniture and fittings.<br />We have also taking preventative measures to ensure guests have a safe stay with us with CCTV in communal areas, wireless check-in, daily communal cleans.<br />These double rooms are suitable for couples or single guests.<br /><br /><b>Guest access</b><br />Guest have access to their own room as well as the communal areas.<br /><br",207477735
45062822,"Our self contained modern and spacious double bedroom in Notting Hill/Westbourne Park minutes walk from the Underground.<br />The apartment is one of five purposely-built double and triple room lets in this brand new block of apartments.<br />The building has a communal shower/bathroom, which is deep cleaned daily along with all communal areas.<br /><br /><b>The space</b><br />The apartments are all designed by ‘Slim Interior Design’ with a chic, modern minimalist brief.<br />The flat has been finished to an excellent standard and comes fully furnished with a modern bathroom with shower & bath, laminate floors throughout, modern furniture and fittings.<br />We have also taking preventative measures to ensure guests have a safe stay with us with CCTV in communal areas, wireless check-in, daily communal cleans.<br />These double rooms are suitable for couples or single guests.<br /><br /><b>Guest access</b><br />Guest have access to their own room as well as the communal areas.<br /><br",207477735
45062895,"Our self contained modern and spacious double bedroom in Notting Hill/Westbourne Park minutes walk from the Underground.<br />The apartment is one of five purposely-built double and triple room lets in this brand new block of apartments.<br />The building has a communal shower/bathroom, which is deep cleaned daily along with all communal areas.<br /><br /><b>The space</b><br />The apartments are all designed by ‘Slim Interior Design’ with a chic, modern minimalist brief.<br />The flat has been finished to an excellent standard and comes fully furnished with a modern bathroom with shower & bath, laminate floors throughout, modern furniture and fittings.<br />We have also taking preventative measures to ensure guests have a safe stay with us with CCTV in communal areas, wireless check-in, daily communal cleans.<br />These double rooms are suitable for couples or single guests.<br /><br /><b>Guest access</b><br />Guest have access to their own room as well as the communal areas.<br /><br",152064486
45062953,"Our self contained modern and spacious double bedroom in Notting Hill/Westbourne Park minutes walk from the Underground.<br />The apartment is one of five purposely-built double and triple room lets in this brand new block of apartments.<br />The building has a communal shower/bathroom, which is deep cleaned daily along with all communal areas.<br /><br /><b>The space</b><br />The apartments are all designed by ‘Slim Interior Design’ with a chic, modern minimalist brief.<br />The flat has been finished to an excellent standard and comes fully furnished with a modern bathroom with shower & bath, laminate floors throughout, modern furniture and fittings.<br />We have also taking preventative measures to ensure guests have a safe stay with us with CCTV in communal areas, wireless check-in, daily communal cleans.<br />These double rooms are suitable for couples or single guests.<br /><br /><b>Guest access</b><br />Guest have access to their own room as well as the communal areas.<br /><br",152064486
45063035,"Our self contained modern and spacious double bedroom in Notting Hill/Westbourne Park minutes walk from the Underground.<br />The apartment is one of five purposely-built double and triple room lets in this brand new block of apartments.<br />The building has a communal shower/bathroom, which is deep cleaned daily along with all communal areas.<br /><br /><b>The space</b><br />The apartments are all designed by ‘Slim Interior Design’ with a chic, modern minimalist brief.<br />The flat has been finished to an excellent standard and comes fully furnished with a modern bathroom with shower & bath, laminate floors throughout, modern furniture and fittings.<br />We have also taking preventative measures to ensure guests have a safe stay with us with CCTV in communal areas, wireless check-in, daily communal cleans.<br />These double rooms are suitable for couples or single guests.<br /><br /><b>Guest access</b><br />Guest have access to their own room as well as the communal areas.<br /><br",258709744
45063091,"Our self contained modern and spacious double bedroom in Notting Hill/Westbourne Park minutes walk from the Underground.<br />The apartment is one of five purposely-built double and triple room lets in this brand new block of apartments.<br />The building has a communal shower/bathroom, which is deep cleaned daily along with all communal areas.<br /><br /><b>The space</b><br />The apartments are all designed by ‘Slim Interior Design’ with a chic, modern minimalist brief.<br />The flat has been finished to an excellent standard and comes fully furnished with a modern bathroom with shower & bath, laminate floors throughout, modern furniture and fittings.<br />We have also taking preventative measures to ensure guests have a safe stay with us with CCTV in communal areas, wireless check-in, daily communal cleans.<br />These double rooms are suitable for couples or single guests.<br /><br /><b>Guest access</b><br />Guest have access to their own room as well as the communal areas.<br /><br",258709744


In [39]:
ref_df[ref_df['id'] == 4482189]

Unnamed: 0,id,host_id,room_type,calculated_host_listings_count
5005,4482189.0,23255766.0,Entire home/apt,2.0
