# Reproducible Analysis

In [None]:
import os
import numpy as np
import pandas as pd
import geopandas as gpd
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import re
from pysal.lib import weights

from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.spatial.distance import pdist, squareform
from sklearn.metrics import pairwise_distances_chunked
from itertools import chain

## Identifying ghost hotels

### Loading and cleaning data

In [None]:
norm_df = pd.read_csv('https://github.com/jreades/i2p/blob/master/data/clean/2020-08-24-listings-with-nlp-subset.csv.gz?raw=true',
                compression='gzip', dtype={'id':np.float64, 'listing_url':str, 'name':str, 'description':str, 'description_norm':str})
ref_df = pd.read_csv('https://github.com/jreades/i2p/blob/master/data/src/2020-08-24-listings.csv.gz?raw=true',
                    compression='gzip', usecols=['id', 'room_type', 'calculated_host_listings_count', 'calculated_host_listings_count_private_rooms', 'host_id', 'host_name', 'availability_365', 'latitude', 'longitude', 'price', 'number_of_reviews_ltm'],
                    dtype={'id':np.float64,
                           'room_type':str, 
                           'calculated_host_listings_count':np.float64,
                           'calculated_host_listings_count_private_rooms':np.float64,
                           'host_id':np.float64, 
                           'host_name':str, 
                           'availability_365':np.float64,
                           'latitude':np.float64,
                          'longitude':np.float64,
                          'price':str,
                          'number_of_reviews_ltm':np.float64})

#### Joining df with normalised description to df with room type, then dropping hotel rooms, serviced apartments, and null normalised descriptions

In [None]:
df = pd.merge(norm_df, ref_df, on='id') #default is inner so we leave it
df.drop(df[(df.room_type == 'Hotel room') |
           (df.id.isna()) |
           (df.calculated_host_listings_count.isna()) |
           (df.latitude.isna())].index, inplace=True)

#Converting columns
df['price'] = df['price'].str.replace('$','').str.replace(',','').astype('float')
ints = ['id', 'host_id', 'calculated_host_listings_count', 'calculated_host_listings_count_private_rooms', 'availability_365', 'number_of_reviews_ltm']
for i in ints:
    print(f"Converting {i}")
    try:
        df[i] = df[i].astype('int')
    except ValueError as e:
        print("\tConverting to unsigned 16-bit integer.")
        df[i] = df[i].astype(pd.UInt16Dtype())

#Setting custom index
df.set_index('id', inplace=True)

#Dropping non-revenue generating listings
df.drop(df[df['number_of_reviews_ltm'] < 1].index, inplace=True)

private_rooms = df[(df['calculated_host_listings_count_private_rooms'] >= 3) &
                   (df['room_type'] == 'Private room') &
                   (~df['description_norm'].isna())].copy()

In [None]:
print(f'There are {private_rooms.shape[0]} candidates for ghost listings, which make up {(private_rooms.shape[0]/df.shape[0])*100:.2f}% of revenue-generating listings in London.')

In [None]:
private_rooms.head(5)

#### To filter or not to filter serviced apartments?

Serviced apartments are an established type of tourist accommodation with similar characteristics to ghost hotels: their units are typically located in the same building, and units listed on Airbnb would likely have similar descriptions. However, established serviced apartments may have planning permission to operate as such, unlike ghost hotels. 

I experimented with filtering out listings that are likely to be legitimate serviced apartments before proceeding with text similarity analysis, but ultimately decided against it for the following reasons:
1. Listings described as 'serviced apartments' or 'serviced accommodation' account for a small percentage of commercially-run Airbnb listings.
2. Listings in buildings classed as 'residential' are still sometimes described as serviced apartments for marketing purposes, and there is no way to distinguish between these listings and legitimate serviced apartment listings.

In [None]:
serviced = df[df.description.str.contains(r'(?:serviced (?:apartments?|accommodation)+)', regex=True, flags=re.IGNORECASE)][['description', 'name', 'calculated_host_listings_count', 'availability_365']]
print(f'{(serviced.shape[0]/df.shape[0])*100:.2f}% of commercially-run Airbnb listings are described as serviced apartments.')

### Text similarity

In [None]:
corpus = private_rooms.description_norm.values

#### Creating TF/IDF vectorizer

In [None]:
vectorizer = TfidfVectorizer(use_idf=True, ngram_range=(1,2), max_df=0.5, min_df=0.05)
#ignore terms that appear in over 50% or less than 5% of documents
tcorpus = vectorizer.fit_transform(corpus).toarray() 
#fit creates list of words/ngrams + IDF scores
#transform creates matrix of TF/IDF scores per word/ngram
#convert to array for cosine similarity step

#### Creating cosine distance matrix

In [None]:
def reduce_func(D_chunk, start):
    neigh = [np.flatnonzero(d < 0.2) for d in D_chunk] #index positions of listings with a cosine distance less than 0.2
    return neigh

ghost_idx = [] #set up list of indices to use for slicing original df
validation = [] #list of arrays, each containing int index of similar listings
for chunk in pairwise_distances_chunked(tcorpus, metric='cosine', reduce_func=reduce_func): #each chunk is a list of arrays
    for idx, d in enumerate(chunk):
        if (d.shape[0] >= 3):
            ghost_idx.append(idx)
            validation.append(d)
            
    #if the length of an array (d.shape[0]) is greater than or equal to 3,
    #the listing it corresponds to has low cosine distances from three or more listings,
    #indicating a likely ghost hotel

In [None]:
len(validation)

In [None]:
pd.set_option('display.max_colwidth', None)
df.iloc[validation[938]][['description', 'host_id', 'name', 'room_type', 'availability_365']]

Some issues:
- Some listing clusters really are multiple spare rooms in someone's house (tend to be 3-4 listings in these clusters, e.g. no. 42 + 46 for validation list, when length of array is set to greater than 3)
- Some listing clusters are hotel rooms listed as private rooms (Crawford Suites, Central Hoxton)
- Some really big listing clusters are just property management companies with rooms in a variety of locations (Sonder

In [None]:
print(f'{len(ghost_idx)} listings are suspected ghost hotels.')

In [None]:
ghost_listings = df.iloc[ghost_idx, :]

In [None]:
pd.set_option('display.max_colwidth', None)
ghost_listings.sample(5)[['description', 'host_id', 'calculated_host_listings_count']]

In [None]:
ghost_listings[
    ghost_listings['host_id'] == 33889201
].shape[0]

#### Previous approaches

In [None]:
keys = private_rooms.index
cosine_distances = pd.DataFrame(squareform(pdist(tcorpus, metric='cosine')), index=keys, columns=keys)

In [None]:
cosine_distances.head(5)

#### Identifying listings that have multiple low cosine distances

My initial approach was to try to identify likely ghost hotels, but given that listings can appear in multiple clusters of varying lengths, it would be simpler to just identify _listings_ that are likely to be in ghost hotels

In [None]:
#Extract clusters of listings that have similar descriptions to over 2 other listings
#2 because 1 will always be the listing itself, and the other could be another spare room in an apartment
ghost_clusters = cosine_distances[(cosine_distances < 0.2).sum(1) > 2].apply(lambda row: row[row < 0.2].index.tolist(), 1).tolist()

In [None]:
#Individual listings that meet criteria
ghost_listings = cosine_distances[(cosine_distances < 0.2).sum(1) > 2].index.tolist()

In [None]:
print(f'{len(ghost_listings)} private-room listings could be in ghost hotels.')

In [None]:
ghost_listings[0:5] #each cluster corresponds to a listing, and the lists are the listings it's textually similar to

In [None]:
ghost_hotels = set()
for i in ghost_listings:
    ghost_hotels.add(frozenset(i))
#the frozensets show that clusters of listings in different areas by the same host can have high cosine similarity
#need to find a better threshold
#but there are some ghost hotels that have cosine distances just under 0.2

In [None]:
len(ghost_hotels) #to access indexes of clusters, use list(ghost_hotels)

#### Mapping potential ghost hotels

In [None]:
gdf = gpd.GeoDataFrame(df, 
      geometry=gpd.points_from_xy(df['longitude'], df['latitude'], crs='epsg:4326'))

In [None]:
boros = gpd.read_file('https://github.com/jreades/i2p/blob/master/data/src/Boroughs.gpkg?raw=true')
gdf = gdf.to_crs(boros.crs)

In [None]:
fig, ax = plt.subplots(1,1, figsize=(12,10))
boros.plot(ax=ax, facecolor='white', edgecolor='grey', linewidth=0.5)
gdf.loc[[14669947,
            14670320,
            25006184,
            25006355,
            25006549,
            25783428,
            25783658,
            25990877,
            28128269,
            28666876,
            28691606,
            28691717,
            29650854,
            29651426,
            29651702,
            29652062,
            30300299,
            30300857,
            30300980,
            30954567,
            31016260,
            31016553,
            32759229,
            32759618,
            34011187,
            34011556,
            34904405,
            34971588,
            35140390,
            35420290,
            35994552,
            38383575,
            38624690,
            40187901]].plot(ax=ax)

In [None]:
ghost_gdf = gdf.loc[ghost_listings2]

In [None]:
type(ghost_listings)

In [None]:
test_dbw = weights.DistanceBand.from_dataframe(ghost_gdf, 300)

In [None]:
type(test_dbw)

In [None]:
test_dbw

In [None]:
#Use pairwise_distances_chunked to find the most similar listings to each listing,
#then select only the ones with more than 2 other similar listings - array

#For each listing with more than 2 other similar listings, calculate the distance band weights

#Sort array of listings with more than 2 other textually similar listings based on the number of similar listings
#Next step will be easier if you start with the clusters that are more likely to contain subsets

#For each listing, compare distance band weights to list of similar listings, and extract clusters