# Inside Airbnb data load
Intial data load from Inside Airbnb.

The oldest data available has been used as this is the closest date to some of the other base datasets that will be used.  In this instance the data was published by Inside Airbnb on 10th December 2023 and accessed on 27th November 2024.

In [73]:
# load required packages
import os
import numpy as np
import pandas as pd
import geopandas as gpd
import requests
import zipfile
import matplotlib.cm as cm
import matplotlib.pyplot as plt
from scipy.spatial.distance import pdist, squareform
from scipy.cluster.hierarchy import fcluster, linkage
from dateutil.relativedelta import relativedelta
from datetime import datetime
from PIL import Image
import requests
from io import BytesIO
from shapely.geometry import Point

pd.set_option('display.max_columns', None)
pd.options.display.max_colwidth = 200

In [41]:
# Set download URL for London data
# the date should be choosen by reviewing the Inside Airbnb Get Data page
# and identifying the date of the required data
date  = "2023-12-10"
url  = f"https://data.insideairbnb.com/united-kingdom/england/london/{date}/data/listings.csv.gz"

# create a dataframe of the raw Inside Airbnb data
df = pd.read_csv(url, compression='gzip', low_memory=False)

In [42]:
path = os.path.join('data','raw') # location to save the data
fn   = url.split('/')[-1] # return the file name
print(f"Writing to: {fn}")

if not os.path.exists(path):      # create directory if does not exist
    print(f"Creating {path} under {os.getcwd()}")
    os.makedirs(path)

if not os.path.exists(os.path.join(path,fn)):  
    df.to_csv(os.path.join(path,fn), index=False)
    print("Done.")

else:
    print("Data previously downloaded")

Writing to: listings.csv.gz
Data previously downloaded


### Cleaning the data

In [57]:
# Column selection
# create list of columns required for further processing

# this is a list of all the columns in the dataset
# cols = ['id', 'listing_url', 'last_scraped', 'name', 'host_id', 'host_since', 'host_location', 'host_listings_count', 'host_total_listings_count', 'latitude', 'longitude', 'property_type', 'room_type', 'accommodates', 'bathrooms', 'bathrooms_text', 'bedrooms', 'beds', 'price', 'minimum_nights', 'maximum_nights', 'availability_365', 'number_of_reviews', 'first_review', 'last_review', 'reviews_per_month']

# this needs to be amended to contain the required columns
# currently only using minimal columns to ensure the process works
# define required columns
cols = ['id', 'listing_url', 'host_id', "host_listings_count",'host_total_listings_count', 'property_type', 'room_type', 'price', 'minimum_nights', 'maximum_nights', 'availability_365', 'number_of_reviews', 'latitude', 'longitude', 'last_review', 'beds', 'bedrooms']

In [58]:
# check a testing data subset to work with
testing = False

if testing:
    df = pd.read_csv(os.path.join(path,fn), 
                low_memory=True, nrows=10000, usecols=cols)
else:
    df = pd.read_csv(os.path.join(path,fn), 
                low_memory=False,usecols=cols)

print(f"Data frame is {df.shape[0]:,} x {df.shape[1]}")

Data frame is 91,778 x 16


In [59]:
def pick_active_listings(df, date='str', col="str"):
    """
    Converting review column to datetime, and selecting only the recently active listings. 
    Recently active is defined as those which received at least one review for the year precedeng the scrape.
    We also take Null values, because these might be new listings without reviews.

    Arguments:
    date: string, format "%Y-%m-%d"
    dataframe
    col: string, colname with "last_review" or its alternative
    """
    df[col] = pd.to_datetime(df[col])
    date = datetime.strptime(date, "%Y-%m-%d")
    df = df[(df['last_review'] > (date - relativedelta(years=1))) | (df['last_review'].isnull())]
    
    print(f"Data frame after filtering last review date is {df.shape[0]:,} x {df.shape[1]}")
    return df

In [60]:
df = pick_active_listings(df, date, "last_review")

Data frame after filtering last review date is 70,872 x 16


In [61]:
df.sample(10, random_state=5)

Unnamed: 0,id,listing_url,host_id,host_listings_count,host_total_listings_count,latitude,longitude,property_type,room_type,beds,price,minimum_nights,maximum_nights,availability_365,number_of_reviews,last_review
86914,54029818,https://www.airbnb.com/rooms/54029818,299567125,1.0,2.0,51.38276,-0.2702,Private room in home,Private room,1.0,$30.00,2,3,0,19,2023-06-05
21407,978713774038141794,https://www.airbnb.com/rooms/978713774038141794,70639521,1.0,1.0,51.464928,-0.164343,Entire rental unit,Entire home/apt,2.0,$81.00,2,365,0,3,2023-12-10
31336,690003753982456300,https://www.airbnb.com/rooms/690003753982456300,474229060,1.0,1.0,51.521223,-0.052744,Private room in bed and breakfast,Private room,1.0,$70.00,1,365,1,8,2023-09-15
56336,1033208398324948011,https://www.airbnb.com/rooms/1033208398324948011,545568298,3.0,3.0,51.56339,-0.10689,Private room in condo,Private room,1.0,$97.00,1,365,262,0,NaT
21310,964346815169608296,https://www.airbnb.com/rooms/964346815169608296,35076388,1.0,1.0,51.569197,-0.115616,Entire rental unit,Entire home/apt,1.0,$160.00,30,30,35,4,2023-10-24
88096,9391077,https://www.airbnb.com/rooms/9391077,41955791,1.0,1.0,51.52318,-0.07173,Private room in home,Private room,1.0,$50.00,1,1125,0,0,NaT
47432,684543466631317302,https://www.airbnb.com/rooms/684543466631317302,46879394,15.0,30.0,51.52329,-0.167,Entire condo,Entire home/apt,5.0,$586.00,3,365,312,1,2023-07-11
13708,693850662406124112,https://www.airbnb.com/rooms/693850662406124112,181058008,2.0,2.0,51.48932,-0.26362,Private room in condo,Private room,1.0,$134.00,1,365,352,13,2023-09-15
11175,6160430,https://www.airbnb.com/rooms/6160430,31959132,1.0,1.0,51.46023,-0.10733,Private room in home,Private room,1.0,$33.00,1,1125,0,0,NaT
90248,817325648660736368,https://www.airbnb.com/rooms/817325648660736368,4209940,48.0,54.0,51.520889,-0.154358,Entire rental unit,Entire home/apt,1.0,$240.00,14,365,111,1,2023-10-10


In [62]:
df[~(df.price.str.startswith('$', na=False))]

Unnamed: 0,id,listing_url,host_id,host_listings_count,host_total_listings_count,latitude,longitude,property_type,room_type,beds,price,minimum_nights,maximum_nights,availability_365,number_of_reviews,last_review
23,592844,https://www.airbnb.com/rooms/592844,2926478,1.0,1.0,51.649330,-0.198220,Private room in home,Private room,1.0,,1,1125,0,0,NaT
120,3238731,https://www.airbnb.com/rooms/3238731,16141357,1.0,1.0,51.465130,-0.135460,Private room in rental unit,Private room,1.0,,2,14,0,0,NaT
130,6933655,https://www.airbnb.com/rooms/6933655,36346281,1.0,2.0,51.407630,-0.125740,Private room in home,Private room,1.0,,1,1125,0,0,NaT
183,4674140,https://www.airbnb.com/rooms/4674140,24178327,1.0,2.0,51.546200,-0.180400,Private room in rental unit,Private room,1.0,,3,1125,0,0,NaT
184,7595146,https://www.airbnb.com/rooms/7595146,39828462,,,51.487300,-0.014280,Private room in home,Private room,1.0,,1,1125,0,0,NaT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
91600,859120270786735012,https://www.airbnb.com/rooms/859120270786735012,455166063,1.0,1.0,51.597456,-0.099655,Entire rental unit,Entire home/apt,2.0,,4,13,96,0,NaT
91610,867190736097834006,https://www.airbnb.com/rooms/867190736097834006,509540333,1.0,1.0,51.532997,0.012144,Entire condo,Entire home/apt,1.0,,1,14,0,0,NaT
91646,1034909916684860323,https://www.airbnb.com/rooms/1034909916684860323,548584783,1.0,1.0,51.616612,-0.170569,Private room in rental unit,Private room,1.0,,1,365,260,0,NaT
91762,946202828259425107,https://www.airbnb.com/rooms/946202828259425107,483131444,1.0,1.0,51.512192,-0.068339,Private room in rental unit,Private room,1.0,,1,365,12,0,NaT


In [63]:
# Checked column data for na values
# df[df.price.isna()]

In [64]:
money = ['price']
for m in money:
    df[m] = df[m].str.replace(r'[$,]', '', regex=True).astype('float')

In [65]:
df

Unnamed: 0,id,listing_url,host_id,host_listings_count,host_total_listings_count,latitude,longitude,property_type,room_type,beds,price,minimum_nights,maximum_nights,availability_365,number_of_reviews,last_review
0,198258,https://www.airbnb.com/rooms/198258,967537,1.0,1.0,51.534300,0.081780,Private room in rental unit,Private room,1.0,67.0,2,100,363,41,2023-03-16
2,42010,https://www.airbnb.com/rooms/42010,157884,2.0,4.0,51.585900,-0.164340,Private room in home,Private room,1.0,65.0,4,365,208,556,2023-12-03
4,89870,https://www.airbnb.com/rooms/89870,54730,3.0,5.0,51.567920,-0.111250,Entire rental unit,Entire home/apt,1.0,149.0,1,60,357,133,2023-12-03
5,326146,https://www.airbnb.com/rooms/326146,1667975,1.0,2.0,51.571390,-0.031310,Private room in rental unit,Private room,1.0,120.0,7,21,0,0,NaT
6,96052,https://www.airbnb.com/rooms/96052,448154,2.0,2.0,51.559300,-0.224970,Private room in condo,Private room,1.0,52.0,3,180,352,80,2023-11-24
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
91773,950589815013504257,https://www.airbnb.com/rooms/950589815013504257,501508071,32.0,43.0,51.487936,-0.167639,Entire rental unit,Entire home/apt,2.0,500.0,7,365,333,0,NaT
91774,951178246905249947,https://www.airbnb.com/rooms/951178246905249947,211074697,7.0,13.0,51.480161,-0.114375,Entire rental unit,Entire home/apt,3.0,289.0,3,20,25,2,2023-09-10
91775,951188392382129035,https://www.airbnb.com/rooms/951188392382129035,526113148,2.0,2.0,51.478320,-0.180640,Entire home,Entire home/apt,4.0,400.0,1,365,227,2,2023-10-08
91776,951192793768996976,https://www.airbnb.com/rooms/951192793768996976,56125082,1.0,1.0,51.585379,-0.163443,Private room in rental unit,Private room,1.0,60.0,1,365,269,0,NaT


In [66]:
ints  = ['id', 'host_id', 'host_total_listings_count', 'minimum_nights', 'maximum_nights', 'availability_365', 'number_of_reviews']

for i in ints:
    print(f"Converting {i}")
    try:
        df[i] = df[i].astype('float').astype('int')
    except ValueError as e:
        print("  - !!!Converting to unsigned 16-bit integer!!!")
        df[i] = df[i].astype('float').astype(pd.UInt16Dtype())

Converting id
Converting host_id
Converting host_total_listings_count
  - !!!Converting to unsigned 16-bit integer!!!
Converting minimum_nights
Converting maximum_nights
Converting availability_365
Converting number_of_reviews


In [67]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 70872 entries, 0 to 91777
Data columns (total 16 columns):
 #   Column                     Non-Null Count  Dtype         
---  ------                     --------------  -----         
 0   id                         70872 non-null  int64         
 1   listing_url                70872 non-null  object        
 2   host_id                    70872 non-null  int64         
 3   host_listings_count        70867 non-null  float64       
 4   host_total_listings_count  70867 non-null  UInt16        
 5   latitude                   70872 non-null  float64       
 6   longitude                  70872 non-null  float64       
 7   property_type              70872 non-null  object        
 8   room_type                  70872 non-null  object        
 9   beds                       70021 non-null  float64       
 10  price                      68146 non-null  float64       
 11  minimum_nights             70872 non-null  int64         
 12  maximum_n

In [68]:
# split the data into valid prices and NaN data
df_valid = df[~df['price'].isna()] # rows where price is valid
df_nan = df[df['price'].isna()] # rows where price is NaN

In [71]:
def find_duplicates(df_valid):
    """
    The function looks at hosts with multiple listings and checks if they are within 300m radius (150m max location scattering as per Airbnb's anonymisation algorithm, x2).
    It then estimates the number of genuine homes within the cluster.
    
    Returns a gdf with new columns: 
    'cluster_id' - each unique value is a cluster
    'easting', 'northing' - used in calculating the proximity of listings
    'prvt_rms_in_cluster','entr_hms_in_cluster' - summarises N of private rooms and entire homes within the cluster
    'genuine', 'true_nrooms' - boolean column, and int column, with the following assumptions:

    If the cluster has 0 entire homes, and N rooms, first room will be "converted" to an entire home, and the rest will be discarded. 
    The location of the point will be amended to represent centrepoint of all rooms in the cluster.
    Record of the N of rooms will stay in col "true_nrooms"
    If the cluster has rooms and entire homes, the homes will be treated as genuine homes, and rooms treated as duplicates of the genuine home. 
    With the current timeframes, further enquiries into uniqueness of homes vs rooms inside each cluster is not feasible.
    """
    """
    We are looking for duplicates among multi-listing hosts. First, we filter by listings_count > 1.
    """
    finddups = df_valid[df_valid["host_listings_count"] > 1]
    print(f"There are {finddups.shape[0]} listings from multi-lister hosts")
    
    """
    I convert the df into gdf, reproject into EPSG 27700, and put easting and northing into new columns.
    """
    finddupsgdf = gpd.GeoDataFrame(finddups,
          geometry=gpd.points_from_xy(finddups.longitude, finddups.latitude, crs='epsg:4326'))
    
    finddupsgdf = finddupsgdf.set_crs('EPSG:4326', allow_override=True)
    finddups = finddupsgdf.to_crs('EPSG:27700', inplace=True)
    # print(finddupsgdf.crs)
    # finddupsgdf.plot()
    
    finddupsgdf['easting'] = finddupsgdf.geometry.x
    finddupsgdf['northing'] = finddupsgdf.geometry.y
    
    """
    I perform cluster analysis and mark listings from the same host that are within 300m from each other 
    (150m max location scattering as per Airbnb's anonymisation algorithm, x2). They constitute a cluster, and we then assess if they are a duplicate or not.
    The selected method computes pairwise distances  can calculate the distance matrix between all points in a group. 
    Before this method I tried KNN, but the listings were paired incorrectly. 
    cdist solved this, and it is relatively light on small groups (our dataset is large, but it is split into small groups - by host).
    """
    # Initialize a column for cluster IDs
    finddupsgdf['cluster_id'] = None
    
    # Process each host group
    for host_id, group in finddupsgdf.groupby("host_id"):
        if len(group) > 1:  # Only proceed if there is more than 1 listing
            coords = group[['easting', 'northing']].to_numpy()
            
            # Calculate pairwise distances using pdist (returns condensed distance matrix)
            dist_matrix = pdist(coords)
            
            # Perform hierarchical clustering using the condensed distance matrix
            linkage_matrix = linkage(dist_matrix, method='single')
            clusters = fcluster(linkage_matrix, t=300, criterion='distance')
            
            # Assign clusters back to the original GeoDataFrame
            finddupsgdf.loc[group.index, 'cluster_id'] = [f"{host_id}-{cluster}" for cluster in clusters]
        else:
            finddupsgdf.loc[group.index, 'cluster_id'] = None  # Single point, no clustering
    
    #creating gdf with clusters only
    finddupsgdf = finddupsgdf[~finddupsgdf['cluster_id'].isna()]
    finddupsgdf = finddupsgdf.loc[finddupsgdf.duplicated(subset='cluster_id', keep=False)]
    
    print(f"Found {len(set(finddupsgdf.cluster_id))} clusters with {len(finddupsgdf)} listings within these clusters.")
    
    """
    I calculate N of entire homes and rooms in each cluster.
    Limitation found during EDA: "bedrooms" column in InsideAirbnb differs between scrapes. 
    "2023-12-10" scrape has NaN values in this column, while "2024-09-06" scrape has full data.
    For both datasets, we use "beds" column as a substitute, because airbnb2024.bedrooms.median()/airbnb2024.beds.median() = 1
    """
    # Initialize the columns
    finddupsgdf['prvt_rms_in_cluster'] = None
    finddupsgdf['entr_hms_in_cluster'] = None
    
    # Loop through each cluster by cluster_id
    for i, group in finddupsgdf.groupby("cluster_id"):
        # Count number of entire homes and private rooms in the group
        n_entire_homes = group[group['room_type'] == 'Entire home/apt'].shape[0]
        n_private_rooms = group[group['room_type'] == 'Private room'].shape[0]
    
        # Assign these counts back to the original DataFrame
        finddupsgdf.loc[group.index, 'entr_hms_in_cluster'] = n_entire_homes
        finddupsgdf.loc[group.index, 'prvt_rms_in_cluster'] = n_private_rooms
    
    # null values in N of bedrooms will interfere with our analysis, therefore we drop them
    finddupsgdf = finddupsgdf[finddupsgdf.beds.notna()]
    print(f"Data frame after filtering bedrooms Nans is {len(finddupsgdf)} listings long")
    
    finddupsgdf['true_nrooms'] = finddupsgdf['beds'] #fill with these values for now
    for i, group in finddupsgdf.groupby("cluster_id"):
        # If there are no "Entire home/apt" listings, only rooms. We assume they combine into an entire home, or are a large-scale commercial let
        if (group['entr_hms_in_cluster'] == 0).any(): #using any because the values are the same within the group
            finddupsgdf.loc[group.index, 'true_nrooms'] = len(group)
    
    #finddupsgdf.true_nrooms.hist(range={1,10})
    
    print(f"There are {finddupsgdf[finddupsgdf['true_nrooms'] > 6].shape[0]} listings that are within clusters with more than 6 listings in each. These are likely to be flats converted to holiday accommodation. We are leaving them in for the purpose of the exercise. They belong to {len(set(finddupsgdf[finddupsgdf['true_nrooms'] > 6].host_id))} unique host IDs.")
    """
    I then determine whether listings are genuine unique homes, or rooms from one entire home.
    If the cluster has 0 entire homes, and N rooms, first room will be "converted" to an entire home, and the rest will be discarded.
    Record of the N of rooms will stay in col "true_nrooms"
    If the cluster has rooms and entire homes, the homes will be treated as genuine homes, and rooms treated as duplicates of the genuine home
    """
    finddupsgdf['genuine'] = None
    
    for cluster_id, group in finddupsgdf.groupby("cluster_id"):
        num_entire_homes = (group['room_type'] == 'Entire home/apt').sum()
        
        if num_entire_homes == 0:
            # If only rooms are in the cluster
            finddupsgdf.loc[group.index, 'genuine'] = False  # Mark all as duplicates
            first_room_idx = group.index[0]  # Pick the first room as genuine
            finddupsgdf.loc[first_room_idx, 'genuine'] = True  # First room becomes the genuine home
            """ ... but the geometry is replaced with centroid x and y. 
            Without this the point and all associated rooms can be aggregated by different spatial unit, considering the scale of the location approximation (up to 150m from origin)"""
            dissolved_geometry = group.geometry.union_all()  # Combine all geometries in the group
            centroid = dissolved_geometry.centroid # Get the centroid of the combined geometry
            finddupsgdf.at[first_room_idx, 'geometry'] = Point(centroid.x, centroid.y)
        else:
            # Cluster has both rooms and entire homes
            finddupsgdf.loc[group.index, 'genuine'] = False  # Default all to duplicates
            entire_home_indices = group[group['room_type'] == 'Entire home/apt'].index
            finddupsgdf.loc[entire_home_indices, 'genuine'] = True  # Mark entire homes with true

    # write valid data to be used in further analysis
    # fn = {date}+"inside_airbnb_clean.csv"
    # path = os.path.join('data','inside_airbnb')
    
    # if not os.path.exists(path):
    #     print(f"Creating {path} under {os.getcwd()}")
    #     os.makedirs(path)
        
    # df_valid.to_file(os.path.join(path,fn), index=False)

    print(f"The clustering exercise identified {len(finddupsgdf[finddupsgdf['genuine'] == False])} listings that are potentially duplicates of other listings.")
    #print(f"The gdf has been saved into {path}")

    return finddupsgdf

In [74]:
df_valid_nodups = find_duplicates(df_valid)

There are 41847 listings from multi-lister hosts
Found 8629 clusters with 29253 listings within these clusters.
Data frame after filtering bedrooms Nans is 28874 listings long
There are 2065 listings that are within clusters with more than 6 listings in each. These are likely to be flats converted to holiday accommodation. We are leaving them in for the purpose of the exercise. They belong to 267 unique host IDs.
The clustering exercise identified 9121 listings that are potentially duplicates of other listings.


In [75]:
df_valid_nodups[['genuine']].value_counts()

genuine
True       19753
False       9121
Name: count, dtype: int64

At this point the invalid prices dataframe could be investigated further to look for a common theme and see if an inference in price could be made.  However in this instance we will only be analysing the valid dataframe.

In [15]:
    """
    Will not be needed once we join filles
    """
# # write valid data to be used in further analysis
# fn = "inside_airbnb_clean.csv"
# path = os.path.join('data','inside_airbnb')

# if not os.path.exists(path):
#     print(f"Creating {path} under {os.getcwd()}")
#     os.makedirs(path)
    
# df_valid.to_csv(os.path.join(path,fn), index=False)
# print("Done.")

Done.
