# Inside Airbnb data load
Intial data load from Inside Airbnb.

The oldest data available has been used as this is the closest date to some of the other base datasets that will be used.  In this instance the data was published by Inside Airbnb on 10th December 2023 and accessed on 27th November 2024.

In [35]:
# load required packages
import os
import numpy as np
import pandas as pd
import geopandas as gpd
import requests
import zipfile
import matplotlib.cm as cm
import matplotlib.pyplot as plt

In [36]:
# Set download URL for London data
# the date should be choosen by reviewing the Inside Airbnb Get Data page
# and identifying the date of the required data
date  = "2023-12-10"
url  = f"https://data.insideairbnb.com/united-kingdom/england/london/{date}/data/listings.csv.gz"

# create a dataframe of the raw Inside Airbnb data
df = pd.read_csv(url, compression='gzip', low_memory=False)

In [37]:
path = os.path.join('data','raw') # location to save the data
fn   = url.split('/')[-1] # return the file name
print(f"Writing to: {fn}")

if not os.path.exists(path):      # create directory if does not exist
    print(f"Creating {path} under {os.getcwd()}")
    os.makedirs(path)

if not os.path.exists(os.path.join(path,fn)):  
    df.to_csv(os.path.join(path,fn), index=False)
    print("Done.")

else:
    print("Data previously downloaded")

Writing to: listings.csv.gz
Data previously downloaded


### Cleaning the data

In [48]:
# Column selection
# create list of columns required for further processing

# this is a list of all the columns in the dataset
# cols = ['id', 'listing_url', 'last_scraped', 'name', 'host_id', 'host_since', 'host_location', 'host_listings_count', 'host_total_listings_count', 'latitude', 'longitude', 'property_type', 'room_type', 'accommodates', 'bathrooms', 'bathrooms_text', 'bedrooms', 'beds', 'price', 'minimum_nights', 'maximum_nights', 'availability_365', 'number_of_reviews', 'first_review', 'last_review', 'reviews_per_month']

# this needs to be amended to contain the required columns
# currently only using minimal columns to ensure the process works
# define required columns
cols = ['id', 'listing_url', 'host_id', 'host_total_listings_count', 'property_type', 'room_type', 'price', 'minimum_nights', 'maximum_nights', 'availability_365', 'number_of_reviews', 'latitude', 'longitude']

In [49]:
# check a testing data subset to work with
testing = True

if testing:
    df = pd.read_csv(os.path.join(path,fn), 
                low_memory=True, nrows=10000, usecols=cols)
else:
    df = pd.read_csv(os.path.join(path,fn), 
                low_memory=False,usecols=cols)

print(f"Data frame is {df.shape[0]:,} x {df.shape[1]}")

Data frame is 10,000 x 13


In [50]:
df.sample(10, random_state=5)

Unnamed: 0,id,listing_url,host_id,host_total_listings_count,latitude,longitude,property_type,room_type,price,minimum_nights,maximum_nights,availability_365,number_of_reviews
7054,52398945,https://www.airbnb.com/rooms/52398945,15418217,11.0,51.60239,-0.27366,Private room in home,Private room,$42.00,14,365,343,11
442,16376993,https://www.airbnb.com/rooms/16376993,83740964,144.0,51.53679,-0.18695,Entire rental unit,Entire home/apt,$124.00,4,320,173,39
3954,52065471,https://www.airbnb.com/rooms/52065471,8080044,10.0,51.55146,-0.04694,Entire condo,Entire home/apt,$153.00,12,150,158,7
2288,965267788108514911,https://www.airbnb.com/rooms/965267788108514911,9986669,11.0,51.53088,-0.12044,Entire rental unit,Entire home/apt,$123.00,5,365,178,5
3196,18650844,https://www.airbnb.com/rooms/18650844,129149641,1.0,51.55756,0.00786,Private room in home,Private room,,1,14,0,2
6178,19882152,https://www.airbnb.com/rooms/19882152,33224924,1.0,51.52529,-0.01958,Entire rental unit,Entire home/apt,$110.00,2,1125,0,0
8351,944020521144543305,https://www.airbnb.com/rooms/944020521144543305,269308503,350.0,51.478583,-0.016159,Entire rental unit,Entire home/apt,$188.00,14,1125,275,3
5658,1039968884446041972,https://www.airbnb.com/rooms/1039968884446041972,405749100,1.0,51.55123,-0.129671,Private room in rental unit,Private room,$33.00,2,10,41,0
2065,868207557073972851,https://www.airbnb.com/rooms/868207557073972851,108829187,5.0,51.491703,-0.199734,Entire rental unit,Entire home/apt,$144.00,2,1125,127,26
413,13274166,https://www.airbnb.com/rooms/13274166,74801427,1.0,51.48895,-0.28555,Private room in home,Private room,$70.00,2,1125,83,183


In [51]:
df[~(df.price.str.startswith('$', na=False))]

Unnamed: 0,id,listing_url,host_id,host_total_listings_count,latitude,longitude,property_type,room_type,price,minimum_nights,maximum_nights,availability_365,number_of_reviews
23,592844,https://www.airbnb.com/rooms/592844,2926478,1.0,51.64933,-0.19822,Private room in home,Private room,,1,1125,0,0
33,5034202,https://www.airbnb.com/rooms/5034202,25987924,1.0,51.48861,-0.13938,Private room in rental unit,Private room,,2,1125,0,2
119,6821240,https://www.airbnb.com/rooms/6821240,850206,1.0,51.49733,-0.08615,Private room in rental unit,Private room,,7,30,0,3
120,3238731,https://www.airbnb.com/rooms/3238731,16141357,1.0,51.46513,-0.13546,Private room in rental unit,Private room,,2,14,0,0
130,6933655,https://www.airbnb.com/rooms/6933655,36346281,2.0,51.40763,-0.12574,Private room in home,Private room,,1,1125,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9966,31236638,https://www.airbnb.com/rooms/31236638,233676141,1.0,51.41545,0.12871,Private room in rental unit,Private room,,1,1125,0,0
9970,31288308,https://www.airbnb.com/rooms/31288308,234153863,1.0,51.49843,-0.04970,Entire rental unit,Entire home/apt,,2,30,0,1
9976,31579671,https://www.airbnb.com/rooms/31579671,236720647,1.0,51.48753,0.07607,Private room in bungalow,Private room,,1,15,0,0
9981,780379779860978702,https://www.airbnb.com/rooms/780379779860978702,491177425,1.0,51.35193,-0.06167,Private room in bed and breakfast,Private room,,1,365,365,0


In [23]:
# Checked column data for na values
# df[df.price.isna()]

Unnamed: 0,id,listing_url,host_id,latitude,longitude,property_type,room_type,price
23,592844,https://www.airbnb.com/rooms/592844,2926478,51.64933,-0.19822,Private room in home,Private room,
33,5034202,https://www.airbnb.com/rooms/5034202,25987924,51.48861,-0.13938,Private room in rental unit,Private room,
119,6821240,https://www.airbnb.com/rooms/6821240,850206,51.49733,-0.08615,Private room in rental unit,Private room,
120,3238731,https://www.airbnb.com/rooms/3238731,16141357,51.46513,-0.13546,Private room in rental unit,Private room,
130,6933655,https://www.airbnb.com/rooms/6933655,36346281,51.40763,-0.12574,Private room in home,Private room,
...,...,...,...,...,...,...,...,...
9966,31236638,https://www.airbnb.com/rooms/31236638,233676141,51.41545,0.12871,Private room in rental unit,Private room,
9970,31288308,https://www.airbnb.com/rooms/31288308,234153863,51.49843,-0.04970,Entire rental unit,Entire home/apt,
9976,31579671,https://www.airbnb.com/rooms/31579671,236720647,51.48753,0.07607,Private room in bungalow,Private room,
9981,780379779860978702,https://www.airbnb.com/rooms/780379779860978702,491177425,51.35193,-0.06167,Private room in bed and breakfast,Private room,


In [52]:
money = ['price']
for m in money:
    df[m] = df[m].str.replace(r'[$,]', '', regex=True).astype('float')

In [34]:
df

Unnamed: 0,id,listing_url,host_id,latitude,longitude,property_type,room_type,price
0,198258,https://www.airbnb.com/rooms/198258,967537,51.53430,0.08178,Private room in rental unit,Private room,67.0
1,33332,https://www.airbnb.com/rooms/33332,144444,51.46410,-0.32498,Private room in home,Private room,140.0
2,42010,https://www.airbnb.com/rooms/42010,157884,51.58590,-0.16434,Private room in home,Private room,65.0
3,284603,https://www.airbnb.com/rooms/284603,1481851,51.51464,-0.20004,Entire rental unit,Entire home/apt,297.0
4,89870,https://www.airbnb.com/rooms/89870,54730,51.56792,-0.11125,Entire rental unit,Entire home/apt,149.0
...,...,...,...,...,...,...,...,...
9995,783238802930999062,https://www.airbnb.com/rooms/783238802930999062,407049596,51.53292,-0.05565,Entire rental unit,Entire home/apt,345.0
9996,33044960,https://www.airbnb.com/rooms/33044960,57724494,51.57870,-0.09712,Entire rental unit,Entire home/apt,65.0
9997,783278373241527416,https://www.airbnb.com/rooms/783278373241527416,491670753,51.49410,-0.23097,Entire rental unit,Entire home/apt,150.0
9998,783781873554521739,https://www.airbnb.com/rooms/783781873554521739,25423352,51.54036,-0.13785,Entire serviced apartment,Entire home/apt,251.0


In [53]:
ints  = ['id', 'host_id', 'host_total_listings_count', 'minimum_nights', 'maximum_nights', 'availability_365', 'number_of_reviews']

for i in ints:
    print(f"Converting {i}")
    try:
        df[i] = df[i].astype('float').astype('int')
    except ValueError as e:
        print("  - !!!Converting to unsigned 16-bit integer!!!")
        df[i] = df[i].astype('float').astype(pd.UInt16Dtype())

Converting id
Converting host_id
Converting host_total_listings_count
  - !!!Converting to unsigned 16-bit integer!!!
Converting minimum_nights
Converting maximum_nights
Converting availability_365
Converting number_of_reviews


In [56]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 13 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   id                         10000 non-null  int64  
 1   listing_url                10000 non-null  object 
 2   host_id                    10000 non-null  int64  
 3   host_total_listings_count  9999 non-null   UInt16 
 4   latitude                   10000 non-null  float64
 5   longitude                  10000 non-null  float64
 6   property_type              10000 non-null  object 
 7   room_type                  10000 non-null  object 
 8   price                      9525 non-null   float64
 9   minimum_nights             10000 non-null  int64  
 10  maximum_nights             10000 non-null  int64  
 11  availability_365           10000 non-null  int64  
 12  number_of_reviews          10000 non-null  int64  
dtypes: UInt16(1), float64(3), int64(6), object(3)
m

In [61]:
# split the data into valid prices and NaN data
df_valid = df[~df['price'].isna()] # rows where price is valid
df_nan = df[df['price'].isna()] # rows where price is NaN

Unnamed: 0,id,listing_url,host_id,host_total_listings_count,latitude,longitude,property_type,room_type,price,minimum_nights,maximum_nights,availability_365,number_of_reviews
0,198258,https://www.airbnb.com/rooms/198258,967537,1,51.53430,0.08178,Private room in rental unit,Private room,67.0,2,100,363,41
1,33332,https://www.airbnb.com/rooms/33332,144444,2,51.46410,-0.32498,Private room in home,Private room,140.0,2,21,365,20
2,42010,https://www.airbnb.com/rooms/42010,157884,4,51.58590,-0.16434,Private room in home,Private room,65.0,4,365,208,556
3,284603,https://www.airbnb.com/rooms/284603,1481851,2,51.51464,-0.20004,Entire rental unit,Entire home/apt,297.0,14,365,0,6
4,89870,https://www.airbnb.com/rooms/89870,54730,5,51.56792,-0.11125,Entire rental unit,Entire home/apt,149.0,1,60,357,133
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,783238802930999040,https://www.airbnb.com/rooms/783238802930999062,407049596,112,51.53292,-0.05565,Entire rental unit,Entire home/apt,345.0,2,365,337,5
9996,33044960,https://www.airbnb.com/rooms/33044960,57724494,4,51.57870,-0.09712,Entire rental unit,Entire home/apt,65.0,1,30,0,40
9997,783278373241527424,https://www.airbnb.com/rooms/783278373241527416,491670753,1,51.49410,-0.23097,Entire rental unit,Entire home/apt,150.0,1,1125,359,7
9998,783781873554521728,https://www.airbnb.com/rooms/783781873554521739,25423352,87,51.54036,-0.13785,Entire serviced apartment,Entire home/apt,251.0,1,365,9,0


At this point the invalid prices dataframe could be investigated further to look for a common theme and see if an inference in price could be made.  However in this instance we will only be analysing the valid dataframe.

In [62]:
# write valid data to be used in further analysis
fn = "inside_airbnb_clean.csv"
path = os.path.join('data','inside_airbnb')

if not os.path.exists(path):
    print(f"Creating {path} under {os.getcwd()}")
    os.makedirs(path)
    
df_valid.to_csv(os.path.join(path,fn), index=False)
print("Done.")



Done.
