# Inside Airbnb data load
Intial data load from Inside Airbnb.

The oldest data available has been used as this is the closest date to some of the other base datasets that will be used.  In this instance the data was published by Inside Airbnb on 10th December 2023 and accessed on 27th November 2024.

In [2]:
# load required packages
import os
import numpy as np
import pandas as pd
import geopandas as gpd
import requests
import zipfile
import matplotlib.cm as cm
import matplotlib.pyplot as plt

In [3]:
# Set download URL for London data
# the date should be choosen by reviewing the Inside Airbnb Get Data page
# and identifying the date of the required data
date  = "2023-12-10"
url  = f"https://data.insideairbnb.com/united-kingdom/england/london/{date}/data/listings.csv.gz"

# create a dataframe of the raw Inside Airbnb data
df = pd.read_csv(url, compression='gzip', low_memory=False)

In [4]:
path = os.path.join('data','raw') # location to save the data
fn   = url.split('/')[-1] # return the file name
print(f"Writing to: {fn}")

if not os.path.exists(path):      # create directory if does not exist
    print(f"Creating {path} under {os.getcwd()}")
    os.makedirs(path)

if not os.path.exists(os.path.join(path,fn)):  
    df.to_csv(os.path.join(path,fn), index=False)
    print("Done.")

else:
    print("Data previously downloaded")

Writing to: listings.csv.gz
Data previously downloaded


### Cleaning the data

In [5]:
# Column selection
# create list of columns required for further processing

# this is a list of all the columns in the dataset
# cols = ['id', 'listing_url', 'last_scraped', 'name', 'host_id', 'host_since', 'host_location', 'host_listings_count', 'host_total_listings_count', 'latitude', 'longitude', 'property_type', 'room_type', 'accommodates', 'bathrooms', 'bathrooms_text', 'bedrooms', 'beds', 'price', 'minimum_nights', 'maximum_nights', 'availability_365', 'number_of_reviews', 'first_review', 'last_review', 'reviews_per_month']

# this needs to be amended to contain the required columns
# currently only using minimal columns to ensure the process works
# define required columns
cols = ['id', 'listing_url', 'host_id', 'host_total_listings_count', 'property_type', 'room_type', 'price', 'minimum_nights', 'maximum_nights', 'availability_365', 'number_of_reviews', 'latitude', 'longitude']

In [6]:
# check a testing data subset to work with
testing = False

if testing:
    df = pd.read_csv(os.path.join(path,fn), 
                low_memory=True, nrows=10000, usecols=cols)
else:
    df = pd.read_csv(os.path.join(path,fn), 
                low_memory=False,usecols=cols)

print(f"Data frame is {df.shape[0]:,} x {df.shape[1]}")

Data frame is 91,778 x 13


In [7]:
df.sample(10, random_state=5)

Unnamed: 0,id,listing_url,host_id,host_total_listings_count,latitude,longitude,property_type,room_type,price,minimum_nights,maximum_nights,availability_365,number_of_reviews
67090,15914972,https://www.airbnb.com/rooms/15914972,103249702,1.0,51.50847,-0.23109,Private room in home,Private room,$40.00,4,1125,0,0
65657,41165527,https://www.airbnb.com/rooms/41165527,135724273,2.0,51.54112,-0.1903,Private room in home,Private room,$30.00,1,1125,0,6
25249,36143235,https://www.airbnb.com/rooms/36143235,243612726,5.0,51.54261,0.00809,Private room in home,Private room,$65.00,1,4,0,141
90854,36360312,https://www.airbnb.com/rooms/36360312,46827741,3.0,51.52227,-0.36956,Private room in home,Private room,$68.00,1,1125,362,45
49145,13355288,https://www.airbnb.com/rooms/13355288,75734116,1.0,51.40103,-0.16966,Private room in home,Private room,$69.00,1,1125,0,0
51957,5811467,https://www.airbnb.com/rooms/5811467,8433288,2.0,51.53803,-0.06325,Private room in rental unit,Private room,$43.00,1,1125,0,0
83744,31666020,https://www.airbnb.com/rooms/31666020,4086714,37.0,51.54855,-0.16436,Entire rental unit,Entire home/apt,$88.00,5,1125,290,64
23968,998607177304130892,https://www.airbnb.com/rooms/998607177304130892,493890758,4.0,51.41347,-0.08278,Private room in home,Private room,$75.00,1,365,269,0
65609,39363515,https://www.airbnb.com/rooms/39363515,29685898,1.0,51.51195,-0.29713,Private room in rental unit,Private room,$45.00,1,15,0,7
89834,643112693782870487,https://www.airbnb.com/rooms/643112693782870487,32464822,3.0,51.545892,-0.080819,Private room in home,Private room,$99.00,1,7,171,31


In [8]:
df[~(df.price.str.startswith('$', na=False))]

Unnamed: 0,id,listing_url,host_id,host_total_listings_count,latitude,longitude,property_type,room_type,price,minimum_nights,maximum_nights,availability_365,number_of_reviews
23,592844,https://www.airbnb.com/rooms/592844,2926478,1.0,51.649330,-0.198220,Private room in home,Private room,,1,1125,0,0
33,5034202,https://www.airbnb.com/rooms/5034202,25987924,1.0,51.488610,-0.139380,Private room in rental unit,Private room,,2,1125,0,2
119,6821240,https://www.airbnb.com/rooms/6821240,850206,1.0,51.497330,-0.086150,Private room in rental unit,Private room,,7,30,0,3
120,3238731,https://www.airbnb.com/rooms/3238731,16141357,1.0,51.465130,-0.135460,Private room in rental unit,Private room,,2,14,0,0
130,6933655,https://www.airbnb.com/rooms/6933655,36346281,2.0,51.407630,-0.125740,Private room in home,Private room,,1,1125,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
91600,859120270786735012,https://www.airbnb.com/rooms/859120270786735012,455166063,1.0,51.597456,-0.099655,Entire rental unit,Entire home/apt,,4,13,96,0
91610,867190736097834006,https://www.airbnb.com/rooms/867190736097834006,509540333,1.0,51.532997,0.012144,Entire condo,Entire home/apt,,1,14,0,0
91646,1034909916684860323,https://www.airbnb.com/rooms/1034909916684860323,548584783,1.0,51.616612,-0.170569,Private room in rental unit,Private room,,1,365,260,0
91762,946202828259425107,https://www.airbnb.com/rooms/946202828259425107,483131444,1.0,51.512192,-0.068339,Private room in rental unit,Private room,,1,365,12,0


In [23]:
# Checked column data for na values
# df[df.price.isna()]

Unnamed: 0,id,listing_url,host_id,latitude,longitude,property_type,room_type,price
23,592844,https://www.airbnb.com/rooms/592844,2926478,51.64933,-0.19822,Private room in home,Private room,
33,5034202,https://www.airbnb.com/rooms/5034202,25987924,51.48861,-0.13938,Private room in rental unit,Private room,
119,6821240,https://www.airbnb.com/rooms/6821240,850206,51.49733,-0.08615,Private room in rental unit,Private room,
120,3238731,https://www.airbnb.com/rooms/3238731,16141357,51.46513,-0.13546,Private room in rental unit,Private room,
130,6933655,https://www.airbnb.com/rooms/6933655,36346281,51.40763,-0.12574,Private room in home,Private room,
...,...,...,...,...,...,...,...,...
9966,31236638,https://www.airbnb.com/rooms/31236638,233676141,51.41545,0.12871,Private room in rental unit,Private room,
9970,31288308,https://www.airbnb.com/rooms/31288308,234153863,51.49843,-0.04970,Entire rental unit,Entire home/apt,
9976,31579671,https://www.airbnb.com/rooms/31579671,236720647,51.48753,0.07607,Private room in bungalow,Private room,
9981,780379779860978702,https://www.airbnb.com/rooms/780379779860978702,491177425,51.35193,-0.06167,Private room in bed and breakfast,Private room,


In [9]:
money = ['price']
for m in money:
    df[m] = df[m].str.replace(r'[$,]', '', regex=True).astype('float')

In [10]:
df

Unnamed: 0,id,listing_url,host_id,host_total_listings_count,latitude,longitude,property_type,room_type,price,minimum_nights,maximum_nights,availability_365,number_of_reviews
0,198258,https://www.airbnb.com/rooms/198258,967537,1.0,51.534300,0.081780,Private room in rental unit,Private room,67.0,2,100,363,41
1,33332,https://www.airbnb.com/rooms/33332,144444,2.0,51.464100,-0.324980,Private room in home,Private room,140.0,2,21,365,20
2,42010,https://www.airbnb.com/rooms/42010,157884,4.0,51.585900,-0.164340,Private room in home,Private room,65.0,4,365,208,556
3,284603,https://www.airbnb.com/rooms/284603,1481851,2.0,51.514640,-0.200040,Entire rental unit,Entire home/apt,297.0,14,365,0,6
4,89870,https://www.airbnb.com/rooms/89870,54730,5.0,51.567920,-0.111250,Entire rental unit,Entire home/apt,149.0,1,60,357,133
...,...,...,...,...,...,...,...,...,...,...,...,...,...
91773,950589815013504257,https://www.airbnb.com/rooms/950589815013504257,501508071,43.0,51.487936,-0.167639,Entire rental unit,Entire home/apt,500.0,7,365,333,0
91774,951178246905249947,https://www.airbnb.com/rooms/951178246905249947,211074697,13.0,51.480161,-0.114375,Entire rental unit,Entire home/apt,289.0,3,20,25,2
91775,951188392382129035,https://www.airbnb.com/rooms/951188392382129035,526113148,2.0,51.478320,-0.180640,Entire home,Entire home/apt,400.0,1,365,227,2
91776,951192793768996976,https://www.airbnb.com/rooms/951192793768996976,56125082,1.0,51.585379,-0.163443,Private room in rental unit,Private room,60.0,1,365,269,0


In [11]:
ints  = ['id', 'host_id', 'host_total_listings_count', 'minimum_nights', 'maximum_nights', 'availability_365', 'number_of_reviews']

for i in ints:
    print(f"Converting {i}")
    try:
        df[i] = df[i].astype('float').astype('int')
    except ValueError as e:
        print("  - !!!Converting to unsigned 16-bit integer!!!")
        df[i] = df[i].astype('float').astype(pd.UInt16Dtype())

Converting id
Converting host_id
Converting host_total_listings_count
  - !!!Converting to unsigned 16-bit integer!!!
Converting minimum_nights
Converting maximum_nights
Converting availability_365
Converting number_of_reviews


In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 91778 entries, 0 to 91777
Data columns (total 13 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   id                         91778 non-null  int64  
 1   listing_url                91778 non-null  object 
 2   host_id                    91778 non-null  int64  
 3   host_total_listings_count  91773 non-null  UInt16 
 4   latitude                   91778 non-null  float64
 5   longitude                  91778 non-null  float64
 6   property_type              91778 non-null  object 
 7   room_type                  91778 non-null  object 
 8   price                      87598 non-null  float64
 9   minimum_nights             91778 non-null  int64  
 10  maximum_nights             91778 non-null  int64  
 11  availability_365           91778 non-null  int64  
 12  number_of_reviews          91778 non-null  int64  
dtypes: UInt16(1), float64(3), int64(6), object(3)


In [13]:
# split the data into valid prices and NaN data
df_valid = df[~df['price'].isna()] # rows where price is valid
df_nan = df[df['price'].isna()] # rows where price is NaN

At this point the invalid prices dataframe could be investigated further to look for a common theme and see if an inference in price could be made.  However in this instance we will only be analysing the valid dataframe.

In [15]:
# write valid data to be used in further analysis
fn = "inside_airbnb_clean.csv"
path = os.path.join('data','inside_airbnb')

if not os.path.exists(path):
    print(f"Creating {path} under {os.getcwd()}")
    os.makedirs(path)
    
df_valid.to_csv(os.path.join(path,fn), index=False)
print("Done.")

Done.
