In [3]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

import warnings
warnings.filterwarnings('ignore')

In [4]:
data = pd.read_csv('airbnb/listings_march16_2023.csv')

In [5]:
data.head()

Unnamed: 0,id,listing_url,scrape_id,last_scraped,source,name,description,neighborhood_overview,picture_url,host_id,...,review_scores_communication,review_scores_location,review_scores_value,license,instant_bookable,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month
0,5456,https://www.airbnb.com/rooms/5456,20230316044216,2023-03-16,city scrape,"Walk to 6th, Rainey St and Convention Ctr",Great central location for walking to Convent...,My neighborhood is ideally located if you want...,https://a0.muscache.com/pictures/14084884/b5a3...,8028,...,4.82,4.73,4.79,,f,2,2,0,0,3.69
1,5769,https://www.airbnb.com/rooms/5769,20230316044216,2023-03-16,previous scrape,NW Austin Room,<b>The space</b><br />Looking for a comfortabl...,Quiet neighborhood with lots of trees and good...,https://a0.muscache.com/pictures/23822033/ac94...,8186,...,4.94,4.76,4.92,,f,1,0,1,0,1.75
2,6413,https://www.airbnb.com/rooms/6413,20230316044216,2023-03-16,previous scrape,Gem of a Studio near Downtown,"Great studio apartment, perfect a single perso...",Travis Heights is one of the oldest neighborho...,https://a0.muscache.com/pictures/miso/Hosting-...,13879,...,4.98,4.87,4.93,,f,1,1,0,0,0.76
3,6448,https://www.airbnb.com/rooms/6448,20230316044216,2023-03-16,city scrape,"Secluded Studio @ Zilker - King Bed, Bright & ...","Clean, private space with everything you need ...",The neighborhood is fun and funky (but quiet)!...,https://a0.muscache.com/pictures/4513152/4ffc1...,14156,...,4.97,4.97,4.9,,t,1,1,0,0,2.1
4,8502,https://www.airbnb.com/rooms/8502,20230316044216,2023-03-17,city scrape,Woodland Studio Lodging,Studio rental on lower level of home located i...,,https://a0.muscache.com/pictures/miso/Hosting-...,25298,...,4.86,4.64,4.57,,f,1,1,0,0,0.3


In [6]:
mask1 = ['id', 
         'host_id',
         'neighbourhood_cleansed',
         'host_neighbourhood',
         'property_type',
         'room_type',
         'price',
         'latitude', 
         'longitude',
         'review_scores_rating', 
         'availability_365']

In [7]:
df = data[mask1]
df = df.rename(columns={'neighbourhood_cleansed':"zip_code",
                        'host_neighbourhood':'host_neighborhood',
                        'review_scores_rating':'rating'})
df.head()

Unnamed: 0,id,host_id,zip_code,host_neighborhood,property_type,room_type,price,latitude,longitude,rating,availability_365
0,5456,8028,78702,East Downtown,Entire guesthouse,Entire home/apt,$176.00,30.26057,-97.73441,4.84,298
1,5769,8186,78729,SW Williamson Co.,Private room in home,Private room,$42.00,30.45697,-97.78422,4.9,0
2,6413,13879,78704,Travis Heights,Entire guesthouse,Entire home/apt,$109.00,30.24885,-97.73587,4.97,0
3,6448,14156,78704,Zilker,Entire guesthouse,Entire home/apt,$240.00,30.26034,-97.76487,4.97,133
4,8502,25298,78741,East Riverside,Entire guest suite,Entire home/apt,$85.00,30.23466,-97.73682,4.53,36


In [8]:
df.shape

(14368, 11)

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14368 entries, 0 to 14367
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 14368 non-null  int64  
 1   host_id            14368 non-null  int64  
 2   zip_code           14368 non-null  int64  
 3   host_neighborhood  12537 non-null  object 
 4   property_type      14368 non-null  object 
 5   room_type          14368 non-null  object 
 6   price              14368 non-null  object 
 7   latitude           14368 non-null  float64
 8   longitude          14368 non-null  float64
 9   rating             11337 non-null  float64
 10  availability_365   14368 non-null  int64  
dtypes: float64(3), int64(4), object(4)
memory usage: 1.2+ MB


Drop the duplicates based on host id, longtitude, and latitude. Same longitude latitude means same property.

In [10]:
df.isna().sum()

id                      0
host_id                 0
zip_code                0
host_neighborhood    1831
property_type           0
room_type               0
price                   0
latitude                0
longitude               0
rating               3031
availability_365        0
dtype: int64

In [11]:
df = df.sort_values(by=['host_id','price'],ascending=[True,False]).drop_duplicates(['host_id','longitude','latitude'],keep="first")

In [12]:
df.head()

Unnamed: 0,id,host_id,zip_code,host_neighborhood,property_type,room_type,price,latitude,longitude,rating,availability_365
4732,32666944,23,78744,McKinney,Private room in home,Private room,$98.00,30.17937,-97.75119,4.77,241
3728,23629432,23,78744,McKinney,Private room in home,Private room,$80.00,30.17879,-97.7508,4.46,265
5450,39056933,23,78744,McKinney,Tent,Entire home/apt,$40.00,30.18005,-97.75093,4.94,148
5968,42625339,796,78725,Austins' Colony,Entire cottage,Entire home/apt,$138.00,30.24147,-97.57637,,125
2533,17074156,2466,78702,East Downtown,Entire guesthouse,Entire home/apt,$135.00,30.27651,-97.71234,4.68,80


Remove the '$' sign, and comma(,) and convert the datatype to integer.

In [18]:
df.shape

(13980, 11)

In [13]:
df.price = df.price.astype('str')
df['price'] = df.price.str.replace('$','', regex=True).replace(',','',regex=True)
df.price = df.price.astype(float)
df.price = df.price.astype('int32')

In [14]:
df.zip_code = df.zip_code.astype('int32')
df.availability_365 = df.availability_365.astype('int32')

In [15]:
df.describe()

Unnamed: 0,id,host_id,zip_code,price,latitude,longitude,rating,availability_365
count,13980.0,13980.0,13980.0,13980.0,13980.0,13980.0,11107.0,13980.0
mean,2.660776e+17,133603600.0,78724.740844,306.371316,30.281102,-97.750496,4.786699,150.282904
std,3.487903e+17,149301000.0,20.648271,743.301475,0.064789,0.063001,0.524681,136.765878
min,5456.0,23.0,78701.0,0.0,30.07887,-98.05663,0.0,0.0
25%,23217140.0,15782490.0,78704.0,107.0,30.24174,-97.770385,4.78,0.0
50%,49458370.0,62926110.0,78723.0,180.0,30.268675,-97.74084,4.93,123.0
75%,6.718016e+17,222390500.0,78745.0,320.0,30.309495,-97.716928,5.0,298.0
max,8.473047e+17,505184800.0,78759.0,65155.0,30.51225,-97.56062,5.0,365.0


In [16]:
quant = df.price.quantile([0.25,0.75])
q1, q3 = quant[0.25], quant[0.75]
print(q1,q3)
IQR = q3-q1
low  = q1-1.5*IQR
high = q3+1.5*IQR
print(low,high)

107.0 320.0
-212.5 639.5


Export the data to a new file.

In [17]:
df.to_csv("airbnb_cleaned.csv")