In [2]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

import warnings
warnings.filterwarnings('ignore')

In [3]:
data = pd.read_csv('data/listings_march16_2023.csv')

In [4]:
data.head()

Unnamed: 0,id,listing_url,scrape_id,last_scraped,source,name,description,neighborhood_overview,picture_url,host_id,...,review_scores_communication,review_scores_location,review_scores_value,license,instant_bookable,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month
0,5456,https://www.airbnb.com/rooms/5456,20230316044216,2023-03-16,city scrape,"Walk to 6th, Rainey St and Convention Ctr",Great central location for walking to Convent...,My neighborhood is ideally located if you want...,https://a0.muscache.com/pictures/14084884/b5a3...,8028,...,4.82,4.73,4.79,,f,2,2,0,0,3.69
1,5769,https://www.airbnb.com/rooms/5769,20230316044216,2023-03-16,previous scrape,NW Austin Room,<b>The space</b><br />Looking for a comfortabl...,Quiet neighborhood with lots of trees and good...,https://a0.muscache.com/pictures/23822033/ac94...,8186,...,4.94,4.76,4.92,,f,1,0,1,0,1.75
2,6413,https://www.airbnb.com/rooms/6413,20230316044216,2023-03-16,previous scrape,Gem of a Studio near Downtown,"Great studio apartment, perfect a single perso...",Travis Heights is one of the oldest neighborho...,https://a0.muscache.com/pictures/miso/Hosting-...,13879,...,4.98,4.87,4.93,,f,1,1,0,0,0.76
3,6448,https://www.airbnb.com/rooms/6448,20230316044216,2023-03-16,city scrape,"Secluded Studio @ Zilker - King Bed, Bright & ...","Clean, private space with everything you need ...",The neighborhood is fun and funky (but quiet)!...,https://a0.muscache.com/pictures/4513152/4ffc1...,14156,...,4.97,4.97,4.9,,t,1,1,0,0,2.1
4,8502,https://www.airbnb.com/rooms/8502,20230316044216,2023-03-17,city scrape,Woodland Studio Lodging,Studio rental on lower level of home located i...,,https://a0.muscache.com/pictures/miso/Hosting-...,25298,...,4.86,4.64,4.57,,f,1,1,0,0,0.3


In [5]:
mask1 = ['id', 
         'host_id',
         'neighbourhood_cleansed',
         'host_neighbourhood',
         'property_type',
         'room_type', 
         'accommodates',
         'bedrooms', 'beds',
         'minimum_nights',
         'review_scores_rating',
         'price',
         'longitude',
         'latitude',
         'number_of_reviews',
         'availability_365']

In [6]:
df = data[mask1]
df = df.rename(columns={'neighbourhood_cleansed':"zip_code",
                        'host_neighbourhood':'neighborhood',
                        'review_scores_rating':'rating',
                        'number_of_reviews':'total_reviews'})
df.head()

Unnamed: 0,id,host_id,zip_code,neighborhood,property_type,room_type,accommodates,bedrooms,beds,minimum_nights,rating,price,longitude,latitude,total_reviews,availability_365
0,5456,8028,78702,East Downtown,Entire guesthouse,Entire home/apt,3,1.0,2.0,2,4.84,$176.00,-97.73441,30.26057,630,298
1,5769,8186,78729,SW Williamson Co.,Private room in home,Private room,2,1.0,1.0,1,4.9,$42.00,-97.78422,30.45697,275,0
2,6413,13879,78704,Travis Heights,Entire guesthouse,Entire home/apt,2,,1.0,30,4.97,$109.00,-97.73587,30.24885,122,0
3,6448,14156,78704,Zilker,Entire guesthouse,Entire home/apt,2,1.0,2.0,3,4.97,$240.00,-97.76487,30.26034,295,133
4,8502,25298,78741,East Riverside,Entire guest suite,Entire home/apt,2,1.0,1.0,4,4.53,$85.00,-97.73682,30.23466,48,36


Drop the duplicates based on host id, longtitude, and latitude. Same longitude latitude means same property.

In [7]:
df = df.sort_values(by=['host_id','price'],ascending=[True,False]).drop_duplicates(['host_id','longitude','latitude'],keep="first")

In [8]:
df.shape

(13980, 16)

Remove the '$' sign, and comma(,) and convert the datatype to integer.

In [9]:
df.price = df.price.astype('str')
df['price'] = df.price.str.replace('$','', regex=True).replace(',','',regex=True)
df.price = df.price.astype(float)
df.price = df.price.astype('int32')

In [10]:
df.zip_code = df.zip_code.astype('int32')
df.availability_365 = df.availability_365.astype('int32')

In [11]:
df.describe()

Unnamed: 0,id,host_id,zip_code,accommodates,bedrooms,beds,minimum_nights,rating,price,longitude,latitude,total_reviews,availability_365
count,13980.0,13980.0,13980.0,13980.0,13259.0,13852.0,13980.0,11107.0,13980.0,13980.0,13980.0,13980.0,13980.0
mean,2.660776e+17,133603600.0,78724.740844,4.816524,2.068029,2.634926,7.352575,4.786699,306.371316,-97.750496,30.281102,36.088412,150.282904
std,3.487903e+17,149301000.0,20.648271,3.160675,1.295426,2.524686,24.894146,0.524681,743.301475,0.063001,0.064789,77.671647,136.765878
min,5456.0,23.0,78701.0,0.0,1.0,1.0,1.0,0.0,0.0,-98.05663,30.07887,0.0,0.0
25%,23217140.0,15782490.0,78704.0,2.0,1.0,1.0,1.0,4.78,107.0,-97.770385,30.24174,1.0,0.0
50%,49458370.0,62926110.0,78723.0,4.0,2.0,2.0,2.0,4.93,180.0,-97.74084,30.268675,8.0,123.0
75%,6.718016e+17,222390500.0,78745.0,6.0,3.0,3.0,3.0,5.0,320.0,-97.716928,30.309495,34.0,298.0
max,8.473047e+17,505184800.0,78759.0,16.0,23.0,132.0,1100.0,5.0,65155.0,-97.56062,30.51225,1099.0,365.0


In [12]:
# quant = df.price.quantile([0.25,0.75])
# q1, q3 = quant[0.25], quant[0.75]
# print(q1,q3)
# IQR = q3-q1
# low  = q1-1.5*IQR
# high = q3+1.5*IQR
# print(low,high)

In [13]:
df.isna().sum()

id                     0
host_id                0
zip_code               0
neighborhood        1811
property_type          0
room_type              0
accommodates           0
bedrooms             721
beds                 128
minimum_nights         0
rating              2873
price                  0
longitude              0
latitude               0
total_reviews          0
availability_365       0
dtype: int64

There must be atleat 1 bed even if the property type is a studio. So for both beds and bedrooms feature, let's put 1.

In [14]:
df.fillna({'rating':df.rating.median(),'bedrooms':1,'beds':1}, inplace=True)

Now, let's find each missing value in the neighborhood column and then get it's corresponding zip code. We will put a neighborhood based on it's zip code. 

But, we'll see which zip codes are missing the neighborhood values.

In [15]:
x = df[df.neighborhood.isna()]['zip_code'].value_counts().reset_index().rename(columns={'index':'zip_code','zip_code':'Missing'})
x

Unnamed: 0,zip_code,Missing
0,78704,128
1,78734,117
2,78744,111
3,78702,106
4,78724,104
5,78745,90
6,78737,88
7,78748,87
8,78758,59
9,78754,59


Based on the zip code, corresponding neighborhood is mapped below.

In [16]:
mapping = {
    78704: 'Zilker',
    78734: 'Lakeway',
    78744: 'McKinney',
    78702: 'East Cesar Chavez',
    78724: 'Colony Park, East End',
    78745: 'Cherry Creek, Westgate',
    78737: 'Bear Creek, Barton Creek',
    78748: 'Shady Hollow, Tanglewood Forest',
    78758: 'North Burnet',
    78754: 'Pioneer Hill, Copperfield',
    78741: 'Montopolis, Parker Lane',
    78747: 'Onion Creek, Bluff Springs',
    78728: 'Wells Branch',
    78705: 'University of Texas at Austin',
    78759: 'Great Hills',
    78752: "St. Johns",
    78721: 'MLK',
    78725: 'Hornsby Bend',
    78738: 'Lake Pointe',
    78746: 'West Lake Hills',
    78703: 'Clarksville',
    78733: 'Rob Roy',
    78723: 'Windsor Park',
    78729: 'Anderson Mill',
    78701: 'Downtown Austin',
    78717: 'Avery Ranch',
    78753: 'Heritage Hills',
    78736: 'Oak Hill',
    78749: 'Circle C',
    78731: 'Northwest Hills, Balcones Park',
    78751: 'Hyde Park, North Loop',
    78757: 'Brentwood, Crestview',
    78727: 'North Austin, Scofield Farms',
    78739: 'Circle C Ranch, Shady Hollow',
    78722: 'Cherrywood, Delwood',
    78735: 'Oak Hill, Barton Creek',
    78742: 'Del Valle',
    78732: 'Steiner Ranch, River Place',
    78756: 'Rosedale, Allandale',
    78719: 'Moores Crossings',
    78750: 'Anderson Mill, Jollyville',
    78726: 'Steiner Ranch, River Place',
    78730: 'West Austin, Four Points'
}

Now all the missing neighborhood values will be replaced with a name based on it's zip code.

In [17]:
for idx, k in df.iterrows():
    if (k.neighborhood) is np.nan:
        if k.zip_code in mapping:
            df.loc[(df.zip_code == k.zip_code) & (df.neighborhood.isna()), 'neighborhood'] = mapping[k.zip_code]

In [18]:
df.isna().sum()

id                  0
host_id             0
zip_code            0
neighborhood        0
property_type       0
room_type           0
accommodates        0
bedrooms            0
beds                0
minimum_nights      0
rating              0
price               0
longitude           0
latitude            0
total_reviews       0
availability_365    0
dtype: int64

Replace the identical locations with common name.

In [19]:
mapping2={'Allendale':'Allandale',
         'Anderson Mill Village': 'Anderson Mill',
         'Anderson Mill Village South':'Anderson Mill',
         'Armstrong Park':'Armstrong_Park_Point',
         'Armstrong Point':'Armstrong_Park_Point',
         'Austin Hills':'Austin_Hills_Lake_Estates',
         'Austin Lake Estates':'Austin_Hills_Lake_Estates',
         'Austin Lake Hills':'Austin_Hills_Lake_Estates',
         'Barton Creek Highlands':'Barton Creek',
         'Bear Creek, Barton Creek':'Barton Creek',
         'Brentwood, Crestview':'Brentwood',
         'Cherry Creek, Westgate':'Cherry Creek',
         'Circle C Ranch': 'Circle C',
         'Circle C Ranch, Shady Hollow':'Circle C',
         'Colony Park, East End':'Colony Park',
         'Greenslopes At Lake Creek':'Greenslopes',
         'Hillside/University Meadows/Ridge Wood Park/North Stonewall Terrace':'Hillside',
         'MLK & 183': 'MLK 183',
         'MLK-183':'MLK 183',
         'Hyde Park, North Loop':'Hyde Park',
         'Montopolis, Parker Lane':'Montopolis',
         'North':'North Side',
         'Northside':'North Side',
         'Northwest Hills, Balcones Park':'Northwest Hills',
         'Northwest Hills Northwest Oaks':'Northwest Hills',
         'Oak Hill, Barton Creek':'Oak Hill',
         'Onion Creek, Bluff Springs':'Onion Creek',
         'Pacifica':'Pacific',
         'Pioneer Hill, Copperfield':'Pioneer Hill',
         'Rosedale, Allandale':'Rosedale',
         'Shady Hollow, Tanglewood Forest':'Shady Hollow',
         'Southside':'South Side',
         'St. John\'s, North Loop':'St. Johns',
         'Steiner Ranch, River Place':'Steiner Ranch',
         'University of Texas':'University of Texas at Autin',
         'West Austin, Four Points':'West Austin',       
         'Downtown':'Downtown Austin'
        }
    
df['neighborhood'] = df['neighborhood'].replace(mapping2)

In [20]:
df

Unnamed: 0,id,host_id,zip_code,neighborhood,property_type,room_type,accommodates,bedrooms,beds,minimum_nights,rating,price,longitude,latitude,total_reviews,availability_365
4732,32666944,23,78744,McKinney,Private room in home,Private room,2,1.0,1.0,2,4.77,98,-97.751190,30.179370,13,241
3728,23629432,23,78744,McKinney,Private room in home,Private room,2,1.0,1.0,1,4.46,80,-97.750800,30.178790,13,265
5450,39056933,23,78744,McKinney,Tent,Entire home/apt,2,1.0,1.0,1,4.94,40,-97.750930,30.180050,18,148
5968,42625339,796,78725,Austins' Colony,Entire cottage,Entire home/apt,4,2.0,4.0,1,4.93,138,-97.576370,30.241470,0,125
2533,17074156,2466,78702,East Downtown,Entire guesthouse,Entire home/apt,3,1.0,1.0,30,4.68,135,-97.712340,30.276510,31,80
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14267,844180241447942467,504557324,78759,North Lamar,Private room in home,Private room,3,1.0,1.0,1,4.93,39,-97.708730,30.413301,0,211
14268,844181974117248791,504557324,78758,North Lamar,Private room in home,Private room,3,1.0,1.0,1,4.93,39,-97.686790,30.390184,0,206
14270,844198837125110871,504764759,78741,East Riverside - Oltorf,Private room in rental unit,Private room,1,1.0,1.0,1,4.93,97,-97.722979,30.242562,0,40
14337,845760861665800487,505094563,78741,East Riverside - Oltorf,Private room in rental unit,Private room,2,1.0,1.0,1,4.93,120,-97.723852,30.241474,0,364


Export the data to a new file.

In [21]:
df.to_csv("data/airbnb_cleaned.csv", index=False)