In [9]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as st

In [10]:
airbnb_data = pd.read_csv('AB_NYC_2019.csv')
airbnb_df = pd.DataFrame(airbnb_data)
airbnb_df.head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,2539,Clean & quiet apt home by the park,2787,John,Brooklyn,Kensington,40.64749,-73.97237,Private room,149,1,9,2018-10-19,0.21,6,365
1,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,2019-05-21,0.38,2,355
2,3647,THE VILLAGE OF HARLEM....NEW YORK !,4632,Elisabeth,Manhattan,Harlem,40.80902,-73.9419,Private room,150,3,0,,,1,365
3,3831,Cozy Entire Floor of Brownstone,4869,LisaRoxanne,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,270,2019-07-05,4.64,1,194
4,5022,Entire Apt: Spacious Studio/Loft by central park,7192,Laura,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,2018-11-19,0.1,1,0


In [11]:
len(airbnb_df['id'])

48895

In [13]:
del airbnb_df['name'],airbnb_df['host_name']

In [14]:
airbnb_df.head()

Unnamed: 0,id,host_id,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,2539,2787,Brooklyn,Kensington,40.64749,-73.97237,Private room,149,1,9,2018-10-19,0.21,6,365
1,2595,2845,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,2019-05-21,0.38,2,355
2,3647,4632,Manhattan,Harlem,40.80902,-73.9419,Private room,150,3,0,,,1,365
3,3831,4869,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,270,2019-07-05,4.64,1,194
4,5022,7192,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,2018-11-19,0.1,1,0


we can find out how many months the listing has been active by dividing number_of_reviews by reviews_per_month

last_review and reviews_per_month are missing values compared to the rest of the data
likely due to listings have zero reviews...

In [15]:
airbnb_df.count()

id                                48895
host_id                           48895
neighbourhood_group               48895
neighbourhood                     48895
latitude                          48895
longitude                         48895
room_type                         48895
price                             48895
minimum_nights                    48895
number_of_reviews                 48895
last_review                       38843
reviews_per_month                 38843
calculated_host_listings_count    48895
availability_365                  48895
dtype: int64

In [None]:
airbnb_df 

In [17]:
airbnb_df['reviews_per_month'] = airbnb_df['reviews_per_month'].fillna(0)
airbnb_df.head()

Unnamed: 0,id,host_id,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,2539,2787,Brooklyn,Kensington,40.64749,-73.97237,Private room,149,1,9,2018-10-19,0.21,6,365
1,2595,2845,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,2019-05-21,0.38,2,355
2,3647,4632,Manhattan,Harlem,40.80902,-73.9419,Private room,150,3,0,,0.0,1,365
3,3831,4869,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,270,2019-07-05,4.64,1,194
4,5022,7192,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,2018-11-19,0.1,1,0


In [28]:
airbnb_df['last_review'] = airbnb_df['last_review'].replace(np.nan, '9999-12-31') # last_review is a string
airbnb_df.head()

Unnamed: 0,id,host_id,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,2539,2787,Brooklyn,Kensington,40.64749,-73.97237,Private room,149,1,9,2018-10-19,0.21,6,365
1,2595,2845,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,2019-05-21,0.38,2,355
2,3647,4632,Manhattan,Harlem,40.80902,-73.9419,Private room,150,3,0,9999-12-31,0.0,1,365
3,3831,4869,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,270,2019-07-05,4.64,1,194
4,5022,7192,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,2018-11-19,0.1,1,0


In [33]:
airbnb_df.loc[:,['price','minimum_nights','number_of_reviews','availability_365']].describe()

Unnamed: 0,price,minimum_nights,number_of_reviews,availability_365
count,48895.0,48895.0,48895.0,48895.0
mean,152.720687,7.029962,23.274466,112.781327
std,240.15417,20.51055,44.550582,131.622289
min,0.0,1.0,0.0,0.0
25%,69.0,1.0,1.0,0.0
50%,106.0,3.0,5.0,45.0
75%,175.0,5.0,24.0,227.0
max,10000.0,1250.0,629.0,365.0


Create bins on reviews/price and use groupby to try and formualte ideas about certain trends

In [60]:
#review_bins = [-1,0.9,3.9,10.9,50.9,100.9,650]
#review_bin_names = ['0','1-3','4-10','11-50','51-100','>101']
#airbnb_df['review range'] = pd.cut(airbnb_df['number_of_reviews'],review_bins,labels=review_bin_names)
#airbnb_df.head()

In [59]:
#airbnb_df['review range'].value_counts()

In [58]:
#neighbourhood_by_review = airbnb_df[['neighbourhood_group','review range','price','minimum_nights','availability_365','number_of_reviews']].groupby(['neighbourhood_group','review range']).mean()
#neighbourhood_by_review

In [61]:
price_bins = [0,49.9,99.9,149.9,249.9,499.9,10001]
price_bin_names = ['0-49','50-99','100-149','150-249','250-499','>500']
airbnb_df['price range'] = pd.cut(airbnb_df['price'],price_bins,labels=price_bin_names)
neighbourhood_by_price = airbnb_df[['neighbourhood_group','price range','number_of_reviews','minimum_nights','availability_365']].groupby(['neighbourhood_group','price range']).mean()
neighbourhood_by_price

Unnamed: 0_level_0,Unnamed: 1_level_0,number_of_reviews,minimum_nights,availability_365
neighbourhood_group,price range,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Bronx,0-49,25.153355,4.690096,151.897764
Bronx,50-99,28.552063,5.084479,168.787819
Bronx,100-149,26.736527,3.91018,169.520958
Bronx,150-249,15.048387,2.774194,154.129032
Bronx,250-499,16.333333,2.366667,255.366667
Bronx,>500,2.444444,2.444444,192.444444
Brooklyn,0-49,16.468248,7.787226,98.220803
Brooklyn,50-99,23.92336,5.472348,94.243286
Brooklyn,100-149,29.251578,6.199126,96.766634
Brooklyn,150-249,27.20118,5.760326,105.750492


In [62]:
room_type_by_price = airbnb_df[['room_type','price range','number_of_reviews','minimum_nights','availability_365']].groupby(['room_type','price range']).mean()
room_type_by_price

Unnamed: 0_level_0,Unnamed: 1_level_0,number_of_reviews,minimum_nights,availability_365
room_type,price range,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Entire home/apt,0-49,29.652174,7.930435,84.495652
Entire home/apt,50-99,27.560927,8.957952,86.208524
Entire home/apt,100-149,26.096371,8.720405,93.694081
Entire home/apt,150-249,22.687189,8.30604,114.808209
Entire home/apt,250-499,17.032173,8.414095,137.505581
Entire home/apt,>500,12.269524,7.921905,180.060952
Private room,0-49,20.236228,7.892857,101.781046
Private room,50-99,26.099252,4.99599,107.513072
Private room,100-149,26.688312,3.885281,119.673779
Private room,150-249,17.505155,3.639968,130.020619
