In [0]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt

In [0]:
# file of data
file_loc = 'http://data.insideairbnb.com/united-states/dc/washington-dc/2015-10-03/data/listings.csv.gz'

In [9]:
df = pd.read_csv('listingsairbnb.csv')
print(df.shape)
print(df.describe())
df.head()

(3723, 11)
       accomodates    bathrooms  ...  maximum_nights  review_scores_rating
count  3723.000000  3696.000000  ...    3.723000e+03            2855.00000
mean      3.195004     1.256358  ...    5.803069e+05              93.42627
std       2.012216     0.585539  ...    3.519552e+07               7.77311
min       1.000000     0.000000  ...    1.000000e+00              30.00000
25%       2.000000     1.000000  ...    1.200000e+02              90.00000
50%       2.000000     1.000000  ...    1.125000e+03              95.00000
75%       4.000000     1.000000  ...    1.125000e+03             100.00000
max      16.000000     8.000000  ...    2.147484e+09             100.00000

[8 rows x 8 columns]


Unnamed: 0,host_since,state,accomodates,bathrooms,bedrooms,beds,square_feet,price,minimum_nights,maximum_nights,review_scores_rating
0,5/21/14,DC,4,1.0,1.0,2.0,,$160.00,1,1125,
1,3/5/13,DC,6,3.0,3.0,3.0,,$350.00,2,30,94.0
2,12/9/11,MD,1,2.0,1.0,1.0,,$50.00,2,1125,
3,6/18/14,DC,2,1.0,1.0,1.0,,$95.00,1,1125,
4,3/31/15,MD,4,1.0,1.0,1.0,,$50.00,7,1125,


In [10]:
# test finding abs value of diff between observed vs predicted
acc_val = 3
first_living_space = df.loc[0,'accomodates']
first_distance = np.abs(first_living_space - acc_val)
print(first_distance)

1


In [11]:
df['distance'] = np.abs(df.accomodates - acc_val)
df.distance.value_counts().sort_index()

0      461
1     2294
2      503
3      279
4       35
5       73
6       17
7       22
8        7
9       12
10       2
11       4
12       6
13       8
Name: distance, dtype: int64

In [12]:
# select random sample of accomodate data, random-state to reproduce same data
df = df.sample(frac=1, random_state=0)
df = df.sort_values('distance')
df.price.head()


2645     $75.00 
2825    $120.00 
2145     $90.00 
2541     $50.00 
3349    $105.00 
Name: price, dtype: object

In [13]:
# clean price column
df['price'] = df.price.str.replace("\$|,",'').astype(float)
mean = df.price.iloc[:5].mean()
print(mean)

88.0


In [0]:
# split data training and test sets
df.drop('distance',axis=1)
train_df = df.copy().iloc[:2792]
test_df = df.copy().iloc[2792:]

In [0]:
# naive price prediction
def predict_price(new_listing_value, feature_column):
  temp_df = train_df
  temp_df['distance'] = np.abs(df[feature_column] - new_listing_value)
  temp_df = temp_df.sort_values('distance') # sort to get 5 closest neighbors
  knn_5 = temp_df.price.iloc[:5]
  pred_price = knn_5.mean()
  return(pred_price)

In [0]:
# apply function on df and new col for predicted prices computed
test_df['predicted_price'] = test_df.accomodates.apply(predict_price, feature_column='accomodates')

In [19]:
# find rmse, sbsolute diff between predicted and actual prices
test_df['squared_error'] = (test_df['predicted_price'] - test_df['price'])**(2)
mse = test_df['squared_error'].mean()
rmse = mse **(1/2)
rmse

212.98927967051543

In [22]:
# add other predicted variables
for feature in ['accomodates', 'bedrooms','bathrooms','review_scores_rating']:
  test_df['predicted_price'] = test_df.accomodates.apply(predict_price, feature_column=feature)
  test_df['squared_error'] = (test_df['predicted_price'] - test_df['price'])**(2)
  mse = test_df['squared_error'].mean()
  rmse = mse ** (1/2)
  print("RMSE for the {} column: {}".format(feature,rmse))

RMSE for the accomodates column: 212.98927967051543
RMSE for the bedrooms column: 216.49048609414763
RMSE for the bathrooms column: 216.89419042215684
RMSE for the review_scores_rating column: 266.2897943312584


In [23]:
df.shape

(3723, 12)

In [24]:
df.index

Int64Index([2645, 2825, 2145, 2541, 3349, 2228, 1122, 2311,  625, 2312,
            ...
            2106, 2757,  611, 1818, 1402,  763, 2560, 1594, 1224, 1596],
           dtype='int64', length=3723)

In [25]:
df.head()

Unnamed: 0,host_since,state,accomodates,bathrooms,bedrooms,beds,square_feet,price,minimum_nights,maximum_nights,review_scores_rating,distance
2645,12/22/12,DC,3,1.0,1.0,1.0,,75.0,7,180,85.0,0
2825,9/12/15,DC,3,2.0,3.0,2.0,,120.0,1,1125,,0
2145,5/24/14,DC,3,2.0,1.0,2.0,,90.0,1,1125,98.0,0
2541,11/3/12,DC,3,1.0,1.0,1.0,,50.0,1,1125,100.0,0
3349,3/7/15,DC,3,1.0,1.0,1.0,,105.0,1,1125,91.0,0


In [38]:
for c in df.columns:
  print(c)
  print(df[c].isnull().sum())

host_since
0
state
0
accomodates
0
bathrooms
27
bedrooms
21
beds
11
square_feet
3641
price
0
minimum_nights
0
maximum_nights
0
review_scores_rating
868
distance
0


AttributeError: ignored