In [2]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import io
import os
import math

In [3]:
! ls $PWD

 Columns.txt   derby.log		   metastore_db
 Data	       Inside-Airbnb-EP17-master  'Preprocessing Listings.csv.ipynb'


In [4]:
calendarOr = pd.read_csv('./Data/calendar.csv')
listingsOr = pd.read_csv('./Data/listings.csv')
reviewsOr = pd.read_csv('./Data/reviews.csv')

# We start studying and cleaning listings df

In [6]:
#Potential columns to drop:
columns_to_drop = ['listing_url','scrape_id', 'last_scraped', 'scrape_id', 'last_scraped', 'name', 
                   'experiences_offered', 'picture_url','host_url', 'host_name','host_thumbnail_url', 
                   'host_picture_url', 'host_location','summary', 'space', 'description', 'neighbourhood', 
                   'neighbourhood_group_cleansed','city','state', 'zipcode', 'market', 'smart_location',
                   'neighborhood_overview','notes', 'transit', 'access', 'interaction', 'house_rules', 
                   'host_id', 'host_about', 'license', 'requires_license', 'host_neighbourhood','host_verifications',
                   'street', 'country_code', 'is_location_exact', 'guests_included', 'calendar_updated', 
                   'has_availability','calendar_last_scraped', 'host_total_listings_count', 'maximum_nights','country']
#remove for Nan
rows_to_drop=['host_since','bedrooms','bathrooms','beds','review_scores_value', 'latitude', 'longitude', 'neighbourhood_cleansed']

In [7]:
#We delete all columns that are all NaN
listings=listingsOr.dropna(axis=1,how='all')

In [8]:
#for column in columns_to_drop:
listings=listings.drop(columns_to_drop,axis=1)

In [9]:
listings=listings.dropna(axis=0,how='any',subset=rows_to_drop)

In [10]:
listings.shape

(13030, 48)

### We make sure there are no null values

In [11]:
listings.isnull().sum()

id                                      0
host_since                              0
host_response_time                   1224
host_response_rate                   1224
host_is_superhost                       0
host_listings_count                     0
host_has_profile_pic                    0
host_identity_verified                  0
neighbourhood_cleansed                  0
latitude                                0
longitude                               0
property_type                           0
room_type                               0
accommodates                            0
bathrooms                               0
bedrooms                                0
beds                                    0
bed_type                                0
amenities                               0
square_feet                         12619
price                                   0
weekly_price                        10459
monthly_price                       10611
security_deposit                  

In [12]:
#We supose null values are no superhost
listings['host_is_superhost'].fillna('f', inplace=True)
#listings['host_is_superhost']=np.where(listings['host_is_superhost']=='f', 0, 1)

In [13]:
#We supose null values are no picture
listings['host_has_profile_pic'].fillna('f', inplace=True)
#listings['host_has_profile_pic']=np.where(listings['host_has_profile_pic']=='f', 0, 1)

In [14]:
#We supose null values are no identity verified
listings['host_identity_verified'].fillna('f', inplace=True)
#listings['host_identity_verified']=np.where(listings['host_identity_verified']=='f', 0, 1)

In [15]:
#We remove square_feet column as there are too many missing values
listings=listings.drop('square_feet', axis=1)

In [16]:
#We convert weekly price and monthly price to boolean, as there are too many missing values,
#and being missing might mean that they do not have a different price
listings['weekly_price']=(listings['weekly_price'].str.replace(r'[^\d.]','').astype(float))
listings['weekly_price'].fillna(0, inplace=True)
listings['weekly_price']=np.where(listings['weekly_price']>0, 't', 'f')

In [17]:
listings['monthly_price']=(listings['monthly_price'].str.replace(r'[^\d.]','').astype(float))
listings['monthly_price'].fillna(0, inplace=True)
listings['monthly_price']=np.where(listings['monthly_price']>0, 't', 'f')

In [18]:
#We supose having missing fee might mean there is no fee
listings['security_deposit']=(listings['security_deposit'].str.replace(r'[^\d.]','').astype(float))
listings['security_deposit'].fillna(0, inplace=True)
listings['cleaning_fee']=(listings['cleaning_fee'].str.replace(r'[^\d.]','').astype(float))
listings['cleaning_fee'].fillna(0, inplace=True)

In [19]:
#We remove all rows where any of the "scores" columns is missing
reviewScoresColums = listings.filter(regex='review_scores.*').columns
listings=listings.dropna(axis=0,how='any', subset=reviewScoresColums)

In [20]:
#We supose if there is no information about the number of houses rented by the host, it is at least for sure 1
listings['host_listings_count'].fillna(1, inplace=True)

In [21]:
#We fill the missing values with the mean and the mode
listings['host_response_rate']=(listings['host_response_rate'].str.replace("%",'').astype(float))/100
listings['host_response_rate'].fillna(listings['host_response_rate'].median(),inplace=True)
listings['host_response_time'].fillna('within an hour',inplace=True)

In [22]:
#There are no missing values now
listings.isnull().sum()

id                                  0
host_since                          0
host_response_time                  0
host_response_rate                  0
host_is_superhost                   0
host_listings_count                 0
host_has_profile_pic                0
host_identity_verified              0
neighbourhood_cleansed              0
latitude                            0
longitude                           0
property_type                       0
room_type                           0
accommodates                        0
bathrooms                           0
bedrooms                            0
beds                                0
bed_type                            0
amenities                           0
price                               0
weekly_price                        0
monthly_price                       0
security_deposit                    0
cleaning_fee                        0
extra_people                        0
minimum_nights                      0
availability

### We check the data types are right

In [23]:
listings.dtypes

id                                    int64
host_since                           object
host_response_time                   object
host_response_rate                  float64
host_is_superhost                    object
host_listings_count                 float64
host_has_profile_pic                 object
host_identity_verified               object
neighbourhood_cleansed               object
latitude                            float64
longitude                           float64
property_type                        object
room_type                            object
accommodates                          int64
bathrooms                           float64
bedrooms                            float64
beds                                float64
bed_type                             object
amenities                            object
price                                object
weekly_price                         object
monthly_price                        object
security_deposit                

#### Numeric variables

In [56]:
numeric_features = ['id','latitude', 'longitude','host_response_rate','host_listings_count', 'accommodates', 'bathrooms',
       'bedrooms', 'beds', 'price', 'security_deposit', 'cleaning_fee', 'extra_people',
       'minimum_nights', 'availability_30', 'availability_60',
       'availability_90', 'availability_365', 'number_of_reviews','review_scores_rating',
       'review_scores_accuracy', 'review_scores_cleanliness',
       'review_scores_checkin', 'review_scores_communication',
       'review_scores_location', 'review_scores_value','calculated_host_listings_count', 'reviews_per_month']

In [25]:
listings['price']=(listings['price'].str.replace(r'[^\d.]','').astype(float))

In [26]:
listings['extra_people']=(listings['extra_people'].str.replace(r'[^\d.]','').astype(float))

#### Amenities

In [27]:
listings['amenities'] = listings['amenities'].map(
    lambda amns: "|".join([amn.replace("}", "").replace("{", "").replace('"', "")\
                           for amn in amns.split(",")])
)

In [60]:
amenities = np.unique(np.concatenate(listings['amenities'].map(lambda amns: amns.split("|")).values))[1:-2] 
#First amenity is empty string and last 2 say "translation missing"
amenities_matrix = np.array([listings['amenities'].map(lambda amns: 1 if amn in amns else 0) for amn in amenities])

In [61]:
Amenities_columns =np.array(["Amen_" + amenity  for amenity in amenities])

In [62]:
features = pd.concat([listings[numeric_features], pd.DataFrame(data=amenities_matrix.T, columns=Amenities_columns)], axis=1)

In [63]:
features.shape

(15093, 141)

In [66]:
listings[numeric_features]

Unnamed: 0,id,latitude,longitude,host_response_rate,host_listings_count,accommodates,bathrooms,bedrooms,beds,price,...,number_of_reviews,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,calculated_host_listings_count,reviews_per_month
0,18628,40.424715,-3.698638,0.60,1.0,2,1.0,0.0,1.0,54.0,...,37,89.0,9.0,9.0,9.0,10.0,10.0,9.0,1,0.40
1,19864,40.413418,-3.706838,1.00,1.0,2,1.0,0.0,1.0,65.0,...,56,91.0,9.0,9.0,10.0,10.0,10.0,9.0,1,0.81
2,21512,40.424920,-3.713446,1.00,12.0,2,1.0,0.0,1.0,40.0,...,36,79.0,9.0,8.0,9.0,9.0,10.0,8.0,7,0.39
3,21853,40.403410,-3.740842,1.00,2.0,1,1.0,1.0,1.0,17.0,...,26,90.0,9.0,9.0,9.0,10.0,8.0,9.0,2,0.65
4,23021,40.423417,-3.712456,1.00,12.0,10,3.0,4.0,5.0,90.0,...,55,80.0,8.0,8.0,9.0,9.0,9.0,8.0,7,0.93
5,24805,40.422022,-3.703954,0.83,3.0,3,1.0,0.0,2.0,55.0,...,2,100.0,8.0,8.0,10.0,10.0,10.0,10.0,2,0.05
6,24836,40.419951,-3.697637,1.00,1.0,4,1.0,2.0,3.0,115.0,...,39,98.0,10.0,10.0,10.0,10.0,10.0,10.0,1,0.48
7,26816,40.391665,-3.688685,1.00,3.0,3,2.0,2.0,2.0,100.0,...,7,95.0,8.0,10.0,10.0,10.0,8.0,8.0,3,0.07
8,26823,40.391161,-3.688234,1.00,3.0,2,1.0,1.0,1.0,42.0,...,27,87.0,10.0,10.0,10.0,9.0,9.0,9.0,3,0.30
9,26825,40.389850,-3.690108,1.00,3.0,1,1.0,1.0,1.0,25.0,...,91,94.0,10.0,10.0,10.0,9.0,9.0,9.0,3,0.97


In [65]:
pd.DataFrame(data=amenities_matrix.T, columns=Amenities_columns)

Unnamed: 0,Amen_ toilet,Amen_24-hour check-in,Amen_Accessible-height bed,Amen_Accessible-height toilet,Amen_Air conditioning,Amen_BBQ grill,Amen_Baby bath,Amen_Baby monitor,Amen_Babysitter recommendations,Amen_Bathtub,...,Amen_Waterfront,Amen_Well-lit path to entrance,Amen_Wheelchair accessible,Amen_Wide clearance to bed,Amen_Wide clearance to shower,Amen_Wide doorway,Amen_Wide entryway,Amen_Wide hallway clearance,Amen_Window guards,Amen_Wireless Internet
0,0,0,0,0,1,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1
1,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,1,0,0,1,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
3,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,0,1,0,0,1,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
5,0,1,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
6,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
7,0,1,0,0,1,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
8,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
9,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1


#### Boolean categories

In [53]:
for tf_feature in ['host_is_superhost', 'host_identity_verified', 'host_has_profile_pic',
                   'instant_bookable','require_guest_profile_picture', 'require_guest_phone_verification', 
                   'weekly_price', 'monthly_price', 'is_business_travel_ready', 
                   'require_guest_profile_picture', 'require_guest_phone_verification']:
    features[tf_feature] = listings[tf_feature].map(lambda s: 0 if s == "f" else 1)

#### Categorical variables

In [87]:
categorical_features = ['neighbourhood_cleansed', 'property_type', 'room_type', 'bed_type']
listings_categorical = pd.get_dummies(listings[categorical_features], columns=categorical_features, prefix=["Neigh", "Property", "Room", "Bed"])

In [91]:
features = pd.concat([features, listings_categorical], axis=1)

In [94]:
%pylab inline
import pandas as pd
import plotly.offline as py
import plotly.graph_objs as go
py.init_notebook_mode(connected=True)

Populating the interactive namespace from numpy and matplotlib


In [95]:
from sklearn.decomposition import PCA
pca = PCA(n_components=3)
X_enc = pca.fit_transform(features)

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [None]:
py.iplot([go.Scatter3d(x=X_enc[:,0],y=X_enc[:,1], z=X_enc[:,2], mode='markers', marker=dict(color=y, size=2))])