In [2]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import io
import os
import math

In [3]:
! ls $PWD

 Clustering.ipynb   Data  'Preprocessing Listings.csv2.ipynb'   README.md


In [4]:
#calendarOr = pd.read_csv('./Airbnb-Project/Data/calendar.csv')
listingsOr = pd.read_csv('./Data/listings.csv')
#reviewsOr = pd.read_csv('./Airbnb-Project/Data/reviews.csv')

# We start studying and cleaning listings df

In [5]:
#Potential columns to drop:
columns_to_drop = ['listing_url','scrape_id', 'last_scraped', 'scrape_id', 'last_scraped', 'name', 
                   'experiences_offered', 'picture_url','host_url', 'host_name','host_thumbnail_url', 
                   'host_picture_url', 'host_location','summary', 'space', 'description', 'neighbourhood', 
                   'neighbourhood_group_cleansed','city','state', 'zipcode', 'market', 'smart_location',
                   'neighborhood_overview','notes', 'transit', 'access', 'interaction', 'house_rules', 
                   'host_id', 'host_about', 'license', 'requires_license', 'host_neighbourhood','host_verifications',
                   'street', 'country_code', 'is_location_exact', 'guests_included', 'calendar_updated', 
                   'has_availability','calendar_last_scraped', 'host_total_listings_count', 'maximum_nights','country']
#remove for Nan
rows_to_drop=['host_since','bedrooms','bathrooms','beds','review_scores_value', 'latitude', 'longitude', 'neighbourhood_cleansed']

In [6]:
#We delete all columns that are all NaN
listings=listingsOr.dropna(axis=1,how='all')

In [7]:
#for column in columns_to_drop:
listings=listings.drop(columns_to_drop,axis=1)

In [8]:
listings=listings.dropna(axis=0,how='any',subset=rows_to_drop)

In [9]:
listings.shape

(13030, 48)

### We make sure there are no null values

In [10]:
listings.isnull().sum()

id                                      0
host_since                              0
host_response_time                   1224
host_response_rate                   1224
host_is_superhost                       0
host_listings_count                     0
host_has_profile_pic                    0
host_identity_verified                  0
neighbourhood_cleansed                  0
latitude                                0
longitude                               0
property_type                           0
room_type                               0
accommodates                            0
bathrooms                               0
bedrooms                                0
beds                                    0
bed_type                                0
amenities                               0
square_feet                         12619
price                                   0
weekly_price                        10459
monthly_price                       10611
security_deposit                  

In [11]:
#We supose null values are no superhost
listings['host_is_superhost'].fillna('f', inplace=True)
#listings['host_is_superhost']=np.where(listings['host_is_superhost']=='f', 0, 1)

In [12]:
#We supose null values are no picture
listings['host_has_profile_pic'].fillna('f', inplace=True)
#listings['host_has_profile_pic']=np.where(listings['host_has_profile_pic']=='f', 0, 1)

In [13]:
#We supose null values are no identity verified
listings['host_identity_verified'].fillna('f', inplace=True)
#listings['host_identity_verified']=np.where(listings['host_identity_verified']=='f', 0, 1)

In [14]:
#We remove square_feet column as there are too many missing values
listings=listings.drop('square_feet', axis=1)

In [15]:
#We convert weekly price and monthly price to boolean, as there are too many missing values,
#and being missing might mean that they do not have a different price
listings['weekly_price']=(listings['weekly_price'].str.replace(r'[^\d.]','').astype(float))
listings['weekly_price'].fillna(0, inplace=True)
listings['weekly_price']=np.where(listings['weekly_price']>0, 't', 'f')

In [16]:
listings['monthly_price']=(listings['monthly_price'].str.replace(r'[^\d.]','').astype(float))
listings['monthly_price'].fillna(0, inplace=True)
listings['monthly_price']=np.where(listings['monthly_price']>0, 't', 'f')

In [17]:
#We supose having missing fee might mean there is no fee
listings['security_deposit']=(listings['security_deposit'].str.replace(r'[^\d.]','').astype(float))
listings['security_deposit'].fillna(0, inplace=True)
listings['cleaning_fee']=(listings['cleaning_fee'].str.replace(r'[^\d.]','').astype(float))
listings['cleaning_fee'].fillna(0, inplace=True)

In [18]:
#We remove all rows where any of the "scores" columns is missing
reviewScoresColums = listings.filter(regex='review_scores.*').columns
listings=listings.dropna(axis=0,how='any', subset=reviewScoresColums)

In [19]:
#We supose if there is no information about the number of houses rented by the host, it is at least for sure 1
listings['host_listings_count'].fillna(1, inplace=True)

In [20]:
#We fill the missing values with the mean and the mode
listings['host_response_rate']=(listings['host_response_rate'].str.replace("%",'').astype(float))/100
listings['host_response_rate'].fillna(listings['host_response_rate'].median(),inplace=True)
listings['host_response_time'].fillna('within an hour',inplace=True)

In [21]:
print(listings['room_type'].unique())
#We only keep Entire home/apt
listings = listings[listings['room_type']=='Entire home/apt'].drop('room_type', axis=1)

['Entire home/apt' 'Private room' 'Shared room']


In [40]:
#There are no missing values now
sum(listings.isnull().sum())

0

In [41]:
listings = listings.reset_index()

### We check the data types are right

In [25]:
listings.dtypes

index                                 int64
id                                    int64
host_since                           object
host_response_time                   object
host_response_rate                  float64
host_is_superhost                    object
host_listings_count                 float64
host_has_profile_pic                 object
host_identity_verified               object
neighbourhood_cleansed               object
latitude                            float64
longitude                           float64
property_type                        object
accommodates                          int64
bathrooms                           float64
bedrooms                            float64
beds                                float64
bed_type                             object
amenities                            object
price                                object
weekly_price                         object
monthly_price                        object
security_deposit                

#### Numeric variables

In [26]:
numeric_features = ['id','latitude', 'longitude','host_response_rate','host_listings_count', 'accommodates', 'bathrooms',
       'bedrooms', 'beds', 'price', 'security_deposit', 'cleaning_fee', 'extra_people',
       'minimum_nights', 'availability_30', 'availability_60',
       'availability_90', 'availability_365', 'number_of_reviews','review_scores_rating',
       'review_scores_accuracy', 'review_scores_cleanliness',
       'review_scores_checkin', 'review_scores_communication',
       'review_scores_location', 'review_scores_value','calculated_host_listings_count', 'reviews_per_month']

In [27]:
listings['price']=(listings['price'].str.replace(r'[^\d.]','').astype(float))

In [28]:
listings['extra_people']=(listings['extra_people'].str.replace(r'[^\d.]','').astype(float))

In [51]:
listings.shape

(8570, 48)

#### Amenities

In [29]:
listings['amenities'] = listings['amenities'].map(
    lambda amns: "|".join([amn.replace("}", "").replace("{", "").replace('"', "")\
                           for amn in amns.split(",")])
)

In [30]:
amenities = np.unique(np.concatenate(listings['amenities'].map(lambda amns: amns.split("|")).values))[1:-2] 
#First amenity is empty string and last 2 say "translation missing"
amenities_matrix = np.array([listings['amenities'].map(lambda amns: 1 if amn in amns else 0) for amn in amenities])

In [31]:
Amenities_columns =np.array(["Amen_" + amenity  for amenity in amenities])

#### Categorical variables

In [42]:
categorical_features = ['neighbourhood_cleansed', 'property_type', 'bed_type']

In [47]:
features = pd.concat([listings[numeric_features],listings[categorical_features],pd.DataFrame(data=amenities_matrix.T, columns=Amenities_columns)], axis=1)

In [48]:
features.shape

(8570, 142)

#### Boolean categories

In [49]:
for tf_feature in ['host_is_superhost', 'host_identity_verified', 'host_has_profile_pic',
                   'instant_bookable','require_guest_profile_picture', 'require_guest_phone_verification', 
                   'weekly_price', 'monthly_price', 'is_business_travel_ready', 
                   'require_guest_profile_picture', 'require_guest_phone_verification']:
    features[tf_feature] = listings[tf_feature].map(lambda s: 0 if s == "f" else 1)

In [50]:
features.shape

(8570, 151)

In [53]:
features.to_csv('./Data/listings_clean.csv', index=False)