In [30]:
import pandas as pd
import numpy as np

#### Reading the file

In [31]:
df = pd.read_csv("data/boston.csv")
df.sample(3)

Unnamed: 0,id,listing_url,scrape_id,last_scraped,name,summary,space,description,experiences_offered,neighborhood_overview,...,instant_bookable,is_business_travel_ready,cancellation_policy,require_guest_profile_picture,require_guest_phone_verification,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month
798,12602632,https://www.airbnb.com/rooms/12602632,20191204162830,2019-12-04,Lovely and Warm 2 Bedroom Condo (Close to the T),Enjoy our historic 1905 home in the wonderful ...,"The space if ideal for solo travelers, couples...",Enjoy our historic 1905 home in the wonderful ...,none,"Jamaica Plain (affectionately called ""JP"") has...",...,f,f,moderate,f,f,1,1,0,0,0.54
3348,39604562,https://www.airbnb.com/rooms/39604562,20191204162830,2019-12-04,"Over-sized historic one bedroom MGH,BU,MIT",Over sized one bedroom condo located in pristi...,,Over sized one bedroom condo located in pristi...,none,95 walk score and 96 transportation score.,...,t,f,strict_14_with_grace_period,f,f,10,10,0,0,
1804,24080196,https://www.airbnb.com/rooms/24080196,20191204162830,2019-12-04,A room,My house locates outside the Savin Hill statio...,,My house locates outside the Savin Hill statio...,none,,...,f,f,flexible,f,f,4,0,4,0,5.92


In [32]:
df.shape

(3507, 106)

In [33]:
# # dropping ligne that don't have "Boston" in host location
# df = df[df["host_location"] == "Boston, Massachusetts, United States"]


#### columns to drop

|Column|Reason|
|-----|-------|
|id<br/>host_id<br/>scrape_id<br/>country_code<br/>country<br/>zipcode<br/>|not relevant/same value|
|name<br/>listing_url<br/>summary<br/>space<br/>description<br/>neighborhood_overview<br/>notes<br/>transit<br/>access<br/>interaction<br/>house_rules<br/>picture_url<br/>host_about<br/>host_response_time<br/>host_picture_url<br/>host_verifications<br/>host_url<br/>host_name<br/>host_thumbnail_url<br/>amenities<br/>calendar_updated<br/>license<br/>jurisdiction_names<br/>host_location</br>street<br/>market<br>smart_location<br/>|Textual data|
|neighbourhood_cleasned<br/>calendar_last_scraped<br/>host_neighbourhood<br/>host_total_listings_count| duplicate of another column|
|experiences_offered<br/>thumbnail_url<br/>medium_url<br/>xl_picture_url<br/>neighbourhood_group_cleansed|null columns|
|square_feet<br/>weekly_price<br/>monthly_price|not enough values|



In [34]:
irrelevant_cols = ["id", "scrape_id", "host_id",
                   "country_code", "country", "state","zipcode"]

textual_cols = ["name","listing_url",
                "summary","space",
                "description","neighborhood_overview",
                "notes","transit","access","interaction",
                "house_rules","picture_url","host_about",
                "host_response_time","host_picture_url",
                "host_verifications", "host_url", "host_name",
                "host_thumbnail_url", "amenities", "calendar_updated",
                "license", "jurisdiction_names", 
                "host_location", "street", "market",
                "smart_location"]

duplicate_cols = ["neighbourhood_cleansed",
                  "calendar_last_scraped", "host_neighbourhood",
                  "host_total_listings_count"]

null_cols = ["experiences_offered", "thumbnail_url",
             "medium_url", "xl_picture_url", "host_acceptance_rate",
             "neighbourhood_group_cleansed"]

not_enough_cols = ["square_feet", "weekly_price", "monthly_price"]

dropped_cols = textual_cols + duplicate_cols + \
irrelevant_cols + null_cols + not_enough_cols

df.drop(dropped_cols, axis=1, inplace=True)

In [35]:
df.shape

(3507, 59)

#### Handling Categorical data

|Column|Categories|
|-----|-------|
|last_scraped|2019-12-04<br/>2019-12-04|
|neighbourhood|Allston-Brighton<br/>Dorchester<br/>Back Bay|
|city|Boston<br/>Dorchester<br/>Allston|
|property_type|partment<br/>House<br/>Condominium|
|room_type|Entire home/apt<br/>Private room<br/>Hotel room|
|bed_type|Real Bed<br/>Futon<br/>Airbed|

In [36]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

df["last_scraped"] = le.fit_transform(df["last_scraped"])
df["neighbourhood"] = le.fit_transform(df["neighbourhood"])
df["city"] = le.fit_transform(df["city"])
df["property_type"] = le.fit_transform(df["property_type"])
df["room_type"] = le.fit_transform(df["room_type"])
df["bed_type"] = le.fit_transform(df["bed_type"])

#### Turning t/f into boolean

In [37]:

df["host_is_superhost"] = df["host_is_superhost"].map(lambda x: True if x == "t" else False)
df["host_has_profile_pic"] = df["host_has_profile_pic"].map(lambda x: True if x == "t" else False)
df["host_identity_verified"] = df["host_identity_verified"].map(lambda x: True if x == "t" else False)
df["is_location_exact"] = df["is_location_exact"].map(lambda x: True if x == "t" else False)
