Cleaning code for Bangkok 25 August, 2021 database of http://insideairbnb.com/get-the-data.html

In [4]:
import pandas as pd
import os
import warnings

warnings.filterwarnings("ignore")

In [5]:
data_in = "../data/"
data_out = "../data/"

In [6]:
data = pd.read_csv(data_in + "listings.csv", delimiter=",", dtype="unicode")
data.shape

(17581, 74)

In [7]:
drops = [
    "host_thumbnail_url",
    "host_picture_url",
    "listing_url",
    "picture_url",
    "host_url",
    "last_scraped",
    "description",
    "neighborhood_overview",
    "host_about",
    "host_response_time",
    "name",
    "host_location",
]
data.drop(columns=drops, inplace=True)
data.to_csv(data_in + "airbnb_bangkok_listing.csv", index=False)

In [8]:
# opening dataset
df = pd.read_csv(data_in + "airbnb_bangkok_listing.csv", delimiter=",",encoding="UTF-8")

In [9]:
# drop broken lines - where id is not a character of numbers
df.id = pd.to_numeric(df.id, errors="coerce")
df = df[df.id.notna()]

In [10]:
# display the class and type of each columns
df.dtypes

id                                                int64
scrape_id                                         int64
host_id                                           int64
host_name                                        object
host_since                                       object
                                                 ...   
calculated_host_listings_count                    int64
calculated_host_listings_count_entire_homes       int64
calculated_host_listings_count_private_rooms      int64
calculated_host_listings_count_shared_rooms       int64
reviews_per_month                               float64
Length: 62, dtype: object

In [11]:
#####################
# formatting columns
for perc in ["host_response_rate", "host_acceptance_rate"]:
    df[perc] = pd.to_numeric(df[perc], errors="coerce")

In [12]:
# remove percentage signs
for pricevars in ["price"]:
    df[pricevars] = df[pricevars].str.replace("\\$", "")
    df[pricevars] = pd.to_numeric(df[pricevars], errors="coerce")

In [13]:
# format binary variables
for binary in [
    "host_is_superhost",
    "host_has_profile_pic",
    "host_identity_verified",
    "instant_bookable",
]:
    df[binary] = df[binary].map({"t": True, "f": False})

In [14]:
# amenities
df["amenities"] = (
    df["amenities"]
    .str.strip("{}")
    .str.replace('"', "")
    .str.replace("[", "")
    .str.replace("]", "")
    .str.replace("–", "")
    .str.replace("-", "")
    .str.replace("\\u", "'", regex=False)
    .str.replace("\d", "", regex=True)
    .str.replace("'", "")
    .str.lower()
    .str.replace("mbps", "",regex=False)
    .str.strip()
    .str.split(",")
)

In [15]:
# generate dummies by amenities
df["amenities"].explode().str.strip().value_counts()

air conditioning                  16871
long term stays allowed           16639
essentials                        15633
wifi                              15391
hangers                           13973
                                  ...  
watson shampoo                        1
samsung bluetooth sound system        1
razr bluetooth sound system           1
local brand conditioner               1
private indoor pool                   1
Name: amenities, Length: 351, dtype: int64

In [17]:
# maybe get those, that has 500 or more count ? not sure
amenities_keep = df["amenities"].explode().str.strip().value_counts().loc[lambda x: x>500]

amenities_keep.to_pickle("amenities_to_keep.pkl")

In [19]:
dummies = pd.get_dummies(df.amenities.apply(pd.Series).stack().str.strip()).sum(level=0)
df = pd.concat([df, dummies], axis=1)

In [20]:
drops = ["amenities"]
df.drop(columns=drops, inplace=True)

In [21]:
df.shape

(17581, 412)

In [22]:
# write csv
df.to_csv(data_out + "airbnb_bangkok_cleaned.csv", index=False)

In [23]:
df.isna().sum().loc[lambda x: x > 0]

host_name                           2
host_since                          2
host_response_rate              17581
host_acceptance_rate            17581
host_is_superhost                   2
host_neighbourhood               6415
host_listings_count                 2
host_total_listings_count           2
host_has_profile_pic                2
host_identity_verified              2
neighbourhood                    7395
neighbourhood_group_cleansed    17581
bathrooms                       17581
bathrooms_text                     53
bedrooms                         1940
beds                              355
price                            9796
minimum_minimum_nights              1
maximum_minimum_nights              1
minimum_maximum_nights              1
maximum_maximum_nights              1
minimum_nights_avg_ntm              1
maximum_nights_avg_ntm              1
calendar_updated                17581
first_review                     7142
last_review                      7142
review_score