# Case Study AirBnB Clustering
This notebook walks through how to segment AirBnB spaces into natural affinity clusters

In [6]:
# Data Representation
import numpy as np
import pandas as pd

# Processing & Modeling
import sklearn
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import VotingRegressor
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler


from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


from sklearn import set_config
set_config(display='diagram')   

import statsmodels.api as sm

# Visualization
import matplotlib.pyplot as plt
import plotly.express as px
from statsmodels.graphics.regressionplots import plot_partregress_grid, plot_regress_exog

random_state = 42
pd.set_option('display.max_rows', 100)

In [3]:
link = 'https://drive.google.com/file/d/1tT0lNiDHwGQPLa3N0zSdhZcJZaa5aqF3/view?usp=sharing'
path = 'https://drive.google.com/uc?export=download&id='+link.split('/')[-2]

In [4]:
listings_df = pd.read_csv(path)
listings_df.set_index('id', inplace=True)
listings_df.head(1)

Unnamed: 0_level_0,listing_url,scrape_id,last_scraped,name,summary,space,description,experiences_offered,neighborhood_overview,notes,...,review_scores_value,requires_license,license,jurisdiction_names,instant_bookable,cancellation_policy,require_guest_profile_picture,require_guest_phone_verification,calculated_host_listings_count,reviews_per_month
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
241032,https://www.airbnb.com/rooms/241032,20160104002432,2016-01-04,Stylish Queen Anne Apartment,,Make your self at home in this charming one-be...,Make your self at home in this charming one-be...,none,,,...,10.0,f,,WASHINGTON,f,moderate,f,f,2,4.07


In [7]:
listings_df.head(1).T

id,241032
listing_url,https://www.airbnb.com/rooms/241032
scrape_id,20160104002432
last_scraped,2016-01-04
name,Stylish Queen Anne Apartment
summary,
space,Make your self at home in this charming one-be...
description,Make your self at home in this charming one-be...
experiences_offered,none
neighborhood_overview,
notes,


## First Clustering Example - By Location

In [9]:
listings_df['jurisdiction_names'].value_counts() # useless

WASHINGTON    3818
Name: jurisdiction_names, dtype: int64

In [10]:
listings_df['neighbourhood'].value_counts() # interesting!

Capitol Hill                 351
Ballard                      213
Belltown                     204
Minor                        192
Queen Anne                   187
Fremont                      148
Wallingford                  143
University District          107
First Hill                   105
Stevens                       98
North Beacon Hill             95
Lower Queen Anne              83
Central Business District     81
Greenwood                     79
Columbia City                 61
Ravenna                       59
Phinney Ridge                 56
Magnolia                      55
Green Lake                    52
Atlantic                      50
North Admiral                 48
Mount Baker                   46
Leschi                        44
Eastlake                      41
Maple Leaf                    41
Madrona                       40
Pike Place Market             39
The Junction                  36
Bryant                        32
Seward Park                   32
Genesee   

In [11]:
columns_to_use = ["zipcode", "latitude", "longitude", "neighbourhood"]

In [12]:
listings_df = listings_df[columns_to_use]
listings_df.head()

Unnamed: 0_level_0,zipcode,latitude,longitude,neighbourhood
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
241032,98119,47.636289,-122.371025,Queen Anne
953595,98119,47.639123,-122.365666,Queen Anne
3308979,98119,47.629724,-122.369483,Queen Anne
7421966,98119,47.638473,-122.369279,Queen Anne
278830,98119,47.632918,-122.372471,Queen Anne


In [13]:
listings_df.shape

(3818, 4)

In [14]:
# % of NaN values
missing_series = pd.Series(1 - listings_df.count() / len(listings_df))
missing_series

zipcode          0.001833
latitude         0.000000
longitude        0.000000
neighbourhood    0.108958
dtype: float64

In [15]:
listings_df.dtypes

zipcode           object
latitude         float64
longitude        float64
neighbourhood     object
dtype: object

In [28]:
listings_df['zipcode'].unique()

array(['98119', '98109', '98107', '98117', nan, '98103', '98105', '98115',
       '98101', '98122', '98112', '98144', '99\n98122', '98121', '98102',
       '98199', '98104', '98134', '98136', '98126', '98146', '98116',
       '98177', '98118', '98108', '98133', '98106', '98178', '98125'],
      dtype=object)

> What the heck is that '99\n98122' item?

In [27]:
listings_df['zipcode'].value_counts()

98122        420
98103        394
98102        339
98105        225
98109        202
98101        201
98144        200
98121        196
98107        180
98115        178
98112        167
98117        164
98118        154
98119        143
98116        112
98104         95
98125         71
98199         66
98126         63
98106         58
98108         56
98133         46
98136         44
98177         19
98178          7
98146          5
98134          5
99\n98122      1
Name: zipcode, dtype: int64

In [36]:
listings_df['zipcode'] = listings_df['zipcode'].replace({'99\n98122': np.nan})
listings_df['zipcode'].unique() # no more 99\n98122 

array(['98119', '98109', '98107', '98117', nan, '98103', '98105', '98115',
       '98101', '98122', '98112', '98144', '98121', '98102', '98199',
       '98104', '98134', '98136', '98126', '98146', '98116', '98177',
       '98118', '98108', '98133', '98106', '98178', '98125'], dtype=object)

In [37]:
cat_pipeline = Pipeline([
        ("cat", OneHotEncoder(handle_unknown='ignore')), 
        ('imputer', SimpleImputer(strategy="mean")),            
    ])
cat_pipeline

In [51]:
preprocessor = ColumnTransformer([
        ("categorical", cat_pipeline, ['zipcode', 'neighbourhood']),
    ])
preprocessor

In [50]:
preprocessor.fit_transform(listings_df)

<3818x110 sparse matrix of type '<class 'numpy.float64'>'
	with 7636 stored elements in Compressed Sparse Row format>

### Note on Train/Test Splitting
> You do not always need to split data for an unsupervised learning task like clustering.  It depends if you do have label data (many times you do not) and what is your objective

### Clustering w/K-Means

In [45]:
from sklearn.cluster import KMeans

In [46]:
kmeans = KMeans(n_clusters=10, random_state=random_state)
kmeans_estimator = Pipeline([
            ("preprocessor", preprocessor),
            ("kmeans", kmeans)
       ])
kmeans_estimator

In [47]:
listings_df.dtypes

zipcode           object
latitude         float64
longitude        float64
neighbourhood     object
dtype: object

In [52]:
kmeans_estimator.fit(listings_df)

In [71]:
listings_df['kmeans_clusters'] = kmeans.labels_
listings_df['kmeans_clusters'] = listings_df['kmeans_clusters'].astype('object')

In [64]:
listings_df['kmeans_clusters'].dtype

dtype('int32')

In [60]:
px.scatter(listings_df.dropna(subset=["neighbourhood"]), x="longitude", y="latitude", color="neighbourhood")

In [77]:
px.scatter_mapbox(listings_df.dropna(subset=["neighbourhood"]), 
                lat="latitude", 
                lon="longitude", 
                color="neighbourhood",
                mapbox_style='open-street-map',
                zoom=9,
                center={'lat':listings_df['latitude'].mean(), 'lon':listings_df['longitude'].mean()}
                )

In [76]:
px.scatter_mapbox(listings_df, 
                lat="latitude", 
                lon="longitude", 
                color="kmeans_clusters",
                mapbox_style='open-street-map',
                zoom=9,
                center={'lat':listings_df['latitude'].mean(), 'lon':listings_df['longitude'].mean()},
                title="K-Means Clustering of AirBnB Spaces"
                )