In [1]:
#Imports 
import pandas as pd
import numpy as np
import seaborn as sns
import datetime
import csv
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import warnings

%matplotlib inline
warnings.filterwarnings('ignore')

In [2]:
#Read in Airbnb data
df = pd.read_csv('https://raw.githubusercontent.com/anguyen-07/DS7331-ML_Labs/master/data/airbnb_train.csv')

In [3]:
#Show the entire dataframe without cutting off columns
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
df.head()

Unnamed: 0,id,log_price,property_type,room_type,amenities,accommodates,bathrooms,bed_type,cancellation_policy,cleaning_fee,city,description,first_review,host_has_profile_pic,host_identity_verified,host_response_rate,host_since,instant_bookable,last_review,latitude,longitude,name,neighbourhood,number_of_reviews,review_scores_rating,thumbnail_url,zipcode,bedrooms,beds
0,6901257,5.010635,Apartment,Entire home/apt,"{""Wireless Internet"",""Air conditioning"",Kitche...",3,1.0,Real Bed,strict,True,NYC,"Beautiful, sunlit brownstone 1-bedroom in the ...",2016-06-18,t,t,,2012-03-26,f,2016-07-18,40.696524,-73.991617,Beautiful brownstone 1-bedroom,Brooklyn Heights,2,100.0,https://a0.muscache.com/im/pictures/6d7cbbf7-c...,11201.0,1.0,1.0
1,6304928,5.129899,Apartment,Entire home/apt,"{""Wireless Internet"",""Air conditioning"",Kitche...",7,1.0,Real Bed,strict,True,NYC,Enjoy travelling during your stay in Manhattan...,2017-08-05,t,f,100%,2017-06-19,t,2017-09-23,40.766115,-73.98904,Superb 3BR Apt Located Near Times Square,Hell's Kitchen,6,93.0,https://a0.muscache.com/im/pictures/348a55fe-4...,10019.0,3.0,3.0
2,7919400,4.976734,Apartment,Entire home/apt,"{TV,""Cable TV"",""Wireless Internet"",""Air condit...",5,1.0,Real Bed,moderate,True,NYC,The Oasis comes complete with a full backyard ...,2017-04-30,t,t,100%,2016-10-25,t,2017-09-14,40.80811,-73.943756,The Garden Oasis,Harlem,10,92.0,https://a0.muscache.com/im/pictures/6fae5362-9...,10027.0,1.0,3.0
3,13418779,6.620073,House,Entire home/apt,"{TV,""Cable TV"",Internet,""Wireless Internet"",Ki...",4,1.0,Real Bed,flexible,True,SF,This light-filled home-away-from-home is super...,,t,t,,2015-04-19,f,,37.772004,-122.431619,Beautiful Flat in the Heart of SF!,Lower Haight,0,,https://a0.muscache.com/im/pictures/72208dad-9...,94117.0,2.0,2.0
4,3808709,4.744932,Apartment,Entire home/apt,"{TV,Internet,""Wireless Internet"",""Air conditio...",2,1.0,Real Bed,moderate,True,DC,"Cool, cozy, and comfortable studio located in ...",2015-05-12,t,t,100%,2015-03-01,t,2017-01-22,38.925627,-77.034596,Great studio in midtown DC,Columbia Heights,4,40.0,,20009.0,0.0,1.0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74111 entries, 0 to 74110
Data columns (total 29 columns):
id                        74111 non-null int64
log_price                 74111 non-null float64
property_type             74111 non-null object
room_type                 74111 non-null object
amenities                 74111 non-null object
accommodates              74111 non-null int64
bathrooms                 73911 non-null float64
bed_type                  74111 non-null object
cancellation_policy       74111 non-null object
cleaning_fee              74111 non-null bool
city                      74111 non-null object
description               74111 non-null object
first_review              58247 non-null object
host_has_profile_pic      73923 non-null object
host_identity_verified    73923 non-null object
host_response_rate        55812 non-null object
host_since                73923 non-null object
instant_bookable          74111 non-null object
last_review               582

# Feature Creation

In [5]:
#create unlogged price variable
df['price'] = np.log(df['log_price'])
#replace infinity values from logging zero
df.replace([np.inf, -np.inf], np.nan, inplace=True)

In [6]:
#create grade variable
df['grade'] = pd.cut(df.review_scores_rating, [0,60,70,80,90,101], right=False, labels = ['F', 'D', 'C', 'B', 'A'])
df[['grade', 'review_scores_rating']].head()

Unnamed: 0,grade,review_scores_rating
0,A,100.0
1,A,93.0
2,A,92.0
3,,
4,F,40.0


In [7]:
#create A/ not A variable
# Create altnerative target variable grade_grouped
new_grades = {
    'A':'A',
    'B':'<A',
    'C':'<A',
    'D':'<A',
    'F':'<A'
}
df['grade_grouped'] = df['grade'].map(new_grades)
df[['grade', 'grade_grouped']].head(20)

Unnamed: 0,grade,grade_grouped
0,A,A
1,A,A
2,A,A
3,,
4,F,<A
5,A,A
6,A,A
7,A,A
8,A,A
9,A,A


In [8]:
#create superuser variable
df['superuser'] = False
df.loc[df.review_scores_rating >=96, 'superuser'] = True
df[['review_scores_rating', 'superuser']].head()

Unnamed: 0,review_scores_rating,superuser
0,100.0,True
1,93.0,False
2,92.0,False
3,,False
4,40.0,False


In [9]:
#create description length variable
df['description_length'] = df['description'].apply(len)

In [None]:
#create amenities length variable
import re
df['amenities_new'] = df.apply(lambda row: re.sub(r'[{}""]', '', row['amenities']), axis=1)
df['amenities_new'] = df.apply(lambda row: row['amenities_new'].lower().split(','), axis=1)
df = df.reset_index(drop=True)
df['length_amenities'] = df.apply(lambda row: len(row['amenities_new']), axis=1)
df[['amenities_new', 'length_amenities']].head()

In [None]:
#create variables indicating whether individual amenities are present
# Create separate columns based on amenities
df['internet'] = df.apply(lambda row: 'internet' in row.amenities.lower(), axis=1)
df['TV'] = df.apply(lambda row: 'tv' in row.amenities.lower(), axis=1)
df['air_conditioning'] = df.apply(lambda row: 'air conditioning' in row.amenities.lower(), axis=1)
df['kitchen'] = df.apply(lambda row: 'kitchen' in row.amenities.lower(), axis=1)
df['pool'] = df.apply(lambda row: 'pool' in row.amenities.lower(), axis=1)
df['parking'] = df.apply(lambda row: 'parking' in row.amenities.lower(), axis=1)

In [None]:
#create number of days of being a host variable
import datetime
date_published = datetime.datetime(2018,3,14)
df['host_since'] = pd.to_datetime(df['host_since'])
df['host_since_days'] = df.apply(lambda row: (date_published - row['host_since']).days, axis=1)
df['host_since_days'].head()

In [None]:
#clean up host_response_rate
df['host_response_rate'] = df['host_response_rate'].str.rstrip('%').astype(np.float64)

# Imputation

In [None]:
df_imputed = df.copy()

In [None]:
df_imputed.info()

In [None]:
#impute bathrooms based on the property size and the number of people it can accomodate
df_imputed["bathrooms"] = df_imputed["bathrooms"].fillna(df_imputed.groupby(["property_type","accommodates"])["bathrooms"].apply(lambda x : x.fillna(x.median())))

#impute bedrooms based on the property size and the number of people it can accomodate
df_imputed["bedrooms"] = df_imputed["bedrooms"].fillna(df_imputed.groupby(["property_type","accommodates"])["bedrooms"].apply(lambda x : x.fillna(x.median())))

#impute beds based on the property size and the number of people it can accomodate
df_imputed["beds"] = df_imputed["beds"].fillna(df_imputed.groupby(["property_type","accommodates"])["beds"].apply(lambda x : x.fillna(x.median())))

#impute host response rate based on the numer of reviews
df_imputed["host_response_rate"] = df_imputed["host_response_rate"].fillna(df_imputed.groupby(["number_of_reviews"])["host_response_rate"].apply(lambda x : x.fillna(x.mean())))

In [None]:
df_imputed.info()

# Dropping data

In [None]:
#drop columns that dont matter and redunandant data
df_imputed = df_imputed.drop(['id','amenities','description','zipcode',
                               'latitude','longitude','name','neighbourhood','review_scores_rating','thumbnail_url',
                               'zipcode', 'amenities_new', 'host_since',
                             'first_review', 'last_review', 'log_price'], axis=1)

In [None]:
# Remove factor with unncessarily large number of levels
df_imputed.drop(['property_type'], axis=1, inplace=True)

In [None]:
#Drop duplicates
df_imputed = df_imputed.drop_duplicates()

In [None]:
len(df_imputed)

In [None]:
len(df_imputed.dropna())

In [None]:
#drop all missing values
df_imputed.dropna(inplace=True)

# Performing PCA

In [None]:
#only keep numeric data types and our response variables(grade, grade_grouped, superuser)
#for pca
df_pca = df_imputed[['accommodates', 'bathrooms', 'host_response_rate',
             'number_of_reviews', 'bedrooms', 'beds', 'price',
            'grade', 'grade_grouped', 'description_length', 'length_amenities',
            'host_since_days', 'superuser']]
df_pca.head()

### Standardize the Data

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
#separating out the features
x = df_pca.drop(['grade', 'grade_grouped', 'superuser'], axis=1).values
x

In [None]:
#separating out the first target
y = df_pca.loc[:, 'grade_grouped'].values
y

In [None]:
# Standardizing the features
x = StandardScaler().fit_transform(x)
x

### Projection onto 2D

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
principalComponents = pca.fit_transform(x)
principalDf = pd.DataFrame(data = principalComponents
             , columns = ['PCA1', 'PCA2'])

In [None]:
principalDf.head()

### Correlation with Target1?

In [None]:
target1 = df_pca.loc[:, 'grade_grouped']
target1.head()

In [None]:
principalDf['target1'] = target1
principalDf.head()

In [None]:
import seaborn as sns
sns.scatterplot(data=principalDf, x='PCA1', y='PCA2', hue='target1',
                legend='full', palette=sns.color_palette('pastel', n_colors=2))
plt.legend(bbox_to_anchor=(1, 1))
plt.show()

* PCA1 and PCA2 do not separate out our predictor variable of A or not A, which shows our dataset may not yield a great model. However, this does not take our categorical variables into account

In [None]:
pca.explained_variance_ratio_

* Also out two principal components only explain about 49 % of the variation, so may have lost too much information in trying to make our data 2D to make the above statement 

## Correlation with target2?

In [None]:
target2 = df_pca.loc[:, 'grade']
target2.head()

In [None]:
principalDf['target2'] = target2
principalDf.head()

In [None]:
sns.scatterplot(data=principalDf, x='PCA1', y='PCA2', hue='target2',
                legend='full', palette=sns.color_palette('pastel', n_colors=5))
plt.legend(bbox_to_anchor=(1, 1))
plt.show()

* Our two principal components do not seem to correlate with normal grade
either

### Correlation with target 3?

In [None]:
target3 = df_pca.loc[:, 'superuser']
target3.head()

In [None]:
principalDf['target3'] = target3
principalDf.head()

In [None]:
sns.scatterplot(data=principalDf, x='PCA1', y='PCA2', hue='target3',
                legend='full', palette=sns.color_palette('pastel', n_colors=2))
plt.legend(bbox_to_anchor=(1, 1))
plt.show()

* Again, superuser does not seem to be separated out well by our first two principal components

# K Means Clustering

In [None]:
df_cluster = df_pca.copy()
df_cluster.head()

In [None]:
df_cluster_x = df_pca.drop(['grade', 'grade_grouped', 'superuser'], axis=1)

In [None]:
df_cluster_x.head()

# Elbow Plot

In [None]:
from sklearn.cluster import KMeans
from sklearn import metrics
from scipy.spatial.distance import cdist
import numpy as np
import matplotlib.pyplot as plt

In [None]:
wcss = []
for i in range(1,11):
    kmeans = KMeans(n_clusters=i,init='k-means++',max_iter=300,n_init=10,random_state=0)
    kmeans.fit(df_cluster_x)
    wcss.append(kmeans.inertia_)
plt.plot(range(1,11),wcss)
plt.title('The Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.show()

* optimal k is two

## Getting two Kmeans clusters

In [None]:
kmeans = KMeans(n_clusters=2,init='k-means++',max_iter=300,n_init=10,random_state=0) 
clusters = kmeans.fit_predict(df_cluster_x)
clusters

In [None]:
principalDf.head()

In [None]:
principalDf['clusters'] = clusters

### Do clusters from K means separate out well in PCA plot?

In [None]:
principalDf.head()

In [None]:
import seaborn as sns
sns.scatterplot(data=principalDf, x='PCA1', y='PCA2', hue='clusters',
                legend='full', palette=sns.color_palette('pastel', n_colors=2))
plt.legend(bbox_to_anchor=(1, 1))
plt.show()

* The clusters do seem to separate out when looking at the PCA plot!

# Interpret KMeans Clusters

In [None]:
#create new dataframe 
df_explore = df_imputed.copy()

In [None]:
df_imputed.head()

In [None]:
df_explore['clusters'] = clusters

In [None]:
df_explore.head()

In [None]:
#There are two clusters with the value counts below
df_explore['clusters'].value_counts()

In [None]:
#Countplot of the clusters
sns.countplot(data=df_explore, x='clusters')
plt.show()

In [None]:
sns.countplot(data=df_explore, x='clusters', hue='grade')
plt.show()

* Like what we saw from PCA, grade seems to be represented equally well in both clusters. This implies that our KMeans clusters will not be a good predictor to add to our model

In [None]:
sns.countplot(data=df_explore, x='clusters', hue='grade_grouped')
plt.show()

* The same applies for our grade grouped variable

In [None]:
sns.countplot(data=df_explore, x='clusters', hue='superuser')
plt.show()

* The same applies for superuser

# Cluster EDA

### Clusters and numerical variables

In [None]:
numerics = list(df_explore.select_dtypes(include=['int64', 'float64']).columns)
numerics

In [None]:
for x in numerics:
    sns.boxplot(data=df_explore, x='clusters', y=x)
    plt.title(x)
    plt.show()

* Cluster 1 seems to include AirBnb users that have been:
    1. longer hosts
    2. have longer description lengths
    3. have more reviews
    

### Clusters and categorical variables

In [None]:
df_explore.info()

In [None]:
def make_percent_stacked_boxplot(x):
    '''make a percent stacked barplot where the clusters are the x axis
    and the categorical variable is the y axis'''
    table = pd.crosstab(df_explore['clusters'], df_explore[x], normalize='index')
    table.plot(kind='bar', stacked=True)
    plt.show()

In [None]:
make_percent_stacked_boxplot('city')

In [None]:
categoricals = list(df_explore.select_dtypes(['object', 'bool', 'category']).columns)

In [None]:
for x in categoricals:
    make_percent_stacked_boxplot(x)

* Out clusters do not seem to seperate out with categorical variables very well
* this makes sense because we could not use categorical variables in the clustering process
* weirdly, the only differences that seem significant is that cluster 1 contains properties that overall are 
    1. less likely to be instant bookable
    2. more likely to have their identity verified
* We also saw that cluster 1 contained hosts that had been using AirBnB for longer, so these may be traits of long time AirBnb users

# DBSCAN

In [None]:
df_cluster.head()

In [None]:
#create feature dataframe without label variables
X = df_cluster.drop(['grade', 'grade_grouped'], axis=1)

In [None]:
#scale the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
#normalize the data to make follow a normal distribution
from sklearn.preprocessing import normalize 
X_normalized = normalize(X_scaled) 

In [None]:
#grab the names of the original dataframe
names = list(df_cluster.columns)
names.remove('grade')
names.remove('grade_grouped')
names

In [None]:
#convert np array back to dataframe
X_normalized = pd.DataFrame(X_normalized , columns=names) 
X_normalized.head()

In [None]:
#perform PCA for visualization
pca = PCA(n_components = 2) 
X_principal = pca.fit_transform(X_normalized) 
X_principal = pd.DataFrame(X_principal) 
X_principal.columns = ['P1', 'P2'] 
print(X_principal.head()) 

In [None]:
#fit the DBSCAN clustering on our PCA
from sklearn.cluster import DBSCAN 
db_default = DBSCAN(eps = 0.0375, min_samples = 3).fit(X_principal) 
labels = db_default.labels_ 
labels

In [None]:
#DBSCAN found 6 clusters
np.unique(labels)

In [None]:
#Build label to color mapping
color_map = {-1:'blue', 0:'green', 1:'black', 2:'purple', 3:'red', 4:'yellow'}

In [None]:
label_colors = np.vectorize(color_map.get)(labels)
label_colors

In [None]:
X_principal['labels'] = labels

In [None]:
X_principal['label_colors'] = label_colors

In [None]:
X_principal.head()

In [None]:
import seaborn as sns

In [None]:
sns.scatterplot(data=X_principal, x='P1', y='P2', hue='labels',
                legend='full', palette=sns.color_palette('pastel', n_colors=6))
plt.legend(bbox_to_anchor=(1, 1))
plt.show()

In [None]:
#NOTE: FIT DBSCAN ON OUR PRINCIPAL COMPONENTS AND NOT ON OUR DATA
#(OPPOSITE OF ABOVE)

# Experiment with DBSCAN

In [None]:
#Fit DBSCAN on data and not principal components
from sklearn.cluster import DBSCAN 
db_default = DBSCAN(eps = 0.0375, min_samples = 3).fit(X_normalized) 
labels = db_default.labels_ 
labels

In [None]:
np.unique(labels) #58 clusters

In [None]:
del X_principal['labels']
del X_principal['label_colors']

In [None]:
X_principal.head()

In [None]:
X_principal['labels'] = labels

In [None]:
X_principal.head()

In [None]:
#MESS
sns.scatterplot(data=X_principal, x='P1', y='P2', hue='labels',
                legend='full', palette=sns.color_palette('pastel', n_colors=58))
plt.legend(bbox_to_anchor=(1, 1))
plt.show()