In [60]:
# Set up
import numpy as np
import pandas as pd
import seaborn as sns # for visualiation
import altair as alt
alt.renderers.enable('notebook') # enable altair rendering
from scipy.stats import ttest_ind # t-tests
import statsmodels.formula.api as smf # linear modeling
import statsmodels.api as sm
import matplotlib.pyplot as plt # plotting
import matplotlib
from sklearn import metrics
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import MinMaxScaler       
from sklearn.model_selection import GridSearchCV   
from sklearn.pipeline import make_pipeline           
from sklearn.feature_selection import SelectPercentile
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import KFold
matplotlib.style.use('ggplot')
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score


In [61]:
# Read csv
listings_df = pd.read_csv('./data/with_amenities_and_estimated_income.csv', low_memory = False)

# Drop columns that aren't related to income or not feasible to capture from user
columns_to_drop = ['Unnamed: 0', 'id', 'scrape_id', 'host_id', 'host_total_listings_count',
                  'latitude', 'longitude', 'availability_30', 'availability_60', 'availability_90',
                  'availability_365', 'number_of_reviews', 'calculated_host_listings_count', 
                  'reviews_per_month', 'Other', 'listing_url', 'last_scraped', 'host_name',
                  'experiences_offered', 'picture_url', 'name', 'host_url', 'host_since',
                  'host_is_superhost', 'host_thumbnail_url', 'host_picture_url', 'host_listings_count',
                  'host_verifications', 'host_has_profile_pic', 'host_identity_verified', 'street',
                  'city', 'neighbourhood_group_cleansed', 'smart_location', 'country_code',
                  'country', 'is_location_exact', 'amenities', 'price', 'calendar_updated', 'has_availability',
                  'calendar_last_scraped', 'first_review', 'last_review', 'requires_license',
                  'jurisdiction_names', 'instant_bookable', 'is_business_travel_ready',
                  'cancellation_policy', 'require_guest_profile_picture', 'require_guest_phone_verification',
                  'translation missing: en.hosting_amenity_49', 'summary', 'space', 'description',
                  'neighborhood_overview', 'notes', 'transit', 'access', 'interaction', 'house_rules',
                  'thumbnail_url', 'medium_url', 'xl_picture_url', 'host_location', 'host_about',
                  'host_response_time', 'host_response_rate', 'host_acceptance_rate', 'state',
                  'neighbourhood_cleansed', 'host_neighbourhood', 'license', 'review_scores_rating',
                  'review_scores_accuracy', 'review_scores_cleanliness', 'review_scores_checkin',
                  'review_scores_communication', 'review_scores_location', 'review_scores_value',
                  'weekly_price', 'monthly_price', 'security_deposit', 'cleaning_fee', 'market']
for col in columns_to_drop:
    listings_df.drop([col], axis=1, inplace=True)

# Remove rows that don't have an estimated income per month
listings_df = listings_df[~pd.isna(listings_df['estimated_income_per_month'])]

# Dropping square feet because 7450 out of 7712 (97%) rows are null
listings_df.drop(['square_feet'], axis=1, inplace=True)

# Fill values going forward
listings_df.fillna(method ='ffill', inplace=True)

# Convert zipcode to string rather than float
listings_df['zipcode'] = listings_df['zipcode'].astype('int').astype('str')

# Convert $ amount for extra people from string to float
listings_df['extra_people'] = listings_df['extra_people'].apply(lambda s: s[1:]).astype('float')

listings_df

Unnamed: 0,neighbourhood,zipcode,property_type,room_type,accommodates,bathrooms,bedrooms,beds,bed_type,guests_included,...,Alfresco bathtub,Safety card,Cooking basics,Flat path to front door,Free street parking,Lock on bedroom door,Lake access,Jetted tub,Cat(s),estimated_income_per_month
0,Madrona,98122,House,Entire home/apt,8,2.5,4.0,4.0,Real Bed,8,...,0,0,0,0,0,0,0,0,0,789.0
1,Roosevelt,98115,House,Private room,2,1.0,1.0,1.0,Real Bed,1,...,0,0,0,0,0,0,0,0,0,1312.0
2,South Delridge,98106,Guest suite,Entire home/apt,2,1.0,0.0,1.0,Real Bed,1,...,0,0,0,1,1,0,0,0,0,1008.0
3,Wallingford,98103,Apartment,Entire home/apt,2,1.0,1.0,1.0,Real Bed,2,...,0,0,0,0,0,0,0,0,0,1740.0
4,Georgetown,98108,Apartment,Private room,2,3.0,1.0,1.0,Real Bed,1,...,0,1,1,0,1,1,0,0,0,1365.0
5,First Hill,98101,Condominium,Private room,2,1.0,1.0,1.0,Real Bed,1,...,0,1,0,0,0,1,0,0,0,1680.0
6,The Junction,98136,House,Entire home/apt,4,1.0,2.0,5.0,Real Bed,2,...,0,1,0,0,0,0,0,0,0,1128.0
7,The Junction,98136,Guest suite,Entire home/apt,3,1.0,2.0,2.0,Real Bed,2,...,0,0,1,1,0,0,0,0,0,854.0
8,Wallingford,98103,Apartment,Entire home/apt,4,1.0,1.0,6.0,Real Bed,2,...,0,0,1,0,1,0,0,0,0,1700.0
9,The Junction,98126,Guest suite,Entire home/apt,4,1.0,2.0,2.0,Real Bed,4,...,0,0,1,0,1,0,0,0,0,1062.0


In [62]:
listings_df = pd.get_dummies(listings_df)

estimated_income = listings_df['estimated_income_per_month']
listings_df = listings_df.drop(['estimated_income_per_month'], axis=1)

In [63]:
train_features, test_features, train_outcome, test_outcome = train_test_split(
    listings_df,
    estimated_income,
    test_size=0.20, 
    random_state=11
)

In [64]:
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE

model = LinearRegression()
#backward selection
rfe = RFE(model, 75)
rfe = rfe.fit(train_features, train_outcome)
filterparam = rfe.support_

#finding the total columns
totalparams = train_features.columns
iqmodelparams = totalparams[filterparam]
iqmodelparams

Index(['accommodates', 'Outdoor seating', 'Pool', 'EV charger',
       'Warming drawer', 'Heat lamps', 'Rain shower', 'Sound system',
       'Projector and screen', 'En suite bathroom', 'Bathroom essentials',
       'Washer / Dryer', 'Waterfront', 'Beach view', 'Shared hot tub',
       'Hangers', 'Terrace', 'Doorman', 'Printer', 'Breakfast table',
       'Shared pool', 'Stand alone steam shower', 'Ground floor access',
       'Toilet paper', 'Hot water kettle', 'Ski-in/Ski-out', 'Amazon Echo',
       'Fax machine', 'Baby monitor', 'Heated towel rack', 'Mountain view',
       'Formal dining area', 'Private hot tub', 'Standing valet',
       'Bedroom comforts', 'Day bed', 'Beachfront', 'Central air conditioning',
       'Body soap', 'Alfresco bathtub', 'Jetted tub', 'neighbourhood_Belltown',
       'neighbourhood_Brighton', 'neighbourhood_Cedar Park',
       'neighbourhood_Central Business District', 'neighbourhood_Fauntleroy',
       'neighbourhood_Industrial District', 'neighbourhood_L

In [66]:
listings_df['estimated_income_per_month'] = estimated_income
top_features_df = listings_df[['accommodates', 'Outdoor seating', 'Pool', 'EV charger',
       'Warming drawer', 'Heat lamps', 'Rain shower', 'Sound system',
       'Projector and screen', 'En suite bathroom', 'Bathroom essentials',
       'Washer / Dryer', 'Waterfront', 'Beach view', 'Shared hot tub',
       'Hangers', 'Terrace', 'Doorman', 'Printer', 'Breakfast table',
       'Shared pool', 'Stand alone steam shower', 'Ground floor access',
       'Toilet paper', 'Hot water kettle', 'Ski-in/Ski-out', 'Amazon Echo',
       'Fax machine', 'Baby monitor', 'Heated towel rack', 'Mountain view',
       'Formal dining area', 'Private hot tub', 'Standing valet',
       'Bedroom comforts', 'Day bed', 'Beachfront', 'Central air conditioning',
       'Body soap', 'Alfresco bathtub', 'Jetted tub', 'neighbourhood_Belltown',
       'neighbourhood_Brighton', 'neighbourhood_Cedar Park',
       'neighbourhood_Central Business District', 'neighbourhood_Fauntleroy',
       'neighbourhood_Industrial District', 'neighbourhood_Lower Queen Anne',
       'neighbourhood_Madison Park', 'neighbourhood_Pike Place Market',
       'neighbourhood_Pioneer Square', 'neighbourhood_Queen Anne',
       'neighbourhood_Rainier Beach', 'neighbourhood_South Beacon Hill',
       'neighbourhood_South Lake Union', 'neighbourhood_Westlake',
       'neighbourhood_Yesler Terrace', 'zipcode_98052', 'zipcode_98101',
       'zipcode_98102', 'zipcode_98104', 'zipcode_98110', 'zipcode_98112',
       'zipcode_98122', 'zipcode_98155', 'property_type_Boat',
       'property_type_Boutique hotel', 'property_type_Castle',
       'property_type_Cottage', 'property_type_Farm stay',
       'property_type_Hostel', 'property_type_Houseboat',
       'property_type_Serviced apartment', 'property_type_Treehouse',
       'property_type_Villa', 'estimated_income_per_month']].copy()

In [67]:
train_features, test_features, train_outcome, test_outcome = train_test_split(
    top_features_df.drop(['estimated_income_per_month'], axis=1),
    top_features_df ['estimated_income_per_month'],
    test_size=0.20, 
    random_state=11
)

### Descision Tree 

In [68]:
# define a pipeline
pipe = make_pipeline(MinMaxScaler(), DecisionTreeClassifier())
folds = KFold(n_splits = 10, shuffle = True)

# defines grid to search through
param_grid = {'decisiontreeclassifier__max_features': range(1,4)}

# performs a grid search of pipeline
dtgrid = GridSearchCV(pipe, param_grid =param_grid, cv=folds, scoring = 'neg_mean_absolute_error')  
dtmodel = dtgrid.fit(train_features, train_outcome) 


  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)


In [69]:
accuracy_score(dtmodel.predict(test_features), test_outcome)


0.03888528839922229

In [70]:
dtmodel.score(test_features, test_outcome)

-1230.6338302009074

### Gaussian Naive Bayes

In [55]:
pipe = make_pipeline(MinMaxScaler(), GaussianNB())
folds = KFold(n_splits = 10, shuffle = True)

# defines grid to search through
param_grid = {}

# performs a grid search of pipeline
gnbgrid = GridSearchCV(pipe, param_grid =param_grid, cv=folds, scoring = 'neg_mean_absolute_error')  
gnbmodel = gnbgrid.fit(train_features, train_outcome)


  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)


In [56]:
accuracy_score(gnbmodel.predict(test_features), test_outcome)

0.029812054439403757

In [57]:
gnbmodel.score(test_features, test_outcome)

-1277.4562540505508

### Gradient Boosting 

In [16]:
# define a pipeline
pipe = make_pipeline(MinMaxScaler(), GradientBoostingClassifier())
folds = KFold(n_splits = 10, shuffle = True)
# defines grid to search through
param_grid = {'gradientboostingclassifier__max_depth':[1,2,4,5]}

# performs a grid search of pipeline
gbgrid = GridSearchCV(pipe, param_grid =param_grid, cv=folds, scoring = 'neg_mean_absolute_error')  
gbmodel = gbgrid.fit(train_features, train_outcome)


  return self.partial_fit(X, y)


KeyboardInterrupt: 

In [None]:
accuracy_score(gbmodel.predict(test_features), test_outcome)

In [None]:
gbmodel.score(test_features, test_outcome)

### Multilayer Perceptron

In [None]:
# define a pipeline
pipe = make_pipeline(VarianceThreshold(.1), MinMaxScaler(), MLPClassifier())
folds = KFold(n_splits = 10, shuffle = True)

# defines grid to search through
param_grid = {'mlpclassifier__learning_rate':['adaptive'],
             'mlpclassifier__activation':['identity', 'logistic', 'tanh', 'relu']}

# performs a grid search of pipeline
mpgrid = GridSearchCV(pipe, param_grid =param_grid, cv=folds, scoring = 'neg_mean_absolute_error')  
mpmodel = mpgrid.fit(train_features, train_outcome)


In [None]:
accuracy_score(mpmodel.predict(test_features), test_outcome)

In [None]:
mpmodel.score(test_features, test_outcome)