In [227]:
import pandas as pd
import numpy as np

# Read csv
listings_df = pd.read_csv('./data/with_amenities_and_estimated_income.csv', low_memory = False)

# Drop columns that aren't related to income or not feasible to capture from user
columns_to_drop = ['Unnamed: 0', 'id', 'scrape_id', 'host_id', 'host_total_listings_count',
                  'latitude', 'longitude', 'availability_30', 'availability_60', 'availability_90',
                  'availability_365', 'number_of_reviews', 'calculated_host_listings_count', 
                  'reviews_per_month', 'Other', 'listing_url', 'last_scraped', 'host_name',
                  'experiences_offered', 'picture_url', 'name', 'host_url', 'host_since',
                  'host_is_superhost', 'host_thumbnail_url', 'host_picture_url', 'host_listings_count',
                  'host_verifications', 'host_has_profile_pic', 'host_identity_verified', 'street',
                  'city', 'neighbourhood_group_cleansed', 'smart_location', 'country_code',
                  'country', 'is_location_exact', 'amenities', 'price', 'calendar_updated', 'has_availability',
                  'calendar_last_scraped', 'first_review', 'last_review', 'requires_license',
                  'jurisdiction_names', 'instant_bookable', 'is_business_travel_ready',
                  'cancellation_policy', 'require_guest_profile_picture', 'require_guest_phone_verification',
                  'translation missing: en.hosting_amenity_49', 'summary', 'space', 'description',
                  'neighborhood_overview', 'notes', 'transit', 'access', 'interaction', 'house_rules',
                  'thumbnail_url', 'medium_url', 'xl_picture_url', 'host_location', 'host_about',
                  'host_response_time', 'host_response_rate', 'host_acceptance_rate', 'state',
                  'neighbourhood_cleansed', 'host_neighbourhood', 'license', 'review_scores_rating',
                  'review_scores_accuracy', 'review_scores_cleanliness', 'review_scores_checkin',
                  'review_scores_communication', 'review_scores_location', 'review_scores_value',
                  'weekly_price', 'monthly_price', 'security_deposit', 'cleaning_fee', 'market']
for col in columns_to_drop:
    listings_df.drop([col], axis=1, inplace=True)

# Remove rows that don't have an estimated income per month
listings_df = listings_df[~pd.isna(listings_df['estimated_income_per_month'])]

# Dropping square feet because 7450 out of 7712 (97%) rows are null
listings_df.drop(['square_feet'], axis=1, inplace=True)

# Fill values going forward
listings_df.fillna(method ='ffill', inplace=True)

# Convert zipcode to string rather than float
listings_df['zipcode'] = listings_df['zipcode'].astype('int').astype('str')

# Convert $ amount for extra people from string to float
listings_df['extra_people'] = listings_df['extra_people'].apply(lambda s: s[1:]).astype('float')

listings_df

Unnamed: 0,neighbourhood,zipcode,property_type,room_type,accommodates,bathrooms,bedrooms,beds,bed_type,guests_included,...,Alfresco bathtub,Safety card,Cooking basics,Flat path to front door,Free street parking,Lock on bedroom door,Lake access,Jetted tub,Cat(s),estimated_income_per_month
0,Madrona,98122,House,Entire home/apt,8,2.5,4.0,4.0,Real Bed,8,...,0,0,0,0,0,0,0,0,0,789.0
1,Roosevelt,98115,House,Private room,2,1.0,1.0,1.0,Real Bed,1,...,0,0,0,0,0,0,0,0,0,1312.0
2,South Delridge,98106,Guest suite,Entire home/apt,2,1.0,0.0,1.0,Real Bed,1,...,0,0,0,1,1,0,0,0,0,1008.0
3,Wallingford,98103,Apartment,Entire home/apt,2,1.0,1.0,1.0,Real Bed,2,...,0,0,0,0,0,0,0,0,0,1740.0
4,Georgetown,98108,Apartment,Private room,2,3.0,1.0,1.0,Real Bed,1,...,0,1,1,0,1,1,0,0,0,1365.0
5,First Hill,98101,Condominium,Private room,2,1.0,1.0,1.0,Real Bed,1,...,0,1,0,0,0,1,0,0,0,1680.0
6,The Junction,98136,House,Entire home/apt,4,1.0,2.0,5.0,Real Bed,2,...,0,1,0,0,0,0,0,0,0,1128.0
7,The Junction,98136,Guest suite,Entire home/apt,3,1.0,2.0,2.0,Real Bed,2,...,0,0,1,1,0,0,0,0,0,854.0
8,Wallingford,98103,Apartment,Entire home/apt,4,1.0,1.0,6.0,Real Bed,2,...,0,0,1,0,1,0,0,0,0,1700.0
9,The Junction,98126,Guest suite,Entire home/apt,4,1.0,2.0,2.0,Real Bed,4,...,0,0,1,0,1,0,0,0,0,1062.0


In [228]:
listings_df = pd.get_dummies(listings_df)

estimated_income = listings_df['estimated_income_per_month']
listings_df = listings_df.drop(['estimated_income_per_month'], axis=1)

In [225]:
from sklearn.model_selection import train_test_split # splitting data

train_features, test_features, train_outcome, test_outcome = train_test_split(
    listings_df,
    estimated_income,
    test_size=0.30, 
    random_state=11
)

array(['accommodates', 'bathrooms', 'bedrooms', 'beds', 'guests_included',
       'extra_people', 'minimum_nights', 'maximum_nights',
       'Outdoor seating', 'Pets allowed', 'Pool',
       'Electric profiling bed', 'EV charger', 'Beach essentials',
       'High chair', 'Warming drawer', 'Fixed grab bars for toilet',
       'Heat lamps', 'Bed linens', 'Dishwasher', 'Breakfast', 'Hot tub',
       'Kitchenette', 'Long term stays allowed', 'Rain shower',
       'Sound system', 'Well-lit path to entrance', 'Bathtub',
       'Handheld shower head', 'Espresso machine', 'Game console',
       'Wide doorway', 'Accessible-height bed', 'Washer',
       'Single level home', 'Cleaning before checkout', 'Coffee maker',
       'Building staff', 'Hair dryer', 'Laptop friendly workspace',
       'Firm mattress', 'Projector and screen', 'Shampoo',
       'En suite bathroom', 'Smart lock', 'Shared gym',
       'Fireplace guards', 'Bathroom essentials',
       'Wide hallway clearance', 'TV',
       'tra

In [216]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import MinMaxScaler       # scaling data
from sklearn.model_selection import GridSearchCV     # for grid search
from sklearn.pipeline import Pipeline                # for making pipelines
from sklearn.feature_selection import SelectPercentile
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import KFold

threshold = VarianceThreshold(.1)
scaler = MinMaxScaler()
select = SelectPercentile()

# define a pipeline
pipe = Pipeline([('scaler', scaler), ('KNeighborsRegressor', KNeighborsRegressor())])

folds = KFold(n_splits = 10, shuffle = True)

# # defines a grid to search through
param_grid = {
#     'selectpercentile__percentile':range(10, 30, 5),
    'KNeighborsRegressor__n_neighbors': [3, 75, 211],
    'KNeighborsRegressor__weights':["uniform", "distance"]
}

# performs a grid search of pipeline
knngrid = GridSearchCV(pipe, param_grid,cv = folds, scoring="neg_mean_absolute_error")
knniqwholemodel = knngrid.fit(test_features, test_outcome)
knniqwholemodel.best_score_

-1108.8857414463887

In [217]:
knniqwholemodel.best_estimator_

Pipeline(memory=None,
     steps=[('scaler', MinMaxScaler(copy=True, feature_range=(0, 1))), ('KNeighborsRegressor', KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
          metric_params=None, n_jobs=1, n_neighbors=75, p=2,
          weights='distance'))])

In [206]:
# selected_threshold = knniqwholemodel.best_estimator_.named_steps['threshold'].get_support()
# selected_threshold_columns = list()
# for i in range(len(selected_threshold)):
#     if (selected_threshold[i]):
#         selected_threshold_columns.append(test_features.columns.values[i])
        
# selected_threshold_columns

selected_columns = knniqwholemodel.best_estimator_.named_steps['selectpercentile'].get_support()
selected_k_best_columns = list()
for i in range(len(selected_columns)):
    if (selected_columns[i]):
        selected_k_best_columns.append(test_features.columns.values[i])

selected_k_best_columns

  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


['accommodates',
 'bathrooms',
 'bedrooms',
 'beds',
 'guests_included',
 'minimum_nights',
 'Pool',
 'High chair',
 'Warming drawer',
 'Heat lamps',
 'Sound system',
 'Espresso machine',
 'Building staff',
 'Hair dryer',
 'Shampoo',
 'En suite bathroom',
 'Balcony',
 'Soaking tub',
 'Crib',
 'Table corner guards',
 'Beach view',
 'Wine cooler',
 'Smoke detector',
 'Hangers',
 'Terrace',
 'DVD player',
 'Hammock',
 'Heating',
 'Gas oven',
 'Printer',
 'Smart TV',
 'Elevator',
 'Buzzer/wireless intercom',
 'Outdoor parking',
 'Hot water kettle',
 'Walk-in shower',
 'Changing table',
 'Ceiling fan',
 'Ski-in/Ski-out',
 'Amazon Echo',
 'Private gym',
 'Baby monitor',
 'Formal dining area',
 'Bathtub with bath chair',
 'Private bathroom',
 'Hot water',
 'Beachfront',
 'Essentials',
 'Exercise equipment',
 'Gym',
 'Sun loungers',
 'neighbourhood_Arbor Heights',
 'neighbourhood_Belltown',
 'neighbourhood_Cedar Park',
 'neighbourhood_Central Business District',
 'neighbourhood_First Hill',
 '

In [207]:
knniqwholemodel.best_score_

-1306.9369057908384