In [None]:
import pandas as pd
import numpy as np

# Read csv
listings_df = pd.read_csv('./data/with_amenities_and_estimated_income.csv', low_memory = False)

# Drop columns that aren't related to income or not feasible to capture from user
columns_to_drop = ['Unnamed: 0', 'id', 'scrape_id', 'host_id', 'host_total_listings_count',
                  'latitude', 'longitude', 'availability_30', 'availability_60', 'availability_90',
                  'availability_365', 'number_of_reviews', 'calculated_host_listings_count', 
                  'reviews_per_month', 'Other', 'listing_url', 'last_scraped', 'host_name',
                  'experiences_offered', 'picture_url', 'name', 'host_url', 'host_since',
                  'host_is_superhost', 'host_thumbnail_url', 'host_picture_url', 'host_listings_count',
                  'host_verifications', 'host_has_profile_pic', 'host_identity_verified', 'street',
                  'city', 'neighbourhood_group_cleansed', 'smart_location', 'country_code',
                  'country', 'is_location_exact', 'amenities', 'price', 'calendar_updated', 'has_availability',
                  'calendar_last_scraped', 'first_review', 'last_review', 'requires_license',
                  'jurisdiction_names', 'instant_bookable', 'is_business_travel_ready',
                  'cancellation_policy', 'require_guest_profile_picture', 'require_guest_phone_verification',
                  'translation missing: en.hosting_amenity_49', 'summary', 'space', 'description',
                  'neighborhood_overview', 'notes', 'transit', 'access', 'interaction', 'house_rules',
                  'thumbnail_url', 'medium_url', 'xl_picture_url', 'host_location', 'host_about',
                  'host_response_time', 'host_response_rate', 'host_acceptance_rate', 'state',
                  'neighbourhood_cleansed', 'host_neighbourhood', 'license', 'review_scores_rating',
                  'review_scores_accuracy', 'review_scores_cleanliness', 'review_scores_checkin',
                  'review_scores_communication', 'review_scores_location', 'review_scores_value',
                  'weekly_price', 'monthly_price', 'security_deposit', 'cleaning_fee', 'market']
for col in columns_to_drop:
    listings_df.drop([col], axis=1, inplace=True)

# Remove rows that don't have an estimated income per month
listings_df = listings_df[~pd.isna(listings_df['estimated_income_per_month'])]

# Dropping square feet because 7450 out of 7712 (97%) rows are null
listings_df.drop(['square_feet'], axis=1, inplace=True)

# Fill values going forward
listings_df.fillna(method ='ffill', inplace=True)

# Convert zipcode to string rather than float
listings_df['zipcode'] = listings_df['zipcode'].astype('int').astype('str')

# Convert $ amount for extra people from string to float
listings_df['extra_people'] = listings_df['extra_people'].apply(lambda s: s[1:]).astype('float')

listings_df

In [None]:
from sklearn.feature_selection import SelectKBest, chi2, VarianceThreshold, SelectPercentile, RFE
from sklearn.linear_model import LinearRegression

amenities = listings_df.iloc[:, 13:-1]
y = np.ravel(listings_df.iloc[:, [-1]])

# Select 20 top amenities
select = RFE(LinearRegression(), 20).fit(amenities, y)
cols_rfe = [col for i, col in enumerate(amenities.columns.values) if select.get_support()[i]]
print(cols_rfe)

# Remove amenities that weren't selected
remove_cols = [col for i, col in enumerate(amenities.columns.values) if not select.get_support()[i]]
for col in remove_cols:
    listings_df.drop([col], axis=1, inplace=True)

In [None]:
listings_df = pd.get_dummies(listings_df)

estimated_income = listings_df['estimated_income_per_month']
listings_df = listings_df.drop(['estimated_income_per_month'], axis=1)

In [None]:
from sklearn.model_selection import train_test_split # splitting data

train_features, test_features, train_outcome, test_outcome = train_test_split(
    listings_df,
    estimated_income,
    test_size=0.20, 
    random_state=11
)

In [8]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import MinMaxScaler       # scaling data
from sklearn.model_selection import GridSearchCV     # for grid search
from sklearn.pipeline import Pipeline                # for making pipelines
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import KFold

scaler = MinMaxScaler()

# define a pipeline
pipe = Pipeline([('scaler', scaler), ('KNeighborsRegressor', KNeighborsRegressor())])

folds = KFold(n_splits = 10, shuffle = True)

# defines a grid to search through
param_grid = {
    'KNeighborsRegressor__n_neighbors': range(101, 302, 50),
    'KNeighborsRegressor__weights': ["distance"]
}

# performs a grid search of pipeline
knn_grid = GridSearchCV(pipe, param_grid, cv=folds, scoring="neg_mean_absolute_error", verbose=3)
knn_model = knn_grid.fit(train_features, train_outcome)

print('Training MAE:', knn_model.best_score_)
print('Testing MAE:', knn_model.score(test_features, test_outcome))
print(knn_model.best_estimator_)

Fitting 10 folds for each of 5 candidates, totalling 50 fits
[CV] KNeighborsRegressor__n_neighbors=101, KNeighborsRegressor__weights=distance 
[CV]  KNeighborsRegressor__n_neighbors=101, KNeighborsRegressor__weights=distance, score=-997.9379270849647, total=   1.0s
[CV] KNeighborsRegressor__n_neighbors=101, KNeighborsRegressor__weights=distance 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    9.6s remaining:    0.0s


[CV]  KNeighborsRegressor__n_neighbors=101, KNeighborsRegressor__weights=distance, score=-1102.4515517308314, total=   0.9s
[CV] KNeighborsRegressor__n_neighbors=101, KNeighborsRegressor__weights=distance 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   18.8s remaining:    0.0s


[CV]  KNeighborsRegressor__n_neighbors=101, KNeighborsRegressor__weights=distance, score=-1015.4654125537527, total=   1.0s
[CV] KNeighborsRegressor__n_neighbors=101, KNeighborsRegressor__weights=distance 
[CV]  KNeighborsRegressor__n_neighbors=101, KNeighborsRegressor__weights=distance, score=-1019.2027604699304, total=   1.0s
[CV] KNeighborsRegressor__n_neighbors=101, KNeighborsRegressor__weights=distance 
[CV]  KNeighborsRegressor__n_neighbors=101, KNeighborsRegressor__weights=distance, score=-977.016804122237, total=   1.0s
[CV] KNeighborsRegressor__n_neighbors=101, KNeighborsRegressor__weights=distance 
[CV]  KNeighborsRegressor__n_neighbors=101, KNeighborsRegressor__weights=distance, score=-1078.0157141152033, total=   1.0s
[CV] KNeighborsRegressor__n_neighbors=101, KNeighborsRegressor__weights=distance 
[CV]  KNeighborsRegressor__n_neighbors=101, KNeighborsRegressor__weights=distance, score=-968.7526963049239, total=   0.9s
[CV] KNeighborsRegressor__n_neighbors=101, KNeighborsRe

[CV]  KNeighborsRegressor__n_neighbors=301, KNeighborsRegressor__weights=distance, score=-1017.4904218711555, total=   1.0s
[CV] KNeighborsRegressor__n_neighbors=301, KNeighborsRegressor__weights=distance 
[CV]  KNeighborsRegressor__n_neighbors=301, KNeighborsRegressor__weights=distance, score=-1029.284264629153, total=   1.1s
[CV] KNeighborsRegressor__n_neighbors=301, KNeighborsRegressor__weights=distance 
[CV]  KNeighborsRegressor__n_neighbors=301, KNeighborsRegressor__weights=distance, score=-969.2836516655444, total=   1.1s
[CV] KNeighborsRegressor__n_neighbors=301, KNeighborsRegressor__weights=distance 
[CV]  KNeighborsRegressor__n_neighbors=301, KNeighborsRegressor__weights=distance, score=-1090.9053464308627, total=   1.1s
[CV] KNeighborsRegressor__n_neighbors=301, KNeighborsRegressor__weights=distance 
[CV]  KNeighborsRegressor__n_neighbors=301, KNeighborsRegressor__weights=distance, score=-971.9474779097752, total=   0.9s
[CV] KNeighborsRegressor__n_neighbors=301, KNeighborsRe

[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:  8.0min finished


Testing MAE: -1060.8400526414678
Pipeline(memory=None,
     steps=[('scaler', MinMaxScaler(copy=True, feature_range=(0, 1))), ('KNeighborsRegressor', KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
          metric_params=None, n_jobs=1, n_neighbors=101, p=2,
          weights='distance'))])


In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler       # scaling data
from sklearn.model_selection import GridSearchCV     # for grid search
from sklearn.pipeline import Pipeline                # for making pipelines
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import KFold

scaler = MinMaxScaler()

# define a pipeline
pipe = Pipeline([('scaler', scaler), ('LogisticRegression', LogisticRegression())])

folds = KFold(n_splits = 10, shuffle = True)

# defines a grid to search through
param_grid = {
    'LogisticRegression__penalty': ['l1', 'l2']
}

# performs a grid search of pipeline
log_grid = GridSearchCV(pipe, param_grid, cv=folds, scoring="neg_mean_absolute_error", verbose=3)
log_model = log_grid.fit(train_features, train_outcome)

print('Training MAE:', log_model.best_score_)
print('Testing MAE:', log_model.score(test_features, test_outcome))
print(log_model.best_estimator_)

Fitting 10 folds for each of 2 candidates, totalling 20 fits
[CV] LogisticRegression__penalty=l1 ..................................
[CV]  LogisticRegression__penalty=l1, score=-1116.1069692058347, total=  13.6s
[CV] LogisticRegression__penalty=l1 ..................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   13.7s remaining:    0.0s


[CV]  LogisticRegression__penalty=l1, score=-1226.4311183144246, total=  13.2s
[CV] LogisticRegression__penalty=l1 ..................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   27.0s remaining:    0.0s


[CV]  LogisticRegression__penalty=l1, score=-1236.871961102107, total=  13.5s
[CV] LogisticRegression__penalty=l1 ..................................
[CV]  LogisticRegression__penalty=l1, score=-1147.6077795786061, total=  14.1s
[CV] LogisticRegression__penalty=l1 ..................................
[CV]  LogisticRegression__penalty=l1, score=-1194.4132901134521, total=  13.8s
[CV] LogisticRegression__penalty=l1 ..................................
[CV]  LogisticRegression__penalty=l1, score=-1160.7893030794166, total=  14.0s
[CV] LogisticRegression__penalty=l1 ..................................
[CV]  LogisticRegression__penalty=l1, score=-1129.640194489465, total=  13.6s
[CV] LogisticRegression__penalty=l1 ..................................
[CV]  LogisticRegression__penalty=l1, score=-1225.611021069692, total=  13.7s
[CV] LogisticRegression__penalty=l1 ..................................
[CV]  LogisticRegression__penalty=l1, score=-1295.3792544570501, total=  13.6s
[CV] LogisticRegression_

[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:  4.9min finished


Training MAE: -1185.2476900632194
Testing MAE: -1226.8139987038237
Pipeline(memory=None,
     steps=[('scaler', MinMaxScaler(copy=True, feature_range=(0, 1))), ('LogisticRegression', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])


In [10]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import MinMaxScaler       # scaling data
from sklearn.model_selection import GridSearchCV     # for grid search
from sklearn.pipeline import Pipeline                # for making pipelines
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import KFold

scaler = MinMaxScaler()

# define a pipeline
pipe = Pipeline([('scaler', scaler), ('RandomForestRegressor', RandomForestRegressor())])

folds = KFold(n_splits = 10, shuffle = True)

# defines a grid to search through
param_grid = {
    'RandomForestRegressor__n_estimators': [1,10,100,1000]
}

# performs a grid search of pipeline
rand_forest_grid = GridSearchCV(pipe, param_grid, cv=folds, scoring="neg_mean_absolute_error", verbose=3)
rand_forest_model = rand_forest_grid.fit(train_features, train_outcome)

print('Training MAE:', rand_forest_model.best_score_)
print('Testing MAE:', rand_forest_model.score(test_features, test_outcome))
print(rand_forest_model.best_estimator_)

  from numpy.core.umath_tests import inner1d
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.2s remaining:    0.0s


Fitting 10 folds for each of 4 candidates, totalling 40 fits
[CV] RandomForestRegressor__n_estimators=1 ...........................
[CV]  RandomForestRegressor__n_estimators=1, score=-1249.3511213932268, total=   0.1s
[CV] RandomForestRegressor__n_estimators=1 ...........................
[CV]  RandomForestRegressor__n_estimators=1, score=-1154.122423836704, total=   0.1s
[CV] RandomForestRegressor__n_estimators=1 ...........................
[CV]  RandomForestRegressor__n_estimators=1, score=-1314.3189742996065, total=   0.1s
[CV] RandomForestRegressor__n_estimators=1 ...........................
[CV]  RandomForestRegressor__n_estimators=1, score=-1285.7237860678788, total=   0.1s
[CV] RandomForestRegressor__n_estimators=1 ...........................
[CV]  RandomForestRegressor__n_estimators=1, score=-1318.0186754156164, total=   0.1s
[CV] RandomForestRegressor__n_estimators=1 ...........................
[CV]  RandomForestRegressor__n_estimators=1, score=-1252.6465560670515, total=   0.1

[Parallel(n_jobs=1)]: Done  40 out of  40 | elapsed:  8.3min finished


Training MAE: -944.1216121578994
Testing MAE: -994.8194224508549
Pipeline(memory=None,
     steps=[('scaler', MinMaxScaler(copy=True, feature_range=(0, 1))), ('RandomForestRegressor', RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False))])


In [13]:
test_features[:15]

Unnamed: 0,accommodates,bathrooms,bedrooms,beds,guests_included,extra_people,minimum_nights,maximum_nights,Heat lamps,Sound system,...,property_type_Villa,property_type_Yurt,room_type_Entire home/apt,room_type_Private room,room_type_Shared room,bed_type_Airbed,bed_type_Couch,bed_type_Futon,bed_type_Pull-out Sofa,bed_type_Real Bed
5592,2,1.0,1.0,1.0,2,40.0,4,1125,0,0,...,0,0,1,0,0,0,0,0,0,1
3967,4,2.0,2.0,2.0,4,50.0,3,1125,0,0,...,0,0,1,0,0,0,0,0,0,1
2913,4,1.0,0.0,3.0,2,20.0,1,27,0,0,...,0,0,1,0,0,0,0,0,0,1
6564,2,1.0,1.0,1.0,1,0.0,2,1125,0,0,...,0,0,0,1,0,0,0,0,0,1
243,6,2.0,2.0,0.0,1,0.0,2,365,0,0,...,0,0,1,0,0,0,0,0,0,1
4480,3,1.0,0.0,1.0,2,30.0,21,1125,0,0,...,0,0,1,0,0,0,0,0,0,1
7333,4,1.0,1.0,1.0,2,15.0,1,31,0,0,...,0,0,1,0,0,0,0,0,0,1
7329,6,2.0,4.0,7.0,1,0.0,7,14,0,0,...,0,0,1,0,0,0,0,0,0,1
5110,1,2.0,1.0,1.0,1,0.0,1,120,0,0,...,0,0,0,1,0,0,0,0,0,1
5042,2,1.5,1.0,1.0,2,10.0,2,45,0,0,...,0,0,0,1,0,0,0,0,0,1


In [25]:
print('Actual    :', list(test_outcome[:15].astype('int')))
print('Predicted :', list(rand_forest_model.predict(test_features[:15]).astype('int')))

Actual    : [1355, 1850, 3339, 1640, 2101, 3339, 2520, 1051, 798, 394, 2079, 11025, 3675, 630, 1067]
Predicted : [2305, 2874, 2357, 995, 3642, 3470, 2088, 1019, 807, 1160, 1390, 9809, 3946, 650, 1311]
