# New Owner Model Exploration
I want create a Streamlit app where new owners can use to help them set the price of their rental.

This notebook will create a fairly basic model using numerous different regression techniques.
From here I can try to gridsearch and hone-in on the best fitting model, that I will then pickle and add to the app. 

In [1]:
# !pip install xgboost

In [1]:
#imports
import numpy as np
import pandas as pd
import math
import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option('display.max_columns', None)

from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV

from sklearn.preprocessing import StandardScaler
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor 
from sklearn.svm import SVR
from sklearn.pipeline import Pipeline
from sklearn.ensemble import BaggingRegressor, RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import ExtraTreesRegressor
import xgboost as xgb
from sklearn.svm import SVR

# Import

In [2]:
df = pd.read_csv('../data/1500_sentiment_api.csv')
df.head(2)

Unnamed: 0,id,latitude,longitude,price,shared_status,accommodates,accomodation_group,bathrooms,bedrooms,beds_adjusted,neighborhood,has_neighborhood_overview,has_reviews,listing_url,host_in_CO,host_lives_in_neighborhood,host_id,host_name,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_neighbourhood,host_listings_count,host_total_listings_count,host_has_profile_pic,host_identity_verified,has_host_about,years_hosting,years_of_reviews,license_listed,response_time,host_lives_in_neighborhood.1,name,host_about,description,neighborhood_overview,amenities,minimum_nights,maximum_nights,min_stay_group,max_stay_group,has_availability,instant_bookable,number_of_reviews,number_of_reviews_ltm,number_of_reviews_l30d,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,reviews_per_month,bike_scores,walk_scores,property_page_links,transit_scores,name_sent_compound,name_sent_pos,name_sent_neg,name_sent_neu,host_sent_compound,host_sent_pos,host_sent_neg,host_sent_neu,description_sent_compound,description_sent_pos,description_sent_neg,description_sent_neu,neighborhood_sent_compound,neighborhood_sent_pos,neighborhood_sent_neg,neighborhood_sent_neu,25*k
0,177,39.69551,-104.92489,79.0,entire home,2,0-2,1.0,1.0,1.0,Virginia Village,1,1,https://www.airbnb.com/rooms/177,1,1,615,Joe,within an hour,1.0,1.0,1.0,Virginia Village,2.0,2.0,1.0,0.0,1,15.0,6.0,1,0.0,1,Tiny Home in the Heart of the City- ECO FRIENDLY,"I'm originally from Oklahoma, but have lived i...","160 sq ft + 80 sq ft loft for sleeping, Sleeps...","Quiet neighborhood next to park, creeks and bi...","[""Hot water"", ""Long term stays allowed"", ""Coff...",29,400,month,long_term,1,0,120,24,0,4.85,4.96,4.89,4.97,4.94,4.85,4.78,1.56,84.0,51.0,https://www.walkscore.com/score/loc/lat=39.695...,37.0,0.6037,0.304,0.0,0.696,0.6948,0.192,0.0,0.808,0.9476,0.153,0.069,0.778,0.0,0.0,0.0,1.0,4.0
1,360,39.76758,-105.00316,133.0,entire home,3,3-4,1.0,2.0,2.0,Highland,1,1,https://www.airbnb.com/rooms/360,1,1,666,Jennifer & Giovanni,within an hour,1.0,0.87,1.0,Highland,4.0,4.0,1.0,1.0,1,15.0,4.0,1,0.0,1,Sit in the Peaceful Garden of the Chickadee Co...,We are artists and tinkerers.\r\n \r\nWe enjoy...,Enjoy the famous Colorado weather and unplug i...,The cottage is located in the center of Lower ...,"[""Coffee maker"", ""Washer"", ""Bedroom comforts"",...",29,35,month,2_months,1,0,174,7,1,4.99,4.99,4.96,5.0,5.0,5.0,4.91,3.26,93.0,90.0,https://www.walkscore.com/score/loc/lat=39.767...,49.0,0.4939,0.242,0.0,0.758,0.9694,0.141,0.023,0.835,0.9814,0.271,0.019,0.709,0.1531,0.025,0.018,0.956,6.0


### Creating a model

In [3]:
X = df[[
    'shared_status', 
    'accommodates',
    'bathrooms', 
    'bedrooms',
    'beds_adjusted',
    'neighborhood', 
    'has_neighborhood_overview', 
    'has_reviews',
    'instant_bookable',
    'host_lives_in_neighborhood',
     'host_is_superhost', 
    'host_has_profile_pic',
    'host_identity_verified', 
    'has_host_about', 
    'years_hosting',
    'license_listed', 
    'minimum_nights',
    'maximum_nights',
     'number_of_reviews',
     'bike_scores',
    'walk_scores',
    'transit_scores',
    'host_sent_compound', 
    'description_sent_compound', 
    'neighborhood_sent_compound']]

X = pd.get_dummies(columns=[ 
                            
                            'shared_status',
                            'neighborhood'
                           ], 
                   drop_first=True, data=X)

y = df['price']

In [4]:
X_train,X_test, y_train, y_test = train_test_split(X,y,test_size=.2,random_state=2023)

In [5]:
sc = StandardScaler()
Z_train = sc.fit_transform(X_train)
Z_test = sc.transform(X_test)

In [6]:
models = [
    { 'name': 'Extra Trees', 'model': ExtraTreesRegressor(), 'params': { 
         'n_estimators':[100, 150, 200], 'max_depth': [None, 1, 2, 3, 4, 5, 6,7]}},
     
     {'name': 'Dummy Regression', 'model': DummyRegressor(strategy='mean'), 'params': {}},

    {'name': 'Linear Regression', 'model': LinearRegression(), 'params': {}},
    
    {'name': 'ADABoost', 'model': AdaBoostRegressor(), 'params': {
        'n_estimators' : [100, 200, 300, 500], 'learning_rate' : [0.001, 0.01, 0.1]}},
    
    {'name': 'Gradient Boost', 'model': GradientBoostingRegressor(), 'params': {
        'n_estimators' : [100, 200, 300, 500], 'learning_rate': [0.1, 0.5, 1], 'max_depth' : [1, 3, 5, 7, 10]}},
    
    {'name': 'KNeighbors' , 'model' : KNeighborsRegressor(), 'params': {
        'n_neighbors' : [3, 5, 7]}},
    
    {'name': 'Decision Tree', 'model': DecisionTreeRegressor(), 'params': {
        'max_depth' : [3, 5, 7, 10]}},
    
    {'name': 'Random Forest', 'model' : RandomForestRegressor(), 'params': {
        'n_estimators' : [100, 200, 300, 400, 500], 'max_depth' : [3, 5, 7, 10]}},
    
    {'name': 'Bagging Trees', 'model': BaggingRegressor(), 'params': {
        'n_estimators' : [100, 200, 300, 400, 500], 'max_samples' : [.1, .3, .5, .7, 1]}},
    
    
    {'name': 'SVR', 'model': SVR(), 'params': {}},
    
    {'name': 'XGBoost', 'model': xgb.XGBRegressor(), 'params': {}}
    
    
    
]



In [7]:
# Perform grid search and print results
for model in models:
    rgs = GridSearchCV(model['model'], model['params'], scoring='neg_root_mean_squared_error',
                      cv=5, n_jobs=-1, return_train_score=True)
    rgs.fit(Z_train, y_train)
    print(f"{model['name']} Training RMSE: {-rgs.score(Z_train, y_train)}")
    print(f"{model['name']} Test RMSE: {-rgs.best_score_}")
    print(f"{model['name']} Best Params: {rgs.best_params_}")
    print("=" * 30)

Extra Trees Training RMSE: 0.5990065102284418
Extra Trees Test RMSE: 92.74092425335616
Extra Trees Best Params: {'max_depth': None, 'n_estimators': 200}
Dummy Regression Training RMSE: 133.53722201162032
Dummy Regression Test RMSE: 133.47913443457543
Dummy Regression Best Params: {}
Linear Regression Training RMSE: 92.88905260034892
Linear Regression Test RMSE: 96.18479381714529
Linear Regression Best Params: {}
ADABoost Training RMSE: 94.69299384148708
ADABoost Test RMSE: 100.41163043035081
ADABoost Best Params: {'learning_rate': 0.01, 'n_estimators': 100}
Gradient Boost Training RMSE: 43.25220222941627
Gradient Boost Test RMSE: 88.69832135439971
Gradient Boost Best Params: {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 200}
KNeighbors Training RMSE: 83.55261935800621
KNeighbors Test RMSE: 105.06644064835275
KNeighbors Best Params: {'n_neighbors': 5}
Decision Tree Training RMSE: 85.76948638329672
Decision Tree Test RMSE: 105.6800603554581
Decision Tree Best Params: {'max_depth

Deserve a closer look

Extra Trees- decent scores, not a bad bias-varience trade off.

Gradient Boost- a bit more overfit than the extra trees. 

Random Forest- yet more overfit. 

Bagging Trees- Overfit, but also has the lowest test score.  

# Logarithmic transformation?
The price column is fairly skewed, so I want to see if a transformation might help

In [14]:
X = df[[
    'shared_status', 
    'accommodates',
    'bathrooms', 
    'bedrooms',
    'beds_adjusted',
    'neighborhood', 
    'has_neighborhood_overview', 
    'has_reviews',
    'instant_bookable',
    'host_lives_in_neighborhood',
     'host_is_superhost', 
    'host_has_profile_pic',
    'host_identity_verified', 
    'has_host_about', 
    'years_hosting',
    'license_listed', 
    'minimum_nights',
    'maximum_nights',
     'number_of_reviews',
     'bike_scores',
    'walk_scores',
    'host_sent_compound', 
    'description_sent_compound', 
    'neighborhood_sent_compound']]


X = pd.get_dummies(columns=[ 
                            
                            'shared_status',
                            'neighborhood'
                           ], 
                   drop_first=True, data=X)

y = np.log(df['price'])

In [15]:
X_train,X_test, y_train, y_test = train_test_split(X,y,test_size=.2,random_state=2023)

In [16]:
sc = StandardScaler()
Z_train = sc.fit_transform(X_train)
Z_test = sc.transform(X_test)

In [17]:
models2 = [
    { 'name': 'Extra Trees', 'model': ExtraTreesRegressor(), 'params': { 
         'n_estimators':[100, 150, 200], 'max_depth': [None, 1, 2, 3, 4, 5, 6,7]}},
     
     {'name': 'Dummy Regression', 'model': DummyRegressor(strategy='mean'), 'params': {}},

    {'name': 'Linear Regression', 'model': LinearRegression(), 'params': {}},
    
    {'name': 'ADABoost', 'model': AdaBoostRegressor(), 'params': {
        'n_estimators' : [100, 200, 300, 500], 'learning_rate' : [0.001, 0.01, 0.1]}},
    
    {'name': 'Gradient Boost', 'model': GradientBoostingRegressor(), 'params': {
        'n_estimators' : [100, 200, 300, 500], 'learning_rate': [0.1, 0.5, 1], 'max_depth' : [1, 3, 5, 7, 10]}},
    
    {'name': 'KNeighbors' , 'model' : KNeighborsRegressor(), 'params': {
        'n_neighbors' : [3, 5, 7]}},
    
    {'name': 'Decision Tree', 'model': DecisionTreeRegressor(), 'params': {
        'max_depth' : [3, 5, 7, 10]}},
    
    {'name': 'Random Forest', 'model' : RandomForestRegressor(), 'params': {
        'n_estimators' : [100, 200, 300, 400, 500], 'max_depth' : [3, 5, 7, 10]}},
    
    {'name': 'Bagging Trees', 'model': BaggingRegressor(), 'params': {
        'n_estimators' : [100, 200, 300, 400, 500], 'max_samples' : [.1, .3, .5, .7, 1]}},

    
    {'name': 'SVR', 'model': SVR(), 'params': {}},
    
    {'name': 'XGBoost', 'model': xgb.XGBRegressor(), 'params': {}}
    
    
    
]



In [18]:
# Perform grid search and print results
for model in models2:
    rgs = GridSearchCV(model['model'], model['params'], scoring='neg_root_mean_squared_error',
                      cv=5, n_jobs=-1, return_train_score=True)
    rgs.fit(Z_train, y_train)
    
    y_pred_power = rgs.predict(Z_test)
    y_preds = [math.exp(y_pred_i) for y_pred_i in y_pred_power]
    
    y_train_pred_power = rgs.predict(Z_train)
    y_train_preds = [math.exp(y_pred_i) for y_pred_i in y_train_pred_power]
    
    y_test_regular = [math.exp(y_test_i) for y_test_i in y_test]
    y_train_regular = [math.exp(y_train_i) for y_train_i in y_train]
    
    rmse = (mean_squared_error(y_test_regular, y_preds))**0.5
    
    print(f"{model['name']} Training RMSE: {(mean_squared_error(y_train_preds, y_train_regular)**0.5)}")
    print(f"{model['name']} Test RMSE: {rmse}")
    print(f"{model['name']} Best Params: {rgs.best_params_}")
    print("=" * 30)

Extra Trees Training RMSE: 0.5991347744058276
Extra Trees Test RMSE: 92.51217506153208
Extra Trees Best Params: {'max_depth': None, 'n_estimators': 200}
Dummy Regression Training RMSE: 137.1265603453392
Dummy Regression Test RMSE: 139.86294855338232
Dummy Regression Best Params: {}
Linear Regression Training RMSE: 93.9996413536984
Linear Regression Test RMSE: 107.58682857248132
Linear Regression Best Params: {}
ADABoost Training RMSE: 99.83614566746938
ADABoost Test RMSE: 105.96187026616786
ADABoost Best Params: {'learning_rate': 0.01, 'n_estimators': 300}
Gradient Boost Training RMSE: 65.35959111783367
Gradient Boost Test RMSE: 91.38242332003512
Gradient Boost Best Params: {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 100}
KNeighbors Training RMSE: 90.06280513177806
KNeighbors Test RMSE: 104.60235916053506
KNeighbors Best Params: {'n_neighbors': 5}
Decision Tree Training RMSE: 72.4993193814558
Decision Tree Test RMSE: 97.42024230123904
Decision Tree Best Params: {'max_depth':

Also a closer look

Random Forest- a bit overfit, but not much worse than above. 
Bagging Trees- same