# Airbnb price prediction

### Outline <a name = 'outline'></a>
* [Helpful functions](#helpful_funtions) 
* [Data exploratory](#data_exploratory)
* [Data preprocessing](#data_preprocessing)
* [Regression models](#regression_models)

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
import time

from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline
from sklearn.preprocessing import PolynomialFeatures,MaxAbsScaler, MinMaxScaler, FunctionTransformer, OneHotEncoder, KBinsDiscretizer
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.base import BaseEstimator,TransformerMixin
from sklearn.feature_selection import VarianceThreshold
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer 

from sklearn.model_selection import cross_val_score, train_test_split as split

from sklearn.neighbors import KNeighborsRegressor as KNNR
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, RandomTreesEmbedding

from sklearn.metrics import roc_curve, mean_absolute_error, make_scorer

import warnings
warnings.filterwarnings('ignore')

from catboost import CatBoostRegressor


### Helpful functions <a name = "helpful_functions"/>

In [2]:
#Data preprocessin functions:
long_text_cols = ['name', 'summary', 'space', 'description', 
                  'neighborhood_overview', 'notes', 'transit',
                 'access', 'interaction', 'house_rules', 
                  'host_about']
categoric_cols = ['host_response_time', 'property_type', 'bed_type', 
                  'cancellation_policy','host_is_superhost', 
                  'host_identity_verified',
                  'is_location_exact','require_guest_profile_picture', 
                  'require_guest_phone_verification', 'room_type']
number_cols = ['latitude', 'longitude', 'accommodates', 'bathrooms',
              'bedrooms', 'beds', 'guests_included', 'extra_people',
              'minimum_nights']
special_cols = ['security_deposit', 'cleaning_fee', 'price']
useless_cols = ['host_has_profile_pic', 'host_since',  
                'experiences_offered', 'square_feet']
id_cols = ['id', 'zipcode', 'host_id']
lat_long_cols = ['latitude', 'longitude']
list_cols = ['amenities']


def str_to_bool(s):
    if s=='t':
        return True
    else: 
        return False
    
def str_to_bool_2(s):
    return s == s  
    

def str_to_rate(s):
    if pd.isnull(s)==False:
        return float(s.replace('%',''))
    else: 
        return s

def extract_list_val(s):
    for c in ['{','}','"']:
        s=s.replace(c,'')
    for c in ['/',':',' ','-','.','&',')','(','\'']:
        s=s.replace(c,'_')
    s=s.replace('matress','mattress')
    return s.split(',')

def preprocessing_no_useless_cols(data):
    data.drop(columns=useless_cols,inplace =True)
    return data
    
def preprocessing_property_type(data):
    dict1 = {'Apartment':['Condominium','Timeshare','Loft',
                          'Serviced apartment','Guest suite'],
             'House':['Vacation home','Villa','Townhouse','In-law',
                      'Casa particular','Cottage', 
                      'Casa particular (Cuba)'],
             'Hotel1':['Dorm','Hostel','Guesthouse', 'Hotel', 
                       'Aparthotel'],
             'Hotel2':['Boutique hotel','Bed and breakfast'],
             'Other':['Island','Castle','Yurt','Hut','Chalet',
                      'Treehouse','Earth House','Tipi','Cave',
                      'Train','Parking Space','Lighthouse',
                      'Tent','Boat','Cabin','Camper/RV','Bungalow', 
                      'Tiny house', 'Houseboat', 'Earth house', 
                      'Barn','Farm stay', 'Nature lodge', 
                      'Ryokan (Japan)', 'Bus', 
                      'Shepherd\'s hut (U.K., France)', 'Resort', 
                      'Dome house']
        }
    dict2 = {i : k for k, v in dict1.items() for i in v}
#     data['property_type'].value_counts()
    data['property_type'].replace(dict2, inplace = True)
#     data['property_type'].value_counts()
    return data
    

def preprocessing_general(data):
    data['room_type']=data['room_type'].str.replace(' ','_')
    data['bed_type']=data['bed_type'].str.replace(' ','_')
    data['host_response_rate'] =data['host_response_rate'].apply(str_to_rate)
    data['host_identity_verified']=data['host_identity_verified'].apply(str_to_bool)
    data['host_is_superhost']=data['host_is_superhost'].apply(str_to_bool)
    data['amenities']=data['amenities'].apply(extract_list_val).str.join(' ')
    
    data['require_guest_phone_verification']=data['require_guest_phone_verification'].apply(str_to_bool)
    data['require_guest_profile_picture']=data['require_guest_profile_picture'].apply(str_to_bool)
    data['is_location_exact']=data['is_location_exact'].apply(str_to_bool)

    return data

def preprocessing_no_text(data):
    data.drop(columns=long_text_cols, inplace = True)
    return data

def preprocessing_text_and_nan_to_bool(data):
    for c in long_text_cols:
        data[c]=data[c].apply(str_to_bool_2)
    return data
        

def preprocessing_no_id(data):
    data.drop(columns = id_cols, inplace = True)
    return data

def preprocessing_1(data):
    return preprocessing_no_useless_cols(
            preprocessing_general(
#             preprocessing_no_id(
            preprocessing_text_and_nan_to_bool(
            preprocessing_property_type
                (data)
            )))

# def preprocessing_no_lat_long(data):
#     data=data[data['neighbourhood'].isna()==False]
#     data.drop(columns=['id', 'name','host_has_profile_pic',
#                            'host_since', 'description',
#                            'zipcode','longitude','latitude'],inplace =True)
#     data['property_type']=data['property_type'].replace(dict2)
#     data['room_type']=data['room_type'].str.replace(' ','_')
#     data['bed_type']=data['bed_type'].str.replace(' ','_')
#     data['host_response_rate'] =data['host_response_rate'].apply(str_to_rate)
#     data['host_identity_verified']=data['host_identity_verified'].apply(str_to_bool)
#     data['instant_bookable']=data['instant_bookable'].apply(str_to_bool).astype(float)
#     data['cleaning_fee']=data['cleaning_fee'].apply(str_to_bool).astype(float)
#     data['amenities']=data['amenities'].apply(extract_list_val).str.join(' ')
#     data['neighbourhood']=data['neighbourhood'].str.lower().replace('castle hill ','castle hill')
#     return data


# scoring functions
def MAE(y_true, y_pred):
    return mean_absolute_error(y_true, y_pred)

def MAPE(y_true, y_pred):
    return np.mean(np.abs(1 - y_pred / y_true))


#functions to get columns for featrue tranformers   
def get_col_to_fillna_most_frequent(df):
    return df[col_to_fillna_most_frequent]

def get_col_long_text_cols(df):
    return df['description']

def get_sum_long_text_cols(s):
    return pd.DataFrame(s.apply(lambda s: len(s.split(' '))))

def get_col_to_fillna_bool(df):
    return df[long_text_cols]

def get_col_to_fillna_mean(df):
    return df[col_to_fillna_mean]

def get_col_to_get_dummies(df):
    return df[col_to_getdummies]


def get_col_to_get_dummies_NYC(df):
    return df[col_to_getdummies_NYC]

def get_lat_long(df):
    return df[['latitude','longitude']]

def get_amenities(df):
    return df['amenities']

def get_amenities2(df):
    return df[['amenities']]

def get_col_no_change(df):
    return df[col_no_change]

def get_sum_amenities(s):
    return pd.DataFrame(s.apply(lambda s: len(s.split(' '))))

#functions for model fit, predict and results
def time_convert (t):
    h,m1=divmod(t, 3600)
    m,s=divmod(m1, 60) 
    return h, m ,s

def get_mean_cv_score(name, model, X_train, y_train, num_cv, n_verbose,scoring_param):
    print ('Model: '+name)
    print ('Begin CV fit')
    t0 = time.time()
    c_val=cross_val_score(model, X_train, y_train, 
                          cv=num_cv, verbose=n_verbose, 
                          scoring=scoring_param, n_jobs=-1)
    t1 = time.time()
    h, m ,s=time_convert(t1-t0)
    print('CV ended. Elapsed time: {0:.0f} hours, {1:.0f} minutes and {2:.0f} seconds'.format(h,m,s))
    return -(c_val.mean().round(4))


def get_results (pipes_dict, X_train, y_train,  X_test, y_test, num_cv, 
                 n_verbose, scoring_param, df_data_and_results, df_all_results):
    for name,model in pipes_dict.items():
        mean_cv_score=get_mean_cv_score(name ,model, X_train, y_train, num_cv, n_verbose,scoring_param)
        print ('Begin '+name+' model fit')
        t1 = time.time()
        model.fit(X_train,y_train)
        t2 = time.time()
        h, m ,s=time_convert(t2-t1)
        print('Model fit ended. Elapsed time: {0:.0f} hours, {1:.0f} minutes and {2:.0f} seconds'.format(h,m,s))
        y_pred=model.predict(X_test)
        df_data_and_results[('Price_pred_'+name)]=y_pred
        df_data_and_results[('Price_diff_'+name)]=y_test - y_pred
        df_all_results=df_all_results.append ({ 'Model':name,
                                                'CV_train_mean_MAPE_score': mean_cv_score,
                                                'Test_MAPE_score':          round(MAE(y_test,y_pred), 4),
                                                'Min_diff':                 df_data_and_results[('Price_diff_'+name)].min(),
                                                'Max_diff':                 df_data_and_results[('Price_diff_'+name)].max(), 
                                                'Mean_diff':                df_data_and_results[('Price_diff_'+name)].mean(), 
                                                'Median_diff':              df_data_and_results[('Price_diff_'+name)].median(),
                                                'STD_diff':                 df_data_and_results[('Price_diff_'+name)].std(),
                                                '10th percentile':          df_data_and_results[('Price_diff_'+name)].quantile(q=[0.1,0.9], interpolation='linear')[0.1],
                                                '90th percentile':          df_data_and_results[('Price_diff_'+name)].quantile(q=[0.1,0.9], interpolation='linear')[0.9]
                                              },
                                                ignore_index = True)
        print('======================================================================================\n')
    return df_data_and_results, df_all_results

#Graph generation functions:
def get_diff_hist (pipes_dict,df_data_and_results, df_all_results):
    for name,model in pipes_dict.items():
        fig, ax = plt.subplots(figsize=(14,5))
        sns.set_context(rc={"lines.linewidth": 3.5})#
        sns.distplot(df_data_and_results[('Price_diff_'+name)],ax=ax,color='red')
        plt.title(name+' Price_diff histogram')
        ax.set_xlim(df_all_results['Min_diff'].min(),df_all_results['Max_diff'].max())

def get_prediction_cluster_graph(pipes_dict, y_test, df_data_and_results):
    for name,model in pipes_dict.items():
        plt.figure()
        plt.plot(y_test,np.log(df_data_and_results[('Price_pred_'+name)]),'.', label = 'Result Data')
        plt.plot([2,9],[2,9], label = 'Ideal')
        plt.axes().set_aspect('equal')
        plt.legend()
        plt.title(name+' (y_true,y_pred) vs. ideal')

### Data exploratory <a name = "data_exploratory"/>

In [3]:
data = pd.read_csv('data/train.csv')

In [4]:
data.head()

Unnamed: 0,id,name,summary,space,description,experiences_offered,neighborhood_overview,notes,transit,access,...,square_feet,security_deposit,cleaning_fee,guests_included,extra_people,minimum_nights,cancellation_policy,require_guest_profile_picture,require_guest_phone_verification,price
0,127860,Double bedroom in cottage Twickenham (sleeps 1-2),"One gorgeous, light-filled double bedroom (sle...","A beautiful, light-filled double bedroom is ou...","One gorgeous, light-filled double bedroom (sle...",none,"Very, very safe area, great transport links an...",We have a super-gorgeous cat,Twickenham Rugby Stadium is 15 mins walk Water...,"You'll have full use of the cottage, two recep...",...,,300.0,10.0,1,10.0,2,strict_14_with_grace_period,f,f,1000.0
1,325809,Big House for Olympics sleeps 6 to8,,"A beautiful, modern, art-filled and clean 4 st...","A beautiful, modern, art-filled and clean 4 st...",none,,,,,...,1400.0,771.0,,1,0.0,14,strict_14_with_grace_period,f,f,771.0
2,429045,The Old Coach House (Olympics),,"Fabulous, recently refurbished original Victor...","Fabulous, recently refurbished original Victor...",none,,,,,...,1800.0,600.0,,1,0.0,14,strict_14_with_grace_period,f,f,1500.0
3,473637,Brand New contemporary mews house,,Stunning contemporary mews available for let o...,Stunning contemporary mews available for let o...,none,,,,,...,2100.0,300.0,100.0,7,100.0,14,strict_14_with_grace_period,f,f,2000.0
4,533943,LUXURY APT. NEAR BUCKINGHAM PALACE,“The area around Westminster Cathedral has lon...,"This stunning, spacious, 4 bedroom, top floor ...",“The area around Westminster Cathedral has lon...,family,The proximity of all of London's main attracti...,,"Only 5 minutes walk to Victoria Station, the m...",,...,2220.0,2505.0,180.0,1,0.0,14,strict_14_with_grace_period,f,t,901.0


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51815 entries, 0 to 51814
Data columns (total 43 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   id                                51815 non-null  int64  
 1   name                              51809 non-null  object 
 2   summary                           50018 non-null  object 
 3   space                             33350 non-null  object 
 4   description                       50798 non-null  object 
 5   experiences_offered               51815 non-null  object 
 6   neighborhood_overview             30176 non-null  object 
 7   notes                             19436 non-null  object 
 8   transit                           29762 non-null  object 
 9   access                            28386 non-null  object 
 10  interaction                       27110 non-null  object 
 11  house_rules                       28071 non-null  object 
 12  host

In [6]:
pd.set_option('display.max_colwidth', 0)
data[categoric_cols].agg(['unique']).transpose()

Unnamed: 0,unique
host_response_time,"[nan, within an hour, within a day, within a few hours, a few days or more]"
property_type,"[House, Apartment, Loft, Townhouse, Boat, Boutique hotel, Serviced apartment, Condominium, Bed and breakfast, Hostel, Other, Cabin, Bungalow, Yurt, Tiny house, Guest suite, Guesthouse, Aparthotel, Cottage, Lighthouse, Barn, Villa, Houseboat, Hotel, Ryokan (Japan), Casa particular (Cuba), Resort, Chalet, Island, Nature lodge, Hut, Camper/RV, Earth house, Treehouse, Farm stay, Bus, Shepherd's hut (U.K., France), Tipi, Tent, Dome house]"
bed_type,"[Real Bed, Pull-out Sofa, Futon, Couch, Airbed]"
cancellation_policy,"[strict_14_with_grace_period, flexible, moderate, super_strict_30, super_strict_60, strict]"
host_is_superhost,"[f, t, nan]"
host_identity_verified,"[t, f, nan]"
is_location_exact,"[t, f]"
require_guest_profile_picture,"[f, t]"
require_guest_phone_verification,"[f, t]"
room_type,"[Private room, Entire home/apt, Shared room]"


In [7]:
data['property_type'].value_counts()

Apartment                        35260
House                            9878 
Townhouse                        1880 
Serviced apartment               1855 
Condominium                      683  
Loft                             446  
Bed and breakfast                413  
Guest suite                      230  
Guesthouse                       205  
Other                            168  
Hostel                           156  
Boutique hotel                   119  
Hotel                            117  
Bungalow                         90   
Cottage                          60   
Villa                            46   
Boat                             41   
Aparthotel                       41   
Cabin                            19   
Casa particular (Cuba)           17   
Tiny house                       17   
Camper/RV                        16   
Houseboat                        15   
Chalet                           8    
Earth house                      8    
Barn                     

### Data preprocessing <a name = "data_preprocessing"/>

In [8]:
dict1 = {'Apartment':['Condominium','Timeshare','Loft','Serviced apartment','Guest suite'],
         'House':['Vacation home','Villa','Townhouse','In-law','Casa particular','Cottage', 'Casa particular (Cuba)'],
         'Hotel1':['Dorm','Hostel','Guesthouse', 'Hotel', 'Aparthotel'],
         'Hotel2':['Boutique hotel','Bed and breakfast'],
         'Other':['Island','Castle','Yurt','Hut','Chalet','Treehouse',
                  'Earth House','Tipi','Cave','Train','Parking Space','Lighthouse',
                 'Tent','Boat','Cabin','Camper/RV','Bungalow', 'Tiny house', 'Houseboat', 'Earth house', 'Barn',
                 'Farm stay', 'Nature lodge', 'Ryokan (Japan)',
                 'Bus', 'Shepherd\'s hut (U.K., France)', 'Resort', 'Dome house']
        }
dict2 = {i : k for k, v in dict1.items() for i in v}
data['property_type'].replace(dict2).value_counts()

Apartment    38474
House        11881
Hotel2       532  
Hotel1       519  
Other        409  
Name: property_type, dtype: int64

In [9]:
data_clean_all=data[(data['price'] > 20) & (data['price'] < 500)].copy()
data_clean_all=preprocessing_1(data_clean_all)
data_clean_all.head()

Unnamed: 0,id,name,summary,space,description,neighborhood_overview,notes,transit,access,interaction,...,amenities,security_deposit,cleaning_fee,guests_included,extra_people,minimum_nights,cancellation_policy,require_guest_profile_picture,require_guest_phone_verification,price
66,13562302,True,True,False,True,False,False,False,False,False,...,Wifi Kitchen Elevator Washer Smoke_detector Carbon_monoxide_detector Essentials translation_missing__en_hosting_amenity_49 translation_missing__en_hosting_amenity_50,,,1,0.0,1,flexible,False,False,210.0
67,13562381,True,True,True,True,True,True,True,True,True,...,Internet Wifi Heating Washer Dryer Smoke_detector Essentials Shampoo Hair_dryer Iron Laptop_friendly_workspace translation_missing__en_hosting_amenity_49 translation_missing__en_hosting_amenity_50 Hot_water Bed_linens Refrigerator,3884.0,5.0,1,8.0,1,strict_14_with_grace_period,False,False,35.0
68,13562628,True,True,False,True,False,False,False,False,False,...,Wifi Kitchen Heating Family_kid_friendly Washer Smoke_detector Essentials Lock_on_bedroom_door Hangers Laptop_friendly_workspace translation_missing__en_hosting_amenity_50,,10.0,1,0.0,3,flexible,False,False,50.0
69,13562725,True,True,True,True,True,True,True,True,True,...,Wifi Kitchen Heating Family_kid_friendly Washer Dryer Smoke_detector First_aid_kit Essentials Lock_on_bedroom_door Hair_dryer Iron translation_missing__en_hosting_amenity_50,,12.0,1,10.0,1,strict_14_with_grace_period,False,False,45.0
70,13563023,True,True,True,True,True,True,False,True,True,...,Wifi Kitchen Smoking_allowed Heating Family_kid_friendly Washer Smoke_detector Essentials Hair_dryer Laptop_friendly_workspace translation_missing__en_hosting_amenity_50,,,1,0.0,4,moderate,False,False,46.0


In [10]:
data_clean_all.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 49788 entries, 66 to 51814
Data columns (total 39 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   id                                49788 non-null  int64  
 1   name                              49788 non-null  bool   
 2   summary                           49788 non-null  bool   
 3   space                             49788 non-null  bool   
 4   description                       49788 non-null  bool   
 5   neighborhood_overview             49788 non-null  bool   
 6   notes                             49788 non-null  bool   
 7   transit                           49788 non-null  bool   
 8   access                            49788 non-null  bool   
 9   interaction                       49788 non-null  bool   
 10  house_rules                       49788 non-null  bool   
 11  host_id                           49788 non-null  int64  
 12  hos

In [11]:
data_clean_all['host_response_time'].dtype

dtype('O')

In [12]:
data_clean_all['host_response_time'].value_counts()

within an hour        20269
within a few hours    5143 
within a day          3576 
a few days or more    903  
Name: host_response_time, dtype: int64

In [13]:
data_clean_all['name'].isna().sum()

0

In [14]:
data_clean_all['host_response_time'].fillna('None', inplace = True)
# for c in long_text_cols:
#     data_clean_all[c].fillna('None', inplace = True)

### Regression models <a name = 'regression_models'/>

In [15]:
#columns to fill Nan with most frequent
col_to_fillna_most_frequent=['beds','bedrooms',
                             'bathrooms', 
                             'host_identity_verified',
                             'require_guest_phone_verification',
                             'minimum_nights',
                             'require_guest_profile_picture',
                             'is_location_exact',
                             'name', 'summary', 'space', 'description', 
                  'neighborhood_overview', 'notes', 'transit',
                 'access', 'interaction', 'house_rules', 
                  'host_about'
                            ]

#columns to fill Nan with mean
col_to_fillna_mean=['host_response_rate', 'security_deposit',
                   'cleaning_fee']

#columns for get_dummies (one hot encoding)
col_to_getdummies=['property_type','room_type','bed_type',
                   'cancellation_policy', 'host_response_time',
                  'host_is_superhost', 'neighbourhood_cleansed',
                  'host_response_time']

#columns that won't be changed
col_no_change=['accommodates', 'guests_included',
              'latitude', 'longitude']

# ammenities?, 

We decided to use 2 regression models: Linear Regression (LR) and Random Forest regressor (RF), for predicting price. 
In addition, KNN regressor was used to create a price grouping feature based on lat-long info.

LR and RF ran with 2 types of transformers, with and without polynomioal features

In [16]:
#scoring functions
MAE_scorer = make_scorer(MAE, greater_is_better=False)
MAPE_scorer = make_scorer(MAPE, greater_is_better=False)

In [17]:
KNN_neighbors=200
RF_n_estimators=50
RF_min_samples_split=50
TSVD_n_components=10
num_cv=5
n_verbose = 3
scoring_param=MAE_scorer

PCA_features=PCA()
TruncatedSVD_features=TruncatedSVD(n_components=TSVD_n_components)
lin_regressor= LinearRegression(n_jobs = -1)
RF_regressor = RandomForestRegressor(n_estimators=RF_n_estimators, 
                                     min_samples_split=RF_min_samples_split,
                                     n_jobs = -1
                                    )
KNN_Reg = KNNR(n_neighbors=KNN_neighbors)
cbr = CatBoostRegressor(iterations=2,
                          learning_rate=1,
                          depth=50)

A class created for the KNN regressor for lat-long-price:

In [18]:
class MyTransformer(TransformerMixin, BaseEstimator):
    '''A template for a custom transformer.'''

    def __init__(self, model):
        self.model=model
        pass

    def fit(self, X, y=None):
        self.model.fit(X, y)
        return self

    def transform(self, X):
        # transform X via code or additional methods
        return pd.DataFrame(self.model.predict(X))

Transformers:

In [19]:
Transformer_fillna_most_frequent =\
Pipeline([('Select_col_to_fillna_most_frequent', 
           FunctionTransformer(func=get_col_to_fillna_most_frequent, 
                               validate=False)),
          ('Fill_Null', 
           SimpleImputer(missing_values=np.nan, 
                         strategy='most_frequent')),
          ('To_float_transformer',FunctionTransformer(
              func=lambda x: x.astype(float) ,validate=False))
         ])

In [20]:
Transformer_fillna_mean =\
Pipeline([('Select_col_to_fillna_mean', 
           FunctionTransformer(func=get_col_to_fillna_mean, 
                               validate=False)),
          ('Fill_Null', 
           SimpleImputer(missing_values=np.nan, strategy='mean'))
         ])

In [21]:
Transformer_OneHotEncoder =\
Pipeline([('Select_col_to_get_dummies', 
           FunctionTransformer(func=get_col_to_get_dummies, validate=False)),
          ('OneHotEncoder_transform',
           OneHotEncoder(handle_unknown='ignore'))
         ])

In [22]:
Transformer_amenities =\
Pipeline([('Select_col_to_get_amenities',  
           FunctionTransformer(func=get_amenities, validate=False)),
          ('CountVectorizer_transform', CountVectorizer(min_df=0.02)),
          ('Feature_extractor_TSVD', TruncatedSVD_features)
         ])

In [23]:
Transformer_text =\
Pipeline([('Select_col_to_get_long_text_cols',  
           FunctionTransformer(func=get_col_long_text_cols, validate=False)),
          ('CountVectorizer_transform', CountVectorizer(min_df=0.02)),
          ('Feature_extractor_TSVD', TruncatedSVD_features)
         ])

In [24]:
Transformer_sum_amenities=\
Pipeline([('Select_col_to_get_amenities',FunctionTransformer(
    func=get_amenities, validate=False)),
          ('Get_sum_amenities', FunctionTransformer(
              func=get_sum_amenities, validate=False)),
         ])

In [25]:
Transformer_sum_text=\
Pipeline([('Select_col_to_get_long_text_cols',FunctionTransformer(
    func=get_col_long_text_cols, validate=False)),
          ('Get_sum_long_text_cols', FunctionTransformer(
              func=get_sum_long_text_cols, validate=False)),
         ])

In [26]:
Transformer_get_columns =\
Pipeline ([('Select_col_no_change', 
            FunctionTransformer(func=get_col_no_change, validate=False))
          ])

In [27]:
Transformer_lat_long =\
Pipeline ([('Select_col_lat_long_price', 
            FunctionTransformer(func=get_lat_long, validate=False)),                                
           ('MyTransformer', MyTransformer(KNN_Reg))    
          ])

In [28]:
get_col_long_text_cols(data_clean_all)

66       True 
67       True 
68       True 
69       True 
70       True 
         ...  
51810    True 
51811    True 
51812    False
51813    True 
51814    False
Name: description, Length: 49788, dtype: bool

In [29]:
FeatureUnionTransformer =\
FeatureUnion([('FTfillna_frequent',   Transformer_fillna_most_frequent),
              ('FTfillna_mean',       Transformer_fillna_mean),
              ('FTget_OneHotEncoder', Transformer_OneHotEncoder),
              ('FTamenities',         Transformer_amenities),
              ('FT_sum_amenities',    Transformer_sum_amenities),
#               ('FTtext',              Transformer_text),
#               ('FT_sum_text',         Transformer_sum_text),
              ('FT_lat_long',         Transformer_lat_long),
              ('FT_get_columns',      Transformer_get_columns)
             ], n_jobs = -1)

In [30]:
#Transformer with polynomial features
Full_Transformer_poly =\
Pipeline([('Feature_Engineering', FeatureUnionTransformer),
          ('Polynomial_Transformer', 
           PolynomialFeatures(degree=2, interaction_only=True)),
          ('Min_Max_Transformer', MaxAbsScaler())
         ])

#Transformer without polynomial features
Full_Transformer =\
Pipeline([('Feature_Engineering', FeatureUnionTransformer),
          ('Min_Max_Transformer', MaxAbsScaler())
         ])

Regression model pipelines:

In [31]:
lin_reg_pipe=\
Pipeline([('Feature_transformer', Full_Transformer),
          ('Linear_regressor', lin_regressor)
         ])

lin_reg_poly_pipe=\
Pipeline([('Feature_transformer_poly', Full_Transformer_poly),
          ('Linear_regressor', lin_regressor)
         ])

RF_pipe=\
Pipeline([('Feature_transformer', Full_Transformer),
          ('RFE_regressor', RF_regressor)
         ])

RF_poly_pipe=\
Pipeline([('Feature_transformer_poly', Full_Transformer_poly),
          ('RFE_regressor', RF_regressor)
         ])

CBR_pipe=\
Pipeline([('Feature_transformer', Full_Transformer),
          ('RFE_regressor', cbr)
         ])

Split the data to train and test:

In [32]:
X_train_all, X_test_all, y_train_all, y_test_all =\
split(data_clean_all.drop(axis=1, columns=['price']), 
      data_clean_all['price'], 
      test_size =0.3, random_state=123)

Run all the regression models

In [33]:
#all regressors
pipes_dict_all = {
#      'LG_poly':lin_reg_poly_pipe,
     'RF':RF_pipe
#      'RF_poly':RF_poly_pipe
}
#                   'RF_poly':RF_poly_pipe}

#reset DF to collect results
df_all_results_all=pd.DataFrame(
    columns=['Model','CV_train_mean_MAPE_score','Test_MAPE_score',
             'Min_diff','Max_diff','Mean_diff','Median_diff',
             'STD_diff','10th percentile','90th percentile'])

df_data_and_results_all=X_test_all.copy()
df_data_and_results_all['Price_true']=y_test_all

In [34]:
#Run models
df_data_and_results_all, df_all_results_all =\
get_results(pipes_dict_all, X_train_all, y_train_all, X_test_all, 
            y_test_all, num_cv, n_verbose, scoring_param,
            df_data_and_results_all, df_all_results_all)

Model: RF
Begin CV fit


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:  1.2min remaining:  1.8min
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  1.2min finished


CV ended. Elapsed time: 0 hours, 1 minutes and 12 seconds
Begin RF model fit
Model fit ended. Elapsed time: 0 hours, 0 minutes and 21 seconds



In [1425]:
df_all_results_all

Unnamed: 0,Model,CV_train_mean_MAPE_score,Test_MAPE_score,Min_diff,Max_diff,Mean_diff,Median_diff,STD_diff,10th percentile,90th percentile


In [1426]:
test_test = pd.read_csv('data/test.csv')

In [1427]:
test_test_clean_all=test_test.copy()
test_test_clean_all=preprocessing_1(test_test_clean_all)

In [1428]:
test_test_clean_all['host_response_time'].fillna('None', inplace = True)

In [1429]:
test_test['price'] = RF_pipe.predict(test_test_clean_all)


NotFittedError: This SimpleImputer instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

In [None]:
test_test[['id', 'price']].to_csv('answer_9.csv', index=False)

In [None]:
data['price'].describe()

In [None]:
test_test['price'].describe()

In [None]:
data[data['price'] > 250].shape

In [1430]:
test_test['price'].describe()

KeyError: 'price'

In [1431]:
X_train_all.head()


Unnamed: 0,id,name,summary,space,description,neighborhood_overview,notes,transit,access,interaction,...,bed_type,amenities,security_deposit,cleaning_fee,guests_included,extra_people,minimum_nights,cancellation_policy,require_guest_profile_picture,require_guest_phone_verification
12258,18500903,True,True,True,True,True,True,True,True,True,...,Real_Bed,TV Wifi Kitchen Free_parking_on_premises Heating Smoke_detector First_aid_kit Fire_extinguisher Essentials Shampoo Lock_on_bedroom_door Hangers Hair_dryer Iron Laptop_friendly_workspace High_chair Children’s_books_and_toys Crib Pack_’n_Play_travel_crib Hot_water Bed_linens Extra_pillows_and_blankets Microwave Refrigerator Dishwasher Cooking_basics Oven Stove Patio_or_balcony Garden_or_backyard Well_lit_path_to_entrance Step_free_access Wide_clearance_to_bed Accessible_height_bed Step_free_access Accessible_height_toilet Step_free_access Host_greets_you Handheld_shower_head,,,1,20.0,2,strict_14_with_grace_period,False,False
34266,25004717,True,True,True,True,True,True,True,False,False,...,Real_Bed,TV Wifi Kitchen Heating Family_kid_friendly Washer Dryer Essentials Shampoo Iron Hot_water Bed_linens Microwave Refrigerator Dishwasher Dishes_and_silverware Oven Stove Garden_or_backyard,495.0,450.0,12,0.0,1,strict_14_with_grace_period,False,False
8979,17343595,True,True,True,True,True,True,True,True,True,...,Real_Bed,Wifi Kitchen Free_parking_on_premises Breakfast Heating Family_kid_friendly Suitable_for_events Washer Dryer Smoke_detector First_aid_kit Safety_card Essentials Shampoo Hangers Hair_dryer Iron Private_entrance,0.0,10.0,1,20.0,2,moderate,False,False
27830,23017728,True,True,True,True,True,False,True,True,True,...,Real_Bed,TV Wifi Kitchen Heating Washer Smoke_detector Essentials Shampoo Hangers Iron Laptop_friendly_workspace Self_check_in Lockbox Bathtub Hot_water Bed_linens Microwave Refrigerator Dishes_and_silverware Cooking_basics Oven Stove,85.0,60.0,1,0.0,3,strict_14_with_grace_period,False,False
9839,17649915,True,True,False,True,False,False,False,False,False,...,Real_Bed,Wifi Kitchen Free_parking_on_premises Heating Washer Dryer Smoke_detector Carbon_monoxide_detector Essentials Lock_on_bedroom_door Hangers Iron translation_missing__en_hosting_amenity_49 translation_missing__en_hosting_amenity_50 Private_living_room,,,1,0.0,1,flexible,False,False


### Data from reviews

In [1432]:
reviews = pd.read_csv('data/reviews.csv')

In [1375]:
reviews.head()

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments
0,9554,1184025,2012-04-26,1809049,Hana,"I stayed in London for a month to study, explore the city, and meet people along the way. \r\n\r\nI think the area is a great place to set your base. The neighborhood is lively and friendly, with an eclectic mix of ethic restaurants, colorful grocery stores and a happening pub that stays open late. There’s a big, green park nearby which is great for running or having a coffee. Central London is twenty minutes away by Tube.\r\n\r\nThe apartment is charming, ceiling to floor. It actually does look even better than in the photos! The guest-room is bright and well heated, with a beautiful work desk and a spacious closet. The kitchen is spacious and smells of fresh espresso in the morning. \r\n\r\nGuy is a friendly, reliable, easy- going host who knows (and loves!) London. Make sure to ask about the must-go London pubs and places to eat! \r\n\r\nAll in all, I’d wholeheartedly recommend the apartment and the neighborhood, not to mention the curry house down the street! Enjoy !!! \r\n"
1,9554,1206322,2012-04-30,2237488,Rishi,"My bnb request was very last minute and i was surprised with a prompt and kind reply from Guy. He scheduled around his day to accommodate the time that i was arriving , and was greeted with an ace cup of coffee. \r\n\r\nHad no complaints at all during my stay and was also give n a spare key to come in and out as i wish ! Thank you once again"
2,9554,1258541,2012-05-10,2150467,Panee,"First time as airbnb!First time to London! I couldn’t have asked for better location and host.He promptly sent me welcome message right after I made reservation.Also detailed and thoughtul answers to all my questions before my trip.\r\nHost:Guy,he is very nice person,friendly and wiling to help.He is a bit busy but If I need information or help always help me.\r\nLocation:The place is a great location and easy to find (Website hidden by Airbnb) close to train ans buses which reached to the center of London easier .Also It’s close to everything you could possibly need like grocery stores and Turkish dises(from his advising).\r\nThe flat:great location,clean,quiet and comfortable living room and wifi.He freely share his kitchen faclities and spare bathroom.\r\nwe stayed with him 9 days .When we arrived to the house he offered us for coffee and he bought us supper meal day before leaving and eating together That was so delicious and very nice .I would recommended staying with Guy for anyone\r\nThank you Airbnb team and Guy .\r\n"
3,9554,1405284,2012-06-03,1864672,Simone Cristina,"A wonderful experience! The house is very well located, near the subway station, the room and the house Guy is very cozy. Guy was really helpful and let me not miss anything. I would say that was one of the best trips to London. Guy loved to meet you! I hope to return soon to London!\r\n\r\nBeijos do Brasil!"
4,9554,1475969,2012-06-13,2438453,Sondra,"We are so grateful that we trusted our intuition and went with Guy for our stay in London. He was a truly fantastic host, who went out of his way to be accomodating to us. He offered to pick us up from the train station in the middle of the night when we arrived, and ended up having to stay and wait quite a long time for us as the train wasn't running as frequently at that time. When we was home, he was quiet and discrete. He's a very kind and considerate person who genuinely loves meeting new people and wants people to enjoy themselves in London. He even treated us to some amazing curry on our last night's stay! Thank you Guy for really making us feel at home!"


In [1376]:
listing_comments = reviews.groupby(by = ["listing_id"]).count().\
reset_index('listing_id')[['listing_id','comments']]
listing_comments = listing_comments.rename(columns={"listing_id": "id"})
listing_comments.head()

Unnamed: 0,id,comments
0,9554,132
1,11076,3
2,13913,14
3,17402,39
4,24328,92


In [1377]:
clean_data_with_comments_count =\
pd.merge(data_clean_all, listing_comments, 
how='left', on=['id'])

In [1378]:
clean_data_with_comments_count.head()
clean_data_with_comments_count = clean_data_with_comments_count.drop(axis=1, columns=['id'])

In [1379]:
X_train_all, X_test_all, y_train_all, y_test_all =\
split(clean_data_with_comments_count.drop(axis=1, columns=['price']), 
      data_clean_all['price'], 
      test_size =0.3, random_state=123)

In [1380]:
#all regressors
pipes_dict_all = {
     'LG':lin_reg_pipe, 
#      'LG_poly':lin_reg_poly_pipe,
     'RF':RF_pipe
#      'RF_poly':RF_poly_pipe
}
#                   'RF_poly':RF_poly_pipe}

#reset DF to collect results
df_all_results_all=pd.DataFrame(
    columns=['Model','CV_train_mean_MAPE_score','Test_MAPE_score',
             'Min_diff','Max_diff','Mean_diff','Median_diff',
             'STD_diff','10th percentile','90th percentile'])

df_data_and_results_all=X_test_all.copy()
df_data_and_results_all['Price_true']=y_test_all

In [1381]:
#Run models
df_data_and_results_all, df_all_results_all =\
get_results(pipes_dict_all, X_train_all, y_train_all, X_test_all, 
            y_test_all, num_cv, n_verbose, scoring_param,
            df_data_and_results_all, df_all_results_all)

Model: LG
Begin CV fit


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    2.4s remaining:    3.7s


CV ended. Elapsed time: 0 hours, 0 minutes and 3 seconds
Begin LG model fit


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    3.3s finished


Model fit ended. Elapsed time: 0 hours, 0 minutes and 2 seconds

Model: RF
Begin CV fit


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:   58.7s remaining:  1.5min
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   59.4s finished


CV ended. Elapsed time: 0 hours, 0 minutes and 59 seconds
Begin RF model fit
Model fit ended. Elapsed time: 0 hours, 0 minutes and 17 seconds



In [1382]:
df_all_results_all

Unnamed: 0,Model,CV_train_mean_MAPE_score,Test_MAPE_score,Min_diff,Max_diff,Mean_diff,Median_diff,STD_diff,10th percentile,90th percentile
0,LG,21.7858,21.8934,-128.388593,184.265107,0.470443,-3.251449,29.667233,-31.238123,38.835722
1,RF,20.3967,20.4666,-103.176156,163.270961,0.165031,-3.619558,28.063369,-29.151263,35.286625


In [1265]:
X_train_all

Unnamed: 0,name,summary,space,description,neighborhood_overview,notes,transit,access,interaction,house_rules,...,bed_type,amenities,security_deposit,cleaning_fee,guests_included,extra_people,minimum_nights,cancellation_policy,require_guest_profile_picture,require_guest_phone_verification
23539,True,True,True,True,True,False,False,True,False,True,...,Real_Bed,TV Wifi Kitchen Elevator Heating Washer Smoke_detector Carbon_monoxide_detector First_aid_kit Fire_extinguisher Essentials Shampoo Lock_on_bedroom_door Hangers Hair_dryer Iron Laptop_friendly_workspace Private_living_room,150.0,15.0,1,15.0,2,strict_14_with_grace_period,False,False
15770,True,True,False,True,False,False,False,False,False,True,...,Real_Bed,TV Wifi Kitchen Breakfast Heating Family_kid_friendly Washer Dryer Smoke_detector Carbon_monoxide_detector First_aid_kit Safety_card Essentials Shampoo Hangers Iron Laptop_friendly_workspace Hot_water Other,0.0,40.0,2,30.0,1,flexible,False,False
50347,True,True,True,True,True,True,True,True,False,True,...,Real_Bed,TV Wifi Kitchen Breakfast Heating Washer Dryer Smoke_detector Carbon_monoxide_detector First_aid_kit Safety_card Fire_extinguisher Essentials Shampoo Hangers Hair_dryer Iron Laptop_friendly_workspace Self_check_in Lockbox Hot_water Bed_linens,200.0,50.0,2,35.0,1,flexible,False,False
12842,True,True,True,True,True,True,True,True,True,False,...,Real_Bed,TV Cable_TV Wifi Kitchen Paid_parking_off_premises Heating Family_kid_friendly Washer Dryer Smoke_detector Carbon_monoxide_detector First_aid_kit Fire_extinguisher Essentials Shampoo Hangers Hair_dryer Iron Laptop_friendly_workspace Self_check_in Lockbox Bathtub Baby_bath High_chair Children’s_books_and_toys Crib Hot_water Bed_linens Extra_pillows_and_blankets Refrigerator Dishes_and_silverware Cooking_basics Oven Stove Luggage_dropoff_allowed Long_term_stays_allowed Wide_hallway_clearance Wide_doorway Well_lit_path_to_entrance Step_free_access Step_free_access Accessible_height_toilet Step_free_access Handheld_shower_head,0.0,67.0,4,10.0,1,strict_14_with_grace_period,False,False
5547,True,True,True,True,False,False,False,False,False,False,...,Real_Bed,TV Wifi Kitchen Free_parking_on_premises Breakfast Heating Washer Dryer Smoke_detector Carbon_monoxide_detector First_aid_kit Essentials Shampoo Hangers Hair_dryer Iron,500.0,70.0,1,0.0,4,strict_14_with_grace_period,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8852,True,True,True,True,False,False,True,True,True,True,...,Real_Bed,Wifi Pets_live_on_this_property Cat_s_ Heating Family_kid_friendly Smoke_detector Carbon_monoxide_detector First_aid_kit Fire_extinguisher Essentials Shampoo Hangers Hair_dryer Iron Laptop_friendly_workspace,81.0,15.0,2,20.0,1,moderate,False,False
17582,True,True,True,True,True,True,True,True,True,False,...,Real_Bed,TV Internet Wifi Heating Family_kid_friendly Washer Dryer Essentials Shampoo Lock_on_bedroom_door Hangers Hair_dryer Iron Laptop_friendly_workspace Self_check_in Smart_lock Hot_water Bed_linens Microwave Coffee_maker Refrigerator Patio_or_balcony Garden_or_backyard,,60.0,1,0.0,1,strict_14_with_grace_period,False,False
20273,True,True,True,True,True,True,True,True,True,False,...,Real_Bed,TV Wifi Kitchen Heating Washer Dryer Smoke_detector Carbon_monoxide_detector First_aid_kit Essentials Shampoo Lock_on_bedroom_door Hangers Hair_dryer Iron Laptop_friendly_workspace Self_check_in Keypad Hot_water Bed_linens Extra_pillows_and_blankets Microwave Coffee_maker Refrigerator Dishes_and_silverware Cooking_basics Oven Stove,100.0,15.0,1,5.0,1,strict_14_with_grace_period,False,False
32377,True,True,False,True,False,False,False,False,True,True,...,Real_Bed,TV Wifi Kitchen Heating Family_kid_friendly Washer Smoke_detector Essentials Shampoo Lock_on_bedroom_door Hangers Hair_dryer Iron Laptop_friendly_workspace Hot_water Long_term_stays_allowed Host_greets_you,150.0,35.0,2,10.0,1,strict_14_with_grace_period,False,False


In [1387]:
test_test_clean_all = pd.merge(test_test_clean_all, listing_comments, 
how='left', on=['id'])
test_test_clean_all.drop(axis=1, columns=['id'])

Unnamed: 0,name,summary,space,description,neighborhood_overview,notes,transit,access,interaction,house_rules,...,amenities,security_deposit,cleaning_fee,guests_included,extra_people,minimum_nights,cancellation_policy,require_guest_profile_picture,require_guest_phone_verification,comments
0,True,True,True,True,True,False,True,True,True,True,...,TV Internet Wifi Kitchen Smoking_allowed Breakfast Buzzer_wireless_intercom Heating Family_kid_friendly Washer Dryer Smoke_detector Carbon_monoxide_detector Essentials Shampoo 24_hour_check_in Hair_dryer Laptop_friendly_workspace,,7.0,1,15.0,1,strict_14_with_grace_period,True,False,132.0
1,True,True,True,True,True,False,True,True,True,True,...,TV Cable_TV Internet Wifi Breakfast Pets_live_on_this_property Dog_s_ Heating Family_kid_friendly Fire_extinguisher Shampoo,,,2,35.0,2,strict_14_with_grace_period,False,False,3.0
2,True,True,True,True,True,True,True,True,True,True,...,TV Cable_TV Wifi Kitchen Paid_parking_off_premises Smoking_allowed Free_street_parking Buzzer_wireless_intercom Heating Family_kid_friendly Washer Dryer Smoke_detector Carbon_monoxide_detector Fire_extinguisher Essentials Shampoo Lock_on_bedroom_door Hangers Hair_dryer Iron Laptop_friendly_workspace Outlet_covers Bathtub Children’s_books_and_toys Babysitter_recommendations Crib Pack_’n_Play_travel_crib Room_darkening_shades Children’s_dinnerware Hot_water Bed_linens Extra_pillows_and_blankets Ethernet_connection Coffee_maker Refrigerator Dishes_and_silverware Cooking_basics Oven Stove Patio_or_balcony Luggage_dropoff_allowed Long_term_stays_allowed Step_free_access Wide_doorway Wide_clearance_to_bed Accessible_height_bed Step_free_access Wide_doorway Bathtub_with_bath_chair Accessible_height_toilet Host_greets_you Handheld_shower_head Roll_in_shower,100.0,15.0,1,15.0,1,moderate,False,False,14.0
3,True,True,True,True,True,True,True,True,True,True,...,TV Wifi Kitchen Paid_parking_off_premises Elevator Buzzer_wireless_intercom Heating Family_kid_friendly Washer Dryer Smoke_detector Essentials Shampoo Hangers Hair_dryer Iron Laptop_friendly_workspace Hot_water Bed_linens Microwave Coffee_maker Refrigerator Dishwasher Dishes_and_silverware Cooking_basics Oven Stove Long_term_stays_allowed Other,350.0,65.0,4,10.0,3,strict_14_with_grace_period,False,False,39.0
4,True,True,True,True,True,True,True,True,True,True,...,TV Cable_TV Internet Wifi Kitchen Free_parking_on_premises Pets_allowed Pets_live_on_this_property Indoor_fireplace Heating Family_kid_friendly Washer Dryer Smoke_detector Carbon_monoxide_detector Essentials Shampoo 24_hour_check_in Hangers Hair_dryer Iron Laptop_friendly_workspace Self_check_in Lockbox Hot_water Long_term_stays_allowed,250.0,70.0,2,15.0,90,strict_14_with_grace_period,True,True,92.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22995,True,True,False,True,False,False,False,False,False,True,...,TV Wifi Air_conditioning Kitchen Breakfast Heating Washer Smoke_detector Essentials Shampoo Hangers Hair_dryer Iron Laptop_friendly_workspace translation_missing__en_hosting_amenity_49 translation_missing__en_hosting_amenity_50,500.0,40.0,1,0.0,3,flexible,False,False,
22996,True,True,True,True,True,True,False,True,True,True,...,TV Cable_TV Internet Wifi Kitchen Buzzer_wireless_intercom Heating Family_kid_friendly Washer Dryer Smoke_detector Carbon_monoxide_detector Essentials Shampoo 24_hour_check_in Hangers Hair_dryer Iron Laptop_friendly_workspace Hot_water Bed_linens Refrigerator Dishes_and_silverware Oven Stove,99.0,0.0,2,0.0,1,strict_14_with_grace_period,False,False,52.0
22997,True,True,False,True,False,False,False,False,False,False,...,TV Wifi Kitchen Free_parking_on_premises Indoor_fireplace Heating Family_kid_friendly Washer Dryer Smoke_detector Carbon_monoxide_detector Essentials Shampoo Hangers Hair_dryer Iron,75.0,35.0,1,0.0,4,flexible,False,False,
22998,True,True,False,True,False,False,False,False,False,True,...,TV Cable_TV Internet Wifi Kitchen Free_parking_on_premises Heating Family_kid_friendly Washer Smoke_detector Carbon_monoxide_detector First_aid_kit Essentials Shampoo 24_hour_check_in Hangers Hair_dryer Iron Laptop_friendly_workspace,250.0,40.0,1,0.0,2,strict_14_with_grace_period,False,False,26.0


In [35]:
### Saving model

import joblib

In [36]:
joblib.dump(lin_reg_pipe, 'pipeline.pkl')

PicklingError: Can't pickle <function get_col_to_fillna_most_frequent at 0x7fe652e0e820>: it's not the same object as __main__.get_col_to_fillna_most_frequent