# Airbnb price prediction

### Outline <a name = 'outline'></a>
* [Helpful functions](#helpful_funtions) 
* [Data exploratory](#data_exploratory)
* [Data preprocessing](#data_preprocessing)
* [Regression models](#regression_models)

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
import time

from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline
from sklearn.preprocessing import PolynomialFeatures,MaxAbsScaler, MinMaxScaler, FunctionTransformer, OneHotEncoder, KBinsDiscretizer
from sklearn.decomposition import PCA

from sklearn.model_selection import cross_val_score, train_test_split as split

from sklearn.neighbors import KNeighborsRegressor as KNNR
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, RandomTreesEmbedding

from sklearn.metrics import roc_curve, mean_absolute_error, make_scorer

from data_preparation.preprocessing import *
from model_training.transformation import Transformer_fillna_most_frequent,\
KNN_neighbors, Transformer_fillna_mean, Transformer_OneHotEncoder,\
Transformer_lat_long,\
Transformer_get_columns

import warnings
warnings.filterwarnings('ignore')

### Helpful functions <a name = "helpful_functions"/>

In [2]:
#Data utils functions functions

# scoring functions
def MAE(y_true, y_pred):
    return mean_absolute_error(y_true, y_pred)

def MAPE(y_true, y_pred):
    return np.mean(np.abs(1 - y_pred / y_true))


#functions for model fit, predict and results
def time_convert (t):
    h,m1=divmod(t, 3600)
    m,s=divmod(m1, 60) 
    return h, m ,s

def get_mean_cv_score(name, model, X_train, y_train, num_cv, n_verbose,scoring_param):
    print ('Model: '+name)
    print ('Begin CV fit')
    t0 = time.time()
    c_val=cross_val_score(model, X_train, y_train, 
                          cv=num_cv, verbose=n_verbose, 
                          scoring=scoring_param, n_jobs=-1)
    t1 = time.time()
    h, m ,s=time_convert(t1-t0)
    print('CV ended. Elapsed time: {0:.0f} hours, {1:.0f} minutes and {2:.0f} seconds'.format(h,m,s))
    return -(c_val.mean().round(4))


def get_results (pipes_dict, X_train, y_train,  X_test, y_test, num_cv, 
                 n_verbose, scoring_param, df_data_and_results, df_all_results):
    for name,model in pipes_dict.items():
        mean_cv_score=get_mean_cv_score(name ,model, X_train, y_train, num_cv, n_verbose,scoring_param)
        print ('Begin '+name+' model fit')
        t1 = time.time()
        model.fit(X_train,y_train)
        t2 = time.time()
        h, m ,s=time_convert(t2-t1)
        print('Model fit ended. Elapsed time: {0:.0f} hours, {1:.0f} minutes and {2:.0f} seconds'.format(h,m,s))
        y_pred=model.predict(X_test)
        df_data_and_results[('Price_pred_'+name)]=y_pred
        df_data_and_results[('Price_diff_'+name)]=y_test - y_pred
        df_all_results=df_all_results.append ({ 'Model':name,
                                                'CV_train_mean_MAPE_score': mean_cv_score,
                                                'Test_MAPE_score':          round(MAE(y_test,y_pred), 4),
                                                'Min_diff':                 df_data_and_results[('Price_diff_'+name)].min(),
                                                'Max_diff':                 df_data_and_results[('Price_diff_'+name)].max(), 
                                                'Mean_diff':                df_data_and_results[('Price_diff_'+name)].mean(), 
                                                'Median_diff':              df_data_and_results[('Price_diff_'+name)].median(),
                                                'STD_diff':                 df_data_and_results[('Price_diff_'+name)].std(),
                                                '10th percentile':          df_data_and_results[('Price_diff_'+name)].quantile(q=[0.1,0.9], interpolation='linear')[0.1],
                                                '90th percentile':          df_data_and_results[('Price_diff_'+name)].quantile(q=[0.1,0.9], interpolation='linear')[0.9]
                                              },
                                                ignore_index = True)
        print('======================================================================================\n')
    return df_data_and_results, df_all_results

#Graph generation functions:
def get_diff_hist (pipes_dict,df_data_and_results, df_all_results):
    for name,model in pipes_dict.items():
        fig, ax = plt.subplots(figsize=(14,5))
        sns.set_context(rc={"lines.linewidth": 3.5})#
        sns.distplot(df_data_and_results[('Price_diff_'+name)],ax=ax,color='red')
        plt.title(name+' Price_diff histogram')
        ax.set_xlim(df_all_results['Min_diff'].min(),df_all_results['Max_diff'].max())

def get_prediction_cluster_graph(pipes_dict, y_test, df_data_and_results):
    for name,model in pipes_dict.items():
        plt.figure()
        plt.plot(y_test,np.log(df_data_and_results[('Price_pred_'+name)]),'.', label = 'Result Data')
        plt.plot([2,9],[2,9], label = 'Ideal')
        plt.axes().set_aspect('equal')
        plt.legend()
        plt.title(name+' (y_true,y_pred) vs. ideal')

### Data exploratory <a name = "data_exploratory"/>

In [3]:
data = pd.read_csv('data/train.csv')

In [4]:
data.head()

Unnamed: 0,id,name,summary,space,description,experiences_offered,neighborhood_overview,notes,transit,access,...,square_feet,security_deposit,cleaning_fee,guests_included,extra_people,minimum_nights,cancellation_policy,require_guest_profile_picture,require_guest_phone_verification,price
0,127860,Double bedroom in cottage Twickenham (sleeps 1-2),"One gorgeous, light-filled double bedroom (sle...","A beautiful, light-filled double bedroom is ou...","One gorgeous, light-filled double bedroom (sle...",none,"Very, very safe area, great transport links an...",We have a super-gorgeous cat,Twickenham Rugby Stadium is 15 mins walk Water...,"You'll have full use of the cottage, two recep...",...,,300.0,10.0,1,10.0,2,strict_14_with_grace_period,f,f,1000.0
1,325809,Big House for Olympics sleeps 6 to8,,"A beautiful, modern, art-filled and clean 4 st...","A beautiful, modern, art-filled and clean 4 st...",none,,,,,...,1400.0,771.0,,1,0.0,14,strict_14_with_grace_period,f,f,771.0
2,429045,The Old Coach House (Olympics),,"Fabulous, recently refurbished original Victor...","Fabulous, recently refurbished original Victor...",none,,,,,...,1800.0,600.0,,1,0.0,14,strict_14_with_grace_period,f,f,1500.0
3,473637,Brand New contemporary mews house,,Stunning contemporary mews available for let o...,Stunning contemporary mews available for let o...,none,,,,,...,2100.0,300.0,100.0,7,100.0,14,strict_14_with_grace_period,f,f,2000.0
4,533943,LUXURY APT. NEAR BUCKINGHAM PALACE,“The area around Westminster Cathedral has lon...,"This stunning, spacious, 4 bedroom, top floor ...",“The area around Westminster Cathedral has lon...,family,The proximity of all of London's main attracti...,,"Only 5 minutes walk to Victoria Station, the m...",,...,2220.0,2505.0,180.0,1,0.0,14,strict_14_with_grace_period,f,t,901.0


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51815 entries, 0 to 51814
Data columns (total 43 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   id                                51815 non-null  int64  
 1   name                              51809 non-null  object 
 2   summary                           50018 non-null  object 
 3   space                             33350 non-null  object 
 4   description                       50798 non-null  object 
 5   experiences_offered               51815 non-null  object 
 6   neighborhood_overview             30176 non-null  object 
 7   notes                             19436 non-null  object 
 8   transit                           29762 non-null  object 
 9   access                            28386 non-null  object 
 10  interaction                       27110 non-null  object 
 11  house_rules                       28071 non-null  object 
 12  host

In [6]:
pd.set_option('display.max_colwidth', 0)
data[categoric_cols].agg(['unique']).transpose()

Unnamed: 0,unique
host_response_time,"[nan, within an hour, within a day, within a few hours, a few days or more]"
property_type,"[House, Apartment, Loft, Townhouse, Boat, Boutique hotel, Serviced apartment, Condominium, Bed and breakfast, Hostel, Other, Cabin, Bungalow, Yurt, Tiny house, Guest suite, Guesthouse, Aparthotel, Cottage, Lighthouse, Barn, Villa, Houseboat, Hotel, Ryokan (Japan), Casa particular (Cuba), Resort, Chalet, Island, Nature lodge, Hut, Camper/RV, Earth house, Treehouse, Farm stay, Bus, Shepherd's hut (U.K., France), Tipi, Tent, Dome house]"
bed_type,"[Real Bed, Pull-out Sofa, Futon, Couch, Airbed]"
cancellation_policy,"[strict_14_with_grace_period, flexible, moderate, super_strict_30, super_strict_60, strict]"
host_is_superhost,"[f, t, nan]"
host_identity_verified,"[t, f, nan]"
is_location_exact,"[t, f]"
require_guest_profile_picture,"[f, t]"
require_guest_phone_verification,"[f, t]"
room_type,"[Private room, Entire home/apt, Shared room]"


In [7]:
data['property_type'].value_counts()

Apartment                        35260
House                            9878 
Townhouse                        1880 
Serviced apartment               1855 
Condominium                      683  
Loft                             446  
Bed and breakfast                413  
Guest suite                      230  
Guesthouse                       205  
Other                            168  
Hostel                           156  
Boutique hotel                   119  
Hotel                            117  
Bungalow                         90   
Cottage                          60   
Villa                            46   
Aparthotel                       41   
Boat                             41   
Cabin                            19   
Casa particular (Cuba)           17   
Tiny house                       17   
Camper/RV                        16   
Houseboat                        15   
Earth house                      8    
Chalet                           8    
Barn                     

### Data preprocessing <a name = "data_preprocessing"/>


In [8]:
data_clean_all=data[(data['price'] > 20) & (data['price'] < 500)].copy()
data_clean_all=preprocessing_1(data_clean_all)
data_clean_all.head()

Unnamed: 0,name,summary,space,description,neighborhood_overview,notes,transit,access,house_rules,host_about,...,is_location_exact,bed_type,security_deposit,cleaning_fee,extra_people,minimum_nights,cancellation_policy,require_guest_profile_picture,require_guest_phone_verification,price
66,True,True,False,True,False,False,False,False,False,False,...,True,Real_Bed,,,0.0,1,flexible,False,False,210.0
67,True,True,True,True,True,True,True,True,False,False,...,True,Pull-out_Sofa,3884.0,5.0,8.0,1,strict_14_with_grace_period,False,False,35.0
68,True,True,False,True,False,False,False,False,False,False,...,True,Real_Bed,,10.0,0.0,3,flexible,False,False,50.0
69,True,True,True,True,True,True,True,True,False,True,...,True,Real_Bed,,12.0,10.0,1,strict_14_with_grace_period,False,False,45.0
70,True,True,True,True,True,True,False,True,True,False,...,False,Real_Bed,,,0.0,4,moderate,False,False,46.0


In [9]:
data_clean_all.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 49788 entries, 66 to 51814
Data columns (total 27 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   name                              49788 non-null  bool   
 1   summary                           49788 non-null  bool   
 2   space                             49788 non-null  bool   
 3   description                       49788 non-null  bool   
 4   neighborhood_overview             49788 non-null  bool   
 5   notes                             49788 non-null  bool   
 6   transit                           49788 non-null  bool   
 7   access                            49788 non-null  bool   
 8   house_rules                       49788 non-null  bool   
 9   host_about                        49788 non-null  bool   
 10  host_response_time                29891 non-null  object 
 11  host_response_rate                29888 non-null  float64
 12  hos

In [10]:
data_clean_all['host_response_time'].dtype

dtype('O')

In [11]:
data_clean_all['host_response_time'].value_counts()

within an hour        20269
within a few hours    5143 
within a day          3576 
a few days or more    903  
Name: host_response_time, dtype: int64

In [12]:
data_clean_all['name'].isna().sum()

0

In [13]:
data_clean_all['host_response_time'].fillna('None', inplace = True)
# for c in long_text_cols:
#     data_clean_all[c].fillna('None', inplace = True)

### Regression models <a name = 'regression_models'/>

We decided to use 2 regression models: Linear Regression (LR) and Random Forest regressor (RF), for predicting price. 
In addition, KNN regressor was used to create a price grouping feature based on lat-long info.

LR and RF ran with 2 types of transformers, with and without polynomioal features

In [14]:
#scoring functions
MAE_scorer = make_scorer(MAE, greater_is_better=False)
MAPE_scorer = make_scorer(MAPE, greater_is_better=False)

In [15]:
RF_n_estimators=50
RF_min_samples_split=50
num_cv=5
n_verbose = 3
scoring_param=MAE_scorer

PCA_features=PCA()
lin_regressor= LinearRegression(n_jobs = -1)
RF_regressor = RandomForestRegressor(n_estimators=RF_n_estimators, 
                                     min_samples_split=RF_min_samples_split,
                                     n_jobs = -1
                                    )
KNN_Reg = KNNR(n_neighbors=KNN_neighbors)

In [16]:
FeatureUnionTransformer =\
FeatureUnion([('FTfillna_frequent',   Transformer_fillna_most_frequent),
              ('FTfillna_mean',       Transformer_fillna_mean),
              ('FTget_OneHotEncoder', Transformer_OneHotEncoder),
              # ('FTamenities',         Transformer_amenities),
              # ('FT_sum_amenities',    Transformer_sum_amenities),
#               ('FTtext',              Transformer_text),
#               ('FT_sum_text',         Transformer_sum_text),
              ('FT_lat_long',         Transformer_lat_long),
              ('FT_get_columns',      Transformer_get_columns)
             ], n_jobs = -1)

In [17]:
#Transformer with polynomial features
Full_Transformer_poly =\
Pipeline([('Feature_Engineering', FeatureUnionTransformer),
          ('Polynomial_Transformer', 
           PolynomialFeatures(degree=2, interaction_only=True)),
          ('Min_Max_Transformer', MaxAbsScaler())
         ])

#Transformer without polynomial features
Full_Transformer =\
Pipeline([('Feature_Engineering', FeatureUnionTransformer),
          ('Min_Max_Transformer', MaxAbsScaler())
         ])

Regression model pipelines:

In [18]:
RF_pipe=\
Pipeline([('Feature_transformer', Full_Transformer),
          ('RFE_regressor', RF_regressor)
         ])

Split the data to train and test:

In [19]:
X_train_all, X_test_all, y_train_all, y_test_all =\
split(data_clean_all.drop(axis=1, columns=['price']), 
      data_clean_all['price'], 
      test_size =0.3, random_state=123)

Run all the regression models

In [20]:
#all regressors
pipes_dict_all = {
     'RF':RF_pipe
}

#reset DF to collect results
df_all_results_all=pd.DataFrame(
    columns=['Model','CV_train_mean_MAPE_score','Test_MAPE_score',
             'Min_diff','Max_diff','Mean_diff','Median_diff',
             'STD_diff','10th percentile','90th percentile'])

df_data_and_results_all=X_test_all.copy()
df_data_and_results_all['Price_true']=y_test_all

In [21]:
#Run models
df_data_and_results_all, df_all_results_all =\
get_results(pipes_dict_all, X_train_all, y_train_all, X_test_all, 
            y_test_all, num_cv, n_verbose, scoring_param,
            df_data_and_results_all, df_all_results_all)

Model: RF
Begin CV fit


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:   26.6s remaining:   39.9s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   26.7s finished


CV ended. Elapsed time: 0 hours, 0 minutes and 27 seconds
Begin RF model fit
Model fit ended. Elapsed time: 0 hours, 0 minutes and 8 seconds



In [22]:
df_all_results_all

Unnamed: 0,Model,CV_train_mean_MAPE_score,Test_MAPE_score,Min_diff,Max_diff,Mean_diff,Median_diff,STD_diff,10th percentile,90th percentile
0,RF,35.9745,36.2569,-231.260099,406.186696,0.845261,-8.804124,53.960446,-48.648534,61.090832


In [34]:
### Saving model

import dill
import os

In [35]:
with open(os.path.join(os.getcwd(),'pipeline.pkl'), 'wb') as file:
    dill.dump(RF_pipe, file)

In [23]:
forest = RF_pipe['RFE_regressor']
importances = forest.feature_importances_
std = np.std([tree.feature_importances_ for tree in forest.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]

# Print the feature ranking
print("Feature ranking:")
print(indices)

for f in range(indices.shape[0]):
    print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))

# # Plot the impurity-based feature importances of the forest
# plt.figure()
# plt.title("Feature importances")
# plt.bar(range(X_train_all.shape[1]), importances[indices],
#         color="r", yerr=std[indices], align="center")
# plt.xticks(range(X_train_all.shape[1]), indices)
# plt.xlim([-1, X_train_all.shape[1]])
# plt.show()

Feature ranking:
[17 67 16 69 68  2 15  6 32 14 27 12 28 11 13 23  0  7 10 26 31 24  4 33
 39 61  9 29 55 37 63  1 45 53 40  8 66 54 60 57 58 52 46 65 42 51 35 44
 47 48 59 30 20 56  3 64 38 50 41 34 62  5 43 36 22 21 49 19 18 25]
1. feature 17 (0.511499)
2. feature 67 (0.229507)
3. feature 16 (0.055983)
4. feature 69 (0.043780)
5. feature 68 (0.036442)
6. feature 2 (0.022251)
7. feature 15 (0.011655)
8. feature 6 (0.008595)
9. feature 32 (0.005115)
10. feature 14 (0.004434)
11. feature 27 (0.004247)
12. feature 12 (0.004171)
13. feature 28 (0.003776)
14. feature 11 (0.003773)
15. feature 13 (0.003506)
16. feature 23 (0.003381)
17. feature 0 (0.003010)
18. feature 7 (0.002835)
19. feature 10 (0.002783)
20. feature 26 (0.002676)
21. feature 31 (0.002427)
22. feature 24 (0.002356)
23. feature 4 (0.002286)
24. feature 33 (0.001805)
25. feature 39 (0.001794)
26. feature 61 (0.001706)
27. feature 9 (0.001634)
28. feature 29 (0.001373)
29. feature 55 (0.001266)
30. feature 37 (0.001166)
31. 

In [33]:
data_clean_all['cancellation_policy'].value_counts()

strict_14_with_grace_period    20796
flexible                       16772
moderate                       11697
super_strict_30                455  
super_strict_60                67   
strict                         1    
Name: cancellation_policy, dtype: int64