# AirBnB Price Prediction

In [None]:
# import the library
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# sklearn :: utils
from sklearn.model_selection import train_test_split

# sklearn :: models
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor

# sklearn :: evaluation metrics
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

sns.set_style('whitegrid')

# Load the csv Data Files into Dataframe

In [None]:
df_train = pd.read_csv('data/train.csv')
df_test = pd.read_csv('data/test.csv')
print(df_train.shape, df_test.shape)

# Handline Missing Values & Convert Data Type

In [None]:
print(df_train.columns)
df_train.head()

In [None]:
# Data types of the Feature
df_train.dtypes

In [None]:
# Find the missing values
print(df_train.isnull().sum())

In [None]:
df_missing = df_train.filter(['bathrooms', 'first_review', 'last_review', 'host_has_profile_pic', 'host_identity_verified', 
                              'host_response_rate', 'host_since', 'neighbourhood', 'review_scores_rating','zipcode'])
df_missing

In [None]:
# Transform object/string Date/Time data to datetime
df_train['first_review'] = pd.to_datetime(df_train['first_review'])
df_train['last_review'] = pd.to_datetime(df_train['last_review'])
df_train['host_since'] = pd.to_datetime(df_train['host_since'])

In [None]:
df_train['host_since_year'] = df_train['host_since'].dt.year
print(round(df_train['host_since_year'].mean(skipna=True)))
df_train['host_since_year'].fillna(round(df_train['host_since_year'].mean()), inplace=True)
# df_train

In [None]:
# Replace NaN with Mean value in bathroom feature / column
df_train['bathrooms'].fillna(round(df_train['bathrooms'].mean()), inplace=True)

In [None]:
# Replace NaN with Mean value in bedrooms feature / column
df_train['bedrooms'].fillna(round(df_train['bedrooms'].mean()), inplace=True)

In [None]:
# Replace NaN with Mean value in bedrooms feature / column
df_train['beds'].fillna(round(df_train['beds'].mean()), inplace=True)

In [None]:
# Replace NaN with Mean value in review_scores_rating feature / column
df_train['review_scores_rating'].fillna(round(df_train['review_scores_rating'].mean()), inplace=True)

In [None]:
# Delete % sign from host_response_rate data and convert the data from object to integer
df_train['host_response_rate'] = df_train['host_response_rate'].str.replace('%', '')
df_train['host_response_rate'].fillna(0, inplace=True)
# Convert data type to Integer
df_train['host_response_rate'] = df_train['host_response_rate'].astype(int)

# Mean of host_response_rate without considering 0 values 
mean_host_response_rate = round(df_train['host_response_rate'].mean(skipna=True))

# Replace 0 with Mean value
df_train['host_response_rate'].mask(df_train['host_response_rate'] == 0, mean_host_response_rate, inplace=True)

In [None]:
# Replace t with 1, f with 0 and NaN with 0 of host_identity_verified feature
df_train['host_identity_verified'].mask(df_train['host_identity_verified'] == "t", "1", inplace=True)
df_train['host_identity_verified'].mask(df_train['host_identity_verified'] == "f", "0", inplace=True)
df_train['host_identity_verified'].fillna(0.0, inplace=True)

# COnvert Data Type to Float
df_train['host_identity_verified'] = df_train['host_identity_verified'].astype(float)

In [None]:
# Replace t with 1, f with 0 and NaN with 0 of host_identity_verified feature
df_train['host_has_profile_pic'].mask(df_train['host_has_profile_pic'] == "t", "1", inplace=True)
df_train['host_has_profile_pic'].mask(df_train['host_has_profile_pic'] == "f", "0", inplace=True)
df_train['host_has_profile_pic'].fillna(0.0, inplace=True)

# Convert Data Type to Float
df_train['host_has_profile_pic'] = df_train['host_has_profile_pic'].astype(float)

In [None]:
# Replace t with 1, f with 0 and NaN with 0 of host_identity_verified feature
df_train['instant_bookable'].mask(df_train['instant_bookable'] == "t", "1", inplace=True)
df_train['instant_bookable'].mask(df_train['instant_bookable'] == "f", "0", inplace=True)

# Convert Data Type to Float
df_train['instant_bookable'] = df_train['instant_bookable'].astype(int)

In [None]:
df_train['room_type'].value_counts()

In [None]:
df_test['room_type'].value_counts()

In [None]:
df_train.groupby(by='room_type')['log_price'].mean()

In [None]:
# Find the missing values
print(df_train.isnull().sum())

# Feature Re-Engineering

In [None]:
#List unique values of a Feature / Column
# df_train['zipcode'].value_counts()

In [None]:
# Create new features from city
df_city = pd.get_dummies(df_train['city'])
df_train = pd.concat([df_train, df_city], axis=1)

In [None]:
# Create new features from property_type
df_property_type = pd.get_dummies(df_train['property_type'])
df_train = pd.concat([df_train, df_property_type], axis=1)

In [None]:
# Create new features from bed_type
df_bed_type = pd.get_dummies(df_train['bed_type'])
df_train = pd.concat([df_train, df_bed_type], axis=1)

In [None]:
# Create new features from room_type
df_room_type = pd.get_dummies(df_train['room_type'])
df_train = pd.concat([df_train, df_room_type], axis=1)

In [None]:
df_train.head(10)

In [None]:
# Correlation
df_temp = df_train.filter(['log_price', 'accommodates', 'bathrooms', 'bedrooms', 'beds', 'Couch', 'Real Bed', 'Shared room', 'Entire home/apt', 
             'Private room', 'SF', 'instant_bookable'], axis=1)
df_temp.corr()

In [None]:
# select the columns
# X_columns = ['accommodates', 'bathrooms', 'bedrooms', 'beds', 'number_of_reviews', 'review_scores_rating']
X_columns = ['accommodates', 'bathrooms', 'bedrooms', 'beds', 'Real Bed', 'Shared room', 'Entire home/apt', 
             'Private room', 'SF']
# X_columns = ['accommodates', 'bathrooms', 'bedrooms', 'beds', 'cleaning_fee']
y_column = ['log_price']

In [None]:
# handle missing values
df_train = df_train[X_columns + y_column]
print(df_train.shape)
df_train = df_train.fillna(0.0) # probably not a good idea for 'review_scores_rating'
print(df_train.shape)

# Experiment

In [None]:
# split the data using sklearn

threshold = 0.7
X = df_train[X_columns]
y = df_train[y_column]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1.0-threshold, shuffle=True)

print('X_train', X_train.shape)
print('y_train', y_train.shape)
print('X_test', X_test.shape)
print('y_test', y_test.shape)

In [None]:
def model_training(model_name, model, X_train, y_train):
    model.fit(X_train, y_train)
    return model
    
def model_prediction(model, X_test):
    y_pred = model.predict(X_test)
    return y_pred

def model_evaluation(model_name, y_test, y_pred):
    print(model_name)
    print('MAE', mean_absolute_error(y_test, y_pred))
    print('RMSE', np.sqrt(mean_squared_error(y_test, y_pred)))
#     plt.scatter(y_test, y_pred, alpha=0.3)
#     plt.plot(range(0,5000000, 100), range(0,5000000, 100), '--r', alpha=0.3, label='Line1')
#     plt.title(model_name)
#     plt.xlabel('True Value')
#     plt.ylabel('Predict Value')
#     plt.xlim([0, 5000000])
#     plt.ylim([0, 5000000])
#     plt.show()
    print('')

def run_experiment(model_name, model, X_train, y_train, X_test):
    train_model = model_training(model_name, model, X_train, y_train)
    predictions = model_prediction(train_model, X_test)
    model_evaluation(model_name, y_test, predictions)
    
run_experiment('Linear Regression', LinearRegression(), X_train, y_train, X_test)
run_experiment('KNN 5', KNeighborsRegressor(5), X_train, y_train, X_test)
run_experiment('KNN 2', KNeighborsRegressor(2), X_train, y_train, X_test)
run_experiment('Decision Tree', DecisionTreeRegressor(), X_train, y_train, X_test)
run_experiment('Random Forest 10', RandomForestRegressor(10), X_train, y_train, X_test)
run_experiment('Random Forest 100', RandomForestRegressor(100), X_train, y_train, X_test)
run_experiment('Gradient Boosting', GradientBoostingRegressor(), X_train, y_train, X_test)


# Model Training

In [None]:
# train a linear regression
model = GradientBoostingRegressor()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# Model Evaluation

In [None]:
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print('RMSE', round(rmse, 2))
plt.scatter(y_test, y_pred, alpha=0.3)
plt.plot(range(0,10), range(0,10), '--r', alpha=0.3, label='Line1')
plt.title('Gradient Boosting')
plt.xlabel('True Value')
plt.ylabel('Predict Value')
plt.show()

# Prepare submission

In [None]:
# Create new features from city
df_city = pd.get_dummies(df_test['city'])
df_test = pd.concat([df_test, df_city], axis=1)

# Create new features from property_type
df_property_type = pd.get_dummies(df_test['property_type'])
df_test = pd.concat([df_test, df_property_type], axis=1)

# Create new features from bed_type
df_bed_type = pd.get_dummies(df_test['bed_type'])
df_test = pd.concat([df_test, df_bed_type], axis=1)

# Create new features from room_type
df_room_type = pd.get_dummies(df_test['room_type'])
df_test = pd.concat([df_test, df_room_type], axis=1)

In [None]:
df_prediction = df_test[X_columns].fillna(0.0)
df_test['log_price'] = model.predict(df_prediction)
df_test[['id', 'log_price']]

In [None]:
df_test[['id', 'log_price']].to_csv('Submission/AirBnB_Submission_1.csv', index=False)