## Instructions {-}

- This is the template for the code and report on the Prediction Problem assignments.

- Your code in steps 1, 3, 4, and 5 will be executed sequentially, and must produce the RMSE / accuracy claimed on Kaggle.

- Your code in step 2 will also be executed, and must produce the optimal hyperparameter values used to train the model.

In [25]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score,train_test_split, KFold, cross_val_predict
from sklearn.metrics import mean_squared_error,r2_score,roc_curve,auc,precision_recall_curve, accuracy_score, \
recall_score, precision_score, confusion_matrix, mean_absolute_error
from sklearn.tree import DecisionTreeRegressor,DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, ParameterGrid, StratifiedKFold, RandomizedSearchCV
from sklearn.ensemble import BaggingRegressor,BaggingClassifier,AdaBoostRegressor,AdaBoostClassifier, \
RandomForestRegressor, GradientBoostingRegressor,VotingRegressor, StackingRegressor, VotingClassifier, StackingClassifier
from sklearn.linear_model import LinearRegression, LassoCV, RidgeCV, Ridge, Lasso
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.neighbors import KNeighborsRegressor
import itertools as it
import time as time
import xgboost as xgb
from datetime import datetime as dt

from skopt import BayesSearchCV
from catboost import CatBoostRegressor
from skopt.space import Real, Categorical, Integer
from skopt.plots import plot_objective, plot_histogram, plot_convergence
import warnings
from IPython import display

## Read data

In [2]:
train = pd.read_csv('./datasets/train_regression.csv')
test = pd.read_csv('./datasets/test_regression.csv')

## 1) Data pre-processing

Put the data pre-processing code. You don't need to explain it. You may use the same code from last quarter.

In [3]:
# Define a function to categorize the property types
def categorize_property(property_type):
    if 'Entire' in property_type:
        return 'Entire Home/Apartment'
    elif 'Private' in property_type:
        return 'Private Room'
    elif 'Shared' in property_type:
        return 'Shared Accommodation'
    elif property_type in ['Room in hotel', 'Room in boutique hotel', 'Boat']:
        return 'Specialty Accommodations'
    else:
        return 'Other'

In [4]:
# overall function to clean training and test data
def clean_data(df):
    
    # Remove $ from response variable and convert to float in training data
    if 'price' in df.columns:
        df.price = df.price.replace('[\$,]', '', regex=True).astype(float)
        
    # replace missing values of numeric variables wtih the median
    numeric_columns = df.select_dtypes(include=['number']).columns
    df[numeric_columns] = df[numeric_columns].apply(lambda x: x.fillna(x.median()))

    # replace missing values of categorical variables with the mode 
    categorical_columns = df.select_dtypes(include=['object']).columns
    df[categorical_columns] = df[categorical_columns].fillna(df[categorical_columns].mode().iloc[0])
    
    # log transform response variable for training data and drop price
    if 'price' in df.columns:
        df['log_price'] = np.log(df['price'])
    
    # replace any 0 values to 1 so that it can go through log transformation
    df['beds'] = df['beds'].replace(0, .01)
    df['accommodates'] = df['accommodates'].replace(0, .01)
    df['number_of_reviews'] = df['number_of_reviews'].replace(0, .01)
    df['reviews_per_month'] = df['reviews_per_month'].replace(0, .01)
    df['number_of_reviews_ltm'] = df['number_of_reviews_ltm'].replace(0, .01)
    df['number_of_reviews_l30d'] = df['number_of_reviews_l30d'].replace(0, .01)
    df['host_total_listings_count'] = df['host_total_listings_count'].replace(0, .01)
    df['host_listings_count'] = df['host_listings_count'].replace(0, .01)
    df['calculated_host_listings_count_private_rooms'] = df['calculated_host_listings_count_private_rooms'].replace(0, .01)
    df['calculated_host_listings_count_shared_rooms'] = df['calculated_host_listings_count_shared_rooms'].replace(0, .01)
    df['calculated_host_listings_count_entire_homes'] = df['calculated_host_listings_count_entire_homes'].replace(0, .01)
    
    df['log_beds'] = np.log(df.beds)
    df['log_accommodates'] = np.log(df.accommodates)
    df['log_reviews'] = np.log(df.number_of_reviews)
    df['log_reviews_per_month'] = np.log(df.reviews_per_month)
    df['log_reviews_ltm'] = np.log(df.number_of_reviews_ltm)
    df['log_reviews_l30d'] = np.log(df.number_of_reviews_l30d)
    df['log_host_total_listings_count'] = np.log(df.host_total_listings_count)
    df['log_host_listings_count'] = np.log(df.host_listings_count)
    df['log_host_listings_count_private_rooms'] = np.log(df.calculated_host_listings_count_private_rooms)
    df['log_host_listings_count_shared_rooms'] = np.log(df.calculated_host_listings_count_shared_rooms)
    df['log_host_listings_count_entire_homes'] = np.log(df.calculated_host_listings_count_entire_homes)

    # calculate the number of days since the host became a host
    df['host_since'] = pd.to_datetime(df['host_since'])
    current_date = dt.now()
    df['host_since_days'] = (current_date - df['host_since']).dt.days
    
    # calculate days since first/last review
    df['first_review'] = pd.to_datetime(df['first_review'], errors='coerce')
    df['last_review'] = pd.to_datetime(df['last_review'], errors='coerce')

    df['first_review_days'] = (current_date - df['first_review']).dt.days
    df['last_review_days'] = (current_date - df['last_review']).dt.days
    
    # make response_rate and acceptance_rate into numeric dtype
    df['host_response_rate'] = df['host_response_rate'].str.rstrip('%').astype('float')
    df['host_acceptance_rate'] = df['host_acceptance_rate'].str.rstrip('%').astype('float')
    
    # subgroup property_type (similar levels as room_type so discard room predictor)
    df['property_cats'] = df['property_type'].apply(categorize_property)
    
    # extract numeric values from the 'bathrooms' column
    df['bath_numeric'] = df['bathrooms_text'].str.extract('(\d+\.*\d*)', expand=False).astype(float)

    # handle "Half-bath" by assigning a numeric value of 0.5
    df['bath_numeric'] = df.apply(lambda row: 0.5 if 'half' in row['bathrooms_text'].lower() \
                                  else row['bath_numeric'], axis=1)
    
    # to binary
    df.host_is_superhost = df.host_is_superhost.replace({'t': 1, 'f': 0})
    df.host_identity_verified = df.host_identity_verified.replace({'t': 1, 'f': 0})
    df.host_has_profile_pic = df.host_has_profile_pic.replace({'t': 1, 'f': 0})
    df.has_availability = df.has_availability.replace({'t': 1, 'f': 0})
    df.instant_bookable = df.instant_bookable.replace({'t': 1, 'f': 0})

    # drop the modified/redundant columns
    df.drop(columns = ['host_since', 'first_review', 'last_review', 'property_type', 'bathrooms_text', \
                       'number_of_reviews', 'reviews_per_month', 'number_of_reviews_ltm', \
                       'number_of_reviews_l30d', 'host_total_listings_count', 'host_listings_count', \
                      'calculated_host_listings_count_private_rooms', 'calculated_host_listings_count_shared_rooms', \
                       'calculated_host_listings_count_entire_homes', 'host_id'], inplace = True)
    
    # drop predictors that have low corr with log_price and high corr with others to help remove multi-collinearity
    df.drop(columns = ['first_review_days', 'last_review_days', 'host_acceptance_rate', 'host_response_rate', 
                       'availability_60', 'availability_90', 'minimum_minimum_nights', \
                       'maximum_maximum_nights', 'maximum_minimum_nights', 'minimum_maximum_nights', \
                       'minimum_nights_avg_ntm', 'maximum_nights_avg_ntm'], inplace = True)

In [5]:
clean_data(train)
clean_data(test)

In [6]:
# extract mean price of each host_location predictor
host_loc_avg_prices = train.groupby('host_location')['price'].mean().reset_index()
host_loc_avg_prices['avg_host_location_price'] = host_loc_avg_prices['price']
train = pd.merge(train, host_loc_avg_prices[['host_location', 'avg_host_location_price']], on='host_location', how='left')

# extract mean price of each host_neighbourhood predictor
host_neigh_avg_prices = train.groupby('host_neighbourhood')['price'].mean().reset_index()
host_neigh_avg_prices['avg_host_neighbourhood_price'] = host_neigh_avg_prices['price']
train = pd.merge(train, host_neigh_avg_prices[['host_neighbourhood', 'avg_host_neighbourhood_price']], on='host_neighbourhood', how='left')

# extract mean price of each neighbourhood_cleansed predictor
neigh_avg_prices = train.groupby('neighbourhood_cleansed')['price'].mean().reset_index()
neigh_avg_prices['avg_neighbourhood_price'] = neigh_avg_prices['price']
train = pd.merge(train, neigh_avg_prices[['neighbourhood_cleansed', 'avg_neighbourhood_price']], on='neighbourhood_cleansed', how='left')

In [7]:
# extract mean price of each host_location predictor for test data
test = pd.merge(test, host_loc_avg_prices[['host_location', 'avg_host_location_price']], on='host_location', how='left')

# extract mean price of each host_neighbourhood predictor for test data
test = pd.merge(test, host_neigh_avg_prices[['host_neighbourhood', 'avg_host_neighbourhood_price']], on='host_neighbourhood', how='left')

# extract mean price of each neighbourhood_cleansed predictor for test data
test = pd.merge(test, neigh_avg_prices[['neighbourhood_cleansed', 'avg_neighbourhood_price']], on='neighbourhood_cleansed', how='left')

In [8]:
# drop the categorical predictors that we used right above
train = train.drop(columns = ['host_neighbourhood', 'neighbourhood_cleansed', 'host_location'])
test = test.drop(columns = ['host_neighbourhood', 'neighbourhood_cleansed', 'host_location'])

In [9]:
# filter out extreme outliers
train = train[train.price < 10000]

In [10]:
# drop the most influential point
train = train.drop(index = 2850)

In [11]:
# OHE the remaining categorical variables
host_response_time_dummies = pd.get_dummies(train['host_response_time'], prefix='host_response_time')
train = pd.concat([train, host_response_time_dummies], axis = 1)

host_response_time_dummies = pd.get_dummies(test['host_response_time'], prefix='host_response_time')
test = pd.concat([test, host_response_time_dummies], axis = 1)

In [12]:
host_verifications_dummies = pd.get_dummies(train['host_verifications'], prefix='host_verifications')
train = pd.concat([train, host_verifications_dummies], axis = 1)

host_verifications_dummies = pd.get_dummies(test['host_verifications'], prefix='host_verifications')
test = pd.concat([test, host_verifications_dummies], axis = 1)

In [13]:
room_type_dummies = pd.get_dummies(train['room_type'], prefix='room_type')
train = pd.concat([train, room_type_dummies], axis = 1)

room_type_dummies = pd.get_dummies(test['room_type'], prefix='room_type')
test = pd.concat([test, room_type_dummies], axis = 1)

In [14]:
property_cats_dummies = pd.get_dummies(train['property_cats'], prefix='property_cats')
train = pd.concat([train, property_cats_dummies], axis = 1)

property_cats_dummies = pd.get_dummies(test['property_cats'], prefix='property_cats')
test = pd.concat([test, property_cats_dummies], axis = 1)

In [15]:
train = train.drop(columns = ['host_response_time', 'host_verifications', 'room_type', 'property_cats'])
test = test.drop(columns = ['host_response_time', 'host_verifications', 'room_type', 'property_cats'])

In [16]:
# variable spacing
train.rename(columns=lambda x: x.replace(' ', '_'), inplace=True)
test.rename(columns=lambda x: x.replace(' ', '_'), inplace=True)

In [17]:
# replace missing values of numeric variables with mean in test from added predictors
numeric_columns = test.select_dtypes(include=['number']).columns
test[numeric_columns] = test[numeric_columns].apply(lambda x: x.fillna(x.mean()))

In [18]:
# set response and predictors for scaling, no need to transform response
y_train = train.price
X_train = train.drop(columns = ['log_price', 'price', 'id'])
X_test = test.drop(columns = ['id'])

### Lasso

In [52]:
y_train_log = train.log_price

In [49]:
# poly features
poly = PolynomialFeatures(degree = 2, include_bias=False)
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.fit_transform(X_test)

In [53]:
alphas = np.logspace(-1, 3, 200)
lasso = LassoCV(alphas = alphas, cv = 5)
lasso.fit(X_train_poly,y_train_log)

In [84]:
X_train_cleaned = X_train_poly.T[lasso.coef_!=0].T
X_test_cleaned = X_test_poly.T[lasso.coef_!=0].T

## 2) Hyperparameter tuning

### How many attempts did it take you to tune the model hyperparameters?

It took me around 30+ attempts to tune.

### Which tuning method did you use (grid search / Bayes search / etc.)?

I used Grid Search.

### What challenges did you face while tuning the hyperparameters, and what actions did you take to address those challenges?

There were not as many challenges tuning the hyperparamters, one of them was making sure that the dataset was properly prepared for the models. Narrowing down the ranges was easier this time as I just used the same tuning methods as the previous models.

### How many hours did you spend on hyperparameter tuning?

I spent around 10 hours or less on tuning.

**Paste the hyperparameter tuning code below. You must show at least one hyperparameter tuning procedure.**

In [1]:
#Hyperparameter tuning code

In [55]:
model = CatBoostRegressor(random_state=1, verbose=0, thread_count = 1)
cv = KFold(n_splits=5, shuffle=True, random_state=1)

In [57]:
grid = {
    'n_estimators': [1000,1100],
    'max_depth': [6, 7],
    'learning_rate': [0.01, 0.1],  
    'subsample': [0.5, 0.6],
    'reg_lambda': [0.1, 1]
}

gcv = GridSearchCV(model, grid, cv = cv, scoring = 'neg_root_mean_squared_error', n_jobs = -1, verbose = 1)

gcv.fit(X_train_cleaned, y_train)

print(-gcv.best_score_)
print(gcv.best_params_)

Fitting 5 folds for each of 32 candidates, totalling 160 fits
129.9453775408653
{'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 1000, 'reg_lambda': 1, 'subsample': 0.5}


#### Linear model

In [85]:
y_train_lr = train.log_price

In [92]:
# poly features
poly = PolynomialFeatures(degree = 2, include_bias=False)
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.fit_transform(X_test)

In [93]:
# scale the variables
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_poly)
X_test_scaled = scaler.transform(X_test_poly)

In [94]:
alphas = np.logspace(-1,3,50)
lr_model = RidgeCV(alphas=alphas, cv=5)

lr_model.fit(X_train_scaled, y_train_lr)

In [87]:
cat_model = CatBoostRegressor(random_state=1, learning_rate=0.1, max_depth=6, n_estimators=1000, \
                              reg_lambda=1, subsample=0.5, verbose=0, thread_count=1)

In [27]:
base_model = DecisionTreeRegressor(random_state = 1)
bag_model = BaggingRegressor(estimator = base_model, bootstrap=True, bootstrap_features=True,max_features=0.75,
max_samples=0.9, n_estimators = 200, random_state = 1)

I used the above hyperparamters used to obtain the best scores from previous models.

## 3) Model

Using the optimal model hyperparameters, train the model, and paste the code below.

In [96]:
model = VotingRegressor(estimators=[('bag', bag_model), ('cat',cat_model)], n_jobs=-1)

model.fit(X_train_cleaned, y_train)

In [101]:
en = StackingRegressor(estimators = [('cat', cat_model), ('bag', bag_model)],
                     final_estimator=LinearRegression(),                                          
                    cv = KFold(n_splits = 5, shuffle = True, random_state=1))

en.fit(X_train_cleaned, y_train)

## 4) Put any ad-hoc steps for further improving model accuracy
For example, scaling up or scaling down the predictions, capping predictions, etc.

Put code below.

#### average with LR

In [102]:
y_pred = en.predict(X_test_cleaned)

In [103]:
pred_lr = np.exp(lr_model.predict(X_test_scaled))

In [104]:
average_pred = (y_pred + pred_lr) / 2

## 5) Export the predictions in the format required to submit on Kaggle
Put code below.

In [105]:
id = test.id.values
predicted = average_pred
submission = pd.DataFrame({'id': id, 'predicted': predicted})
submission = submission.reset_index(drop=True)
submission.to_csv('ensemble_regression_submission.csv', index=False)