In [168]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict, \
cross_validate, GridSearchCV, RandomizedSearchCV, KFold, StratifiedKFold, RepeatedKFold, RepeatedStratifiedKFold
from skopt import BayesSearchCV
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.metrics import accuracy_score, recall_score, mean_squared_error
from sklearn.linear_model import LinearRegression, LassoCV, RidgeCV, Ridge, Lasso
from scipy.stats import uniform
import seaborn as sns
import matplotlib.pyplot as plt

## Read data

In [236]:
train = pd.read_csv('./datasets/train_regression.csv')
test = pd.read_csv('./datasets/test_regression.csv')

## 1) Data pre-processing

Put the data pre-processing code. You don't need to explain it. You may use the same code from last quarter.

In [98]:
# Define a function to categorize the property types
def categorize_property(property_type):
    if 'Entire' in property_type:
        return 'Entire Home/Apartment'
    elif 'Private' in property_type:
        return 'Private Room'
    elif 'Shared' in property_type:
        return 'Shared Accommodation'
    elif property_type in ['Room in hotel', 'Room in boutique hotel', 'Boat']:
        return 'Specialty Accommodations'
    else:
        return 'Other'

In [99]:
# overall function to clean training and test data
def clean_data(df):
    
    # Remove $ from response variable and convert to float in training data
    if 'price' in df.columns:
        df.price = df.price.replace('[\$,]', '', regex=True).astype(float)
        
    # replace missing values of numeric variables wtih the median
    numeric_columns = df.select_dtypes(include=['number']).columns
    df[numeric_columns] = df[numeric_columns].apply(lambda x: x.fillna(x.median()))

    # replace missing values of categorical variables with the mode 
    categorical_columns = df.select_dtypes(include=['object']).columns
    df[categorical_columns] = df[categorical_columns].fillna(df[categorical_columns].mode().iloc[0])
    
    # log transform response variable for training data and drop price
    if 'price' in df.columns:
        df['log_price'] = np.log(df['price'])
    
    # replace any 0 values to 1 so that it can go through log transformation
    df['beds'] = df['beds'].replace(0, .01)
    df['accommodates'] = df['accommodates'].replace(0, .01)
    df['number_of_reviews'] = df['number_of_reviews'].replace(0, .01)
    df['reviews_per_month'] = df['reviews_per_month'].replace(0, .01)
    df['number_of_reviews_ltm'] = df['number_of_reviews_ltm'].replace(0, .01)
    df['number_of_reviews_l30d'] = df['number_of_reviews_l30d'].replace(0, .01)
    df['host_total_listings_count'] = df['host_total_listings_count'].replace(0, .01)
    df['host_listings_count'] = df['host_listings_count'].replace(0, .01)
    df['calculated_host_listings_count_private_rooms'] = df['calculated_host_listings_count_private_rooms'].replace(0, .01)
    df['calculated_host_listings_count_shared_rooms'] = df['calculated_host_listings_count_shared_rooms'].replace(0, .01)
    df['calculated_host_listings_count_entire_homes'] = df['calculated_host_listings_count_entire_homes'].replace(0, .01)
    
    df['log_beds'] = np.log(df.beds)
    df['log_accommodates'] = np.log(df.accommodates)
    df['log_reviews'] = np.log(df.number_of_reviews)
    df['log_reviews_per_month'] = np.log(df.reviews_per_month)
    df['log_reviews_ltm'] = np.log(df.number_of_reviews_ltm)
    df['log_reviews_l30d'] = np.log(df.number_of_reviews_l30d)
    df['log_host_total_listings_count'] = np.log(df.host_total_listings_count)
    df['log_host_listings_count'] = np.log(df.host_listings_count)
    df['log_host_listings_count_private_rooms'] = np.log(df.calculated_host_listings_count_private_rooms)
    df['log_host_listings_count_shared_rooms'] = np.log(df.calculated_host_listings_count_shared_rooms)
    df['log_host_listings_count_entire_homes'] = np.log(df.calculated_host_listings_count_entire_homes)

    # calculate the number of days since the host became a host
    df['host_since'] = pd.to_datetime(df['host_since'])
    current_date = dt.now()
    df['host_since_days'] = (current_date - df['host_since']).dt.days
    
    # calculate days since first/last review
    df['first_review'] = pd.to_datetime(df['first_review'], errors='coerce')
    df['last_review'] = pd.to_datetime(df['last_review'], errors='coerce')

    df['first_review_days'] = (current_date - df['first_review']).dt.days
    df['last_review_days'] = (current_date - df['last_review']).dt.days
    
    # make response_rate and acceptance_rate into numeric dtype
    df['host_response_rate'] = df['host_response_rate'].str.rstrip('%').astype('float')
    df['host_acceptance_rate'] = df['host_acceptance_rate'].str.rstrip('%').astype('float')
    
    # subgroup property_type (similar levels as room_type so discard room predictor)
    df['property_cats'] = df['property_type'].apply(categorize_property)
    
    # extract numeric values from the 'bathrooms' column
    df['bath_numeric'] = df['bathrooms_text'].str.extract('(\d+\.*\d*)', expand=False).astype(float)

    # handle "Half-bath" by assigning a numeric value of 0.5
    df['bath_numeric'] = df.apply(lambda row: 0.5 if 'half' in row['bathrooms_text'].lower() \
                                  else row['bath_numeric'], axis=1)
    
    # to binary
    df.host_is_superhost = df.host_is_superhost.replace({'t': 1, 'f': 0})
    df.host_identity_verified = df.host_identity_verified.replace({'t': 1, 'f': 0})
    df.host_has_profile_pic = df.host_has_profile_pic.replace({'t': 1, 'f': 0})
    df.has_availability = df.has_availability.replace({'t': 1, 'f': 0})
    df.instant_bookable = df.instant_bookable.replace({'t': 1, 'f': 0})

    # drop the modified/redundant columns
    df.drop(columns = ['host_since', 'first_review', 'last_review', 'property_type', 'bathrooms_text', \
                       'number_of_reviews', 'reviews_per_month', 'number_of_reviews_ltm', \
                       'number_of_reviews_l30d', 'host_total_listings_count', 'host_listings_count', \
                      'calculated_host_listings_count_private_rooms', 'calculated_host_listings_count_shared_rooms', \
                       'calculated_host_listings_count_entire_homes', 'host_id'], inplace = True)
    
    # drop predictors that have low corr with log_price and high corr with others to help remove multi-collinearity
    df.drop(columns = ['first_review_days', 'last_review_days', 'host_acceptance_rate', 'host_response_rate', 
                       'availability_60', 'availability_90', 'minimum_minimum_nights', \
                       'maximum_maximum_nights', 'maximum_minimum_nights', 'minimum_maximum_nights', \
                       'minimum_nights_avg_ntm', 'maximum_nights_avg_ntm'], inplace = True)

In [237]:
clean_data(train)
clean_data(test)

In [238]:
# extract mean price of each host_location predictor
host_loc_avg_prices = train.groupby('host_location')['price'].mean().reset_index()
host_loc_avg_prices['avg_host_location_price'] = host_loc_avg_prices['price']
train = pd.merge(train, host_loc_avg_prices[['host_location', 'avg_host_location_price']], on='host_location', how='left')

# extract mean price of each host_neighbourhood predictor
host_neigh_avg_prices = train.groupby('host_neighbourhood')['price'].mean().reset_index()
host_neigh_avg_prices['avg_host_neighbourhood_price'] = host_neigh_avg_prices['price']
train = pd.merge(train, host_neigh_avg_prices[['host_neighbourhood', 'avg_host_neighbourhood_price']], on='host_neighbourhood', how='left')

# extract mean price of each neighbourhood_cleansed predictor
neigh_avg_prices = train.groupby('neighbourhood_cleansed')['price'].mean().reset_index()
neigh_avg_prices['avg_neighbourhood_price'] = neigh_avg_prices['price']
train = pd.merge(train, neigh_avg_prices[['neighbourhood_cleansed', 'avg_neighbourhood_price']], on='neighbourhood_cleansed', how='left')

In [239]:
# extract mean price of each host_location predictor for test data
test = pd.merge(test, host_loc_avg_prices[['host_location', 'avg_host_location_price']], on='host_location', how='left')

# extract mean price of each host_neighbourhood predictor for test data
test = pd.merge(test, host_neigh_avg_prices[['host_neighbourhood', 'avg_host_neighbourhood_price']], on='host_neighbourhood', how='left')

# extract mean price of each neighbourhood_cleansed predictor for test data
test = pd.merge(test, neigh_avg_prices[['neighbourhood_cleansed', 'avg_neighbourhood_price']], on='neighbourhood_cleansed', how='left')

In [240]:
# drop the categorical predictors that we used right above
train = train.drop(columns = ['host_neighbourhood', 'neighbourhood_cleansed', 'host_location'])
test = test.drop(columns = ['host_neighbourhood', 'neighbourhood_cleansed', 'host_location'])

In [241]:
# filter out extreme outliers
train = train[train.price < 10000]

In [242]:
# drop the most influential point
train = train.drop(index = 2850)

In [243]:
# OHE the remaining categorical variables
host_response_time_dummies = pd.get_dummies(train['host_response_time'], prefix='host_response_time')
train = pd.concat([train, host_response_time_dummies], axis = 1)

host_response_time_dummies = pd.get_dummies(test['host_response_time'], prefix='host_response_time')
test = pd.concat([test, host_response_time_dummies], axis = 1)

In [244]:
host_verifications_dummies = pd.get_dummies(train['host_verifications'], prefix='host_verifications')
train = pd.concat([train, host_verifications_dummies], axis = 1)

host_verifications_dummies = pd.get_dummies(test['host_verifications'], prefix='host_verifications')
test = pd.concat([test, host_verifications_dummies], axis = 1)

In [245]:
room_type_dummies = pd.get_dummies(train['room_type'], prefix='room_type')
train = pd.concat([train, room_type_dummies], axis = 1)

room_type_dummies = pd.get_dummies(test['room_type'], prefix='room_type')
test = pd.concat([test, room_type_dummies], axis = 1)

In [246]:
property_cats_dummies = pd.get_dummies(train['property_cats'], prefix='property_cats')
train = pd.concat([train, property_cats_dummies], axis = 1)

property_cats_dummies = pd.get_dummies(test['property_cats'], prefix='property_cats')
test = pd.concat([test, property_cats_dummies], axis = 1)

In [247]:
train = train.drop(columns = ['host_response_time', 'host_verifications', 'room_type', 'property_cats'])
test = test.drop(columns = ['host_response_time', 'host_verifications', 'room_type', 'property_cats'])

In [248]:
# variable spacing
train.rename(columns=lambda x: x.replace(' ', '_'), inplace=True)
test.rename(columns=lambda x: x.replace(' ', '_'), inplace=True)

In [249]:
# replace missing values of numeric variables with mean in test from added predictors
numeric_columns = test.select_dtypes(include=['number']).columns
test[numeric_columns] = test[numeric_columns].apply(lambda x: x.fillna(x.mean()))

In [250]:
# set response and predictors for scaling
y_train = train.log_price
X_train = train.drop(columns = ['log_price', 'price', 'id'])
X_test = test.drop(columns = ['id'])

In [251]:
# poly features
poly = PolynomialFeatures(degree = 3, include_bias=False)
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.fit_transform(X_test)

In [252]:
# scale the variables
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_poly)
X_test_scaled = scaler.transform(X_test_poly)

## 2) Hyperparameter tuning

### How many attempts did it take you to tune the model hyperparameters?

It took me around 10 attempts to tune the model hyperparamters.

### Which tuning method did you use (grid search / Bayes search / etc.)?

I used Randomized Search to tune my model.

### What challenges did you face while tuning the hyperparameters, and what actions did you take to address those challenges?

A challenge I faced while tuning the hyperparamters was figuring out the optimal range to test `n_neighbors` and `r` features selected. I mainly just used trial and error to address this, as I saw the `n_neighbors` and printed out `r` feature results each run. I also had to choose a variable selection method first before running KNN, as unncessary predictors would negatively incluence KNN.

### How many hours did you spend on hyperparameter tuning?

The code usually took 30 minutes or less each run. In total, I spent around 2-3 hours on hyperparamter tuning in addition to the time it took to implement Lasso.

### Variable Selection Step: Lasso

**Paste the hyperparameter tuning code below. You must show at least one hyperparameter tuning procedure.**

In [254]:
# Create DataFrames with polynomial features
X_train_poly_df = pd.DataFrame(X_train_scaled, columns=poly.get_feature_names_out(X_train.columns))
X_test_poly_df = pd.DataFrame(X_test_scaled, columns=poly.get_feature_names_out(X_test.columns))

selected_coeffs = []
r = 0
alphas = np.logspace(-1, 3, 200)
kfold = KFold(n_splits = 10, shuffle = True, random_state = 1)

for alpha in alphas:
    lasso = Lasso(alpha = alpha)
    lasso.fit(X_train_scaled,y_train)
    if ((lasso.coef_ == 0).sum() > r) & (len(selected_coeffs) <= r) :
        selected_coeffs.append(np.where(lasso.coef_!=0)[0])
        r = r + 1

model = KNeighborsRegressor()
grid = {'n_neighbors': np.arange(1, 41), 'weights': ['uniform', 'distance'], 'metric': ['manhattan', 'euclidean', 'minkowski']}

results = []

for r in range(1, 31):
    
    gcv = RandomizedSearchCV(model, grid, cv = kfold, n_iter = 180, random_state = 10,
                         scoring = 'neg_root_mean_squared_error', n_jobs = -1)
    
    selected_Xs = X_train_poly_df.iloc[:,selected_coeffs[r-1]]
    selected_predictors = selected_Xs.columns.tolist()
    
    if selected_Xs.shape[1] > 0:
        gcv.fit(selected_Xs, y_train)
        cv_rmse = np.sqrt(-gcv.best_score_)
        results.append({'r': r,
                    'selected_predictors': selected_predictors,
                    'best_params': gcv.best_params_,
                    'cv_rmse': cv_rmse})
        
    else:
        print("No features selected for r =", r)
        
optimal_r = min(results, key = lambda x: x['cv_rmse'])

In [260]:
# Extract selected predictors column names
selected_predictors = optimal_r['selected_predictors']

# Find indices of selected predictor column names
selected_predictor_indices = [X_train_poly_df.columns.get_loc(col) for col in selected_predictors]

# Extract selected predictor column names
selected_predictor_names = X_train_poly_df.columns[selected_predictor_indices]

**Paste the optimal hyperparameter values below.**

In [270]:
print(optimal_r['best_params'])

{'weights': 'distance', 'n_neighbors': 20, 'metric': 'manhattan'}


## 3) Model

Using the optimal model hyperparameters, train the model, and paste the code below.

In [266]:
model = KNeighborsRegressor(**optimal_r['best_params'])
model.fit(X_train_poly_df[selected_predictor_names], y_train)

## 4) Put any ad-hoc steps for further improving model accuracy
For example, scaling up or scaling down the predictions, capping predictions, etc.

Put code below.

No further ad-hoc steps

## 5) Export the predictions in the format required to submit on Kaggle
Put code below.

In [267]:
y_preds = model.predict(X_test_poly_df[selected_predictor_names])

In [268]:
pred = np.exp(y_preds)

In [269]:
id = test.id.values
predicted = pred
submission = pd.DataFrame({'id': id, 'predicted': predicted})
submission = submission.reset_index(drop=True)
submission.to_csv('regression_submission.csv', index=False)