<a href="https://colab.research.google.com/github/annaroney/Airbnb/blob/main/Ensemble_Regression_PredictionProblemCode.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Prediction Problem Code


**Problem:** Regression

**Model:** Ensemble

**RMSE on Kaggle:** 113.35


In [None]:
# run this cell if using google colab
from google.colab import drive
drive.mount('/content/drive')

import os
os.chdir('/content/drive/My Drive/data science/stat303-3')

Mounted at /content/drive


### Libraries

In [11]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

from sklearn.linear_model import Lasso, LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.ensemble import AdaBoostRegressor, RandomForestRegressor, VotingRegressor, StackingRegressor, BaggingRegressor
from sklearn.model_selection import train_test_split, GridSearchCV, KFold, RepeatedKFold, RandomizedSearchCV
from xgboost import XGBRegressor
# uncomment next line if needed
#!pip install catboost
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor

### Data and Preprocessing

In [12]:
# Reading data
train = pd.read_csv('train_regression.csv')

# Clean price column and remove extreme outliers with price>10000
train.price = train.price.str.replace('$','').str.replace(',','').astype(float)
train = train.loc[train.price < 10000,:]


# Modifying the neighborhoods_cleansed column to only reflect if the neighborhood is in the top 3 most fancy
train.neighbourhood_cleansed.value_counts()[0:3]
fancy_neighborhoods = ['Near North Side','West Town','Lake View']
train.neighbourhood_cleansed = train.neighbourhood_cleansed.apply(
    lambda x: 1 if x in fancy_neighborhoods else 0)

# Since there are many property types, modifying the column so that only types with over 100 observations are kept, otherwise set to 'Other'
train.property_type.value_counts()[0:7]
top_property_types = ['Entire rental unit','Entire condo','Private room in rental unit','Entire home',
                     'Private room in home','Private room in home','Entire serviced apartment','Room in hotel']
train.property_type = train.property_type.apply(lambda x: x if x in top_property_types else 'Other')

# host_location data prep
train['host_location'].fillna('Other', inplace=True)
top_locations = train['host_location'].value_counts().index[:3]
train.loc[~train['host_location'].isin(top_locations), 'host_location'] = 'Other'

# host_response_time data prep
mode_value = train['host_response_time'].mode()[0]
train['host_response_time'].fillna(mode_value, inplace=True)

# host_response_rate data prep
train['host_response_rate'] = train['host_response_rate'].str.rstrip('%').astype(float)
mean_value = train['host_response_rate'].mean()
train['host_response_rate'].fillna(mean_value, inplace=True)

# host_acceptance_rate data prep
train['host_acceptance_rate'] = train['host_acceptance_rate'].str.rstrip('%').astype(float)
mean_value = train['host_acceptance_rate'].mean()
train['host_acceptance_rate'].fillna(mean_value, inplace=True)

# host_is_superhost data prep
train['host_is_superhost'] = train['host_is_superhost'].astype(str)
train['host_is_superhost'] = train['host_is_superhost'].replace({'t': 1, 'f': 0})
mode_value = train['host_is_superhost'].mode()[0]
train['host_is_superhost'].fillna(mode_value, inplace=True)

# host_neighbourhood
train.host_neighbourhood.value_counts()[0:3]
top_host_neighbourhoods = ['Cambridge','River North','Logan Square']
train.host_neighbourhood = train.host_neighbourhood.apply(
    lambda x: 1 if x in top_host_neighbourhoods else 0)
train['host_neighbourhood'].fillna('Other', inplace=True)

# host_has_profile_pic data prep
train['host_has_profile_pic'] = train['host_has_profile_pic'].astype(str)
train['host_has_profile_pic'] = train['host_has_profile_pic'].replace({'t': 1, 'f': 0})

# host_identity_verified data prep
train['host_identity_verified'] = train['host_identity_verified'].astype(str)
train['host_identity_verified'] = train['host_identity_verified'].replace({'t': 1, 'f': 0})

# bathrooms_text
top_bathroom_types = train['bathrooms_text'].value_counts().index[:5]
train['bathrooms_text'] = train['bathrooms_text'].apply(lambda x: x if x in top_bathroom_types else 'Other')

# has_availability data prep
train['has_availability'] = train['has_availability'].astype(str)
train['has_availability'] = train['has_availability'].replace({'t': 1, 'f': 0})

# Imputing missing numeric values using median
train = train.fillna(train.median(numeric_only=True))
# Imputing missing categorical values using ffill and then bfill (in case the first value was null)
train = train.fillna(method = 'ffill')
train = train.fillna(method = 'bfill')
# Need to clean feature names so that non-JSON chars don't appear in the dummies and interfere with LightGBM in submission 13
# SAME MODIFICATIONS FOR TEST DATA (except for those involving price since it's not present in the test data)

test = pd.read_csv('test_regression.csv')

# Neighborhood cleaning
test.neighbourhood_cleansed = test.neighbourhood_cleansed.apply(
    lambda x: 1 if x in fancy_neighborhoods else 0)

# Property type cleaning
test.property_type = test.property_type.apply(lambda x: x if x in top_property_types else 'Other')

# host_location data prep
test['host_location'].fillna('Other', inplace=True)
top_locations = test['host_location'].value_counts().index[:3]
test.loc[~test['host_location'].isin(top_locations), 'host_location'] = 'Other'

# host_response_time data prep
mode_value = test['host_response_time'].mode()[0]
test['host_response_time'].fillna(mode_value, inplace=True)

# host_response_rate data prep
test['host_response_rate'] = test['host_response_rate'].str.rstrip('%').astype(float)
mean_value = test['host_response_rate'].mean()
test['host_response_rate'].fillna(mean_value, inplace=True)

# host_acceptance_rate data prep
test['host_acceptance_rate'] = test['host_acceptance_rate'].str.rstrip('%').astype(float)
mean_value = test['host_acceptance_rate'].mean()
test['host_acceptance_rate'].fillna(mean_value, inplace=True)

# host_is_superhost data prep
test['host_is_superhost'] = test['host_is_superhost'].astype(str)
test['host_is_superhost'] = test['host_is_superhost'].replace({'t': 1, 'f': 0})
mode_value = test['host_is_superhost'].mode()[0]
test['host_is_superhost'].fillna(mode_value, inplace=True)

# host_neighbourhood
test.host_neighbourhood.value_counts()[0:3]
top_host_neighbourhoods = ['Cambridge','River North','Logan Square']
test.host_neighbourhood = test.host_neighbourhood.apply(
    lambda x: 1 if x in top_host_neighbourhoods else 0)
test['host_neighbourhood'].fillna('Other', inplace=True)

# host_has_profile_pic data prep
test['host_has_profile_pic'] = test['host_has_profile_pic'].astype(str)
test['host_has_profile_pic'] = test['host_has_profile_pic'].replace({'t': 1, 'f': 0})

# host_identity_verified data prep
test['host_identity_verified'] = test['host_identity_verified'].astype(str)
test['host_identity_verified'] = test['host_identity_verified'].replace({'t': 1, 'f': 0})

# bathrooms_text
top_bathroom_types = test['bathrooms_text'].value_counts().index[:5]
test['bathrooms_text'] = test['bathrooms_text'].apply(lambda x: x if x in top_bathroom_types else 'Other')

# has_availability data prep
test['has_availability'] = test['has_availability'].astype(str)
test['has_availability'] = test['has_availability'].replace({'t': 1, 'f': 0})

# Imputing
test = test.fillna(test.median(numeric_only=True))
test = test.fillna(method = 'ffill')
test = test.fillna(method = 'bfill')

  train = train.fillna(method = 'ffill')
  train = train.fillna(method = 'bfill')
  test = test.fillna(method = 'ffill')
  test = test.fillna(method = 'bfill')


### Predictor Selection

In [13]:
bad_predictors = ['host_neighbourhood',
 'host_has_profile_pic',
 'host_identity_verified',
 'maximum_maximum_nights',
 'maximum_nights_avg_ntm',
 'has_availability',
 'review_scores_checkin',
 'review_scores_communication',
 'calculated_host_listings_count_shared_rooms',
 'host_location_Chicago_IL',
 'host_location_New_York_NY',
 'host_location_Other',
 'host_response_time_a_few_days_or_more',
 'host_response_time_within_a_few_hours',
 'host_response_time_within_an_hour',
 'host_is_superhost_0',
 'host_is_superhost_1',
 'host_is_superhost_nan',
 'host_verifications_email_phone_work_email',
 'host_verifications_email_phone',
 'host_verifications_email',
 'host_verifications_phone_work_email',
 'host_verifications_phone',
 'property_type_Entire_condo',
 'property_type_Entire_home',
 'property_type_Entire_rental_unit',
 'property_type_Entire_serviced_apartment',
 'property_type_Other',
 'property_type_Private_room_in_home',
 'property_type_Private_room_in_rental_unit',
 'property_type_Room_in_hotel',
 'room_type_Entire_home/apt',
 'room_type_Hotel_room',
 'room_type_Private_room',
 'room_type_Shared_room',
 'bathrooms_text_1_bath',
 'bathrooms_text_1_private_bath',
 'bathrooms_text_1_shared_bath',
 'bathrooms_text_1.5_baths',
 'bathrooms_text_2_baths',
 'instant_bookable_f',
 'instant_bookable_t']

def clean_feature_name(name):
    translation_table = str.maketrans({
        ",": "",
        "[": "",
        "]": "",
        "'": "",
        '"': "",
        " ": "_"
    })
    cleaned_name = name.translate(translation_table)
    return cleaned_name

# ** RUN ONLY ONCE **

ids = test.id
train = train.drop(columns=['id', 'host_id', 'host_since', 'first_review', 'last_review'])
test = test.drop(columns=['id', 'host_id', 'host_since', 'first_review', 'last_review'])

x_train = train.drop(columns='price')
y_train = train.price

x_test = test


x_train = pd.get_dummies(x_train)
x_test = pd.get_dummies(x_test)

x_train.columns = [clean_feature_name(col) for col in x_train.columns]
x_test.columns = [clean_feature_name(col) for col in x_test.columns]

x_train = x_train.drop(columns=bad_predictors)
x_test = x_test.drop(columns=bad_predictors)

# Clean all feature names in the DataFrame
x_train.columns = [clean_feature_name(col) for col in x_train.columns]
x_test.columns = [clean_feature_name(col) for col in x_test.columns]

In [14]:
# lasso
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

# Define and fit the Lasso model
lasso = Lasso(alpha=0.1)  # Adjust alpha for regularization strength
lasso.fit(x_train_scaled, y_train)

# Get the coefficients
lasso_coef = lasso.coef_

# Print the selected predictors (non-zero coefficients)
selected_predictors = np.where(lasso_coef != 0)[0]

x_train = x_train.iloc[:,selected_predictors]
x_test = x_test.iloc[:,selected_predictors]
y_train = train[['price']]

x_train = pd.get_dummies(x_train, drop_first=True)
x_test = pd.get_dummies(x_test, drop_first=True)

# step 5: final processing steps
missing_columns = [col for col in x_train.columns if col not in x_test.columns]
missing_columns_df = pd.DataFrame(0, index=x_test.index, columns=missing_columns)
x_test = pd.concat([x_test, missing_columns_df], axis=1)

# Reordering x_test columns to match x_train
x_test = x_test[x_train.columns]

x_train.columns = x_train.columns.str.replace(' ', '_')
x_test.columns = x_test.columns.str.replace(' ', '_')


### Model Tuning and Training

In [16]:
# ensemble of four boosting models, each already individually tuned
# didn't tune them as an ensemble because it would take too long to run and it would probably have minimal improvement
# fitting this model should take ~ 2 minutes

bm1 = XGBRegressor(random_state = 12,
                   objective = 'reg:squarederror',
                   colsample_bytree = 0.5,
                   gamma = 0.1,
                   learning_rate = 0.01,
                   max_depth = 8,
                   n_estimators = 1000,
                   reg_lambda = 1,
                   subsample = 0.75)

bm2 = CatBoostRegressor(random_state = 11,
                       n_estimators = 1100,
                       max_depth = 8,
                       learning_rate = .1,
                       subsample = .5,
                       reg_lambda = .1,
                       verbose=False
                       )
bm3 = LGBMRegressor(random_state = 1,
                    num_threads = 1,
                    n_estimators = 700,
                    max_depth = 6,
                    learning_rate = .01,
                    subsample = .5,
                    reg_lambda = .1,
                    verbose=-1)

bm4 = AdaBoostRegressor(n_estimators=115, learning_rate=0.001, random_state=12)

model = StackingRegressor(estimators = [('xgb',bm1),('catboost',bm2),('lgbm',bm3),('ada',bm4)], final_estimator = LinearRegression())

model.fit(x_train, y_train)

  y = column_or_1d(y, warn=True)


### Prediction

In [17]:
predictions = model.predict(x_test)

# create csv file of predicted classifications
output = pd.DataFrame({'id': ids,'predicted':predictions})
output.to_csv('EnsembleRegression.csv', index=False)

In [18]:
# testing to get a sense of model performance
predictions = model.predict(x_train)
model_rmse = np.sqrt(np.mean((predictions - train.price)**2))
print(model_rmse)



18.809900282464298
