# Kaggle Seattle Airbnb

## Project Info

### Author Info

- Author: [Zacks Shen](https://www.linkedin.com/in/zacks-shen/)
- Contributor: [Kevin Chu](https://www.linkedin.com/in/yen-duo-chu/)

---

### GitHub

- [Kaggle-Seattle-Airbnb](https://github.com/ZacksAmber/Kaggle-Seattle-Airbnb)



---

# Dependencies

In [1]:
# Statistics
import pandas as pd
import numpy as np
import math as mt

# Data Visualization
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

'''Visualization
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio

px.defaults.width = 1200
px.defaults.height = 800
# plotly.io Settings for both plotly.graph_objects and plotly.express
pio.templates.default = "plotly_white" # "plotly", "plotly_white", "plotly_dark", "ggplot2", "seaborn", "simple_white", "none"
pio.kaleido.scope.default_format = 'svg'
pio.kaleido.scope.default_scale = 1
'''

# Data Preprocessing - Standardization, Encoding, Imputation
from sklearn.preprocessing import StandardScaler # Standardization
from sklearn.preprocessing import Normalizer # Normalization
from sklearn.preprocessing import OneHotEncoder # One-hot Encoding
from sklearn.preprocessing import OrdinalEncoder # Ordinal Encoding
from category_encoders import MEstimateEncoder # Target Encoding
from sklearn.preprocessing import PolynomialFeatures # Create Polynomial Features
from sklearn.impute import SimpleImputer # Imputation

# Exploratory Data Analysis - Feature Engineering
from sklearn.preprocessing import PolynomialFeatures
from sklearn.feature_selection import mutual_info_regression
from sklearn.decomposition import PCA

# Modeling - ML Pipelines
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

# Modeling - Algorithms
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
#from catboost import CatBoostRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

# ML - Evaluation
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score

# ML - Tuning
import optuna
#from sklearn.model_selection import GridSearchCV

# Settings
# Settings for Seaborn
sns.set_theme(context='notebook', style='ticks', palette="bwr_r", font_scale=0.7, rc={"figure.dpi":240, 'savefig.dpi':240})

In [2]:
import os
kaggle_project = 'seattle'
# Import dataset from local directory './data' or from Kaggle
data_dir = ('./data/201601' if os.path.exists('data') else f'/kaggle/input/{kaggle_project}')

# print all files in data_dir
for dirname, _, filenames in os.walk(data_dir):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Import three datasets
reviews = pd.read_csv(f'{data_dir}/reviews.csv')
calendar = pd.read_csv(f'{data_dir}/calendar.csv')
listings = pd.read_csv(f'{data_dir}/listings_kfold.csv') if os.path.exists(f'{data_dir}/listings_kfold.csv') else pd.read_csv(f'{data_dir}/listings.csv')

./data/201601/reviews.csv
./data/201601/listings_kfold.csv
./data/201601/listings.csv
./data/201601/calendar.csv
./data/201601/.ipynb_checkpoints/listings_kfold-checkpoint.csv
./data/201601/.ipynb_checkpoints/calendar-checkpoint.csv
./data/201601/.ipynb_checkpoints/listings-checkpoint.csv


---

## Cross-Validation KFold

In [3]:
def generate_listings_kfold():
    # Mark the train dataset with kfold = 5
    listings = pd.read_csv(f'{data_dir}/listings.csv')
    if os.path.exists(f'{data_dir}/listings_kfold.csv'):
        os.remove(f'{data_dir}/listings_kfold.csv')
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    for fold, (train_idx, valid_idx) in enumerate(kf.split(X=listings)):
        listings.loc[valid_idx, "kfold"] = fold

    listings.to_csv(f'{data_dir}/listings_kfold.csv', index=False)
#generate_listings_kfold()

In [4]:
# After assigning kfold
# If error, run the above function then re-load listings_kfold.csv
listings.loc[:, ['id', 'kfold']].head()

Unnamed: 0,id,kfold
0,241032,0.0
1,953595,4.0
2,3308979,2.0
3,7421966,3.0
4,278830,4.0


---

# ETL Pipeline

The ETL pipeline provides data transformation and formatting. Thus, we can calculate the data and perform machine learning with the correct data format.

In [5]:
class ETL_pipeline:
    def __init__(self, data_frame):
        self.df = data_frame
    
    # Data type transformation
    def _transformation(self, data_frame):
        df = data_frame
        # Convert dollar columns from object to float
        # Remove '$' and ','
        dollar_cols = ['price', 'weekly_price', 'monthly_price', 'extra_people', 'security_deposit', 'cleaning_fee']
        for dollar_col in dollar_cols:
            df[dollar_col] = df[dollar_col].replace('[\$,]', '', regex=True).astype(float)
        # Convert dollar columns from object to float
        # Remove '%'
        percent_cols = ['host_response_rate', 'host_acceptance_rate']
        for percent_col in percent_cols:
            df[percent_col] = df[percent_col].replace('%', '', regex=True).astype(float)

        # Replace the following values in property_type to Unique space due to small sample size
        unique_space = ["Barn",
        "Boat",
        "Bus",
        "Camper/RV",
        "Treehouse",
        "Campsite",
        "Castle",
        "Cave",
        "Dome House",
        "Earth house",
        "Farm stay",
        "Holiday park",
        "Houseboat",
        "Hut",
        "Igloo",
        "Island",
        "Lighthouse",
        "Plane",
        "Ranch",
        "Religious building",
        "Shepherd’s hut",
        "Shipping container",
        "Tent",
        "Tiny house",
        "Tipi",
        "Tower",
        "Train",
        "Windmill",
        "Yurt",
        "Riad",
        "Pension",
        "Dorm",
        "Chalet"]            
        df.property_type = df.property_type.replace(unique_space, "Unique space", regex=True)

        # Convert 't', 'f' to 1, 0
        tf_cols = ['host_is_superhost', 'instant_bookable', 'require_guest_profile_picture', 'require_guest_phone_verification']
        for tf_col in tf_cols:
            df[tf_col] = df[tf_col].replace('f', 0, regex=True)
            df[tf_col] = df[tf_col].replace('t', 1, regex=True)
        
        return df
    
    # Parse listings
    def parse_listings(self):
        """Parse listings.
        """
        df = self.df
        df = self._transformation(df)
        return df
    
    def parse_reviews(self):
        """Parse reviews.
        """
        df = self.df
        df.date = pd.to_datetime(df.date)
        return df
    
    # Parse calendar
    def parse_calender(self):
        """Paser calendar.
        """
        df = self.df
        # Convert date from object to datetime
        df.date = pd.to_datetime(df.date)
        # Convert price from object to float
        # Convert '$' and ',' to ''
        df.price = df.price.replace('[\$,]', '', regex=True).astype(float)
        
        # Convert 't', 'f' to 1, 0
        df['available'] = df['available'].replace('f', 0, regex=True)
        df['available'] = df['available'].replace('t', 1, regex=True)

        return df

In [6]:
listings = ETL_pipeline(listings).parse_listings()
reviews = ETL_pipeline(reviews).parse_reviews()
calendar = ETL_pipeline(calendar).parse_calender()

---

# ML Pipeline

In [7]:
class EDA_demand:
    def __init__(self):
        pass
    
    def reviews_rate_vs_unavailability(self, period=30):
        """Calculate the booked listing from file calendar.

        Args:
            period (int): Positive integer. Default is 30.

        Returns:
            Pandas DataFrame.
        """
        assert (0 < period <= 365) & isinstance(period, int), "period must be an integer and greater than 0"
        self.period = period
        
        #
        # Calculate review rate & unavailability
        #

        # reviews Rate: review / days
        """
        SELECT 
            listing_id, 
            COUNT(listing_id) / DATEDIFF(20160104+1, MIN(date)) AS reviews_per_day
        FROM reviews
        GROUP BY listing_id
        """
        # Extract the first reviews date for each listing
        func = lambda df: pd.Series({'first_day': df.date.min()})
        df_reviews_per_day = pd.DataFrame(reviews.groupby('listing_id').apply(func))
        # Define last scraped date
        last_scraped = listings.last_scraped.unique()[0]
        last_scraped = pd.Timestamp(last_scraped)
        df_reviews_per_day['last_day'] = last_scraped + pd.DateOffset(days=1)
        # Calculate the datediff
        df_reviews_per_day['datediff'] = df_reviews_per_day.last_day - df_reviews_per_day.first_day
        df_reviews_per_day['datediff'] = df_reviews_per_day['datediff'].dt.days
        # Calculate the reviews Rate
        df_reviews_per_day['reviews_per_day'] = reviews.groupby('listing_id').size() / df_reviews_per_day['datediff']

        """
        SELECT listing_id, SUM(IF(available = 0, 1, 0))
        FROM calendar
        WHERE DATEDIFF(date, 20160104) <= period
        GROUP BY listing_id
        """
        last_day = last_scraped + pd.DateOffset(days=period-1)
        filter = calendar.date <= (last_day)
        func = lambda df: pd.Series({f'unavailability_{period}_unscaled': sum(df.available == 0)}) # Scaling available to day scale
        df_unavailability = pd.DataFrame(calendar[filter].groupby('listing_id').apply(func))
        df_unavailability[f'unavailability_{period}'] = df_unavailability[f'unavailability_{period}_unscaled'] / period
        #df_unavailability['first_day'] = last_scraped
        #df_unavailability['last_day'] = last_day
        self.df_unavailability = df_unavailability
        
        # Join two tables
        df_unavailability_reviews = df_unavailability.join(df_reviews_per_day, how='left')
        df_unavailability_reviews.reviews_per_day.fillna(value=0, inplace=True)
        #df_unavailability_reviews.loc[:, [f'unavailability_{period}_unscaled', f'unavailability_{period}', 'reviews_per_day']]
        
        # Find outliers (unavailable rather than booked)
        # Extrat quantiles
        reviews_rate_25 = df_unavailability_reviews.reviews_per_day.quantile(q=0.25, interpolation='higher')
        unavailability_75 = df_unavailability_reviews[f'unavailability_{period}'].quantile(q=0.75, interpolation='higher')
        # Low reviews rate: 0.010376
        filter1 = df_unavailability_reviews.reviews_per_day < reviews_rate_25
        # High unavailability: 0.660274
        filter2 = df_unavailability_reviews[f'unavailability_{period}'] > unavailability_75

        outliers = df_unavailability_reviews[filter1 & filter2]
        df_unavailability_reviews['demand'] = df_unavailability_reviews[f'unavailability_{period}_unscaled']
        df_unavailability_reviews.loc[outliers.index, 'demand'] = period - df_unavailability_reviews.loc[outliers.index, 'demand']
        
        self.outliers = outliers
        self.df_unavailability_reviews = df_unavailability_reviews
        
        return self.df_unavailability_reviews

In [8]:
class ML_pipeline:
    """ML Pipeline for listings.
    """
    def __init__(self, data_frame, features, target, days=365):
        """
        
        Args:
            data_frame (Pandas DataFrame): listings.
            features (list): The Machine Learning features.
            target (str): price
            days (int): The days after 2016-01-04 for calculating demand.
        """
        import warnings
        warnings.filterwarnings("ignore") # ignore target encoding warnings
        
        # Get demand
        demand = EDA_demand().reviews_rate_vs_unavailability(days)
        # The index will change to id
        data_frame = data_frame.set_index('id').join(demand['demand'], how='inner')
        
        features.append(target)
        data_frame = data_frame[features]
        
        # Encode amenities
        data_frame = self._encode_amentities(data_frame)
        data_frame.pop('amenities')
        
        self.data_frame = data_frame
        
    # encode amentities
    def _encode_amentities(self, data_frame):
        # Replace amenities from {}" to ''
        data_frame.amenities.replace('[{}"]', '', regex=True, inplace=True)
        # Split amenities with ,
        amenities = data_frame.amenities.str.split(',', expand=True)
        
        """All amenities
        '24-Hour Check-in',
        'Air Conditioning',
        'Breakfast',
        'Buzzer/Wireless Intercom',
        'Cable TV',
        'Carbon Monoxide Detector',
        'Cat(s)',
        'Dog(s)',
        'Doorman',
        'Dryer',
        'Elevator in Building',
        'Essentials',
        'Family/Kid Friendly',
        'Fire Extinguisher',
        'First Aid Kit',
        'Free Parking on Premises',
        'Gym',
        'Hair Dryer',
        'Hangers',
        'Heating',
        'Hot Tub',
        'Indoor Fireplace',
        'Internet',
        'Iron',
        'Kitchen',
        'Laptop Friendly Workspace',
        'Lock on Bedroom Door',
        'Other pet(s)',
        'Pets Allowed',
        'Pets live on this property',
        'Pool',
        'Safety Card',
        'Shampoo',
        'Smoke Detector',
        'Smoking Allowed',
        'Suitable for Events',
        'TV',
        'Washer',
        'Washer / Dryer',
        'Wheelchair Accessible',
        'Wireless Internet'
        """

        # For each col, extract the unique amenities
        amenities_uniques = []
        for col in amenities.columns:
            amenities_uniques += list(amenities[col].unique())

        # Remove the duplicate values
        amenities_uniques = set(amenities_uniques)
        amenities_uniques.remove('')
        amenities_uniques.remove(None)
        # Only two rows have Washer / Dryer, and they both have washer and dryer
        amenities_uniques.remove('Washer / Dryer')
        # When 'Pets live on this property' is True, one or more from 'Cat(s)', 'Dog(s)', 'Other pet(s)' will appear

        # Encoding amenities
        amenities_enc = pd.DataFrame()
        for amenity in amenities_uniques:
            amenities_enc[amenity] = data_frame.amenities.str.contains(amenity, regex=False)

        # Rename the columns with prefix amenity_
        amenities_enc.columns = [f"amenity_{col}" for col in amenities_enc.columns]
        
        # Concat encoded amenities and data_frame
        data_frame = pd.concat([data_frame, amenities_enc], axis=1)

        return data_frame

    def _imputation(self, X_train, X_valid, y_train, y_valid):
        X_train, X_valid, y_train, y_valid = X_train.copy(), X_valid.copy(), y_train.copy(), y_valid.copy()
        
        # Zero imputation
        # Reason:
        zero_imp = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=0)
        zero_features = ['reviews_per_month', 'host_response_rate', 'host_is_superhost', 'security_deposit', 'cleaning_fee']
        X_train_zero_imp = pd.DataFrame(zero_imp.fit_transform(X_train[zero_features]))
        X_valid_zero_imp = pd.DataFrame(zero_imp.transform(X_valid[zero_features]))
        X_train_zero_imp.columns = zero_features
        X_valid_zero_imp.columns = zero_features
        X_train_zero_imp.index = X_train.index
        X_valid_zero_imp.index = X_valid.index
        X_train_zero_imp = X_train_zero_imp.astype(float)
        X_valid_zero_imp = X_valid_zero_imp.astype(float)
        
        # Mean imputation
        # Reason:
        mean_imp = SimpleImputer(missing_values=np.nan, strategy='mean')
        mean_features = ['host_acceptance_rate', 'review_scores_accuracy', 'review_scores_checkin', 
                         'review_scores_value', 'review_scores_location', 'review_scores_cleanliness', 
                         'review_scores_communication', 'review_scores_rating']
        X_train_mean_imp = pd.DataFrame(mean_imp.fit_transform(X_train[mean_features]))
        X_valid_mean_imp = pd.DataFrame(mean_imp.transform(X_valid[mean_features]))
        X_train_mean_imp.columns = mean_features
        X_valid_mean_imp.columns = mean_features
        X_train_mean_imp.index = X_train.index
        X_valid_mean_imp.index = X_valid.index
        X_train_mean_imp = X_train_mean_imp.astype(float)
        X_valid_mean_imp = X_valid_mean_imp.astype(float)
        
        # Mode imputation
        # Reason: 
        mode_imp = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
        mode_features = ['bathrooms', 'bedrooms', 'beds', 'property_type']
        X_train_mode_imp = pd.DataFrame(mode_imp.fit_transform(X_train[mode_features]))        
        X_valid_mode_imp = pd.DataFrame(mode_imp.transform(X_valid[mode_features]))
        X_train_mode_imp.columns = mode_features
        X_valid_mode_imp.columns = mode_features
        X_train_mode_imp.index = X_train.index
        X_valid_mode_imp.index = X_valid.index
        X_train_mode_imp[['bathrooms', 'bedrooms', 'beds']] = X_train_mode_imp[['bathrooms', 'bedrooms', 'beds']].astype(int)
        X_valid_mode_imp[['bathrooms', 'bedrooms', 'beds']] = X_valid_mode_imp[['bathrooms', 'bedrooms', 'beds']].astype(int)
        
        # Replace the unimputated columns
        for feature in zero_features:
            X_train[feature] = X_train_zero_imp[feature]
            X_valid[feature] = X_valid_zero_imp[feature]
        
        for feature in mean_features:
            X_train[feature] = X_train_mean_imp[feature]
            X_valid[feature] = X_valid_mean_imp[feature]

        for feature in mode_features:
            X_train[feature] = X_train_mode_imp[feature]
            X_valid[feature] = X_valid_mode_imp[feature]
        
        return X_train, X_valid, y_train, y_valid
    
    def _one_hot_encoding(self, X_train, X_valid, y_train, y_valid):
        X_train, X_valid, y_train, y_valid = X_train.copy(), X_valid.copy(), y_train.copy(), y_valid.copy()
        
        oe_enc_features = ['cancellation_policy', 'require_guest_profile_picture', 'require_guest_phone_verification', 
                               'neighbourhood_group_cleansed', 'property_type', 'instant_bookable', 'room_type', 'bed_type']
        
        oe = OrdinalEncoder()
        X_train[oe_enc_features] = oe.fit_transform(X_train[oe_enc_features])
        X_valid[oe_enc_features] = oe.transform(X_valid[oe_enc_features])
    
        return X_train, X_valid, y_train, y_valid

    def _target_encoding(self, X_train, X_valid, y_train, y_valid):
        X_train, X_valid, y_train, y_valid = X_train.copy(), X_valid.copy(), y_train.copy(), y_valid.copy()
        
        target_enc_features = ['cancellation_policy', 'require_guest_profile_picture', 'require_guest_phone_verification', 
                               'neighbourhood_group_cleansed', 'property_type', 'instant_bookable', 'room_type', 'bed_type']
        
        # Create the encoder instance. Choose m to control noise.
        target_enc = MEstimateEncoder(cols=target_enc_features, m=5.0)
        X_train = target_enc.fit_transform(X_train, y_train)
        X_valid = target_enc.transform(X_valid)
        
        return X_train, X_valid, y_train, y_valid
    
    def getData(self, kfold, target_encoding=True):
        data_frame = self.data_frame.copy()
        
        # Split train and valid
        X_train = data_frame[data_frame.kfold != kfold]
        X_valid = data_frame[data_frame.kfold == kfold]
        y_train = X_train.pop('price')
        y_valid = X_valid.pop('price')
        
        # Imputation
        X_train, X_valid, y_train, y_valid = self._imputation(X_train, X_valid, y_train, y_valid)
        
        # Target Encoding
        if target_encoding:
            X_train, X_valid, y_train, y_valid = self._target_encoding(X_train, X_valid, y_train, y_valid)
        else:
            X_train, X_valid, y_train, y_valid = self._one_hot_encoding(X_train, X_valid, y_train, y_valid)
        
        return X_train, X_valid, y_train, y_valid

---

# Machine Learning

## Model Tuning

The Hyperparameter tuning platform I used is [Optuna](https://optuna.org/).<br>
I implemented a logger to write the tuning results in the local log file.<br>
After all tunings are finished, the program will sent an email to my mailbox with the best hyperparameters.
- **To enable this feature**, go to [configure your gmail first](#gmail-configuration).

P.S: If your computer does not support GPU accleration, uncomment code `For CPU` and comment code `FOR GPU`.

---

### Define Logger

In [9]:
import logging
# Define logger
logger = logging.getLogger('ML')

# Set level for logger
logger.setLevel(logging.DEBUG)

# Define the handler and formatter for file logging
log_file = 'ML'
fileHandler = logging.FileHandler(f'{log_file}.log') # Define FileHandler
fileHandler.setLevel(logging.INFO) # Set level
fileFormatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') # Define formatter
fileHandler.setFormatter(fileFormatter) # Set formatter
logger.addHandler(fileHandler) # Add handler to logger

---

### Define Features for ML

In [10]:
# Define sheet id and base url
sheet_id = "1M_qah-ym6O8vDcSmoKAP-lbZRPHUey83R_DJaW3LXfs"
base_url = f"https://docs.google.com/spreadsheets/d/{sheet_id}/gviz/tq?tqx=out:csv&sheet="

# Load metadata for three datasets
listings_metadata = pd.read_csv(base_url+"listings")
calendar_metadata = pd.read_csv(base_url+"calendar")
reviews_metadata = pd.read_csv(base_url+"reviews")

amenities = ['amenity_Washer', 'amenity_Air Conditioning', 'amenity_TV',
             'amenity_Kitchen', 'amenity_Wheelchair Accessible',
             'amenity_Free Parking on Premises', 'amenity_Doorman',
             'amenity_Cable TV', 'amenity_Smoke Detector',
             'amenity_Pets live on this property', 'amenity_Internet',
             'amenity_Hangers', 'amenity_Family/Kid Friendly',
             'amenity_First Aid Kit', 'amenity_Indoor Fireplace', 'amenity_Gym',
             'amenity_Suitable for Events', 'amenity_Breakfast', 'amenity_Cat(s)',
             'amenity_Lock on Bedroom Door', 'amenity_Smoking Allowed',
             'amenity_Dog(s)', 'amenity_Shampoo', 'amenity_Hair Dryer',
             'amenity_Carbon Monoxide Detector', 'amenity_Wireless Internet',
             'amenity_Hot Tub', 'amenity_Safety Card',
             'amenity_Buzzer/Wireless Intercom', 'amenity_Pool',
             'amenity_Elevator in Building', 'amenity_Pets Allowed',
             'amenity_Fire Extinguisher', 'amenity_Other pet(s)',
             'amenity_Laptop Friendly Workspace', 'amenity_Essentials',
             'amenity_Iron', 'amenity_Dryer', 'amenity_24-Hour Check-in',
             'amenity_Heating']
    
# ML1 + ML2
ml1 = listings_metadata[listings_metadata.ML == 1].Label.to_list()
useless_features = ['availability_30', 'availability_60', 'availability_90', 'availability_365', 'first_review', 'last_review', 'amenities']
for useless_feature in useless_features:
    ml1.remove(useless_feature)
ml2 = listings_metadata[listings_metadata.ML == 2].Label.to_list()
ml2.append('demand')
ml2 = ml1 + ml2 + amenities

---

### Tuning Configurations

In [11]:
# Silence Optuna
optuna.logging.set_verbosity(optuna.logging.WARNING)

In [12]:
# Define number of trails
n_trials = 2

---

### Model Tuning: XGBoost

In [13]:
def objective(trial):
    """Modeling tuning with Target encoding.
    """
    features = ['host_acceptance_rate', 'neighbourhood_group_cleansed', 'property_type', 'room_type',
                'bathrooms', 'bedrooms', 'beds', 'bed_type', 'number_of_reviews', 'review_scores_rating',
                'review_scores_accuracy', 'review_scores_cleanliness', 'review_scores_checkin', 'review_scores_communication',
                'review_scores_location', 'review_scores_value', 'reviews_per_month', 'host_response_rate', 'host_is_superhost', 
                'accommodates', 'security_deposit', 'cleaning_fee', 'guests_included', 'extra_people', 'minimum_nights', 
                'maximum_nights', 'instant_bookable', 'cancellation_policy', 'require_guest_profile_picture', 
                'require_guest_phone_verification', 'amenities', 'demand', 'kfold']

    ml_pipeline = ML_pipeline(data_frame=listings, features=features, target='price')
    
    RMSE_AVG = []
    for kfold in range(5):
        X_train, X_valid, y_train, y_valid = ml_pipeline.getData(kfold=kfold, target_encoding=True)
        X_train, X_valid = X_train[ml2], X_valid[ml2]
        
        # Hyperparameters for XGBoost
        xgb_params = {
            'lambda': trial.suggest_loguniform('lambda', 1e-3, 10.0),
            'alpha': trial.suggest_loguniform('alpha', 1e-3, 10.0),
            'reg_lambda': trial.suggest_loguniform("reg_lambda", 1e-8, 100.0),
            'reg_alpha': trial.suggest_loguniform("reg_alpha", 1e-8, 100.0),
            'colsample_bytree': trial.suggest_float("colsample_bytree", 0.1, 1.0),
            'subsample': trial.suggest_float("subsample", 0.1, 1.0),
            'learning_rate': trial.suggest_float("learning_rate", 1e-2, 0.3, log=True),
            'n_estimators': trial.suggest_int('n_estimators', 100, 10000),
            'max_depth': trial.suggest_int("max_depth", 1, 7),
            'random_state': trial.suggest_categorical('random_state', [0, 42, 2021]),
            'min_child_weight': trial.suggest_int('min_child_weight', 1, 300)
        }

        
        # For GPU
        model = XGBRegressor(
                tree_method='gpu_hist',
                gpu_id=0,
                predictor='gpu_predictor',
                **xgb_params)
        
        '''
        # For CPU
        model = XGBRegressor(**xgb_params)
        '''
        
        model.fit(
            X_train, y_train, 
            early_stopping_rounds=300,
            eval_set=[(X_valid, y_valid)],
            verbose=5000
        )
        
        valid_preds = model.predict(X_valid)
        RMSE = mean_squared_error(y_valid, valid_preds, squared=False)
        RMSE_AVG.append(RMSE)
    
    return np.mean(RMSE_AVG)

In [14]:
%%time
study = optuna.create_study(direction='minimize', study_name=f'XGBoost {n_trials} trails')
study.optimize(objective, n_trials=n_trials, show_progress_bar=False) # set n_triasl

logger.info(f"Study name: {study.study_name}")
logger.info(f"Best value: {study.best_value}")
logger.info(f"Best paras: {study.best_params}")
logger.info("Mission Complete! --------------")

[0]	validation_0-rmse:146.89149
[1240]	validation_0-rmse:49.49187
[0]	validation_0-rmse:154.56607
[803]	validation_0-rmse:60.99924
[0]	validation_0-rmse:142.88559
[620]	validation_0-rmse:58.63733
[0]	validation_0-rmse:149.67696
[1708]	validation_0-rmse:56.09981
[0]	validation_0-rmse:148.25832
[601]	validation_0-rmse:50.44122
[0]	validation_0-rmse:144.33707
[1362]	validation_0-rmse:54.90929
[0]	validation_0-rmse:151.35757
[1870]	validation_0-rmse:64.23412
[0]	validation_0-rmse:139.70265
[1815]	validation_0-rmse:60.93744
[0]	validation_0-rmse:146.43550
[1836]	validation_0-rmse:61.31049
[0]	validation_0-rmse:145.27843
[1708]	validation_0-rmse:52.91108
CPU times: user 5min 58s, sys: 23.5 s, total: 6min 21s
Wall time: 34.3 s


---

### Model Tuning: LigtGBM

In [15]:
def objective(trial):
    """Modeling tuning with Target encoding.
    """
    features = ['host_acceptance_rate', 'neighbourhood_group_cleansed', 'property_type', 'room_type',
                'bathrooms', 'bedrooms', 'beds', 'bed_type', 'number_of_reviews', 'review_scores_rating',
                'review_scores_accuracy', 'review_scores_cleanliness', 'review_scores_checkin', 'review_scores_communication',
                'review_scores_location', 'review_scores_value', 'reviews_per_month', 'host_response_rate', 'host_is_superhost', 
                'accommodates', 'security_deposit', 'cleaning_fee', 'guests_included', 'extra_people', 'minimum_nights', 
                'maximum_nights', 'instant_bookable', 'cancellation_policy', 'require_guest_profile_picture', 
                'require_guest_phone_verification', 'amenities', 'demand', 'kfold']

    ml_pipeline = ML_pipeline(data_frame=listings, features=features, target='price')
    
    RMSE_AVG = []
    for kfold in range(5):
        X_train, X_valid, y_train, y_valid = ml_pipeline.getData(kfold=kfold, target_encoding=True)
        X_train, X_valid = X_train[ml2], X_valid[ml2]
        
        # Hyperparameters for LightGBM
        lgb_params = {
            'random_state': trial.suggest_categorical('random_state', [0, 42, 2021]),
            'num_iterations': trial.suggest_int('num_iterations', 100, 10000),
            'learning_rate': trial.suggest_float("learning_rate", 1e-2, 0.3, log=True),
            'max_depth': trial.suggest_int('max_depth', 1, 7),
            'num_leaves': trial.suggest_int('num_leaves', 2, 100),
            'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 100, 2000),
            'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-8, 10.0),
            'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-8, 10.0),
            'feature_fraction': trial.suggest_uniform('feature_fraction', 0.01, 0.99),
            'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.01, 0.99),
            'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
            'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        }

        
        # For GPU
        model = LGBMRegressor(
                    device='gpu',
                    gpu_platform_id=0,
                    gpu_device_id=0,
                    n_jobs=-1,
                    metric='rmse',
                    **lgb_params
        )
        
        '''
        # For CPU
        model = LGBMRegressor(**lgb_params)
        '''
        
        model.fit(
            X_train, y_train, 
            early_stopping_rounds=300,
            eval_set=[(X_valid, y_valid)],
            verbose=5000
        )
        
        valid_preds = model.predict(X_valid)
        RMSE = mean_squared_error(y_valid, valid_preds, squared=False)
        RMSE_AVG.append(RMSE)
    
    return np.mean(RMSE_AVG)

In [16]:
%%time
study = optuna.create_study(direction='minimize', study_name=f'LGBoost {n_trials} trails')
study.optimize(objective, n_trials=n_trials, show_progress_bar=False) # set n_triasl

logger.info(f"Study name: {study.study_name}")
logger.info(f"Best value: {study.best_value}")
logger.info(f"Best paras: {study.best_params}")
logger.info("Mission Complete! --------------")

Training until validation scores don't improve for 300 rounds
Early stopping, best iteration is:
[3159]	valid_0's l2: 5076.1
Training until validation scores don't improve for 300 rounds
Early stopping, best iteration is:
[2873]	valid_0's l2: 5739.6
Training until validation scores don't improve for 300 rounds
Early stopping, best iteration is:
[1370]	valid_0's l2: 5162.45
Training until validation scores don't improve for 300 rounds
Early stopping, best iteration is:
[3291]	valid_0's l2: 5341.94
Training until validation scores don't improve for 300 rounds
Early stopping, best iteration is:
[1020]	valid_0's l2: 4471.65
Training until validation scores don't improve for 300 rounds
Early stopping, best iteration is:
[1666]	valid_0's l2: 4427.31
Training until validation scores don't improve for 300 rounds
Early stopping, best iteration is:
[2896]	valid_0's l2: 5117.76
Training until validation scores don't improve for 300 rounds
Early stopping, best iteration is:
[291]	valid_0's l2: 481

---

### Gmail Configuration<a id='gmail-configuration'></a>

> [How to Send Emails with Gmail using Python](https://stackabuse.com/how-to-send-emails-with-gmail-using-python/)

In [17]:
def gmail(YOUR_GMAIL, YOUR_APP_PASSWORD, SEND_TO):
    """Send the ML tuning result to one or more email addresses.
    
    Args:
        YOUR_GMAIL (str): Your gmail address.
        YOUR_APP_PASSWORD (str): Your APP Password for gmail. 
        SEND_TO (str or list): The target emails.
    """
    gmail_user = YOUR_GMAIL
    gmail_password = YOUR_APP_PASSWORD # Google App Password

    import smtplib
    from email.message import EmailMessage

    msg = EmailMessage()
    msg["From"] = YOUR_GMAIL
    msg["Subject"] = "Seattle Airbnb ML Tuning"
    msg["To"] = SEND_TO
    msg.set_content(f"""\
    {n_trials} Trials are done.
    Mission Complete!""")
    with open('ML.log', 'rb') as f:
        content = f.read()
        msg.add_attachment(content, maintype='application', subtype='log', filename='ML.log')

    server = smtplib.SMTP_SSL('smtp.gmail.com', 465)
    server.login(gmail_user, gmail_password)
    server.send_message(msg)
    server.close()
#gmail(YOUR_GMAIL, YOUR_APP_PASSWORD, SEND_TO)