In [147]:
# Import Libraries
import datetime as dt
import time
# Load packages
import holidays as holiday_lib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sklearn
import xgboost as xgb
from scipy.stats import uniform
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn import metrics, linear_model
from sklearn.model_selection import (
    train_test_split,
    GridSearchCV,
    RandomizedSearchCV
)

# set max display of rows to 100
pd.set_option('display.max_rows', 100)

# show all columns
pd.set_option('max_columns', None)

In [148]:
df = pd.read_csv('https://raw.githubusercontent.com/ras592/mba-datasets/main/hotel_bookings.csv')

In [149]:
df.head()

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,meal,country,market_segment,distribution_channel,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,reserved_room_type,assigned_room_type,booking_changes,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date
0,Resort Hotel,0,342,2015,July,27,1,0,0,2,0.0,0,BB,PRT,Direct,Direct,0,0,0,C,C,3,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01
1,Resort Hotel,0,737,2015,July,27,1,0,0,2,0.0,0,BB,PRT,Direct,Direct,0,0,0,C,C,4,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01
2,Resort Hotel,0,7,2015,July,27,1,0,1,1,0.0,0,BB,GBR,Direct,Direct,0,0,0,A,C,0,No Deposit,,,0,Transient,75.0,0,0,Check-Out,2015-07-02
3,Resort Hotel,0,13,2015,July,27,1,0,1,1,0.0,0,BB,GBR,Corporate,Corporate,0,0,0,A,A,0,No Deposit,304.0,,0,Transient,75.0,0,0,Check-Out,2015-07-02
4,Resort Hotel,0,14,2015,July,27,1,0,2,2,0.0,0,BB,GBR,Online TA,TA/TO,0,0,0,A,A,0,No Deposit,240.0,,0,Transient,98.0,0,1,Check-Out,2015-07-03


### Utility Functions

In [150]:
def convert_month_to_numeric(month_name):
    return dt.datetime.strptime(month_name, "%B").month

def model_timer(func):
    def wrapper(*args, **kwargs):
        start = time.time()
        ret_obj = func(*args, **kwargs)
        end = time.time()
        print("Total time in seconds ==> ", end - start)
        return ret_obj
    return wrapper

### Intiail Feature Engineering

#### NaN Problems
- agent - dropping (Follow Up)
- company - if na -> 0 else 1 (see to numeric)
- children - fillna with median
#### To Numeric
- arrival_date_month
- company
#### One-hot encode
- hotel
- meal
- market_segment
- distribution_channel
- reserved_room_type
- assigned_room_type
- deposit_type
- customer_type
#### Drop
- reservation_status_date
- reservation_status
- country - consider using as continent
- agent

In [157]:
def initial_features(df):
    """ initial_features
        :param df: Pandas DataFrame
        :returns: Pandas DataFrame
        Outlined in Markdown above. Formats input dataframe into one ready for
        use in models like Random Forests.
    """
    
    encode = ['reserved_room_type','market_segment','hotel','distribution_channel','assigned_room_type',
              'deposit_type', 'customer_type', 'meal']
    
    for var in encode:
        df1 = pd.get_dummies(df[var], prefix = var) 
        df = pd.concat([df, df1], axis = 1) 
        del df[var]
    
    # Mostly null value, but should indicate a corporate trip
    df['company_bool'] = df['company'].isna()
 
    """ Children have a few missing rows let's fill that with the median (0.0).
    0.0     110796
    1.0       4861
    2.0       3652
    3.0         76
    10.0         1

    df['children'].isna().sum() -> 4 to 0.0
    """
    df['children'] = df['children'].fillna(df['children'].median())
    
    df = df.drop(['reservation_status_date', 'country', 'agent', 'assigned_room_type_L',
                  'reservation_status', 'company','arrival_date_month'], axis=1)

    # Check for remaining nans and fail if needed
    nan_count = df.isna().sum().sum()
    assert nan_count == 0, f'Remaing nan values in df: {nan_count}'
    
    return df

### Features from Feature Engineering Steps

In [173]:
def new_feature_engineering_features(df):
    df['children'] = df['children'].fillna(0)
    df["meal"] = df.meal.map(lambda x: "SC" if x == 'Undefined' else x)
    df.dropna(subset=['country'], inplace=True)

    #creating new arrival_date column from individual date columns
    df['arrival_date'] = pd.to_datetime(df['arrival_date_year'].astype(str) + 
                                        df['arrival_date_month'].astype(str) + 
                                        df['arrival_date_day_of_month'].astype(str), format='%Y%B%d')
    
    df['season'] = df.arrival_date.dt.month.map(lambda x: 'Spring' if x in [3,4,5] else ('Summer' if x in [6,7,8] else ('Fall' if x in [9,10,11] else 'Winter')))
    
    df['stay_length'] = df['stays_in_weekend_nights'] + df['stays_in_week_nights']

    df['last_minute'] = df.lead_time.map(lambda x: 1 if x < 5 else 0)

    #extracting what day of the week the visitor was scheduled to arrive at the hotel
    #df['day_of_week_of_arrival'] = (df['arrival_date']).dt.day_name()

    #changing Month name to numeric
    df['arrival_date_month'] = pd.to_datetime(df.arrival_date_month, format='%B').dt.month

    #assigning bookings that were reserved the room type they requested 1, if they did not receive the room type they
    #requested, they are assigned 0.
    df['room_type_requested_and_received'] = np.where(df['reserved_room_type'] == df['assigned_room_type'], 1, 0)

    #finding what the local season is at the hotel depending on month of booking
    df['season'] = df.arrival_date.dt.month.map(lambda x: 'Spring' if x in [3,4,5] else ('Summer' if x in [6,7,8] else ('Fall' if x in [9,10,11] else 'Winter')))

    #creating range of dates: 1 week before their arrival date, 1 week after their arrival date
    df['range_before'] = df.arrival_date.apply(lambda x: x - dt.timedelta(weeks=1))
    df['range_after'] = df.arrival_date.apply(lambda x: x + dt.timedelta(weeks=1))

    #using holiday package to find local holidays in Portugal from the years of data we have and appending their dates
    #to a list 
    holidays = holiday_lib.PT(years =[2015,2016,2017])
    holidates= []
    for date, name in holidays.items():
        holidates.append(date)

    #assigning True/False if any of the holiday dates are within the 2 weeks of their arrival_date range; as some 
    #guests may arrive for a holiday a few days before actual holiday, or used holiday time after holiday for travel
    hol=[any([(z>x)&(z<y) for z in holidates]) for x , y in zip(df.range_before, df.range_after)]
    df['is_holiday'] = hol
    df['is_holiday'] = df.is_holiday.map(lambda x: 1 if x == True else 0)

    encode = ['hotel','market_segment','deposit_type','customer_type',
              'distribution_channel', 'meal','season']

    for var in encode:
        book = pd.get_dummies(df[var], prefix = var) 
        df = pd.concat([df, book], axis = 1) 
        del df[var]
        
    #dropping unwanted variables, variables with high correlations, & variables that showed 0 feat importance.        
    df = df.drop(['reservation_status_date', 'agent', 'reservation_status', 'company',
                  'previous_bookings_not_canceled', 'reservation_status', 'country', 'arrival_date',
                  'range_before','reserved_room_type', 'assigned_room_type','range_after'], axis=1)
    
    return df

In [174]:
# Perform initial features transformations
featured_df = initial_features(df.copy())
# Perform feature engineering steps
fe_featured_df = new_feature_engineering_features(df.copy())

### Splitting and Labeling the Dataset

In [176]:
# split data into train / validation
train_data, val_data = train_test_split(featured_df, train_size = 0.7, random_state = 0)

# look at counts of the label
train_data.is_canceled.value_counts()

0    52795
1    30778
Name: is_canceled, dtype: int64

In [177]:
# split the label out from the train and validation sets
y_train = train_data.is_canceled
x_train = train_data.drop(columns = ["is_canceled"])

y_val = val_data.is_canceled
x_val = val_data.drop(columns = ["is_canceled"])

for field in x_train.columns:
    standard_dev = x_train[field].std()
    mean = x_train[field].mean()
    
    x_train[field] = (x_train[field] - mean) / standard_dev
    x_val[field] = (x_val[field] - mean) / standard_dev

In [178]:
# split data into train / validation
fe_train_data, fe_val_data = train_test_split(fe_featured_df, train_size = 0.7, random_state = 0)

# look at counts of the label
fe_train_data.is_canceled.value_counts()

0    52404
1    30827
Name: is_canceled, dtype: int64

In [179]:
# split the label out from the train and validation sets
fe_y_train = fe_train_data.is_canceled
fe_x_train = fe_train_data.drop(columns = ["is_canceled"])

fe_y_val = fe_val_data.is_canceled
fe_x_val = fe_val_data.drop(columns = ["is_canceled"])

for field in fe_x_train.columns:
    standard_dev = fe_x_train[field].std()
    mean = fe_x_train[field].mean()
    
    fe_x_train[field] = (fe_x_train[field] - mean) / standard_dev
    fe_x_val[field] = (fe_x_val[field] - mean) / standard_dev

### Model Functions
1. Logistic Regression
2. Random Forests
3. XGBoost

In [181]:
@model_timer
def run_logistic_regression_model(x_train, y_train):
    """ run_logistic_regression_model
        :param x_train: Pandas DataFrame
        :param y_train: Pandas DataFrame
        :returns: 
        Runs our Logistic Regression model with parameters selected for a RandomizedSearchCV.
    """
    # Create regularization penalty space
    penalty = ['l1', 'l2']

    # Create regularization hyperparameter distribution using uniform distribution
    C = uniform(loc=0, scale=4)

    # Create hyperparameter options
    hyperparameters = dict(C=C, penalty=penalty)

    logistic = linear_model.LogisticRegression(solver='saga', verbose=10)

    clf = RandomizedSearchCV(logistic, hyperparameters, random_state=1, n_iter=10, cv=5, n_jobs=4)

    clf.fit(x_train, y_train)

    return clf

In [182]:
@model_timer
def run_random_forest_model(x_train, y_train):
    """ run_random_forest_model
        :param x_train: Pandas DataFrame
        :param y_train: Pandas DataFrame
        :returns: 
        Runs our random forest model with parameters selected for a RandomizedSearchCV.
    """
    parameters = {"max_depth":range(2, 8), "min_samples_leaf": range(5, 55, 5), "min_samples_split": range(10, 110, 5),
                  "max_samples":[0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4], "max_features": [2, 3, 4, 5, 6],
                  "n_estimators": [150, 200, 250, 300, 350]}

    # Creating the RandomizedSearchCV object
    clf = RandomizedSearchCV(RandomForestClassifier(), parameters, n_jobs=4, scoring = "roc_auc", n_iter = 200,
                             random_state = 0)

    clf.fit(x_train, y_train)
    return clf

In [183]:
@model_timer
def run_xgboost_model(x_train, y_train):
    """ run_random_forest_model
        :param x_train: Pandas DataFrame
        :param y_train: Pandas DataFrame
        :returns: 
        Runs our XGBoost model with parameters selected for a RandomizedSearchCV.
    """
    parameters = {
        'max_depth': range(2, 10),
        'n_estimators': [50, 100, 150, 200, 250, 300],
        'subsample': [0.6, 0.7, 0.8],
        'colsample_bytree': [0.2, 0.4, 0.5, 0.6, 0.7, 0.8, 1],
        'colsample_bynode': [0.2, 0.4, 0.5, 0.6, 0.7, 0.8, 1],
        'gamma': [0, 5, 10, 15, 20],
        'learning_rate': [0.05, 0.1, 0.15, 0.2, 0.25, 0.3],
        'lambda': [0.1, 0.25, 0.5, 0.75, 1]
    }

    clf = RandomizedSearchCV(xgb.XGBClassifier(use_label_encoder = False, eval_metric = "logloss"), 
                             parameters, n_jobs=4, scoring = "roc_auc", n_iter = 30, verbose=10,
                             random_state = 0)

    clf.fit(x_train, y_train)
    return clf

In [184]:
logistic_clf = run_logistic_regression_model(x_train, y_train)
logistic_clf_fe = run_logistic_regression_model(fe_x_train, fe_y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


max_iter reached after 8 seconds
Total time in seconds ==>  104.72907710075378


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    8.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    8.1s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


max_iter reached after 7 seconds
Total time in seconds ==>  83.06903910636902


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    7.4s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    7.4s finished


In [185]:
clf = run_random_forest_model(x_train, y_train)
clf_fe = run_random_forest_model(fe_x_train, fe_y_train)

Total time in seconds ==>  772.2318172454834
Total time in seconds ==>  839.1519718170166


In [197]:
#xg_clf = run_xgboost_model(x_train, y_train)
xg_clf_fe = run_xgboost_model(fe_x_train, fe_y_train)

Fitting 5 folds for each of 30 candidates, totalling 150 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   5 tasks      | elapsed:   55.8s
[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:  1.3min
[Parallel(n_jobs=4)]: Done  17 tasks      | elapsed:  2.4min
[Parallel(n_jobs=4)]: Done  24 tasks      | elapsed:  3.2min
[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:  4.1min
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:  5.3min
[Parallel(n_jobs=4)]: Done  53 tasks      | elapsed:  6.2min
[Parallel(n_jobs=4)]: Done  64 tasks      | elapsed:  7.6min
[Parallel(n_jobs=4)]: Done  77 tasks      | elapsed:  9.2min
[Parallel(n_jobs=4)]: Done  90 tasks      | elapsed: 11.2min
[Parallel(n_jobs=4)]: Done 105 tasks      | elapsed: 12.5min
[Parallel(n_jobs=4)]: Done 120 tasks      | elapsed: 13.9min
[Parallel(n_jobs=4)]: Done 137 tasks      | elapsed: 15.4min
[Parallel(n_jobs=4)]: Done 150 out of 150 | elapsed: 16.8min finished


Total time in seconds ==>  1070.4983959197998


### Metrics Output

In [188]:
def get_metrics(clf, x_train, x_val, train_data, val_data):
    # print the optimized parameters from those tested
    print(clf.best_params_)
    
    try:
        # Create a dataframe with the feature importances
        imp = pd.DataFrame()
        imp["var"] = x_train.columns
        imp["imp"] = clf.best_estimator_.feature_importances_
        imp = imp.sort_values("imp", ascending = False).reset_index(drop = True)
        print(imp)
    except AttributeError:
        print('Skipping Feature Importance Logistic Regression')
    
    # Get the predicted probabilities on the training set
    y_train_prob = clf.predict_proba(x_train)
    # Get the validation probabilities
    y_val_prob = clf.predict_proba(x_val)
    # calculate AUC on the training set using the optimized random forest model
    fpr, tpr, thresholds = metrics.roc_curve(train_data['is_canceled'], y_train_prob[:,1], pos_label = 1)
    print(f'Training AUC: {metrics.auc(fpr, tpr)}')
    # Get the AUC on the validation set
    fpr, tpr, thresholds = metrics.roc_curve(val_data['is_canceled'], y_val_prob[:,1], pos_label = 1)
    print(f'Validation AUC: {metrics.auc(fpr, tpr)}')

### Model Metrics

#### Logistic Regression

In [189]:
get_metrics(logistic_clf, x_train, x_val, train_data, val_data)

{'C': 1.2530940677291005, 'penalty': 'l1'}
Skipping Feature Importance Logistic Regression
Training AUC: 0.8604545899796907
Validation AUC: 0.8607625145766895


In [190]:
pd.DataFrame([(coef, col_name) for col_name, coef in zip(x_train.columns, logistic_clf.best_estimator_.coef_[0])], columns=['coef', 'column_name']).sort_values('coef')

Unnamed: 0,coef,column_name
15,-1.011166,required_car_parking_spaces
53,-0.846229,deposit_type_No Deposit
16,-0.577162,total_of_special_requests
11,-0.329193,previous_bookings_not_canceled
17,-0.324498,reserved_room_type_A
32,-0.296861,market_segment_Offline TA/TO
12,-0.238465,booking_changes
48,-0.236497,assigned_room_type_G
46,-0.211245,assigned_room_type_E
47,-0.20397,assigned_room_type_F


In [191]:
get_metrics(logistic_clf_fe, fe_x_train, fe_x_val, fe_train_data, fe_val_data)

{'C': 2.155266936013428, 'penalty': 'l1'}
Skipping Feature Importance Logistic Regression
Training AUC: 0.8690769581379946
Validation AUC: 0.8650690250782113


In [192]:
pd.DataFrame([(coef, col_name) for col_name, coef in zip(fe_x_train.columns, logistic_clf_fe.best_estimator_.coef_[0])], columns=['coef', 'column_name']).sort_values('coef')

Unnamed: 0,coef,column_name
15,-1.028425,required_car_parking_spaces
31,-0.85471,deposit_type_No Deposit
16,-0.588846,total_of_special_requests
18,-0.421091,last_minute
28,-0.310713,market_segment_Offline TA/TO
12,-0.243819,booking_changes
39,-0.108648,distribution_channel_Direct
10,-0.106344,is_repeated_guest
33,-0.103958,deposit_type_Refundable
37,-0.092955,customer_type_Transient-Party


#### Random Forests

In [193]:
get_metrics(clf, x_train, x_val, train_data, val_data)

{'n_estimators': 250, 'min_samples_split': 10, 'min_samples_leaf': 15, 'max_samples': 0.1, 'max_features': 6, 'max_depth': 7}
                               var       imp
0          deposit_type_Non Refund  0.225843
1          deposit_type_No Deposit  0.196388
2                        lead_time  0.091305
3        total_of_special_requests  0.069299
4           previous_cancellations  0.067749
5            market_segment_Groups  0.037425
6      required_car_parking_spaces  0.032118
7         market_segment_Online TA  0.028085
8                  booking_changes  0.027473
9          customer_type_Transient  0.025249
10            assigned_room_type_A  0.021503
11   customer_type_Transient-Party  0.020981
12      distribution_channel_TA/TO  0.019248
13                             adr  0.017728
14    market_segment_Offline TA/TO  0.013052
15           market_segment_Direct  0.012025
16     distribution_channel_Direct  0.011195
17            assigned_room_type_D  0.009442
18            reser

In [194]:
get_metrics(clf_fe, fe_x_train, fe_x_val, fe_train_data, fe_val_data)

{'n_estimators': 300, 'min_samples_split': 10, 'min_samples_leaf': 25, 'max_samples': 0.4, 'max_features': 5, 'max_depth': 7}
                                 var       imp
0            deposit_type_No Deposit  0.213856
1            deposit_type_Non Refund  0.207092
2          total_of_special_requests  0.077061
3                          lead_time  0.069022
4             previous_cancellations  0.068664
5   room_type_requested_and_received  0.056978
6              market_segment_Groups  0.037522
7                        last_minute  0.036152
8        required_car_parking_spaces  0.028737
9                    booking_changes  0.025051
10          market_segment_Online TA  0.024579
11           customer_type_Transient  0.022486
12     customer_type_Transient-Party  0.021340
13        distribution_channel_TA/TO  0.015865
14      market_segment_Offline TA/TO  0.010680
15                               adr  0.010678
16             market_segment_Direct  0.008250
17       distribution_channe

#### XGBoost

In [195]:
get_metrics(xg_clf, x_train, x_val, train_data, val_data)

{'subsample': 0.6, 'n_estimators': 200, 'max_depth': 9, 'learning_rate': 0.2, 'lambda': 0.25, 'gamma': 0, 'colsample_bytree': 1, 'colsample_bynode': 0.7}
                               var       imp
0          deposit_type_Non Refund  0.730037
1      required_car_parking_spaces  0.060849
2           previous_cancellations  0.031259
3          deposit_type_No Deposit  0.012138
4         market_segment_Online TA  0.010278
5             reserved_room_type_P  0.006073
6             assigned_room_type_P  0.006051
7          customer_type_Transient  0.005479
8   previous_bookings_not_canceled  0.005342
9        total_of_special_requests  0.005048
10   customer_type_Transient-Party  0.005034
11  distribution_channel_Corporate  0.004535
12            assigned_room_type_I  0.004369
13            reserved_room_type_A  0.003918
14        distribution_channel_GDS  0.003515
15                  meal_Undefined  0.003476
16                         meal_FB  0.003463
17            assigned_room_type_A  

In [198]:
get_metrics(xg_clf_fe, fe_x_train, fe_x_val, fe_train_data, fe_val_data)

{'subsample': 0.6, 'n_estimators': 200, 'max_depth': 9, 'learning_rate': 0.2, 'lambda': 0.25, 'gamma': 0, 'colsample_bytree': 1, 'colsample_bynode': 0.7}
                                 var       imp
0            deposit_type_Non Refund  0.686535
1            deposit_type_No Deposit  0.093582
2        required_car_parking_spaces  0.046142
3             previous_cancellations  0.027696
4                        last_minute  0.020467
5   room_type_requested_and_received  0.018467
6           market_segment_Online TA  0.008948
7            customer_type_Transient  0.004967
8          total_of_special_requests  0.004871
9      customer_type_Transient-Party  0.004442
10                 is_repeated_guest  0.004160
11    distribution_channel_Corporate  0.003989
12                           meal_FB  0.003427
13             market_segment_Groups  0.003387
14                   booking_changes  0.003133
15              days_in_waiting_list  0.002767
16          distribution_channel_GDS  0.002755


In [199]:
def get_metrics_results(clf, x_train, y_train, x_val, y_val):
    # Get the predicted probabilities on the training set
    y_train_prob = clf.predict_proba(x_train)
    y_val_prob =  clf.predict_proba(x_val)
    thresh = np.arange(0.01, 1, .01) # create array 0.01, 0.02, 0.03, ..., 0.99

    # create empty lists for each metric
    precision_values = []
    recall_values = []
    acc_values = []
    f1_values = []

    # Loop through each threshold value - .01, .02, .03, ..., .99
    for val in thresh:

        # get 1 / 0 predictions based off probability threshold
        pred = [1 if prob >= val else 0 for prob in y_train_prob[:,1]]

        # calculate precision, recall, accuracy, and f1-score
        precision = metrics.precision_score(y_train, pred)
        recall = metrics.recall_score(y_train, pred)
        accuracy = metrics.accuracy_score(y_train, pred)
        f1_score = metrics.f1_score(y_train, pred)

        # add precision, recall, accuracy, and f1-score to their respective lists
        precision_values.append(precision)
        recall_values.append(recall)
        acc_values.append(accuracy)
        f1_values.append(f1_score)

    # put precision, recall, and accuracy values into a data frame
    result = pd.DataFrame()
    result["threshold"] = thresh
    result["precision"] = precision_values
    result["recall"] = recall_values
    result["accuracy"] = acc_values
    result["f1_score"] = f1_values

    # Pick threshold based off optimal F1-Score on training set
    _prob = result.iloc[result.f1_score.idxmax()]['threshold']
    ## Use above threshold to calculate accuracy, precision, recall, and F1-score on validation set

    # get 1 / 0 predictions based off probability threshold
    pred = [1 if prob >= _prob else 0 for prob in y_val_prob[:,1]]

    # calculate precision, recall, accuracy, and f1-score
    precision = metrics.precision_score(y_val, pred)
    recall = metrics.recall_score(y_val, pred)
    accuracy = metrics.accuracy_score(y_val, pred)
    f1_score = metrics.f1_score(y_val, pred)

    print("Threshold: ", _prob)
    print("Validation precision: ", precision)
    print("Validation recall: ", recall)
    print("Validation accuracy: ", accuracy)
    print("Validation F1-Score: ", f1_score)

#### Metrics - Logistic Regression

In [200]:
get_metrics_results(logistic_clf, x_train, y_train, x_val, y_val)

Threshold:  0.4
Validation precision:  0.7328115408225906
Validation recall:  0.7102484010114533
Validation accuracy:  0.7940084317502861
Validation F1-Score:  0.7213535765541204


In [201]:
get_metrics_results(logistic_clf_fe, fe_x_train, fe_y_train, fe_x_val, fe_y_val)

Threshold:  0.43
Validation precision:  0.7537680209698558
Validation recall:  0.6903225806451613
Validation accuracy:  0.8000056067954361
Validation F1-Score:  0.720651578040567


#### Metrics - Random Forests

In [202]:
get_metrics_results(clf, x_train, y_train, x_val, y_val)

  _warn_prf(average, modifier, msg_start, len(result))


Threshold:  0.36000000000000004
Validation precision:  0.7600737002323159
Validation recall:  0.7056373642719024
Validation accuracy:  0.8058743054973895
Validation F1-Score:  0.731844652705465


In [203]:
get_metrics_results(clf_fe, fe_x_train, fe_y_train, fe_x_val, fe_y_val)

Threshold:  0.36000000000000004
Validation precision:  0.7686468646864687
Validation recall:  0.6988747186796699
Validation accuracy:  0.8088643435844243
Validation F1-Score:  0.7321021611001964


  _warn_prf(average, modifier, msg_start, len(result))


#### Metrics - XGBoost

In [204]:
get_metrics_results(xg_clf, x_train, y_train, x_val, y_val)

Threshold:  0.41000000000000003
Validation precision:  0.8119307152456323
Validation recall:  0.8087907184292726
Validation accuracy:  0.8578887120641037
Validation F1-Score:  0.8103576751117735


In [205]:
get_metrics_results(xg_clf_fe, fe_x_train, fe_y_train, fe_x_val, fe_y_val)

Threshold:  0.4
Validation precision:  0.7997804610318332
Validation recall:  0.8198799699924981
Validation accuracy:  0.8559894592245801
Validation F1-Score:  0.8097055010187071
