In [17]:
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.decomposition import PCA
from sklearn.linear_model import Lasso, LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, StratifiedKFold
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, classification_report
from scipy.stats import boxcox
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from xgboost import XGBRegressor, XGBClassifier
from hyperopt import fmin, tpe, hp, Trials, STATUS_OK, space_eval
from hyperopt.pyll import scope

In [2]:
robust_scaled = pd.read_parquet('combined_robust.parquet.gzip')
standard_scaled = pd.read_parquet('combined_standard.parquet.gzip')

In [5]:
X_robust = robust_scaled.drop(columns=['is_canceled', 'reservation_status'])
X_standard = standard_scaled.drop(columns=['is_canceled', 'reservation_status'])
y = robust_scaled.is_canceled

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X_standard, y, test_size=0.33, random_state=42)

In [9]:
def hyperparameter_tuning(space):
    model = XGBClassifier(**space)

    #Define evaluation datasets.
    evaluation = [(X_train, y_train), (X_test, y_test)]
    evaluation_dart = [(X_test, y_test)]

    #Fit the model. Define evaluation sets, early_stopping_rounds, and eval_metric.
    model.fit(X_train, y_train, eval_set = evaluation, verbose = False)
    #Obtain prediction and log_loss score.
    pred = model.predict(X_test)
    loss = mean_squared_error(y_test, pred,squared=False)
    print ("SCORE:", loss)

    #Specify what the loss is for each model.
    return {'loss': loss, 'status': STATUS_OK, 'model': model}

In [10]:
space = {'max_depth': scope.int(hp.quniform("max_depth", 1, 15, 1)),
         'max_leaves': scope.int(hp.quniform("max_leaves", 1, 50, 1)),
         'gamma': hp.uniform ('gamma', 0,1),
         'reg_alpha' : hp.uniform('reg_alpha', 0,50),
         'reg_lambda' : hp.uniform('reg_lambda', 10,200),
         'colsample_bytree' : hp.uniform('colsample_bytree', 0,1),
         'min_child_weight' : hp.uniform('min_child_weight', 0, 5),
         'n_estimators': 10000,
         'learning_rate': hp.uniform('learning_rate', 0, .1),
         'tree_method':'gpu_hist',
         'gpu_id': 0,
         'random_state': 7,
         'max_bin' : scope.int(hp.quniform('max_bin', 200, 650, 1)),
         'sampling_method': 'gradient_based',
         'eval_metric': 'rmse',
         'early_stopping_rounds': 100
         }

In [18]:
trials = Trials()
best = fmin(fn=hyperparameter_tuning,
            space=space,
            algo=tpe.suggest,
            max_evals=30,
            trials=trials)

print(best)

SCORE:                                                
0.41708968602445                                      
SCORE:                                                                        
0.30278284915510123                                                           
SCORE:                                                                           
0.33535769466367743                                                              
SCORE:                                                                           
0.3818916927699869                                                               
SCORE:                                                                           
0.3864175340510534                                                               
SCORE:                                                                           
0.2664425820589722                                                               
SCORE:                                                                      

In [19]:
#Create instace of best model.
best_model = trials.results[np.argmin([r['loss'] for r in
                                       trials.results])]['model']

#Examine model hyperparameters
print(best_model)

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=0.5564531451500485, early_stopping_rounds=100,
              enable_categorical=False, eval_metric='rmse', feature_types=None,
              gamma=0.6194834193249419, gpu_id=0, grow_policy=None,
              importance_type=None, interaction_constraints=None,
              learning_rate=0.0575375642426279, max_bin=464,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=11, max_leaves=39,
              min_child_weight=2.259157290019759, missing=nan,
              monotone_constraints=None, n_estimators=10000, n_jobs=None,
              num_parallel_tree=None, predictor=None, random_state=7, ...)


In [20]:
xgb_preds_best = best_model.predict(X_test)
xgb_score_best = mean_squared_error(y_test, xgb_preds_best, squared=False)
print('RMSE_Best_Model:', xgb_score_best)

RMSE_Best_Model: 0.2556501295834989


In [21]:
print(classification_report(y_true=y_test,y_pred=xgb_preds_best))

              precision    recall  f1-score   support

           0       0.93      0.97      0.95     24754
           1       0.94      0.88      0.91     14645

    accuracy                           0.93     39399
   macro avg       0.94      0.92      0.93     39399
weighted avg       0.93      0.93      0.93     39399



In [24]:
a = 1
b = 10
a = a.__add__(b.__add__(10))
print(a)

21


In [4]:
from hyperopt import hp, fmin, tpe
from hyperopt.pyll import scope
# Define the hyperparameter search space
space = {
    'n_estimators': scope.int(hp.quniform('n_estimators', 10, 1000, 10)),
    'max_depth': scope.int(hp.quniform('max_depth', 1, 30, 1)),
    'min_samples_split': hp.uniform('min_samples_split', 0.1, 1),
    'min_samples_leaf': hp.uniform('min_samples_leaf', 0.1, 0.5),
    'max_features': hp.choice('max_features', ['sqrt', 'log2', None])
}
# Define the objective function to minimize
def objective(params):
    clf = RandomForestClassifier(**params, random_state=42)
    score = cross_val_score(clf, X_standard, y, cv=5, n_jobs=-1).mean()
    return -score

# Run the hyperparameter search using Tree-structured Parzen Estimator (TPE) algorithm
best_params = fmin(fn=objective, space=space, algo=tpe.suggest, max_evals=30, rstate=np.random.default_rng(42))

# Train a model using the best hyperparameters found
best_clf = RandomForestClassifier(**best_params, random_state=42)
best_clf.fit(X_standard, y)

  0%|          | 0/30 [00:18<?, ?trial/s, best loss=?]


KeyboardInterrupt: 

In [18]:
best_params

{'max_depth': 23.0,
 'max_features': 2,
 'min_samples_leaf': 0.135540739226335,
 'min_samples_split': 0.5204337092518656,
 'n_estimators': 80.0}

In [61]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
best_clf = RandomForestClassifier(max_depth=23, max_features=2, n_estimators=80, min_samples_leaf=0.135540739226335,
                                  min_samples_split=0.5204337092518656, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X_robust, y, test_size=0.33, random_state=42)
best_clf.fit(X_train, y_train)
print(classification_report(y_test, best_clf.predict(X_test), labels = [0,1]))

              precision    recall  f1-score   support

           0       0.63      1.00      0.77     24754
           1       0.00      0.00      0.00     14645

    accuracy                           0.63     39399
   macro avg       0.31      0.50      0.39     39399
weighted avg       0.39      0.63      0.48     39399



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [25]:
set(y_test) - set(best_clf.predict(X_test))

{1}

In [70]:
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials, space_eval
from sklearn.metrics import precision_score, r2_score, mean_squared_error
# Define hyperparameter space
space = {
    'n_estimators': hp.choice('n_estimators', range(100, 1000)),
    'max_depth': hp.choice('max_depth', range(1, 20)),
    'min_samples_split': hp.choice('min_samples_split', range(2, 10)),
    'min_samples_leaf': hp.choice('min_samples_leaf', range(1, 10))
}

# Define objective function
def objective(params):
    clf = RandomForestClassifier(**params)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred)
    r2 = r2_score(y_true=y_test, y_pred=y_pred)
    rmse = mean_squared_error(y_true=y_test, y_pred=y_pred, squared=False)
    return {'loss': -rmse, 'status': STATUS_OK}

# Split data into training and testing sets
np.random.seed(0)
X_train, X_test, y_train, y_test = train_test_split(X_robust, y, test_size=0.33, random_state=42)

# Define trials object and run hyperparameter search
trials = Trials()
best = fmin(fn=objective,
            space=space,
            algo=tpe.suggest,
            max_evals=30,
            trials=trials)

# Print best hyperparameters and accuracy
print("Best hyperparameters:", best)
best_params = space_eval(space, best)
clf = RandomForestClassifier(**best_params)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print("R2:", r2_score(y_test, y_pred))

  0%|          | 0/30 [00:01<?, ?trial/s, best loss=?]


KeyboardInterrupt: 

In [None]:
print(classification_report(y_test, y_pred))

In [62]:
X_standard = X_standard.drop('reservation_status', axis=1)

In [63]:
X_train, X_test, y_train, y_test = train_test_split(X_standard, y, test_size= 0.35, random_state=1352)
clf = RandomForestClassifier(**best_params)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print("R2:", r2_score(y_test, y_pred))

R2: 0.06772310072822618


In [64]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.75      0.98      0.85     26217
           1       0.94      0.44      0.60     15570

    accuracy                           0.78     41787
   macro avg       0.84      0.71      0.73     41787
weighted avg       0.82      0.78      0.76     41787



In [45]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(X_train, y_train)
preds = lr.predict(X_test)
print(f"R2 Score: {r2_score(y_test, y_pred)}")


R2 Score: 0.9992834151427581


In [47]:
print(precision_score(y_test, y_pred))

1.0


In [48]:
y_pred

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [49]:
y_test

99421     0
18436     0
27430     0
113582    0
111001    0
         ..
32450     0
52759     1
84560     0
27250     0
85486     0
Name: is_canceled, Length: 41787, dtype: int64

In [50]:
print(precision_score(y_train, clf.predict(X_train)))

1.0


In [51]:
data = pd.read_parquet('combined_data_basic.parquet.gzip')
X_train, X_test, y_train, y_test = train_test_split(data.drop('is_canceled', axis=1), data.is_canceled, test_size= 0.35, random_state=1352)
clf = RandomForestClassifier(**best_params)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print("R2:", r2_score(y_test, y_pred))

R2: 0.9992834151427581


In [54]:
from tqdm import tqdm
from sklearn.model_selection import RepeatedStratifiedKFold
X = data.drop('is_canceled', axis=1)
y = data.is_canceled
rskf = RepeatedStratifiedKFold(n_splits=10, n_repeats=10, random_state=42)
clf = RandomForestClassifier(**best_params)
scores = tqdm(cross_val_score(clf, X, y, cv=rskf))

print("Cross-validation scores:", scores)
print("Mean score:", scores.mean())

KeyboardInterrupt: 

In [58]:
data_v2 = pd.read_parquet('encoded_data_v2.parquet.gzip')
X_train, X_test, y_train, y_test = train_test_split(data_v2.drop('is_canceled', axis=1), data_v2.is_canceled, test_size= 0.35, random_state=1352)
clf = RandomForestClassifier(**best_params)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print("R2:", r2_score(y_test, y_pred))

R2: 0.0032304635764509593


In [59]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.73      0.99      0.84     26217
           1       0.97      0.39      0.55     15570

    accuracy                           0.77     41787
   macro avg       0.85      0.69      0.70     41787
weighted avg       0.82      0.77      0.73     41787



In [52]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     26217
           1       1.00      1.00      1.00     15570

    accuracy                           1.00     41787
   macro avg       1.00      1.00      1.00     41787
weighted avg       1.00      1.00      1.00     41787

