In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import pyreadr
import lightgbm as lgb
import rpy2.robjects as robjects
from rpy2.robjects import pandas2ri
from sklearn.model_selection import train_test_split
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import RandomizedSearchCV
import random

readRDS = robjects.r['readRDS']
df = readRDS('do_class_ohe.Rds')
df = pandas2ri.rpy2py_dataframe(df)

def clean_dataset(df):
    assert isinstance(df, pd.DataFrame), "df needs to be a pd.DataFrame"
    df.dropna(inplace=True)
    indices_to_keep = ~df.isin([np.nan, np.inf, -np.inf]).any(1)
    return df[indices_to_keep].astype(np.float64)

df_clean = clean_dataset(df)

X = df_clean.iloc[:, 2:-1].values
y = df_clean.iloc[:, -1].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0, stratify = y)

def learning_rate_010_decay_power_099(current_iter):
    base_learning_rate = 0.1
    lr = base_learning_rate  * np.power(.99, current_iter)
    return lr if lr > 1e-3 else 1e-3

def learning_rate_010_decay_power_0995(current_iter):
    base_learning_rate = 0.1
    lr = base_learning_rate  * np.power(.995, current_iter)
    return lr if lr > 1e-3 else 1e-3

def learning_rate_005_decay_power_099(current_iter):
    base_learning_rate = 0.05
    lr = base_learning_rate  * np.power(.99, current_iter)
    return lr if lr > 1e-3 else 1e-3

fit_params={"early_stopping_rounds":30, 
            "eval_metric" : 'auc', 
            "eval_set" : [(X_test,y_test)],
            'eval_names': ['valid'],
            #'callbacks': [lgb.reset_parameter(learning_rate=learning_rate_010_decay_power_099)],
            'verbose': 100,
            'categorical_feature': 'auto'}

param_test ={'num_leaves': sp_randint(6, 50), 
             'min_child_samples': sp_randint(100, 500), 
             'min_child_weight': [1e-5, 1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4],
             'subsample': sp_uniform(loc=0.2, scale=0.😎, 
             'colsample_bytree': sp_uniform(loc=0.4, scale=0.6),
             'reg_alpha': [0, 1e-1, 1, 2, 5, 7, 10, 50, 100],
             'reg_lambda': [0, 1e-1, 1, 5, 10, 20, 50, 100]}

n_HP_points_to_test = 100

#n_estimators is set to a "large value". The actual number of trees build will depend on early stopping and 5000 define only the absolute maximum
clf = lgb.LGBMClassifier(max_depth=-1, random_state=314, silent=True, metric='None', n_jobs=4, n_estimators=5000)
gs = RandomizedSearchCV(
    estimator=clf, param_distributions=param_test, 
    n_iter=n_HP_points_to_test,
    scoring='roc_auc',
    cv=3,
    refit=True,
    random_state=314,
    verbose=True)

#gs.fit(X_train, y_train, **fit_params)
print('Best score reached: {} with params: {} '.format(gs.best_score_, gs.best_params_))

opt_parameters = {'colsample_bytree': 0.9522, 'min_child_samples': 111, 'min_child_weight': 0.01, 'num_leaves': 38, 'reg_alpha': 0, 'reg_lambda': 0.1, 'subsample': 0.30293}

clf_final = lgb.LGBMClassifier(**gs.best_estimator_.get_params())

clf_final = lgb.LGBMClassifier(**clf.get_params())

clf_final.set_params(**opt_parameters)

random.seed(997)
clf_final.fit(X_train, y_train, **fit_params, callbacks=[lgb.reset_parameter(learning_rate=learning_rate_010_decay_power_0995)])

probs = clf_final.predict_proba(X_test)
preds = probs[:,1]
Roc_Analysis(y_test, preds)

model = LogisticRegression(solver='liblinear', random_state=997)

model.fit(X_train, y_train)

model = LogisticRegression(solver='liblinear', random_state=0).fit(X_train, y_train)
model.coef_

probs = model.predict_proba(X_test)
preds = probs[:,1]
Roc_Analysis(y_test, preds)

del df['BKG_SALES_DATE_MONTH.12']
del df['KG_SALES_DATE_WEEKDAY.7']
del df['BKG_BOOKING_WINDOW_D']
del df['BKG_STAY_LENGTH_GROUPED.Short']
del df['BKG_FLIGHT_COUPONS']
del df['BKG_SALES_CHANNEL_grouped.1']
del df['TKT_CURRENCY_fin.USD']
del df['BKG_SALES_DATE_MONTH.12']
del df['BKG_SALES_DATE_MONTH.12']
del df['BKG_SALES_DATE_MONTH.12']
del df['BKG_SALES_DATE_MONTH.12']
del df['BKG_SALES_DATE_MONTH.12']