In [1]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel
from joblib import load, dump

In [2]:
# Load training sets for WoE and mean transformations
X_train_WOE = pd.read_csv('data/X_train_WOE.csv',index_col=0)
X_train_mean = pd.read_csv('data/X_train_mean.csv',index_col=0) 
y_train = pd.read_csv('data/y_train.csv',index_col=['ID']).squeeze()

In [3]:
# Load feature selection models
fs_WOE = load('models/fs_WOE.joblib')
fs_MEAN = load('models/fs_MEAN.joblib')

In [4]:
# Training models
y_train_ = y_train.astype(int)
results_clf = dict()
for j,data_preproc in enumerate(['WOE','MEAN']):
    if data_preproc=='WOE':
        X_train_ = X_train_WOE
        fs_model = fs_WOE
        eval_metric = 'aicc'
    elif data_preproc=='MEAN':
        X_train_ = X_train_mean
        fs_model = fs_MEAN
        eval_metric = 'aicc'
    fs_ = SelectFromModel(estimator=fs_model,prefit=True,threshold=1e-6)
    model_ = LogisticRegression(class_weight='balanced', solver='saga', 
                                penalty='l2', C=1.0, random_state=123, 
                                tol=1e-6, max_iter=int(1e6))
    pipeline_model = Pipeline(steps = [('feature_selection', fs_),
                                       ('model', model_)])
    pm = pipeline_model.fit(X_train_,y_train_)
    df_lr_coef = pd.DataFrame({'variable':list(pm[0].get_feature_names_out()),
                                'coefficient':list(pm[-1].coef_.reshape(-1).round(3))}).sort_values(by=['variable'])
    results_clf.update({data_preproc:{'coeff':df_lr_coef,
                                      'pipeline':pm,}
                        })

In [7]:
# Save pipeline for WoE and mean transformations
dump(results_clf['WOE']['pipeline'], 'models/pipeline_WOE.joblib')
dump(results_clf['MEAN']['pipeline'], 'models/pipeline_MEAN.joblib')

['models/pipeline_MEAN.joblib']