## 1. Import Module & Data Preparation

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import log_loss, precision_score, confusion_matrix
from sklearn.linear_model import LogisticRegression
from itertools import combinations

train = pd.read_csv('../data/processed/final_model_train_input.csv')
test = pd.read_csv('../data/processed/final_model_test_input.csv')
X_train = train.drop(['is_duplicate'],  axis=1)
y_train = train['is_duplicate']

X_test = test.drop(['is_duplicate'],  axis=1)
y_test = test['is_duplicate']

model_list = X_train.columns.tolist()
input_list = []
for i in range(2, len(model_list)):
    input_list += [",".join(map(str, comb)) for comb in combinations(model_list, i)]

# 2. Stepwise Tuning

In [2]:
#result_df = pd.Dataframe(columns = ['Input', 'Train Log Loss', 'Trai Precision', 'Test Log Loss', 'Test Precision'])
result_list = []
log_clf = LogisticRegression()
for i in input_list:
    subset = i.split(",")
    train_subset = X_train[subset]
    test_subset = X_test[subset]
    log_clf.fit(train_subset,y_train)
    preds_train = log_clf.predict(train_subset)
    preds_prob_train = log_clf.predict_proba(train_subset)
    preds_test = log_clf.predict(test_subset)
    preds_prob_test = log_clf.predict_proba(test_subset)
    l = [i, log_loss(y_train, preds_prob_train),precision_score(y_train, preds_train), 
                       log_loss(y_test, preds_prob_test), precision_score(y_test, preds_test)]
    result_list.append(l)

result_df = pd.DataFrame(result_list)
result_df.columns = ['Stacked Model','Train Log Loss', 'Train Precision', 'Test Log Loss', 'Test Precision']
result_df = result_df.sort_values(by =['Test Log Loss'], ascending = True)
result_df.reset_index(drop = True, inplace = True)
result_df

Unnamed: 0,Stacked Model,Train Log Loss,Train Precision,Test Log Loss,Test Precision
0,"bert,lgbm",0.125318,0.935907,0.133022,0.9307
1,"bert,lgbm,mlp",0.125301,0.935968,0.133088,0.930564
2,"bert,mlp",0.13312,0.930157,0.135653,0.927189
3,"siam_lstm,bert",0.089873,0.952187,0.143896,0.928853
4,"siam_lstm,bert,lgbm",0.089812,0.952045,0.144577,0.92811
5,"lgbm,mlp",0.131216,0.936443,0.144879,0.927349
6,"siam_lstm,bert,lgbm,mlp",0.087988,0.954055,0.145437,0.928533
7,"siam_lstm,bert,mlp",0.088223,0.953479,0.146651,0.92793
8,"siam_lstm,bert,xgboost",0.081837,0.955377,0.157603,0.935726
9,"siam_lstm,lgbm,mlp",0.094769,0.953671,0.159301,0.922521


# 3. Best Subset Selection

In [3]:
best_subset_feature = result_df['Stacked Model'].values[0].split(",")
X_train = X_train[best_subset_feature]
X_test = X_test[best_subset_feature]

In [4]:
log_clf.fit(X_train, y_train)
preds_train = log_clf.predict(X_train)
preds_prob_train = log_clf.predict_proba(X_train)
preds_test = log_clf.predict(X_test)
preds_prob_test = log_clf.predict_proba(X_test)

In [5]:
print("The train log loss is:", log_loss(y_train, preds_prob_train))
print("The train precision is:", precision_score(y_train, preds_train))
print("The test log loss is:", log_loss(y_test, preds_prob_test))
print("The test precision is:", precision_score(y_test, preds_test))

The train log loss is: 0.12531842132839877
The train precision is: 0.9359066709721222
The test log loss is: 0.13302161760440695
The test precision is: 0.9307002505604642
