In [1]:
from model_utils import *
from sklearn.utils import resample
import pandas as pd
from sklearn.model_selection import (
    train_test_split,
    GridSearchCV,
    cross_val_score,
    validation_curve,
)
from sklearn import preprocessing

## Import Data
### Data used for preliminary results are downsampled and balanced

In [2]:
# import dataset
test_data = "dataset/test.csv"
df_test = pd.read_csv(test_data, sep=",", index_col="ID_code")

train_data = "dataset/train.csv"
df_train = pd.read_csv(train_data, sep=",", index_col="ID_code")

# Separate majority and minority classes
df2_majority = df_train[df_train["target"] == 0]
df2_minority = df_train[df_train["target"] == 1]
n_samples = df2_minority.target.sum()

df2_majority_downsampled = resample(
   df2_majority, replace=False, n_samples=n_samples, random_state=99
)
df_downsampled = pd.concat([df2_majority_downsampled, df2_minority])
X_dn = df_downsampled.drop(["target"], axis=1)
y_dn = df_downsampled["target"]

# calculatig the z-score normalization using sklearn
std_scale = preprocessing.StandardScaler().fit(df_train.drop(["target"], axis=1).values)
X_dn_norm = std_scale.transform(X_dn)

#use all downsamples samples (40 000)
X_train, X_test, y_train, y_test = train_test_split(X_dn_norm, y_dn, test_size=0.2, random_state=101)

#reduce size of for cbomputationally intensive algorithms
X_trainsmall, _, y_trainsmall, _ = train_test_split(X_train, y_train, test_size=0.8756096, random_state=101)

## Create Models using default Parameters

In [4]:
# Logistic Regression
from sklearn.linear_model import LogisticRegression

logreg_params = {
    'C': 0.00010494583820459354, 
    'solver': 'saga'
}
clf_logreg = LogisticRegression(random_state=111, penalty='l2', **logreg_params)

In [5]:
# Naive Bayes
from sklearn.naive_bayes import GaussianNB

NB_params = {
    'var_smoothing': 7.453943540948982e-11
}
clf_NB = GaussianNB(**NB_params)

In [6]:
# XGBoost
import xgboost as xgb

xgb_params = {
    
}

clf_xgb = xgb.XGBClassifier(
        objective="binary:logistic",
        eval_metric="auc",
        learning_rate=0.1,
        silent=1,
        early_stopping=200,
        n_estimators=500,
        tree_method="approx",
    )
