Submissions are evaluated on area under the ROC curve between the predicted probability and the observed target.

Submission File
For each id in the test set, you must predict a probability for the target variable. The file should contain a header and have the following format:

https://www.kaggle.com/c/tabular-playground-series-nov-2021/overview

In [1]:
!conda info


     active environment : kaggle-pgnov21
    active env location : C:\ProgramData\Anaconda3\envs\kaggle-pgnov21
            shell level : 2
       user config file : C:\Users\globetrekker\.condarc
 populated config files : C:\Users\globetrekker\.condarc
          conda version : 4.10.3
    conda-build version : 3.21.4
         python version : 3.8.8.final.0
       virtual packages : __win=0=0
                          __archspec=1=x86_64
       base environment : C:\ProgramData\Anaconda3  (writable)
      conda av data dir : C:\ProgramData\Anaconda3\etc\conda
  conda av metadata url : None
           channel URLs : https://repo.anaconda.com/pkgs/main/win-64
                          https://repo.anaconda.com/pkgs/main/noarch
                          https://repo.anaconda.com/pkgs/r/win-64
                          https://repo.anaconda.com/pkgs/r/noarch
                          https://repo.anaconda.com/pkgs/msys2/win-64
                          https://repo.anaconda.com/pkgs/msys2

In [2]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [3]:
import time, gc, copy
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import warnings

from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import GaussianNB

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.model_selection import StratifiedKFold, RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score, cross_val_predict

from sklearn.calibration import CalibratedClassifierCV
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.metrics import roc_auc_score
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import optuna

pd.options.mode.chained_assignment = None  # default='warn'
%config Completer.use_jedi = False
warnings.simplefilter(action='ignore', category=FutureWarning)

In [4]:
def get_datasets(path: str, scale: bool, debug: bool):
    """Import datasets from path. Expect csvs called train.csv and test.csv

    Arguments:
    :path - path containing csvs
    :scale - run standard scaler
    :debug - run in debug mode
    
    Returns:
    :X - dataframe (train) minus target
    :y - series (target values for train)
    :df_test - dataframe (test) 
    """
    
    if debug:
        df_train = pd.read_csv(path + 'train.csv', nrows=1000)
        df_test = pd.read_csv(path + 'test.csv', nrows=1000)
    else:
        df_train = pd.read_csv(path + 'train.csv')
        df_test = pd.read_csv(path + 'test.csv')
        
    ids = df_test.id
    df_train.drop('id', axis=1, inplace=True)
    df_test.drop('id', axis=1, inplace=True)

    original_features = df_test.columns

    X = df_train[original_features]
    y = df_train['target']
    
    if scale:
        std_scaler = StandardScaler()
        X_norm = pd.DataFrame(std_scaler.fit_transform(X))
        X_norm.columns = original_features
        df_test_norm = pd.DataFrame(std_scaler.transform(df_test))
        df_test_norm.columns = original_features
    else:
        X_norm = X
        df_test_norm = df_test
    
    return X_norm, y, df_test_norm, ids

In [5]:
def get_models():
    """Return list of models for initial analysis
    
    Returns:
    :models - list of dicts(name, model)
    """
    models = [
        {'name': 'lr', 'model': LogisticRegression(random_state=5)},
        {'name': 'lsvc', 'model': LinearSVC(dual=False, random_state=5)},
        {'name': 'lgbm', 'model': LGBMClassifier(random_state=5)},
        {'name': 'bayes', 'model': GaussianNB()},
    ]
    return models

In [6]:
def evaluate_model(
    model, 
    X: pd.DataFrame, 
    y: pd.Series) -> list:
    """Return list of scores for a model

    Arguments:
    :model - model to be evaluated
    :X - dataframe (train) minus target
    :y - series (target values for train)
    
    Returns:
    scores - list of scores for model
    """
    
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
    scores = cross_val_score(model, X, y, scoring='roc_auc', cv=cv, n_jobs=-1, error_score='raise')
    
    return scores

In [7]:
def evaluate_model_val_set(
    model, 
    X_train: pd.DataFrame, 
    y_train: pd.Series,
    X_val: pd.DataFrame,
    y_val: pd.Series) -> float:
    """Return scores for a model on validation set

    Arguments:
    :model - model to be evaluated
    :X_train - training dataframe minus target
    :y_train - training series (target values for training set)
    :X_val - validation dataframe minus target
    :y_val - validation series (target values for validation set)
    
    Returns:
    score - score for model
    """
    if model.__class__.__name__ == 'LinearSVC':
        clf = CalibratedClassifierCV(base_estimator=model, cv=5)
    else:
        clf = model
    clf.fit(X_train, y_train)
    preds = clf.predict_proba(X_val)[:,1]
    
    score = roc_auc_score(y_val, preds)
    
    return score

In [8]:
def get_feature_importances(
    X_in: pd.DataFrame, 
    y_in: pd.Series, 
    model_type: str, 
    k: int) -> pd.DataFrame:
    """Return feature importances of features as to the target prediction

    Arguments:
    :X_in - dataframe (train) minus target
    :y_in - series (target values for train)
    :model_type - 'regression' or 'classification'
    :k - number of folds 
    
    Returns:
    :featureScores - dataframe with abs correlation value sorted in asc
    """
    if model_type == 'classification':
        bestfeatures = SelectKBest(score_func=f_classif, k=k)
    else:
        bestfeatures = SelectKBest(score_func=f_regression, k=k)
    
    fit = bestfeatures.fit(X_in, y_in)

    dfscores = pd.DataFrame(fit.scores_)
    dfcolumns = pd.DataFrame(X_in.columns)

    # Concat two dataframes for better visualization 
    featureScores = pd.concat([dfcolumns,dfscores],axis=1)
    featureScores.columns = ['Specs','Score'] 
    featureScores['Abs_score'] = abs(featureScores['Score'])
    featureScores.sort_values(by='Score', axis=0, ascending=True, inplace=True)
    featureScores.reset_index(drop=True, inplace=True)
    
    plt.bar(featureScores['Specs'], featureScores['Abs_score'])
    plt.title('Feature Importances')
    plt.show()
    
    return featureScores

In [9]:
def get_kmeans_labels(
    X_in: pd.DataFrame, 
    features: list,
    n_clusters: int) -> list:
    """Return kmeans labels for a dataframe

    Arguments:
    :X_in - dataframe (train) minus target
    :features - list of important features
    :n_clusters - number of kmeans clusters
    
    Returns:
    X_temp - dataframe (train) minus target plus kmeans labels
    """
    X_temp = copy.deepcopy(X_in)
    kmeans = KMeans(n_clusters=n_clusters, random_state=3)
    kmeans.fit(X_temp[features])
    X_temp['cluster'] = kmeans.predict(X_temp[features])
    
    return X_temp

In [10]:
def get_kmeans_dist_ratios(
    X_in: pd.DataFrame, 
    X_val_in: pd.DataFrame,
    X_test_in: pd.DataFrame,
    features: list,
    n_clusters: int) -> list:
    """Return kmeans labels for a dataframe

    Arguments:
    :X_in - dataframe (train) minus target
    :X_val_in - dataframe (val) minus target
    :X_test_in - dataframe (test) minus target
    :features - list of important features
    :n_clusters - number of kmeans clusters
    
    Returns:
    :X_temp - dataframe (train) minus target plus kmeans dist ratios
    :X_temp_val - dataframe (val) minus target plus kmeans dist ratios
    :X_temp_test - dataframe (test) minus target plus kmeans dist ratios
    """
    kmeans = KMeans(n_clusters=n_clusters, random_state=3)
    X_temp = copy.deepcopy(X_in)
    X_temp_val = copy.deepcopy(X_val_in)
    X_temp_test = copy.deepcopy(X_test_in)
    
    kmeans.fit(X_temp[features])
    cluster_cols = [f"cluster{i+1}" for i in range(n_clusters)]

    cluster_distances = kmeans.transform(X_temp[features])
    cluster_distances_val = kmeans.transform(X_temp_val[features])
    cluster_distances_test = kmeans.transform(X_temp_test[features])
    
    X_temp_cluster_distances = pd.DataFrame(cluster_distances, columns=cluster_cols, index=X_temp.index)
    X_temp_val_cluster_distances = pd.DataFrame(cluster_distances_val, columns=cluster_cols, index=X_temp_val.index)
    X_temp_test_cluster_distances = pd.DataFrame(cluster_distances_test, columns=cluster_cols, index=X_temp_test.index)

    new_cols = []
    for i in cluster_cols:
        for j in cluster_cols:
            if i != j:
                new_col_name = i + '_' + j
                X_temp_cluster_distances[new_col_name] = X_temp_cluster_distances[i] / X_temp_cluster_distances[j]
                X_temp_val_cluster_distances[new_col_name] = X_temp_val_cluster_distances[i] / X_temp_val_cluster_distances[j]
                X_temp_test_cluster_distances[new_col_name] = X_temp_test_cluster_distances[i] / X_temp_test_cluster_distances[j]
                new_cols.append(new_col_name)
            
    X_temp = X_temp.join(X_temp_cluster_distances[new_cols])
    X_temp_val = X_temp_val.join(X_temp_val_cluster_distances[new_cols])
    X_temp_test = X_temp_test.join(X_temp_test_cluster_distances[new_cols])
    
    return X_temp, X_temp_val, X_temp_test

In [11]:
def generate_meta_features_model(model, X_in, y_in, cv):
    """Generate meta features for single base classifier model, to be used later for stacking

    Arguments:
    :model - model to evaluate
    :X_in - dataframe with features minus target
    :y_in - target series
    :cv - cross-validation iterator 
    """
    
    # Initialize
    n_classes = len(np.unique(y_in)) # Assuming that training data contains all classes
    meta_features = np.zeros((X_in.shape[0], n_classes)) 
    n_splits = cv.get_n_splits(X_in, y_in)
    
    # Loop over folds
    print("Starting hold out prediction with {} splits for {}.".format(n_splits, model.__class__.__name__))
    for train_idx, hold_out_idx in cv.split(X_in, y_in): 
        
        # Split data
        X_in_train = X_in.iloc[train_idx]    
        y_in_train = y_in.iloc[train_idx]
        X_in_hold_out = X_in.iloc[hold_out_idx]

        # Fit estimator to K-1 parts and predict on hold out part
        est = copy.deepcopy(model)
        est.fit(X_in_train, y_in_train)
        y_in_hold_out_pred = est.predict_proba(X_in_hold_out)
        
        # Fill in meta features
        meta_features[hold_out_idx] = y_in_hold_out_pred

    return meta_features

In [12]:
def get_stack_df(models, X_input, X_input_km, y_in):
    # Loop over classifier to produce meta features
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=5)
    meta_train = []
    meta_test = []
    for model in models:
        name = model['name']
        if name == 'lr':
            X_in = X_input
        elif name == 'lsvc':
            X_in = X_input
        else:
            X_in = X_input_km
        
        # Create hold out predictions for a classifier
        if model['model'].__class__.__name__ == 'LinearSVC':
            clf = CalibratedClassifierCV(base_estimator=model['model'], cv=5)
        else:
            clf = model['model']
        meta_train_model = generate_meta_features_model(clf, X_in, y_in, cv)

        # Remove redundant column - 0th column = 1-first column in a two class dataset 
        meta_train_model = np.delete(meta_train_model, 0, axis=1).ravel()
        print(pd.DataFrame(meta_train_model).head())

        # Gather meta training data
        meta_train.append(meta_train_model)

    meta_train = np.array(meta_train).T 
    df_meta_train = pd.DataFrame(meta_train)

    # Optional (Add original features to meta)
    df_meta_train = pd.DataFrame(np.concatenate((df_meta_train, X_in), axis=1))
    
    return df_meta_train

In [13]:
def get_stack_df_val(models, stack_model, X_input, X_input_km, y_in, X_test_input, X_test_km_input, features, ids):
    
    meta_test = []
    for model in models:
        name = model['name']
    if name == 'lr':
        X_in = X_input
        X_test_in = X_test_input
    elif name == 'lsvc':
        X_in = X_input
        X_test_in = X_test_input
    else:
        X_in = X_input_km
        X_test_in = X_test_km_input

    clf.fit(X_in, y_in)
    meta_test_model = clf.predict_proba(X_test_in)

    # Remove redundant column - 0th column = 1-first column in a two class dataset 
    meta_test_model = np.delete(meta_test_model, 0, axis=1).ravel()

    # Gather meta training data
    meta_test.append(meta_test_model)

    meta_test = np.array(meta_test).T 
    df_meta_test = pd.DataFrame(meta_test)

    # Optional (Add original features to meta)
    df_meta_test = pd.DataFrame(np.concatenate((df_meta_test, X_test_in), axis=1))
    
    return df_meta_test

In [14]:
def objective_logreg(trial, X_in, y_in, X_val_in, y_val_in):
    """Optimize logistic regression model using optuna"""
    
    solver = trial.suggest_categorical('solver', ['liblinear', 'newton-cg', 'lbfgs', 'newton-cg', 'sag', 'saga'])
    C = trial.suggest_float("C", 0.01, 2.0)
    max_iter = trial.suggest_int("max_iter", 100, 10000, step=100)
    
    penalty = 'l2'
    
    model = LogisticRegression(C=C, max_iter=max_iter, solver=solver, penalty=penalty)
    model.fit(X_in, y_in)
    preds = model.predict_proba(X_val_in)[:,1]
    score = roc_auc_score(y_val_in, preds)
    
    return score

In [15]:
def objective_linearSVC(trial, X_in, y_in, X_val_in, y_val_in):
    """Optimize linear SVC model using optuna"""
    
    C = trial.suggest_float("C", 0.01, 2.0)
    max_iter = trial.suggest_int("max_iter", 1000, 10000, step=1000)
    
    model = LinearSVC(C=C
                      , max_iter=max_iter
                      , dual=False
                      , random_state=5)
    clf = CalibratedClassifierCV(base_estimator=model, cv=5)
    clf.fit(X_in, y_in)
    
    preds = clf.predict_proba(X_val_in)[:,1]
    score = roc_auc_score(y_val_in, preds)
    
    return score

In [None]:
def get_baseline_scores(models, X_in, y_in):
    """evaluate baseline cross val scores of the models"""
    results, names = list(), list()
    for model in models:
        name = model['name']
        scores = evaluate_model(model['model'], X_in, y_in)
        model['init_scores'] = np.mean(scores)
        results.append(scores)
        names.append(name)
        print('>%s %.3f (%.3f)' % (name, np.mean(scores), np.std(scores)))

    # plot model performance for comparison
    plt.boxplot(results, labels=names, showmeans=True)
    plt.title('Baseline Scores')
    plt.show()

In [None]:
def get_baseline_scores_validation(models, X_in, y_in, X_val_in, y_val_in):
    """evaluate baseline scores of the models on validation set"""
    for model in models:
        name = model['name']
        score = evaluate_model_val_set(model['model'], X_in, y_in, X_val_in, y_val_in)
        model['init_scores_val'] = score
        print('>%s %.3f' % (name, score))

In [None]:
def get_kmeans_scores(featureScores, X_in, X_val_in, X_test_in, models):
    """evaluate cross val scores for the models by adding kmeans cluster distance ratios"""
    results, names = list(), list()
    important_features = list(featureScores.sort_values(by='Abs_score', ascending=False).head(15)['Specs'])
    X_train_km, X_val_km, X_test_km = get_kmeans_dist_ratios(X_train, X_val, df_test, important_features, 10)
    for model in models:
        name = model['name']
        if name == 'lr':
            pass
        elif name == 'lsvc':
            pass
        else:
            scores = evaluate_model(model['model'], X_train_km, y_train)
            model['kmeans_dist_rat_scores'] = np.mean(scores)
            results.append(scores)
            names.append(name)
            print('>%s %.3f (%.3f)' % (name, np.mean(scores), np.std(scores)))

    # plot model performance for comparison
    plt.boxplot(results, labels=names, showmeans=True)
    plt.title('KMeans Distance Ratio Scores')
    plt.show()

In [None]:
def get_kmeans_scores_validation(featureScores, X_in, X_val_in, X_test_in, models):
    """evaluate scores for the models by adding kmeans cluster distance ratios on validation set"""
    important_features = list(featureScores.sort_values(by='Abs_score', ascending=False).head(15)['Specs'])
    X_train_km, X_val_km, X_test_km = get_kmeans_dist_ratios(X_in, X_val_in, X_test_in, important_features, 10)
    for model in models:
        name = model['name']
        if name == 'lr':
            pass
        elif name == 'lsvc':
            pass
        else:
            score = evaluate_model_val_set(model['model'], X_train_km, y_train, X_val_km, y_val)
            model['kmeans_dist_rat_scores_val'] = score
            print('>%s %.3f' % (name, score))

In [None]:
def get_stacking_scores(featureScores, X_in, y_in, X_val_in, X_test_in, models, stack_model):
    """evaluate cross val scores with stacking"""
    # get km enhanced df
    important_features = list(featureScores.sort_values(by='Abs_score', ascending=False).head(15)['Specs'])
    X_train_km, X_val_km, X_test_km = get_kmeans_dist_ratios(X_in, X_val_in, X_test_in, important_features, 10)
    
    # get meta df
    df_meta_train = get_stack_df(models, X_in, X_train_km, y_in)

    # cross val scores from stacking
    results = list()
    name = [model['name'] +'_' for model in models]
    scores = evaluate_model(stack_model, df_meta_train, y_in)
    score = np.mean(scores)
    results.append(scores)
    print('>%s %.3f (%.3f)' % (name, np.mean(scores), np.std(scores)))

In [None]:
def get_binning_scores(baseline_score, X_in, y_in, model_in, improvement):
    """check cross val score for improvement in score by binning column"""
    improved_cols = []
    for col in X_in.columns:
        print(col)
        if model_in.__class__.__name__ == 'LinearSVC':
            clf = CalibratedClassifierCV(base_estimator=model, cv=n_splits)
        else:
            clf = model
        
        X_new = copy.deepcopy(X_in)
        new_col = col + '_bin'
        X_new[new_col], bins = pd.qcut(X_in[col], q=1000, retbins=True, labels=False)

        scores = evaluate_model(model, X_new, y_in)
        new_score = scores.mean()
        if new_score >= baseline_score + 0.00001:
            new_col = {'col': col, 'score': new_score}
            improved_cols.append(new_col)
    return improved_cols

In [None]:
def get_binning_scores_val(baseline_score, X_in, y_in, X_val_in, y_val_in, model_in, improvement):
    """check score in validation set for improvement in score by binning column"""
    improved_cols = []
    
    for col in X_in.columns:
        print(col)
        if model_in.__class__.__name__ == 'LinearSVC':
            clf = CalibratedClassifierCV(base_estimator=model, cv=n_splits)
        else:
            clf = model
        
        X_new = copy.deepcopy(X_in)
        X_val_new = copy.deepcopy(X_val_in)
        new_col = col + '_bin'
        
        X_new[new_col], bins = pd.qcut(X_in[col], q=1000, retbins=True, labels=False)
        X_val_new[new_col] = pd.cut(X_val_new[col], bins=bins, labels=False, include_lowest=True)
        X_val_new[new_col].fillna(X_val_new[new_col].mode()[0], inplace=True)
        
        score = evaluate_model_val_set(clf, X_new, y_in, X_val_new, y_val_in)
        if score >= baseline_score + improvement:
            new_col = {'col': col, 'score': new_score}
            improved_cols.append(new_col)
    return improved_cols

In [None]:
def make_final_pred_stack(models_in, stack_model_in, X_in, y_in, original_features, X_test_in, ids):
    """generate predictions on test df using stacking"""
    
    X_in = X_in[original_features]
    X_test_in = df_test_in[original_features]
    
    # get km enhanced df
    important_features = list(featureScores.sort_values(by='Abs_score', ascending=False).head(15)['Specs'])
    X_train_km, X_val_km, X_test_km = get_kmeans_dist_ratios(X_in, X_val_in, X_test_in, important_features, 10)
    
    # get meta df
    df_meta_train = get_stack_df(models, X_in, X_train_km, y_in)
    
    
    meta_test_in = []
    for model in models_in:
        # Fit model

        # Create hold out predictions for a classifier
        if model.__class__.__name__ == 'LinearSVC':
            clf = CalibratedClassifierCV(base_estimator=model, cv=n_splits)
        else:
            clf = model
        
        clf.fit(X_in, y_in)
        meta_test_model = clf.predict_proba(X_test_in)
    
        # Remove redundant column - 0th column = 1-first column in a two class dataset 
        meta_test_model = np.delete(meta_test_model, 0, axis=1).ravel()
    
        # Gather meta training data
        meta_test_in.append(meta_test_model)
    
        meta_test_in = np.array(meta_test_in).T 
        X_meta_test_in = pd.DataFrame(meta_test_in)

    # Optional (Add original features to meta)
    X_meta_test_in = pd.DataFrame(np.concatenate((X_meta_test_in, X_test_in), axis=1))
    
    stack_model_in.fit(pd.DataFrame(X_meta_train), y_in)

    # Final output
    preds = stack_model.predict_proba(X_meta_test)[:,1]
    output = pd.DataFrame({'id': ids, 'target': preds})
    output.to_csv('submission.csv', index=False)
    
    preds = clf.predict_proba(df_test_in)[:,1]
    df_preds = pd.DataFrame({'id': ids_in, 'target': preds})
    return df_preds

In [None]:
def make_final_pred_single(model_in, X_in, y_in, X_test_in, ids):
    """make final test predictions using a single model"""
    if model_in.__class__.__name__ == 'LinearSVC':
        clf = CalibratedClassifierCV(base_estimator=model, cv=n_splits)
    else:
        clf = model
        
    clf.fit(X_in, y_in)
    preds = clf.predict_proba(X_test_in)[:,1]

    output = pd.DataFrame({'id': ids, 'target': preds})
    output.to_csv('submission.csv', index=False)

In [None]:
def objective_lightgbm(trial):
    """optuna objective function for lightgbm"""
    
    num_leaves = trial.suggest_int("num_leaves", 11, 101, step=10)
    max_depth = trial.suggest_int("max_depth", 2, 10, step=1)
    learning_rate = trial.suggest_float("learning_rate", 0.01, 2.0)
    
    model = LGBMClassifier(num_leaves=num_leaves
                           , max_depth=max_depth
                           , learning_rate=learning_rate
                           , objective='binary'
                           , random_state=5)
    model.fit(X_train_norm, y_train_norm)
    preds = model.predict_proba(X_val_norm)[:,1]
    score = roc_auc_score(y_val_norm, preds)
    
    return score

In [None]:
def optimize_linear_svc(X_in, y_in, X_val_in, y_val_in, n_trials):
    """optimize linear svc using optuna"""
    study = optuna.create_study(direction='maximize', study_name="LinearSVC")
    func = lambda trial: objective_linearSVC(trial, X_in, y_in,  X_val_in, y_val_in)
    study.optimize(func, n_trials=n_trials)

    print('Number of finished trials:', len(study.trials))
    print('Best trial:', study.best_trial.params)
    print('Best score:', study.best_value)
    
    return study.best_trial.params

In [None]:
def optimize_logreg(X_in, y_in, X_val_in, y_val_in, n_trials):
    """optimize logistic regression using optuna"""
    study = optuna.create_study(direction='maximize', study_name="Logistic regression")
    func = lambda trial: objective_logreg(trial, X_train, y_train,  X_val, y_val)
    study.optimize(func, n_trials=n_trials)

    print('Number of finished trials:', len(study.trials))
    print('Best trial:', study.best_trial.params)
    print('Best score:', study.best_value)
    
    return study.best_trial.params

In [None]:
def optimize_lightgbm(X_in, y_in, X_val_in, y_val_in, n_trials):
    """optimize lightgbm using optuna"""
    study = optuna.create_study(direction='maximize', study_name="LightGBM")
    func = lambda trial: objective_lightgbm(trial)
    study.optimize(func, n_trials=n_trials)

    print('Number of finished trials:', len(study.trials))
    print('Best trial:', study.best_trial.params)
    print('Best score:', study.best_value)
    
    return study.best_trial.params

In [18]:
# get data
X, y, df_test, ids = get_datasets('../input/tabular-playground-series-nov-2021/', True, False)
original_features = list(df_test.columns)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, train_size=0.8)

# get models
models = get_models()

# get baseline scores
get_baseline_scores(models, X_train, y_train)

# get baseline scores on validation set
get_baseline_scores_validation(models, X_train, y_train, X_val, y_val)

# get feature importances
featureScores = get_feature_importances(X_train, y_train, 'classification', 5)

# get kmeans scores
get_kmeans_scores(featureScores, X_train, X_val, df_test, models)

# get kmeans scores on validation set
get_kmeans_scores_validation(featureScores, X_train, X_val, df_test, models)

# get stacking scores
stack_model = LogisticRegression(solver='sag', C=1.6213309780417264, max_iter=1800, random_state=10)
get_stacking_scores(featureScores, X_train, y_train, X_val, df_test, models, stack_model)

# get stacking scores - pop 0
models_stack = copy.deepcopy(models)
models_stack.pop(0)
stack_model = LogisticRegression(solver='sag', C=1.6213309780417264, max_iter=1800, random_state=10)
get_stacking_scores(featureScores, X_train, y_train, X_val, df_test, models_stack, stack_model)

# get binning scores
baseline_score = 0.749
model = LinearSVC(dual=False, C=0.012249147757314706, max_iter=10000)
improved_cols = get_binning_scores(baseline_score, X_train, y_train, model, 0.01)

# get binning scores on validation set
baseline_score = 0.749
model = LinearSVC(dual=False, C=0.012249147757314706, max_iter=10000)
improved_cols = get_binning_scores_val(baseline_score, X_train, y_train, X_val, y_val, model, improvement)

# optimize model
optimize_linear_svc(X_train, y_train, X_val, y_val)

# final prediction - single model - logistic regression
clf = LogisticRegression(solver='newton-cg', C=0.023672809391721117, max_iter=200)
make_final_pred_single(clf, X, y)