In [157]:
import os
import re
import numpy as np
import pandas as pd
import xgboost as xgb
import lightgbm as lgb

from sklearn.model_selection import StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler, StandardScaler, PowerTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from scipy.stats import zscore, skew

In [None]:
def preprocessing(X, y):
    # clear missing value
    missing_value = X.isnull().any(axis=1) | y.isnull().any(axis=1)
    X, y = X[~missing_value], y[~missing_value]

    # clear duplicate value
    duplicate_value = pd.concat([X, y], axis=1).duplicated()
    X, y = X[~duplicate_value], y[~duplicate_value]

    # decide use minmax or zscore or yeojohnson
    # it will pass the discrete feature
    skewness = skew(X, axis=0)
    transformers = []
    for i, col in enumerate(X.columns):
        if X[col].dtype == 'int64':
            continue
        elif np.abs(skewness[i]) > 1:
            transformers.append((f'yeojohnson_{col}', PowerTransformer(method='yeo-johnson', standardize=False), [col]))
        elif np.abs(skewness[i]) < 0.5:
            transformers.append((f'standard_{col}', StandardScaler(), [col]))
        else:
            # clear outliers before minmax
            z_score = np.abs(zscore(X[col]))
            outliers = z_score > 3
            X, y = X[~outliers], y[~outliers]
            transformers.append((f'minmax_{col}', MinMaxScaler(), [col]))

    preprocessor = ColumnTransformer(transformers, remainder='passthrough') 
    pipeline = Pipeline(steps=[('preprocessor', preprocessor)])

    X = pipeline.fit_transform(X)
    y = y.to_numpy().ravel()
    return X, y, pipeline

In [None]:
knn_params = {
    'n_neighbors': 3,          # avoid overlapping
    'weights': 'uniform',
    'algorithm': 'auto',
    'p': 2,
}
rf_params = {
    'n_estimators': 100,      # fewer trees
    'max_depth': 5,
    'min_samples_split': 2,
    'min_samples_leaf': 2,
    'max_features': 'sqrt',
    'random_state': 42,
}
svc_params = {
    'kernel': 'linear',
    'C': 0.1,                  # avoid overlapping
    'gamma': 'scale',          # avoid overlapping
    'probability': True,
    'random_state': 42,
}
xgb_params = {
    'objective': 'binary:logistic',
    'eval_metric': 'logloss',
    'learning_rate': 0.01,
    'n_estimators': 1000,
    'max_depth': 3,
    'min_child_weight': 1,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'reg_alpha': 0.1,                 # L1 regularization
    'reg_lambda': 0.1,                # L2 regularization
    'scale_pos_weight': 1,            # Used for class imbalance problems
    'random_state': 42,
}
lgb_params = {
    'objective': 'binary',
    'metric': 'binary_error',
    'learning_rate': 0.01,
    'num_leaves': 31,
    'max_depth': 4,
    'min_data_in_leaf': 20,
    'lambda_l1': 0.1,                # L1 regularization
    'lambda_l2': 0.1,                # L2 regularization
    'max_bin': 255,
    'scale_pos_weight': 1,           # Used for class imbalance problems
    'boosting_type': 'gbdt',
    'verbose': -1,
    'random_state': 42,
}
meta_model_params = {
    'solver': 'liblinear',
    'C': 1.0,
    'random_state': 42,
}


In [154]:
def processing(X, y):
    X, y, pipeline = preprocessing(X, y)

    base_models = [
        ('knn', KNeighborsClassifier(**knn_params)),
        ('rf', RandomForestClassifier(**rf_params)),
        ('svc', SVC(**svc_params)),
        ('xgb', xgb.XGBClassifier(**xgb_params)),
        ('lgb', lgb.LGBMClassifier(**lgb_params)),
    ]

    meta_model = LogisticRegression(**meta_model_params)

    stacking_model = StackingClassifier(
        estimators=base_models,
        final_estimator=meta_model,
        cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    )

    stacking_model.fit(X, y)

    return stacking_model, pipeline

In [None]:
path = './Competition_data'
dataset_names = os.listdir(path)
dataset_names.sort(key=lambda x: int(re.search(r'\d+', x).group()))

for dataset_name in dataset_names:
    X_train = pd.read_csv(f'./Competition_data/{dataset_name}/X_train.csv')
    y_train = pd.read_csv(f'./Competition_data/{dataset_name}/y_train.csv')
    
    model, pipeline = processing(X_train, y_train)

    X_test = pd.read_csv(f'./Competition_data/{dataset_name}/X_test.csv')
    X_test = pipeline.transform(X_test)

    y_pred = model.predict_proba(X_test)[:, 1]
    y_pred = pd.DataFrame(y_pred)

    y_pred.columns = ["y_predict_proba"]

    y_pred.to_csv(f'./Competition_data/{dataset_name}/y_predict.csv', index=False)

    print(f"{dataset_name} done")

Dataset_1 done
Dataset_2 done
Dataset_3 done
Dataset_4 done
Dataset_5 done
Dataset_6 done
Dataset_7 done
Dataset_8 done
Dataset_9 done
Dataset_10 done
Dataset_11 done
Dataset_12 done
Dataset_13 done
Dataset_14 done
Dataset_15 done
Dataset_16 done
Dataset_17 done
Dataset_18 done
Dataset_19 done
Dataset_20 done
Dataset_21 done
Dataset_22 done
Dataset_23 done
Dataset_24 done
Dataset_25 done
Dataset_26 done
Dataset_27 done
Dataset_28 done
Dataset_29 done
Dataset_30 done
Dataset_31 done
Dataset_32 done
Dataset_33 done
Dataset_34 done
Dataset_35 done
Dataset_36 done
Dataset_37 done
Dataset_38 done
Dataset_39 done
Dataset_40 done
Dataset_41 done
Dataset_42 done
Dataset_43 done
Dataset_44 done
Dataset_45 done


  skewness = skew(X, axis=0)


Dataset_46 done
Dataset_47 done
Dataset_48 done


  skewness = skew(X, axis=0)


Dataset_49 done
