In [105]:
import sys
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import optuna
import xgboost as xgb
import lightgbm as lgbm
import statistics
from sklearn.linear_model import Ridge
from sklearn import model_selection
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, PowerTransformer 
from functools import partial
from openfe import OpenFE, transform
import contextlib
import re
import warnings

warnings.filterwarnings('ignore')


In [106]:
sys.path.append(os.path.abspath("/home/bk_anupam/code/ML/ML_UTILS/"))

In [107]:
import train_tabular_utils as tt
import cv_split_utils
import enums
import data_utils

In [108]:
class Config:
    RUN_MODE = "LOCAL"
    RANDOM_SEED = 42
    NUM_FOLDS = 5
    TARGET_COL_NAME = "Target"    
    SKEW_THRESHOLD = 0.5
    MODEL_TYPE = enums.ModelName.LogisticRegression            
    METRIC = enums.Metrics.ACCURACY        
    GENERATE_AUTO_FEATURES = True
    NUM_NEW_FEATURES = 30
    TRAIN_SINGLE_FOLD = True

COLS_TO_LEAVE = ["Target", "kfold"]
CPU_COUNT = os.cpu_count()

DATA_READPATH = "./data/"
DATA_WRITEPATH = "./data/"
if Config.RUN_MODE == "KAGGLE":
    DATA_READPATH = "/kaggle/input/playground-series-s4e6/"
    DATA_WRITEPATH = "/kaggle/working/"

In [109]:
# import train dataset locally from data folder
df_train = pd.read_csv(DATA_READPATH + "train.csv")
# import test dataset locally from data folder
df_test = pd.read_csv(DATA_READPATH + "test.csv")
df_train = df_train.drop("id", axis=1)
df_test = df_test.drop("id", axis=1)
# keep a copy of original train and test data for later use
df_train_orig = df_train.copy()
df_test_orig = df_test.copy()

In [110]:
df_train = df_train.sample(frac=0.1, random_state=Config.RANDOM_SEED)
df_test = df_test.sample(frac=0.1, random_state=Config.RANDOM_SEED)

In [111]:
df_train.columns

Index(['Marital status', 'Application mode', 'Application order', 'Course',
       'Daytime/evening attendance', 'Previous qualification',
       'Previous qualification (grade)', 'Nacionality',
       'Mother's qualification', 'Father's qualification',
       'Mother's occupation', 'Father's occupation', 'Admission grade',
       'Displaced', 'Educational special needs', 'Debtor',
       'Tuition fees up to date', 'Gender', 'Scholarship holder',
       'Age at enrollment', 'International',
       'Curricular units 1st sem (credited)',
       'Curricular units 1st sem (enrolled)',
       'Curricular units 1st sem (evaluations)',
       'Curricular units 1st sem (approved)',
       'Curricular units 1st sem (grade)',
       'Curricular units 1st sem (without evaluations)',
       'Curricular units 2nd sem (credited)',
       'Curricular units 2nd sem (enrolled)',
       'Curricular units 2nd sem (evaluations)',
       'Curricular units 2nd sem (approved)',
       'Curricular units 2nd s

In [112]:
def process_col_name(cols_list):
    processed_cols_list = []
    for item in cols_list:
        # Remove round brackets but keep the text inside them
        item_no_brackets = re.sub(r'[\(\)]', '', item)
        # Remove single quotes
        item_no_quotes = item_no_brackets.replace("'", "")
        # Replace spaces with underscores
        item_processed = item_no_quotes.replace(' ', '_')
        # Append to the processed list
        processed_cols_list.append(item_processed)
    return processed_cols_list

In [113]:
train_cols_list = df_train.columns.to_list()
test_cols_list = df_test.columns.to_list()
train_processed_cols_list = process_col_name(train_cols_list)
test_processed_cols_list = process_col_name(test_cols_list)
df_train.columns = train_processed_cols_list
df_test.columns = test_processed_cols_list

In [114]:
cont_features = df_train.dtypes[df_train.dtypes == "float"].index.values
cat_features = df_train.dtypes[df_train.dtypes == "int"].index.values
feature_cols_for_fe = [x for x in df_train.columns if x not in COLS_TO_LEAVE]

In [115]:
preprocessor = None

In [116]:
model_params = None

In [117]:
def generate_new_features(df_train, df_test, feature_cols, NUM_NEW_FEATURES=10):
    train_X = df_train[feature_cols] 
    test_X = df_test[feature_cols]   
    train_y = df_train[Config.TARGET_COL_NAME]
    ofe = OpenFE()
    with contextlib.redirect_stdout(None):
        features = ofe.fit(data=train_X, label=train_y, n_jobs=CPU_COUNT, verbose=False)  # generate new features    
    # OpenFE recommends a list of new features. We include the top 10
    # generated features to see how they influence the model performance
    train_X, test_X = transform(train_X, test_X, ofe.new_features_list[:NUM_NEW_FEATURES], n_jobs=CPU_COUNT)        
    return train_X, test_X, features, ofe

In [118]:
%%time
df_train_fe, df_test_fe, new_features, ofe = generate_new_features(df_train, df_test, feature_cols_for_fe, Config.NUM_NEW_FEATURES)  
df_train_labels = df_train[[Config.TARGET_COL_NAME]]
# Add the label data to the dataframe
df_train_fe = pd.concat([df_train_fe, df_train_labels], axis=1)
# save the new train and test data with openfe features to csv files for later use
df_train_fe.to_csv(DATA_WRITEPATH + "train_openfe.csv", index=False)
df_test_fe.to_csv(DATA_WRITEPATH + "test_openfe.csv", index=False)

100%|██████████| 32/32 [01:06<00:00,  2.07s/it]
100%|██████████| 32/32 [00:43<00:00,  1.35s/it]
100%|██████████| 32/32 [01:05<00:00,  2.04s/it]
100%|██████████| 32/32 [00:14<00:00,  2.18it/s]


CPU times: user 2min 7s, sys: 3.36 s, total: 2min 10s
Wall time: 3min 28s


In [119]:
print(f"Number of new features = {len(new_features)}")  

Number of new features = 1999


In [120]:
df_train_fe.columns

Index(['Marital_status', 'Application_mode', 'Application_order', 'Course',
       'Daytime/evening_attendance', 'Previous_qualification',
       'Previous_qualification_grade', 'Nacionality', 'Mothers_qualification',
       'Fathers_qualification', 'Mothers_occupation', 'Fathers_occupation',
       'Admission_grade', 'Displaced', 'Educational_special_needs', 'Debtor',
       'Tuition_fees_up_to_date', 'Gender', 'Scholarship_holder',
       'Age_at_enrollment', 'International',
       'Curricular_units_1st_sem_credited',
       'Curricular_units_1st_sem_enrolled',
       'Curricular_units_1st_sem_evaluations',
       'Curricular_units_1st_sem_approved', 'Curricular_units_1st_sem_grade',
       'Curricular_units_1st_sem_without_evaluations',
       'Curricular_units_2nd_sem_credited',
       'Curricular_units_2nd_sem_enrolled',
       'Curricular_units_2nd_sem_evaluations',
       'Curricular_units_2nd_sem_approved', 'Curricular_units_2nd_sem_grade',
       'Curricular_units_2nd_sem_wit

In [121]:
df_train = cv_split_utils.strat_kfold_dataframe(df_train, 
                                                target_col_name=Config.TARGET_COL_NAME, 
                                                random_state=Config.RANDOM_SEED, 
                                                num_folds=Config.NUM_FOLDS)

In [122]:
len(df_train)

7652

In [123]:
print(f"Before feature engineering")
fold_metrics_model, df_oof_preds, preprocessor = tt.train_and_validate(
        model_name=Config.MODEL_TYPE,
        model_params=model_params,
        preprocessor=preprocessor,
        df=df_train,
        feature_cols=feature_cols_for_fe,
        target_col_name=Config.TARGET_COL_NAME,
        metric=Config.METRIC,
        single_fold=Config.TRAIN_SINGLE_FOLD,
        num_folds=Config.NUM_FOLDS,
        suppress_print=False
)

Before feature engineering
Fold 0 - LogisticRegression - ACCURACY : 0.7348138471587198
LogisticRegression metric=ACCURACY CV score = 0.7348138471587198
LogisticRegression Mean ACCURACY = 0.7348138471587198, std = 0.0


In [124]:
df_train_fe = cv_split_utils.strat_kfold_dataframe(df_train_fe, 
                                                target_col_name=Config.TARGET_COL_NAME, 
                                                random_state=Config.RANDOM_SEED, 
                                                num_folds=Config.NUM_FOLDS)

In [125]:
len(df_train_fe)

7652

In [134]:
na_val = df_train_fe.isna().sum()
null_features =na_val.loc[na_val > 0].index.values.tolist()
null_features

['autoFE_f_1',
 'autoFE_f_5',
 'autoFE_f_6',
 'autoFE_f_7',
 'autoFE_f_8',
 'autoFE_f_9',
 'autoFE_f_10',
 'autoFE_f_15',
 'autoFE_f_17',
 'autoFE_f_18',
 'autoFE_f_24',
 'autoFE_f_29']

In [153]:
na_val.loc[na_val > 0]

autoFE_f_1     1827
autoFE_f_5      845
autoFE_f_6     1827
autoFE_f_7     1827
autoFE_f_8      252
autoFE_f_9     1590
autoFE_f_10    1827
autoFE_f_15    1827
autoFE_f_17    1590
autoFE_f_18     839
autoFE_f_24     839
autoFE_f_29    1592
dtype: int64

In [137]:
feature_cols_after_fe = [x for x in df_train_fe.columns if x not in COLS_TO_LEAVE+null_features]

In [138]:
print(feature_cols_after_fe)

['Marital_status', 'Application_mode', 'Application_order', 'Course', 'Daytime/evening_attendance', 'Previous_qualification', 'Previous_qualification_grade', 'Nacionality', 'Mothers_qualification', 'Fathers_qualification', 'Mothers_occupation', 'Fathers_occupation', 'Admission_grade', 'Displaced', 'Educational_special_needs', 'Debtor', 'Tuition_fees_up_to_date', 'Gender', 'Scholarship_holder', 'Age_at_enrollment', 'International', 'Curricular_units_1st_sem_credited', 'Curricular_units_1st_sem_enrolled', 'Curricular_units_1st_sem_evaluations', 'Curricular_units_1st_sem_approved', 'Curricular_units_1st_sem_grade', 'Curricular_units_1st_sem_without_evaluations', 'Curricular_units_2nd_sem_credited', 'Curricular_units_2nd_sem_enrolled', 'Curricular_units_2nd_sem_evaluations', 'Curricular_units_2nd_sem_approved', 'Curricular_units_2nd_sem_grade', 'Curricular_units_2nd_sem_without_evaluations', 'Unemployment_rate', 'Inflation_rate', 'GDP', 'autoFE_f_0', 'autoFE_f_2', 'autoFE_f_3', 'autoFE_f_4

In [136]:
print(f"After feature engineering")
fold_metrics_model, df_oof_preds, preprocessor = tt.train_and_validate(
        model_name=Config.MODEL_TYPE,
        model_params=model_params,
        preprocessor=preprocessor,
        df=df_train_fe,
        feature_cols=feature_cols_after_fe,
        target_col_name=Config.TARGET_COL_NAME,
        metric=Config.METRIC,
        single_fold=Config.TRAIN_SINGLE_FOLD,
        num_folds=Config.NUM_FOLDS,
        suppress_print=False
)

After feature engineering
Fold 0 - LogisticRegression - ACCURACY : 0.7798824297844547
LogisticRegression metric=ACCURACY CV score = 0.7798824297844547
LogisticRegression Mean ACCURACY = 0.7798824297844547, std = 0.0


In [156]:
from openfe import tree_to_formula

print(f'The top {Config.NUM_NEW_FEATURES} generated features are:')
feature_formula = []
for feature in ofe.new_features_list[:Config.NUM_NEW_FEATURES]:
    f_formula = tree_to_formula(feature)
    feature_formula.append(f_formula)
    print(f_formula)

The top 30 generated features are:
(Tuition_fees_up_to_date*Curricular_units_2nd_sem_approved)
(Curricular_units_2nd_sem_evaluations/Curricular_units_2nd_sem_approved)
(Curricular_units_1st_sem_approved+Curricular_units_2nd_sem_approved)
(Curricular_units_2nd_sem_grade*Curricular_units_2nd_sem_approved)
(Curricular_units_2nd_sem_grade+Curricular_units_2nd_sem_approved)
(Curricular_units_1st_sem_approved/Curricular_units_2nd_sem_evaluations)
(Curricular_units_1st_sem_evaluations/Curricular_units_2nd_sem_approved)
(Curricular_units_1st_sem_enrolled/Curricular_units_2nd_sem_approved)
(Curricular_units_1st_sem_approved/Curricular_units_2nd_sem_enrolled)
(Tuition_fees_up_to_date/Curricular_units_1st_sem_approved)
(Course/Curricular_units_2nd_sem_approved)
GroupByThenRank(Curricular_units_2nd_sem_approved,Curricular_units_1st_sem_enrolled)
(Curricular_units_2nd_sem_grade+Scholarship_holder)
(Curricular_units_1st_sem_grade+Scholarship_holder)
(Curricular_units_2nd_sem_enrolled-Curricular_unit

In [147]:
auto_feature_names = [item for item in df_train_fe.columns.values.tolist() if item.startswith('autoFE_f_')]
df_auto_features = pd.DataFrame()
df_auto_features["feature_name"] = auto_feature_names
df_auto_features["feature_formula"] = feature_formula

Unnamed: 0,feature_name,feature_formula
0,autoFE_f_0,(Tuition_fees_up_to_date*Curricular_units_2nd_...
1,autoFE_f_1,(Curricular_units_2nd_sem_evaluations/Curricul...
2,autoFE_f_2,(Curricular_units_1st_sem_approved+Curricular_...
3,autoFE_f_3,(Curricular_units_2nd_sem_grade*Curricular_uni...
4,autoFE_f_4,(Curricular_units_2nd_sem_grade+Curricular_uni...
5,autoFE_f_5,(Curricular_units_1st_sem_approved/Curricular_...
6,autoFE_f_6,(Curricular_units_1st_sem_evaluations/Curricul...
7,autoFE_f_7,(Curricular_units_1st_sem_enrolled/Curricular_...
8,autoFE_f_8,(Curricular_units_1st_sem_approved/Curricular_...
9,autoFE_f_9,(Tuition_fees_up_to_date/Curricular_units_1st_...


In [159]:
# select rows from df_auto_features where feature_name in not in null_features
auto_features_not_null = df_auto_features.loc[~df_auto_features["feature_name"].isin(null_features)]
auto_features_null = df_auto_features.loc[df_auto_features["feature_name"].isin(null_features)]
auto_features_not_null.to_csv("./output/auto_features_not_null.csv", index=False)
auto_features_null.to_csv("./output/auto_features_null.csv", index=False)