In [1]:
import numpy as np
import pandas as pd

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
train = pd.read_csv('/kaggle/input/ml-olympiad-ai-ml-malaysia/train.csv')
test = pd.read_csv('/kaggle/input/ml-olympiad-ai-ml-malaysia/test.csv')
submit = pd.read_csv('/kaggle/input/ml-olympiad-ai-ml-malaysia/sample_submission.csv')

In [3]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55760 entries, 0 to 55759
Data columns (total 21 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   id      55760 non-null  int64  
 1   02      55760 non-null  int64  
 2   03      55752 non-null  object 
 3   04      55760 non-null  object 
 4   05      55760 non-null  int64  
 5   06      55102 non-null  float64
 6   07      55760 non-null  int64  
 7   08      52567 non-null  float64
 8   09      52336 non-null  float64
 9   10      55760 non-null  float64
 10  11      55760 non-null  int64  
 11  12      48289 non-null  float64
 12  13      55760 non-null  int64  
 13  14      55760 non-null  int64  
 14  15      55760 non-null  int64  
 15  16      55719 non-null  float64
 16  17      33629 non-null  float64
 17  18      33629 non-null  float64
 18  19      17819 non-null  float64
 19  20      55719 non-null  float64
 20  class   55760 non-null  int64  
dtypes: float64(10), int64(9), object(2)

In [4]:
train['class'].unique()

array([0, 1])

In [5]:
train['class'].value_counts()

class
0    54916
1      844
Name: count, dtype: int64

In [6]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13940 entries, 0 to 13939
Data columns (total 20 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   id      13940 non-null  int64  
 1   02      13940 non-null  int64  
 2   03      13933 non-null  object 
 3   04      13940 non-null  object 
 4   05      13940 non-null  int64  
 5   06      13784 non-null  float64
 6   07      13940 non-null  int64  
 7   08      13115 non-null  float64
 8   09      13066 non-null  float64
 9   10      13940 non-null  float64
 10  11      13940 non-null  int64  
 11  12      12023 non-null  float64
 12  13      13940 non-null  int64  
 13  14      13940 non-null  int64  
 14  15      13940 non-null  int64  
 15  16      13930 non-null  float64
 16  17      8366 non-null   float64
 17  18      8366 non-null   float64
 18  19      4452 non-null   float64
 19  20      13930 non-null  float64
dtypes: float64(10), int64(8), object(2)
memory usage: 2.1+ MB


## FEATURE ENGINEERING

In [7]:
cols_to_impute = ['06', '08', '09', '12', '16', '17', '18', '19', '20']

def impute_dataset(df):
    for column in cols_to_impute:
        median_value = int(df[column].median())
        df[column].fillna(median_value, inplace=True)    
    #for the date
    mode_value = df[column].mode()[0]
    df['03'].fillna(mode_value, inplace=True)

impute_dataset(train)
impute_dataset(test)

In [8]:
def date_features(df):
    df['03'] = pd.to_datetime(df['03'], errors='coerce')
    df['04'] = pd.to_datetime(df['04'], errors='coerce')

    df['dob_year'] = df['03'].dt.year
    df['dob_month'] = df['03'].dt.month
    df['dob_day'] = df['03'].dt.day
    df['dob_dayofweek'] = df['03'].dt.dayofweek
    df['dob_quarter'] = df['03'].dt.quarter

    df['lead_year'] = df['04'].dt.year
    df['lead_month'] = df['04'].dt.month
    df['lead_day'] = df['04'].dt.day
    df['lead_dayofweek'] = df['04'].dt.dayofweek
    df['lead_quarter'] = df['04'].dt.quarter

    # Calculate the age of the applicant at the time of lead creation based on the date of birth
    df['age_at_lead_creation'] = df['04'].dt.year - df['03'].dt.year
    
date_features(train)    
date_features(test)

In [9]:
def numerical_features(df):
    numcols = ['10', '16', '17', '18', '19', '20']
    # Normalize numerical features to bring them to a similar scale
    for col in numcols:
        df[col + '_normalized'] = (df[col] - df[col].mean()) / df[col].std()

    # Create additional numerical features(didn't work out as planned)
    #df['loan_amt_to_monthly_income_ratio'] = df['17'] / df['10']
    #df['loan_installment_to_monthly_income_ratio'] = df['20'] / df['10']
    #df['loan_installment_to_installment_ratio'] = df['20'] / df['16']

numerical_features(train)
numerical_features(test)

In [10]:
def feature_interaction(df):
    df['employer_interaction'] = df['08'].astype(str) + '_' + df['09'].astype(str)

feature_interaction(train)
feature_interaction(test)

In [11]:
def feature_derivation(df):
    df['total_loan_amount'] = df['17'] * df['18']
    df['total_interest_paid'] = df['19'] * df['17'] * df['18']

feature_derivation(train)    
feature_derivation(test)

In [12]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55760 entries, 0 to 55759
Data columns (total 41 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   id                    55760 non-null  int64         
 1   02                    55760 non-null  int64         
 2   03                    55752 non-null  datetime64[ns]
 3   04                    55760 non-null  datetime64[ns]
 4   05                    55760 non-null  int64         
 5   06                    55760 non-null  float64       
 6   07                    55760 non-null  int64         
 7   08                    55760 non-null  float64       
 8   09                    55760 non-null  float64       
 9   10                    55760 non-null  float64       
 10  11                    55760 non-null  int64         
 11  12                    55760 non-null  float64       
 12  13                    55760 non-null  int64         
 13  14              

In [13]:
train.drop(columns=['id', '03', '04', '11'], inplace=True)
test.drop(columns=['id', '03', '04', '11'], inplace=True)

In [14]:
def onehot_encode(df):
    one_hot_encoded = pd.get_dummies(df['employer_interaction'], prefix='employer_interaction')
    df = pd.concat([df, one_hot_encoded], axis=1)
    df.drop('employer_interaction', axis=1, inplace=True)

onehot_encode(train)
onehot_encode(test)

## AUTO ML

In [15]:
!pip install flaml

Collecting flaml
  Downloading FLAML-2.1.2-py3-none-any.whl.metadata (15 kB)
Downloading FLAML-2.1.2-py3-none-any.whl (296 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m296.7/296.7 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: flaml
Successfully installed flaml-2.1.2


In [16]:
from flaml import AutoML

automl = AutoML()
automl_settings = {
    "time_budget": 5,
    "metric": 'accuracy',
    "task": 'classification',
}

2024-03-31 18:17:11,289	INFO util.py:124 -- Outdated packages:
  ipywidgets==7.7.1 found, needs ipywidgets>=8
Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.
2024-03-31 18:17:12,452	INFO util.py:124 -- Outdated packages:
  ipywidgets==7.7.1 found, needs ipywidgets>=8
Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


In [17]:
selected_cols = train.select_dtypes(exclude=['object']).columns
selected_cols = selected_cols.drop('class', errors='ignore')
test_selected_cols = test.select_dtypes(exclude=['object']).columns
X_train = train[selected_cols]
y_train = train['class']
X_test = test[test_selected_cols]

In [18]:
from sklearn.model_selection import train_test_split

X_trainsub, X_valid, y_trainsub, y_valid = train_test_split(X_train, y_train, test_size=0.3, train_size=0.7, random_state=True)

In [19]:
automl.fit(X_trainsub, y_trainsub, **automl_settings)

[flaml.automl.logger: 03-31 18:17:17] {1680} INFO - task = classification
[flaml.automl.logger: 03-31 18:17:17] {1691} INFO - Evaluation method: holdout
[flaml.automl.logger: 03-31 18:17:17] {1789} INFO - Minimizing error metric: 1-accuracy
[flaml.automl.logger: 03-31 18:17:17] {1901} INFO - List of ML learners in AutoML Run: ['lgbm', 'rf', 'catboost', 'xgboost', 'extra_tree', 'xgb_limitdepth', 'lrl1']
[flaml.automl.logger: 03-31 18:17:17] {2219} INFO - iteration 0, current learner lgbm
[flaml.automl.logger: 03-31 18:17:17] {2345} INFO - Estimated sufficient time budget=2392s. Estimated necessary time budget=59s.
[flaml.automl.logger: 03-31 18:17:17] {2392} INFO -  at 0.9s,	estimator lgbm's best error=0.0156,	best estimator lgbm's best error=0.0156
[flaml.automl.logger: 03-31 18:17:17] {2219} INFO - iteration 1, current learner lgbm
[flaml.automl.logger: 03-31 18:17:18] {2392} INFO -  at 1.2s,	estimator lgbm's best error=0.0156,	best estimator lgbm's best error=0.0156
[flaml.automl.log

In [20]:
print('Best ML leaner:', automl.best_estimator)
print('Best hyperparmeter config:', automl.best_config)
print('Best log_loss on validation data: {0:.4g}'.format(automl.best_loss))
print('Training duration of best run: {0:.4g} s'.format(automl.best_config_train_time))

Best ML leaner: lgbm
Best hyperparmeter config: {'n_estimators': 4, 'num_leaves': 4, 'min_child_samples': 20, 'learning_rate': 0.09999999999999995, 'log_max_bin': 8, 'colsample_bytree': 1.0, 'reg_alpha': 0.0009765625, 'reg_lambda': 1.0}
Best log_loss on validation data: 0.01562
Training duration of best run: 0.3319 s


In [21]:
y_pred1 = automl.predict(X_valid)
rounded_predictions1 = np.round(y_pred1)
integer_predictions1 = rounded_predictions1.astype(int)

In [22]:
from sklearn.metrics import accuracy_score

accuracy1 = accuracy_score(y_valid, integer_predictions1)
print("Accuracy:", accuracy1)

Accuracy: 0.9851745576279293


## KNN Neighbors

In [23]:
X_trainsub = X_trainsub.fillna(X_trainsub.median()) 
X_valid = X_valid.fillna(X_valid.median()) 

In [24]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_trainsub, y_trainsub)

In [25]:
y_pred2 = knn.predict(X_valid)
rounded_predictions2 = np.round(y_pred2)
integer_predictions2 = rounded_predictions2.astype(int)

In [26]:
accuracy2 = accuracy_score(y_valid, integer_predictions2)
print("Accuracy:", accuracy2)

Accuracy: 0.983739837398374


## TFDF Gradient Boosted

In [27]:
x,valid = train_test_split(train, test_size=0.3, train_size=0.7, random_state=True)

In [28]:
import tensorflow_decision_forests as tfdf

df_train = tfdf.keras.pd_dataframe_to_tf_dataset(train, label="class")
df_valid = tfdf.keras.pd_dataframe_to_tf_dataset(valid, label="class")
df_test = tfdf.keras.pd_dataframe_to_tf_dataset(test)

2024-03-31 18:17:31.227550: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-31 18:17:31.227730: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-31 18:17:31.430108: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [29]:
model = tfdf.keras.GradientBoostedTreesModel(hyperparameter_template="better_default@v1")
model.fit(df_train)

Resolve hyper-parameter template "better_default@v1" to "better_default@v1" -> {'growing_strategy': 'BEST_FIRST_GLOBAL'}.
Use /tmp/tmprhn5bpk0 as temporary training directory
Reading training dataset...




Training dataset read in 0:00:09.182321. Found 55760 examples.
Training model...
Model trained in 0:00:34.351287
Compiling model...


[INFO 24-03-31 18:18:29.7167 UTC kernel.cc:1233] Loading model from path /tmp/tmprhn5bpk0/model/ with prefix ee28ef562de24179
[INFO 24-03-31 18:18:29.7294 UTC quick_scorer_extended.cc:903] The binary was compiled without AVX2 support, but your CPU supports it. Enable it for faster model inference.
[INFO 24-03-31 18:18:29.7304 UTC abstract_model.cc:1344] Engine "GradientBoostedTreesQuickScorerExtended" built
[INFO 24-03-31 18:18:29.7305 UTC kernel.cc:1061] Use fast generic engine


Model compiled.


<tf_keras.src.callbacks.History at 0x7a19494f9c60>

In [30]:
model.make_inspector().evaluation()

Evaluation(num_examples=None, accuracy=0.9864840507507324, loss=0.11983446031808853, rmse=None, ndcg=None, aucs=None, auuc=None, qini=None)

## Conclusion
Since TFDF GradientBoosted displayed a higher accuracy. It will be used.

In [31]:
y_pred = model.predict(df_test)



In [32]:
rounded_predictions = np.round(y_pred)
integer_predictions = rounded_predictions.astype(int)
submit['class'] = integer_predictions

In [33]:
submit['class'].unique()

array([0, 1])

In [34]:
submit['class'].value_counts()

class
0    13766
1      174
Name: count, dtype: int64

In [35]:
submit.to_csv('submission.csv',index=False)