In [90]:
import pandas as pd


train_data = pd.read_csv("train.csv")
test_data = pd.read_csv("test.csv")

In [91]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 750000 entries, 0 to 749999
Data columns (total 18 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   id         750000 non-null  int64 
 1   age        750000 non-null  int64 
 2   job        750000 non-null  object
 3   marital    750000 non-null  object
 4   education  750000 non-null  object
 5   default    750000 non-null  object
 6   balance    750000 non-null  int64 
 7   housing    750000 non-null  object
 8   loan       750000 non-null  object
 9   contact    750000 non-null  object
 10  day        750000 non-null  int64 
 11  month      750000 non-null  object
 12  duration   750000 non-null  int64 
 13  campaign   750000 non-null  int64 
 14  pdays      750000 non-null  int64 
 15  previous   750000 non-null  int64 
 16  poutcome   750000 non-null  object
 17  y          750000 non-null  int64 
dtypes: int64(9), object(9)
memory usage: 103.0+ MB


In [92]:

#Feature engineering
import numpy as np

train_data['age_category'] = pd.cut(train_data['age'],
                                    bins = [0 , 30 , 60 , 100],
                                    labels=['Young' , 'Prime_Investor' , 'Senior'])

conditions = [
    train_data['balance'] < 0,
    (train_data['balance'] >= 0) & (train_data['balance'] <= 6000),
    train_data['balance'] > 6000
]
choices = ["Indebt" , "Low_Balance" , "High_Balance"]
train_data["balance_category"] = np.select(conditions , choices)

train_data["education"] = train_data["education"].replace('unknown' , 'primary')

train_data["education_category"] = train_data["education"].map({
    "secondary" : "Secondary",
    "tertiary" : "Tertiary",
    "primary" : "Primary"
})

train_data["age_education_category"] = train_data["age_category"].astype(str) + '_' + train_data["education_category"].astype(str)
train_data["age_balance_segment"] = train_data["age_category"].astype(str) + "_" + train_data["balance_category"]


train_data['marital_obligation'] = train_data['marital'].map({
    'married': 'High_Obligations',
    'divorced': 'Medium_Obligations',
    'single': 'Low_Obligations'
})

train_data["marital_balance_segment"] = train_data["marital_obligation"] + "_" + train_data["balance_category"]
train_data.drop(["age_category", "balance_category", "education_category"], axis=1, inplace=True)

In [93]:
#test_data 
test_data['age_category'] = pd.cut(test_data['age'],
                                    bins=[0, 30, 60, 100],
                                    labels=['Young', 'Prime_Investor', 'Senior'])

conditions = [
    test_data['balance'] < 0,
    (test_data['balance'] >= 0) & (test_data['balance'] <= 6000),
    test_data['balance'] > 6000
]
choices = ["Indebt", "Low_Balance", "High_Balance"]
test_data["balance_category"] = np.select(conditions, choices)

test_data["education"] = test_data["education"].replace('unknown', 'primary')

test_data["education_category"] = test_data["education"].map({
    "secondary": "Secondary",
    "tertiary": "Tertiary",
    "primary": "Primary"
})

test_data["age_education_category"] = test_data["age_category"].astype(str) + '_' + test_data["education_category"].astype(str)
test_data["age_balance_segment"] = test_data["age_category"].astype(str) + "_" + test_data["balance_category"]

test_data['marital_obligation'] = test_data['marital'].map({
    'married': 'High_Obligations',
    'divorced': 'Medium_Obligations',
    'single': 'Low_Obligations'
})

test_data["marital_balance_segment"] = test_data["marital_obligation"] + "_" + test_data["balance_category"]
test_data.drop(["age_category", "balance_category", "education_category"], axis=1, inplace=True)





In [94]:
from sklearn.preprocessing import LabelEncoder

categorical_cols = ['job', 'marital', 'education', 'default', 'housing', 'loan', 
                   'contact', 'month', 'poutcome', 'age_education_category', 
                   'age_balance_segment','marital_balance_segment' , "marital_obligation"]
for categ in categorical_cols:
    label = LabelEncoder()
    combined = pd.concat([train_data[categ] , test_data[categ]] , axis= 0).astype(str)
    label.fit(combined)
    train_data[categ] = label.transform(train_data[categ].astype(str))
    test_data[categ] = label.transform(test_data[categ].astype(str))

In [95]:
x_train = train_data.drop(["y"] , axis= 1)
y_train = train_data["y"]

In [96]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 750000 entries, 0 to 749999
Data columns (total 22 columns):
 #   Column                   Non-Null Count   Dtype
---  ------                   --------------   -----
 0   id                       750000 non-null  int64
 1   age                      750000 non-null  int64
 2   job                      750000 non-null  int32
 3   marital                  750000 non-null  int32
 4   education                750000 non-null  int32
 5   default                  750000 non-null  int32
 6   balance                  750000 non-null  int64
 7   housing                  750000 non-null  int32
 8   loan                     750000 non-null  int32
 9   contact                  750000 non-null  int32
 10  day                      750000 non-null  int64
 11  month                    750000 non-null  int32
 12  duration                 750000 non-null  int64
 13  campaign                 750000 non-null  int64
 14  pdays                    750000 non-

In [100]:
from sklearn.feature_selection import mutual_info_classif

mi_scores = mutual_info_classif(x_train , y_train , random_state= 42)
mi_df = pd.DataFrame({
    'Feature': x_train.columns,
    'MI_Score': mi_scores
}).sort_values('MI_Score', ascending=False)

print(mi_df)

                    Feature  MI_Score
12                 duration  0.154533
7                   housing  0.121085
16                 poutcome  0.119448
6                   balance  0.069438
18      age_balance_segment  0.066048
9                   contact  0.058683
3                   marital  0.057603
4                 education  0.057308
11                    month  0.053703
20  marital_balance_segment  0.046553
17   age_education_category  0.035926
2                       job  0.034012
14                    pdays  0.030030
13                 campaign  0.024049
19       marital_obligation  0.020024
1                       age  0.018968
10                      day  0.014176
8                      loan  0.013088
15                 previous  0.012053
5                   default  0.000381
0                        id  0.000245


In [104]:
colums = ['id', 'default', 'marital_balance_segment', 'day', 'loan', 'previous']


train_data.drop(columns= colums , axis=1)
test_data.drop(columns= colums , axis=1)

Unnamed: 0,age,job,marital,education,balance,housing,contact,month,duration,campaign,pdays,poutcome,age_education_category,age_balance_segment,marital_obligation
0,32,1,1,1,1397,1,2,8,224,1,-1,3,1,2,0
1,44,4,1,2,23,1,0,0,586,2,-1,3,2,2,0
2,36,6,1,0,46,1,0,8,111,2,-1,3,0,2,0
3,58,1,1,1,-1380,1,2,8,125,1,-1,3,1,1,0
4,28,9,2,1,1950,1,0,5,181,1,-1,3,7,8,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
249995,43,4,1,2,0,1,0,9,65,2,-1,3,2,2,0
249996,40,7,1,0,522,1,0,9,531,1,189,0,0,2,0
249997,63,5,1,0,33,0,0,5,178,1,92,2,3,5,0
249998,50,1,1,0,2629,1,2,8,163,2,-1,3,0,2,0


In [None]:
from xgboost import XGBClassifier

model_baseline = XGBClassifier(
    n_estiamtors = 100,
    max_depth = 3,
    min_child_weight = 5,
    scale_pos_weight = 7,
    colsample_bytree = 0.7,
    random_state = 42,
    n_jo0bs = -1
)

model_baseline.fit(x_train , y_train)

In [111]:
from sklearn.metrics import confusion_matrix , roc_auc_score , classification_report


y_pred = model_baseline.predict(x_train)
y_pred_proba = model_baseline.predict_proba(x_train)[: , 1]
print(confusion_matrix(y_train , y_pred))
print(classification_report(y_train , y_pred))
print(roc_auc_score(y_train , y_pred_proba))

[[572626  86886]
 [  6067  84421]]
              precision    recall  f1-score   support

           0       0.99      0.87      0.92    659512
           1       0.49      0.93      0.64     90488

    accuracy                           0.88    750000
   macro avg       0.74      0.90      0.78    750000
weighted avg       0.93      0.88      0.89    750000

0.9614017097552733


In [113]:
from sklearn.model_selection import RandomizedSearchCV

params_distribution = {
    "n_estimators": [100, 200, 300],
    "learning_rate": [0.01, 0.05, 0.1, 0.2],
    "max_depth": [6, 8, 10],  # Added 10
    "colsample_bytree": [0.3, 0.5, 0.7, 0.9],  # Removed 0.1, added 0.9
    "scale_pos_weight": [6, 7, 8, 9],  # Focused around 7
    "subsample": [0.7, 0.8, 0.9, 1.0],  # Optional: add row sampling
    "min_child_weight": [1, 3, 5]  # Optional: add regularization
}

model_tuned = RandomizedSearchCV(
    estimator= XGBClassifier(random_state = 42 , eval_metrics ='logloss'),
    param_distributions=params_distribution,
    scoring="roc_auc",
    n_iter=50,
    cv= 3,
    n_jobs= -1,
    random_state= 42,
    verbose= 2
)

In [114]:
model_tuned.fit(x_train , y_train)

Fitting 3 folds for each of 50 candidates, totalling 150 fits


Parameters: { "eval_metrics" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [116]:
best_model = model_tuned.best_estimator_

y_pred = best_model.predict(x_train)
y_pred_proba = best_model.predict_proba(x_train)[: , 1]
print("Confusion Matrix:")
print(confusion_matrix(y_train , y_pred))
print(classification_report(y_train , y_pred))
print(roc_auc_score(y_train , y_pred_proba))

Confusion Matrix:
[[590186  69326]
 [  5518  84970]]
              precision    recall  f1-score   support

           0       0.99      0.89      0.94    659512
           1       0.55      0.94      0.69     90488

    accuracy                           0.90    750000
   macro avg       0.77      0.92      0.82    750000
weighted avg       0.94      0.90      0.91    750000

0.9719893992700764


In [119]:
y_prediction_test = best_model.predict(test_data)

In [121]:
y_test = pd.read_csv("test.csv")

submisson = pd.DataFrame({
    "id" : y_test["id"] ,
    "y" : y_prediction_test
})

submisson.to_csv("submisson.csv" , index=False)