### Importing Libraries

In [171]:
import pandas as pd
import numpy as np
import scipy.stats as stats
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, precision_score, recall_score

### Loading data

In [172]:
data = pd.read_csv('application_record.csv')
data.sample(20)

Unnamed: 0,ID,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,FLAG_WORK_PHONE,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,CNT_FAM_MEMBERS
99148,5408340,M,N,N,0,157500.0,Pensioner,Higher education,Married,House / apartment,-21555,365243,1,0,0,0,,2
332131,6353712,M,Y,Y,2,360000.0,Working,Secondary / secondary special,Married,House / apartment,-16895,-679,1,0,0,0,,4
191657,5877316,M,Y,Y,0,112500.0,Working,Secondary / secondary special,Married,House / apartment,-19197,-4810,1,0,0,0,Drivers,2
87365,5346023,F,N,Y,0,157500.0,Working,Higher education,Married,House / apartment,-19466,-4414,1,0,0,0,Accountants,2
431805,7466774,F,N,Y,0,121500.0,Working,Secondary / secondary special,Married,House / apartment,-9913,-1250,1,1,1,0,Laborers,2
236206,6604285,F,N,Y,0,405000.0,State servant,Secondary / secondary special,Married,House / apartment,-13824,-4760,1,0,1,1,Core staff,2
255345,6063972,M,N,Y,1,135000.0,Commercial associate,Higher education,Married,House / apartment,-14207,-854,1,0,0,0,Security staff,3
45019,5135666,F,N,Y,0,216000.0,Commercial associate,Secondary / secondary special,Married,House / apartment,-17644,-7840,1,0,0,0,Laborers,2
367978,6539334,F,N,Y,0,175500.0,Working,Higher education,Single / not married,House / apartment,-12615,-4763,1,0,1,0,Core staff,1
155132,5731914,M,Y,Y,0,310500.0,Working,Incomplete higher,Married,House / apartment,-21001,-981,1,1,1,0,Sales staff,2


### Check the correlation between CNT_FAM_MEMBERS and CNT_CHILDREN

In [173]:
# Create two arrays of data
child_count = data['CNT_CHILDREN']
fam_count = data['CNT_FAM_MEMBERS']

# Calculate the Pearson correlation coefficient
corr, pval = stats.pearsonr(child_count, fam_count)

# Print the result
print(f"Pearson correlation coefficient: {corr}")

Pearson correlation coefficient: 0.8847821164512261


### Dropping CNT_FAM_MEMBERS as its too correlated with CNT_CHILDREN our traget (y)

In [174]:
data.drop('CNT_FAM_MEMBERS', axis=1, inplace=True)

### Getting data info to see columns type, number of rows and missing values

In [175]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 438552 entries, 0 to 438551
Data columns (total 17 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   ID                   438552 non-null  int64  
 1   CODE_GENDER          438552 non-null  object 
 2   FLAG_OWN_CAR         438552 non-null  object 
 3   FLAG_OWN_REALTY      438552 non-null  object 
 4   CNT_CHILDREN         438552 non-null  int64  
 5   AMT_INCOME_TOTAL     438552 non-null  float64
 6   NAME_INCOME_TYPE     438552 non-null  object 
 7   NAME_EDUCATION_TYPE  438552 non-null  object 
 8   NAME_FAMILY_STATUS   438552 non-null  object 
 9   NAME_HOUSING_TYPE    438552 non-null  object 
 10  DAYS_BIRTH           438552 non-null  int64  
 11  DAYS_EMPLOYED        438552 non-null  int64  
 12  FLAG_MOBIL           438552 non-null  int64  
 13  FLAG_WORK_PHONE      438552 non-null  int64  
 14  FLAG_PHONE           438552 non-null  int64  
 15  FLAG_EMAIL       

### Dropping rows that OCCUPATION_TYPE is missing as its categorial and we cannot fill in this value

In [176]:
data.dropna(inplace=True)

### Creating an age column instead of DAYS_BIRTH and creating an emplyee status column instead of DAYS_EMPLOYED -- because we have the total income it doesnt matter for us how much time he works, we just care if he is employed or not

In [177]:
data['AGE']=-(data['DAYS_BIRTH'])//365
data['EMPLOYMENT_STATUS'] = data['DAYS_EMPLOYED'].apply(lambda x: 1 if x < 0 else 0)
data.head()

Unnamed: 0,ID,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,FLAG_WORK_PHONE,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,AGE,EMPLOYMENT_STATUS
2,5008806,M,Y,Y,0,112500.0,Working,Secondary / secondary special,Married,House / apartment,-21474,-1134,1,0,0,0,Security staff,58,1
3,5008808,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,-19110,-3051,1,0,1,1,Sales staff,52,1
4,5008809,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,-19110,-3051,1,0,1,1,Sales staff,52,1
5,5008810,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,-19110,-3051,1,0,1,1,Sales staff,52,1
6,5008811,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,-19110,-3051,1,0,1,1,Sales staff,52,1


### Removing DAYS_BIRTH and DAYS_EMPLOYED and ID

In [178]:
data = data.drop(['DAYS_BIRTH', 'DAYS_EMPLOYED', 'ID'], axis=1)
data

Unnamed: 0,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,FLAG_MOBIL,FLAG_WORK_PHONE,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,AGE,EMPLOYMENT_STATUS
2,M,Y,Y,0,112500.0,Working,Secondary / secondary special,Married,House / apartment,1,0,0,0,Security staff,58,1
3,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,1,0,1,1,Sales staff,52,1
4,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,1,0,1,1,Sales staff,52,1
5,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,1,0,1,1,Sales staff,52,1
6,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,1,0,1,1,Sales staff,52,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
438536,M,N,Y,0,202500.0,Working,Higher education,Civil marriage,House / apartment,1,1,0,0,Laborers,37,1
438543,M,Y,Y,1,135000.0,Working,Secondary / secondary special,Married,House / apartment,1,0,0,0,Laborers,34,1
438548,F,N,N,0,103500.0,Working,Secondary / secondary special,Single / not married,House / apartment,1,0,0,0,Laborers,43,1
438549,F,N,N,0,54000.0,Commercial associate,Higher education,Single / not married,With parents,1,1,0,0,Sales staff,22,1


##### Removing unique rows in CNT_CHILDREN

In [179]:
unique = data['CNT_CHILDREN'].value_counts()[data['CNT_CHILDREN'].value_counts() == 1].index.tolist()
data = data[~data['CNT_CHILDREN'].isin(unique)]
data

Unnamed: 0,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,FLAG_MOBIL,FLAG_WORK_PHONE,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,AGE,EMPLOYMENT_STATUS
2,M,Y,Y,0,112500.0,Working,Secondary / secondary special,Married,House / apartment,1,0,0,0,Security staff,58,1
3,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,1,0,1,1,Sales staff,52,1
4,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,1,0,1,1,Sales staff,52,1
5,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,1,0,1,1,Sales staff,52,1
6,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,1,0,1,1,Sales staff,52,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
438536,M,N,Y,0,202500.0,Working,Higher education,Civil marriage,House / apartment,1,1,0,0,Laborers,37,1
438543,M,Y,Y,1,135000.0,Working,Secondary / secondary special,Married,House / apartment,1,0,0,0,Laborers,34,1
438548,F,N,N,0,103500.0,Working,Secondary / secondary special,Single / not married,House / apartment,1,0,0,0,Laborers,43,1
438549,F,N,N,0,54000.0,Commercial associate,Higher education,Single / not married,With parents,1,1,0,0,Sales staff,22,1


### Dividing into x and y and dropping ID column

In [180]:
X = data.drop('CNT_CHILDREN', axis=1)
y = data['CNT_CHILDREN']
data.head()

Unnamed: 0,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,FLAG_MOBIL,FLAG_WORK_PHONE,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,AGE,EMPLOYMENT_STATUS
2,M,Y,Y,0,112500.0,Working,Secondary / secondary special,Married,House / apartment,1,0,0,0,Security staff,58,1
3,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,1,0,1,1,Sales staff,52,1
4,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,1,0,1,1,Sales staff,52,1
5,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,1,0,1,1,Sales staff,52,1
6,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,1,0,1,1,Sales staff,52,1


In [181]:
X

Unnamed: 0,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,AMT_INCOME_TOTAL,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,FLAG_MOBIL,FLAG_WORK_PHONE,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,AGE,EMPLOYMENT_STATUS
2,M,Y,Y,112500.0,Working,Secondary / secondary special,Married,House / apartment,1,0,0,0,Security staff,58,1
3,F,N,Y,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,1,0,1,1,Sales staff,52,1
4,F,N,Y,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,1,0,1,1,Sales staff,52,1
5,F,N,Y,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,1,0,1,1,Sales staff,52,1
6,F,N,Y,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,1,0,1,1,Sales staff,52,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
438536,M,N,Y,202500.0,Working,Higher education,Civil marriage,House / apartment,1,1,0,0,Laborers,37,1
438543,M,Y,Y,135000.0,Working,Secondary / secondary special,Married,House / apartment,1,0,0,0,Laborers,34,1
438548,F,N,N,103500.0,Working,Secondary / secondary special,Single / not married,House / apartment,1,0,0,0,Laborers,43,1
438549,F,N,N,54000.0,Commercial associate,Higher education,Single / not married,With parents,1,1,0,0,Sales staff,22,1


In [182]:
y

2         0
3         0
4         0
5         0
6         0
         ..
438536    0
438543    1
438548    0
438549    0
438551    0
Name: CNT_CHILDREN, Length: 304349, dtype: int64

## Prepering data for Machine Learning Models

### First, dividing into categorial columns and numerical columns

In [183]:
data.columns

Index(['CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'CNT_CHILDREN',
       'AMT_INCOME_TOTAL', 'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE',
       'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE', 'FLAG_MOBIL',
       'FLAG_WORK_PHONE', 'FLAG_PHONE', 'FLAG_EMAIL', 'OCCUPATION_TYPE', 'AGE',
       'EMPLOYMENT_STATUS'],
      dtype='object')

In [184]:
data.head(1)

Unnamed: 0,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,FLAG_MOBIL,FLAG_WORK_PHONE,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,AGE,EMPLOYMENT_STATUS
2,M,Y,Y,0,112500.0,Working,Secondary / secondary special,Married,House / apartment,1,0,0,0,Security staff,58,1


In [185]:
categorical_cols = ['CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE',
                    'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE', 'FLAG_MOBIL', 'FLAG_WORK_PHONE',
                  'FLAG_PHONE', 'FLAG_EMAIL', 'OCCUPATION_TYPE', 'EMPLOYMENT_STATUS']
numerical_cols = ['AMT_INCOME_TOTAL', 'AGE']

### Onehotencoder on the categorial columns and standardscaler

In [186]:
numerical_transformer = StandardScaler()
categorical_transformer = OneHotEncoder()
preprocessor = ColumnTransformer(transformers=[('num', numerical_transformer, numerical_cols),
                                               ('cat', categorical_transformer, categorical_cols)])

### Dividing the data to train and test

In [187]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

### Setting modules

In [188]:
models = {
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42),
    "SVM": SVC(probability=True, random_state=42),
    "KNN": KNeighborsClassifier()
}

In [189]:
# Initialize dictionary to store evaluation metrics for each model
model_scores = {}

# Initialize k-fold cross-validation
kf = StratifiedKFold(n_splits=10)

# Train and evaluate each model using k-fold cross-validation
for model_name, model in models.items():
    print(f"Training {model_name}...")
    accuracy_scores = []
    f1_scores = []
    roc_auc_scores = []
    precision_scores = []
    recall_scores = []
    
    for train_idx, val_idx in kf.split(X_train_preprocessed, y_train):
        # Split training data into training and validation sets
        X_train_kf, X_val_kf = X_train_preprocessed[train_idx], X_train_preprocessed[val_idx]
        y_train_kf, y_val_kf = y_train.iloc[train_idx], y_train.iloc[val_idx]
        
        # Fit the model on training data
        model.fit(X_train_kf, y_train_kf)
            # Split the data into training and validation sets based on the indices generated by the k-fold splitter
    X_train, y_train = X_train_preprocessed.iloc[train_idx], y_train.iloc[train_idx]
    X_val, y_val = X_train_preprocessed.iloc[val_idx], y_train.iloc[val_idx]

    # Train and evaluate decision tree classifier
    dtc = DecisionTreeClassifier(random_state=42)
    dtc.fit(X_train, y_train)
    y_val_pred = dtc.predict(X_val)
    accuracy = accuracy_score(y_val, y_val_pred)
    f1 = f1_score(y_val, y_val_pred)
    roc_auc = roc_auc_score(y_val, y_val_pred)
    precision = precision_score(y_val, y_val_pred)
    recall = recall_score(y_val, y_val_pred)
    dtc_results.append((accuracy, f1, roc_auc, precision, recall))

    # Train and evaluate random forest classifier
    rfc = RandomForestClassifier(random_state=42)
    rfc.fit(X_train, y_train)
    y_val_pred = rfc.predict(X_val)
    accuracy = accuracy_score(y_val, y_val_pred)
    f1 = f1_score(y_val, y_val_pred)
    roc_auc = roc_auc_score(y_val, y_val_pred)
    precision = precision_score(y_val, y_val_pred)
    recall = recall_score(y_val, y_val_pred)
    rfc_results.append((accuracy, f1, roc_auc, precision, recall))

    # Train and evaluate support vector machine classifier
    svc = SVC(random_state=42, probability=True)
    svc.fit(X_train, y_train)
    y_val_pred = svc.predict(X_val)
    accuracy = accuracy_score(y_val, y_val_pred)
    f1 = f1_score(y_val, y_val_pred)
    roc_auc = roc_auc_score(y_val, y_val_pred)
    precision = precision_score(y_val, y_val_pred)
    recall = recall_score(y_val, y_val_pred)
    svc_results.append((accuracy, f1, roc_auc, precision, recall))

    # Train and evaluate k-nearest neighbors classifier
    knn = KNeighborsClassifier()
    knn.fit(X_train, y_train)
    y_val_pred = knn.predict(X_val)
    accuracy = accuracy_score(y_val, y_val_pred)
    f1 = f1_score(y_val, y_val_pred)
    roc_auc = roc_auc_score(y_val, y_val_pred)
    precision = precision_score(y_val, y_val_pred)
    recall = recall_score(y_val, y_val_pred)
    knn_results.append((accuracy, f1, roc_auc, precision, recall))

# Calculate average of metrics across all folds
dtc_avg_results = np.mean(dtc_results, axis=0)
rfc_avg_results = np.mean(rfc_results, axis=0)
svc_avg_results = np.mean(svc_results, axis=0)
knn_avg_results = np.mean(knn_results, axis=0)


Training Decision Tree...


ValueError: Found input variables with inconsistent numbers of samples: [2434792, 243479]