## Classification on credit card approval

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from sklearn.preprocessing import OneHotEncoder

In [2]:
df = pd.read_csv("credit_card.csv")
labels = pd.read_csv("credit_card_label.csv")
df.head()

Unnamed: 0,Ind_ID,GENDER,Car_Owner,Propert_Owner,CHILDREN,Annual_income,Type_Income,EDUCATION,Marital_status,Housing_type,Birthday_count,Employed_days,Mobile_phone,Work_Phone,Phone,EMAIL_ID,Type_Occupation,Family_Members
0,5008827,M,Y,Y,0,180000.0,Pensioner,Higher education,Married,House / apartment,-18772.0,365243,1,0,0,0,,2
1,5009744,F,Y,N,0,315000.0,Commercial associate,Higher education,Married,House / apartment,-13557.0,-586,1,1,1,0,,2
2,5009746,F,Y,N,0,315000.0,Commercial associate,Higher education,Married,House / apartment,,-586,1,1,1,0,,2
3,5009749,F,Y,N,0,,Commercial associate,Higher education,Married,House / apartment,-13557.0,-586,1,1,1,0,,2
4,5009752,F,Y,N,0,315000.0,Commercial associate,Higher education,Married,House / apartment,-13557.0,-586,1,1,1,0,,2


In [3]:
labels.head()

Unnamed: 0,Ind_ID,label
0,5008827,1
1,5009744,1
2,5009746,1
3,5009749,1
4,5009752,1


### Label information
0 is application approved and 1 is application rejected.

In [4]:
# merge label to dataframe
df = pd.merge(df, labels[["Ind_ID", "label"]], on="Ind_ID", how="left")

### Dropping unrelevant variables

In [5]:
df = df.drop(["Mobile_phone", "Work_Phone", "Phone", "EMAIL_ID"], axis=1)
df.head()

Unnamed: 0,Ind_ID,GENDER,Car_Owner,Propert_Owner,CHILDREN,Annual_income,Type_Income,EDUCATION,Marital_status,Housing_type,Birthday_count,Employed_days,Type_Occupation,Family_Members,label
0,5008827,M,Y,Y,0,180000.0,Pensioner,Higher education,Married,House / apartment,-18772.0,365243,,2,1
1,5009744,F,Y,N,0,315000.0,Commercial associate,Higher education,Married,House / apartment,-13557.0,-586,,2,1
2,5009746,F,Y,N,0,315000.0,Commercial associate,Higher education,Married,House / apartment,,-586,,2,1
3,5009749,F,Y,N,0,,Commercial associate,Higher education,Married,House / apartment,-13557.0,-586,,2,1
4,5009752,F,Y,N,0,315000.0,Commercial associate,Higher education,Married,House / apartment,-13557.0,-586,,2,1


In [6]:
df.shape

(1548, 15)

### Checking for missing values

In [7]:
df.isnull().sum()

Ind_ID               0
GENDER               7
Car_Owner            0
Propert_Owner        0
CHILDREN             0
Annual_income       23
Type_Income          0
EDUCATION            0
Marital_status       0
Housing_type         0
Birthday_count      22
Employed_days        0
Type_Occupation    488
Family_Members       0
label                0
dtype: int64

In [8]:
# filling in "GENDER" and "Type_Occupation" with mode
df['GENDER'].fillna(df['GENDER'].mode()[0])
df['Type_Occupation'].fillna(df['Type_Occupation'].mode()[0], inplace=True)

# filling in "Annual_income" and "Birthday_count" with mean
df['Annual_income'].fillna(df['Annual_income'].mean(), inplace=True)
df['Birthday_count'].fillna(df['Birthday_count'].mean(), inplace=True)


In [9]:
df["Birthday_count"] = (df["Birthday_count"] * -1) / 365
df["Employed_days"] = (df["Employed_days"] * -1)
df = df.rename(columns={"Birthday_count": "Age"})
df.head()

Unnamed: 0,Ind_ID,GENDER,Car_Owner,Propert_Owner,CHILDREN,Annual_income,Type_Income,EDUCATION,Marital_status,Housing_type,Age,Employed_days,Type_Occupation,Family_Members,label
0,5008827,M,Y,Y,0,180000.0,Pensioner,Higher education,Married,House / apartment,51.430137,-365243,Laborers,2,1
1,5009744,F,Y,N,0,315000.0,Commercial associate,Higher education,Married,House / apartment,37.142466,586,Laborers,2,1
2,5009746,F,Y,N,0,315000.0,Commercial associate,Higher education,Married,House / apartment,43.946143,586,Laborers,2,1
3,5009749,F,Y,N,0,191399.32623,Commercial associate,Higher education,Married,House / apartment,37.142466,586,Laborers,2,1
4,5009752,F,Y,N,0,315000.0,Commercial associate,Higher education,Married,House / apartment,37.142466,586,Laborers,2,1


### Unbalanced label 

In [10]:
df["label"].value_counts()

label
0    1373
1     175
Name: count, dtype: int64

In [11]:
# balancing label
under_sample_0 = df.query('label == 0').sample(n=175)
under_sample_1 = df.query('label == 1').sample(n=175)
balanced_df = pd.concat([under_sample_1, under_sample_0], ignore_index = True)
balanced_df = shuffle(balanced_df)
balanced_df["label"].value_counts()

label
0    175
1    175
Name: count, dtype: int64

### Random Forest Classifier

In [12]:
X = balanced_df[["Annual_income", "Age", "Employed_days", "Family_Members"]]
y = balanced_df[["label"]]

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.8,
                                                    test_size = 0.2, random_state = 100)

rf = RandomForestClassifier(100)
rf.fit(X_train, y_train)

  return fit_method(estimator, *args, **kwargs)


In [14]:
prediction = rf.predict(X_test)
f1 = f1_score(y_test, prediction)
f1

0.7272727272727272