# Credit Card Approval Project Using Logistic Regression

In [107]:
import pandas as pd
import numpy as np

import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

In [108]:
df = pd.read_csv("credit_card_approval.csv")

In [109]:
len(df)

537667

In [110]:
df.head()

Unnamed: 0,ID,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,FLAG_WORK_PHONE,FLAG_PHONE,FLAG_EMAIL,JOB,BEGIN_MONTHS,STATUS,TARGET
0,5065438,F,Y,N,2+ children,270000.0,Secondary / secondary special,Married,With parents,-13258,-2300,1,0,0,0,Managers,-6,C,0
1,5142753,F,N,N,No children,81000.0,Secondary / secondary special,Single / not married,House / apartment,-17876,-377,1,1,1,0,Private service staff,-4,0,0
2,5111146,M,Y,Y,No children,270000.0,Higher education,Married,House / apartment,-19579,-1028,1,0,1,0,Laborers,0,C,0
3,5010310,F,Y,Y,1 children,112500.0,Secondary / secondary special,Married,House / apartment,-15109,-1956,1,0,0,0,Core staff,-3,0,0
4,5010835,M,Y,Y,2+ children,139500.0,Secondary / secondary special,Married,House / apartment,-17281,-5578,1,1,0,0,Drivers,-29,0,0


## Step 1: Business Understanding

We want to check if a customer will get approved for a card (TARGET == 0), or denied (TARGET == 1)

## Step 2: Data Understanding

In [111]:
df.TARGET.value_counts()

TARGET
0    535705
1      1962
Name: count, dtype: int64

There are over 500k data points, and all of them have a value for TARGET. It is unlikely that we'll need more data.

Let's transpose the head of the table so we can more easily see the columns and their values.

In [112]:
df.head().T

Unnamed: 0,0,1,2,3,4
ID,5065438,5142753,5111146,5010310,5010835
CODE_GENDER,F,F,M,F,M
FLAG_OWN_CAR,Y,N,Y,Y,Y
FLAG_OWN_REALTY,N,N,Y,Y,Y
CNT_CHILDREN,2+ children,No children,No children,1 children,2+ children
AMT_INCOME_TOTAL,270000.0,81000.0,270000.0,112500.0,139500.0
NAME_EDUCATION_TYPE,Secondary / secondary special,Secondary / secondary special,Higher education,Secondary / secondary special,Secondary / secondary special
NAME_FAMILY_STATUS,Married,Single / not married,Married,Married,Married
NAME_HOUSING_TYPE,With parents,House / apartment,House / apartment,House / apartment,House / apartment
DAYS_BIRTH,-13258,-17876,-19579,-15109,-17281


In [113]:
df.dtypes

ID                       int64
CODE_GENDER             object
FLAG_OWN_CAR            object
FLAG_OWN_REALTY         object
CNT_CHILDREN            object
AMT_INCOME_TOTAL       float64
NAME_EDUCATION_TYPE     object
NAME_FAMILY_STATUS      object
NAME_HOUSING_TYPE       object
DAYS_BIRTH               int64
DAYS_EMPLOYED            int64
FLAG_MOBIL               int64
FLAG_WORK_PHONE          int64
FLAG_PHONE               int64
FLAG_EMAIL               int64
JOB                     object
BEGIN_MONTHS             int64
STATUS                  object
TARGET                   int64
dtype: object

In [114]:
df.columns = df.columns.str.lower().str.replace(" ", "_")
string_columns = list(df.dtypes[df.dtypes == "object"].index)
for column in string_columns:
    df[column] = df[column].str.lower().str.replace(" ", "_")

In [115]:
df.head().T

Unnamed: 0,0,1,2,3,4
id,5065438,5142753,5111146,5010310,5010835
code_gender,f,f,m,f,m
flag_own_car,y,n,y,y,y
flag_own_realty,n,n,y,y,y
cnt_children,2+_children,no_children,no_children,1_children,2+_children
amt_income_total,270000.0,81000.0,270000.0,112500.0,139500.0
name_education_type,secondary_/_secondary_special,secondary_/_secondary_special,higher_education,secondary_/_secondary_special,secondary_/_secondary_special
name_family_status,married,single_/_not_married,married,married,married
name_housing_type,with_parents,house_/_apartment,house_/_apartment,house_/_apartment,house_/_apartment
days_birth,-13258,-17876,-19579,-15109,-17281


Look at the target variable: target. It's currently numerical, with two values, 1 and 0. For binary classification, all models typically expect a number, so we don't need to do any conversions here.

## Step 3: Data Preparation

In [116]:
from sklearn.model_selection import train_test_split

In [117]:
df_train_full, df_test = train_test_split(df, test_size=0.2, random_state=1)

In [118]:
df_train_full.head().T

Unnamed: 0,350066,394166,402624,105907,317369
id,5048639,5117739,5029654,5095468,5132411
code_gender,m,f,f,m,f
flag_own_car,n,y,n,y,y
flag_own_realty,n,n,y,y,y
cnt_children,2+_children,no_children,no_children,2+_children,no_children
amt_income_total,135000.0,225000.0,630000.0,225000.0,292500.0
name_education_type,secondary_/_secondary_special,higher_education,higher_education,higher_education,higher_education
name_family_status,married,single_/_not_married,separated,married,married
name_housing_type,house_/_apartment,house_/_apartment,house_/_apartment,house_/_apartment,house_/_apartment
days_birth,-13639,-13941,-22001,-13170,-18409


In [119]:
df_train_full.shape

(430133, 19)

In [120]:
df_train, df_val = train_test_split(df_train_full, test_size=0.25, random_state=11)

In [121]:
len(df_train), len(df_val), len(df_test)

(322599, 107534, 107534)

In [122]:
len(df) - (len(df_train) + len(df_val) + len(df_test))

0

In [123]:
len(df_train) / 537667, len(df_val) / 537667, len(df_test) / 537667

(0.5999977681352956, 0.20000111593235217, 0.20000111593235217)

It looks like we're using every piece of the original dataset, so we can move on to the next step, getting the target values.

In [124]:
y_train = df_train.target.values
y_val = df_val.target.values

In [125]:
# del df_train["target"]
# del df_val["target"]

In [126]:
df_train_full.isnull().sum()

id                     0
code_gender            0
flag_own_car           0
flag_own_realty        0
cnt_children           0
amt_income_total       0
name_education_type    0
name_family_status     0
name_housing_type      0
days_birth             0
days_employed          0
flag_mobil             0
flag_work_phone        0
flag_phone             0
flag_email             0
job                    0
begin_months           0
status                 0
target                 0
dtype: int64

It prints all zeros, so we have no missing values in the dataset and don't need to do anything extra null handling.

In [127]:
df_train_full["target"].value_counts()

target
0    428559
1      1574
Name: count, dtype: int64

Most of the customers got approved for credit cards.

In [128]:
global_mean = df_train_full["target"].mean()
round(global_mean, 5)

np.float64(0.00366)

This credit card approval dataset is an example of a so-called imbalanced dataset. There were 273 times as many people who did get approved in our dataset as those who didn't get approved, and we say that the approved class dominates the non-approved class. The approval rate in our data is ~99.6% (1 - 0.00366), which is a strong indicator of class imbalance. The opposite of imbalanced is the balanced case, when positive and negative classes are equally distributed among all observations.

Both the categorical and numerical variables in our dataset are important, but they are also different and need different treatment. For that, we want to look at them separately.

In [129]:
df.head().T

Unnamed: 0,0,1,2,3,4
id,5065438,5142753,5111146,5010310,5010835
code_gender,f,f,m,f,m
flag_own_car,y,n,y,y,y
flag_own_realty,n,n,y,y,y
cnt_children,2+_children,no_children,no_children,1_children,2+_children
amt_income_total,270000.0,81000.0,270000.0,112500.0,139500.0
name_education_type,secondary_/_secondary_special,secondary_/_secondary_special,higher_education,secondary_/_secondary_special,secondary_/_secondary_special
name_family_status,married,single_/_not_married,married,married,married
name_housing_type,with_parents,house_/_apartment,house_/_apartment,house_/_apartment,house_/_apartment
days_birth,-13258,-17876,-19579,-15109,-17281


In [146]:
categorical = ["code_gender", "flag_own_car", "flag_own_realty", "cnt_children",
               "name_education_type", "name_family_status", "name_housing_type",
               "job", "status"]
numerical = ["amt_income_total", "days_birth", "days_employed", "flag_mobil",
            "flag_work_phone", "flag_phone", "flag_email", "begin_months"]

In [147]:
df_train_full[categorical].nunique()

code_gender             2
flag_own_car            2
flag_own_realty         2
cnt_children            3
name_education_type     5
name_family_status      5
name_housing_type       6
job                    18
status                  8
dtype: int64

In [148]:
df_train_full["job"].value_counts()

job
laborers                 105312
core_staff                61728
sales_staff               56151
managers                  54218
drivers                   38100
high_skill_tech_staff     25422
accountants               21742
medicine_staff            21392
cooking_staff             10766
security_staff             9922
cleaning_staff             9091
private_service_staff      5384
low-skill_laborers         2919
secretaries                2532
waiters/barmen_staff       2051
hr_staff                   1353
it_staff                   1056
realty_agents               994
Name: count, dtype: int64

job has many possible values, so if we do one-hot encoding it'll result in a very large table. I don't know how to manage such large possible values for a categorical field yet. I could group them, but what metric would I use? For now, I'll just use my laptop and create a large table, it can handle it.

In [150]:
female_mean = df_train_full[df_train_full["code_gender"] == "f"].target.mean()
print("gender == f:", round(female_mean, 5))

male_mean = df_train_full[df_train_full["code_gender"] == "m"].target.mean()
print("gender == m:", round(male_mean, 5))

gender == f: 0.0029
gender == m: 0.00491


Males are much more likely to be declined than approved. Let's calculate the declinal rate for the code_gender variable.

In [151]:
global_mean = df_train_full.target.mean()

df_group = df_train_full.groupby(by="code_gender").target.agg(["mean"])
df_group["diff"] = df_group["mean"] - global_mean
df_group["risk"] = df_group["mean"] / global_mean

df_group

Unnamed: 0_level_0,mean,diff,risk
code_gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
f,0.002896,-0.000764,0.791309
m,0.004909,0.001249,1.3414


In [152]:
from IPython.display import display


for col in categorical:
    df_group = df_train_full.groupby(by=col).target.agg(["mean"])
    df_group["diff"] = df_group["mean"] - global_mean
    df_group["risk"] = df_group["mean"] / global_mean
    display(df_group)

Unnamed: 0_level_0,mean,diff,risk
code_gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
f,0.002896,-0.000764,0.791309
m,0.004909,0.001249,1.3414


Unnamed: 0_level_0,mean,diff,risk
flag_own_car,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
n,0.003509,-0.00015,0.958934
y,0.003858,0.000199,1.054387


Unnamed: 0_level_0,mean,diff,risk
flag_own_realty,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
n,0.003149,-0.00051,0.860543
y,0.003943,0.000284,1.077539


Unnamed: 0_level_0,mean,diff,risk
cnt_children,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1_children,0.004654,0.000995,1.271888
2+_children,0.002735,-0.000924,0.747377
no_children,0.003469,-0.00019,0.94808


Unnamed: 0_level_0,mean,diff,risk
name_education_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
academic_degree,0.0,-0.003659,0.0
higher_education,0.004425,0.000766,1.209255
incomplete_higher,0.003761,0.000102,1.027906
lower_secondary,0.010168,0.006508,2.778547
secondary_/_secondary_special,0.003246,-0.000413,0.887042


Unnamed: 0_level_0,mean,diff,risk
name_family_status,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
civil_marriage,0.001307,-0.002353,0.357068
married,0.003543,-0.000116,0.968185
separated,0.002788,-0.000872,0.761754
single_/_not_married,0.005843,0.002184,1.596733
widow,0.006199,0.00254,1.694076


Unnamed: 0_level_0,mean,diff,risk
name_housing_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
co-op_apartment,0.009591,0.005931,2.620848
house_/_apartment,0.003604,-5.5e-05,0.984943
municipal_apartment,0.005135,0.001475,1.403155
office_apartment,0.005675,0.002016,1.550837
rented_apartment,0.002919,-0.00074,0.797763
with_parents,0.002889,-0.00077,0.789574


Unnamed: 0_level_0,mean,diff,risk
job,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
accountants,0.00299,-0.00067,0.816981
cleaning_staff,0.00209,-0.001569,0.571137
cooking_staff,0.003994,0.000335,1.091471
core_staff,0.003823,0.000164,1.044787
drivers,0.004042,0.000383,1.104571
high_skill_tech_staff,0.004524,0.000864,1.236193
hr_staff,0.000739,-0.00292,0.201976
it_staff,0.015152,0.011492,4.140512
laborers,0.003181,-0.000478,0.869291
low-skill_laborers,0.016787,0.013127,4.58733


Unnamed: 0_level_0,mean,diff,risk
status,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.0,-0.003659,0.0
1,0.0,-0.003659,0.0
2,1.0,0.996341,273.273825
3,1.0,0.996341,273.273825
4,1.0,0.996341,273.273825
5,1.0,0.996341,273.273825
c,0.0,-0.003659,0.0
x,0.0,-0.003659,0.0


It seems like the status variable will help us a lot to see if a customer will be approved or declined.

Higher values of mutual information mean a higher degree of dependence: if the mutual information between a categorical variable and the target is high, this categorical variable will be quite useful for predicting the target. On the other hand, if the mutual information is low, the categorical variable and the target are independent, and thus the variable will not be useful for predicting the target.

In [153]:
from sklearn.metrics import mutual_info_score

def calculate_mi(series):
    return mutual_info_score(series, df_train_full.target)

df_mi = df_train_full[categorical].apply(calculate_mi)
df_mi = df_mi.sort_values(ascending=False).to_frame(name="MI")
df_mi

Unnamed: 0,MI
status,0.024183
job,0.000474
name_family_status,0.000175
code_gender,0.000127
name_education_type,7.4e-05
cnt_children,4.9e-05
name_housing_type,4e-05
flag_own_realty,2e-05
flag_own_car,4e-06


status seems to be the most important feature by a wide margin.

Mutual information is a way to quantify the degree of dependency between two categorical variables, but it doesn’t work when one of the features is numerical, so we cannot apply it to the 10 numerical variables that we have, but we can use the correlation coefficient.

In [154]:
# df_train_full[numerical].corrwith(df_train_full.target)

The line above gives us an error, this is because "flag_mobil" only has 1 value (1). We can remove it from the numerical table and try again.

In [155]:
df["flag_mobil"].value_counts()

flag_mobil
1    537667
Name: count, dtype: int64

In [156]:
numerical.remove("flag_mobil")

In [157]:
df_train_full[numerical].corrwith(df_train_full.target).to_frame("correlation")

Unnamed: 0,correlation
amt_income_total,0.002681
days_birth,-0.005737
days_employed,0.019841
flag_work_phone,0.004211
flag_phone,0.009162
flag_email,-0.0021
begin_months,-0.001742


Correlation between numerical variables and target. days_birth has a high negative correlation: as days_birth grows, target (decline) rate goes down. days_employed has positive correlation: the more days_employed they are, the more likely they are to target (decline).

When the number of features grows, the one-hot encoding process becomes tedious. Luckily, Scikit-learn can perform one-hot encoding in multiple ways, here we will use DictVectorizer.

DictVectorizer takes in a dictionary and vectorizes it. It creates vectors from the dictionary. Then the vectors are put together as rows of one matrix. This matrix is used as input to a machine learning algorithm.

To use this method, we need to convert our dataframe to a list of dictionaries, which is simple to do in Pandas using the to_dict method with the orient="records" parameter:

In [163]:
train_dict = df_train[categorical + numerical].to_dict(orient="records")

In [164]:
train_dict[0]

{'code_gender': 'm',
 'flag_own_car': 'y',
 'flag_own_realty': 'n',
 'cnt_children': '1_children',
 'name_education_type': 'secondary_/_secondary_special',
 'name_family_status': 'married',
 'name_housing_type': 'house_/_apartment',
 'job': 'drivers',
 'status': '0',
 'amt_income_total': 360000.0,
 'days_birth': -11294,
 'days_employed': -3536,
 'flag_work_phone': 0,
 'flag_phone': 1,
 'flag_email': 0,
 'begin_months': -37}

Now we can use DictVectorizer. We create it and then fit it to the list of dictionaries we created previously:

In [166]:
from sklearn.feature_extraction import DictVectorizer

dv = DictVectorizer(sparse=False)
dv.fit(train_dict)

In this code we create a DictVectorizer instance, which we call dv and "train" it by invoking the fit method. The fit method looks at the content of these dictionaries and figures out the possible values for each variable and how to map them to the columns in the output matrix. If a feature is categorical, it applies the one-hot encoding scheme, but if a feature is numerical, it's left intact.

The DictVectorizer class can take in a set of parameters. We specify one of them: sparese=False. This parameter means that the created matrix will not be sparse and instead will create a simple NumPy array.

After we fit the vectorizer, we can use it for converting the dictionaries to a matrix by using the transform method:

In [167]:
X_train = dv.transform(train_dict)

In [168]:
X_train

array([[ 3.600e+05, -3.700e+01,  1.000e+00, ...,  0.000e+00,  0.000e+00,
         0.000e+00],
       [ 1.575e+05, -3.900e+01,  1.000e+00, ...,  0.000e+00,  1.000e+00,
         0.000e+00],
       [ 1.125e+05, -2.400e+01,  1.000e+00, ...,  0.000e+00,  0.000e+00,
         0.000e+00],
       ...,
       [ 1.260e+05, -2.000e+00,  0.000e+00, ...,  0.000e+00,  1.000e+00,
         0.000e+00],
       [ 4.950e+04, -2.000e+00,  0.000e+00, ...,  0.000e+00,  1.000e+00,
         0.000e+00],
       [ 1.395e+05, -6.000e+00,  0.000e+00, ...,  0.000e+00,  0.000e+00,
         0.000e+00]], shape=(322599, 58))

In [169]:
X_train.shape

(322599, 58)

In [170]:
X_train[0]

array([ 3.6000e+05, -3.7000e+01,  1.0000e+00,  0.0000e+00,  0.0000e+00,
        0.0000e+00,  1.0000e+00, -1.1294e+04, -3.5360e+03,  0.0000e+00,
        0.0000e+00,  1.0000e+00,  1.0000e+00,  0.0000e+00,  1.0000e+00,
        0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
        1.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
        0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
        0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
        0.0000e+00,  0.0000e+00,  0.0000e+00,  1.0000e+00,  0.0000e+00,
        1.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
        1.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
        1.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
        0.0000e+00,  0.0000e+00,  0.0000e+00])

The elements are one-hot encoded categorical variables. Not all of them are ones and zeros, however. We see that three of them are other numbers. These are our numeric variables: monthlycharges, tenure, and totalcharges.

In [171]:
dv.get_feature_names_out()

array(['amt_income_total', 'begin_months', 'cnt_children=1_children',
       'cnt_children=2+_children', 'cnt_children=no_children',
       'code_gender=f', 'code_gender=m', 'days_birth', 'days_employed',
       'flag_email', 'flag_own_car=n', 'flag_own_car=y',
       'flag_own_realty=n', 'flag_own_realty=y', 'flag_phone',
       'flag_work_phone', 'job=accountants', 'job=cleaning_staff',
       'job=cooking_staff', 'job=core_staff', 'job=drivers',
       'job=high_skill_tech_staff', 'job=hr_staff', 'job=it_staff',
       'job=laborers', 'job=low-skill_laborers', 'job=managers',
       'job=medicine_staff', 'job=private_service_staff',
       'job=realty_agents', 'job=sales_staff', 'job=secretaries',
       'job=security_staff', 'job=waiters/barmen_staff',
       'name_education_type=academic_degree',
       'name_education_type=higher_education',
       'name_education_type=incomplete_higher',
       'name_education_type=lower_secondary',
       'name_education_type=secondary_/_second

For each categorical feature it creates multiple columns for each of its distinct values. Numerical features such as days_birth and days_employed keep the original names because they are numerical; therefore, DictVectorizer doesn't change them.

## Step 4: Modeling

In [172]:
import math

def sigmoid(score):
    return 1 / (1 + math.exp(-score))

In [173]:
from sklearn.linear_model import LogisticRegression

In [174]:
model = LogisticRegression(solver="liblinear", random_state=1)
model.fit(X_train, y_train)

Convert the dataframe to a list of dictionaries and then feed it to the DictVectorizer we fit previously:

In [175]:
val_dict = df_val[categorical + numerical].to_dict(orient="records")
X_val = dv.transform(val_dict)

As a result, we get X_val, a matrix with features from the validation dataset. Now we are ready to put this matrix to the model. To get the probabilities, we use the predict_proba method of the model:

In [176]:
y_pred = model.predict_proba(X_val)

The result of predict_proba is a two-dimensional NumPy array, or a two-column matrix. The first column of the array contains the probability that the target is negative (approval), and the second column contains the probability that the targt is positive (decline).

These columns convey the same information. We know the probability of declinal—it’s p, and the probability of approval is always 1 – p, so we don’t need both columns.

In [177]:
y_pred = model.predict_proba(X_val)[:, 1]

In [179]:
y_pred[:20]

array([0.00240627, 0.00376124, 0.00431213, 0.00404527, 0.00546687,
       0.00305061, 0.0011262 , 0.00321469, 0.00749929, 0.0101175 ,
       0.00031611, 0.0004437 , 0.0050112 , 0.00351286, 0.00508127,
       0.01885304, 0.0004528 , 0.00687637, 0.00175445, 0.00153127])

To get the binary predictions, we take the probabilities and cut them above a certain threshold. If the probability for a customer is higher than this threshold, we predict a decline, otherwise, an approval. This threshold can change depending on the current market conditions for credit card companies.

In [221]:
pd.Series(y_pred >= 0.01).value_counts()

False    91911
True     15623
Name: count, dtype: int64

In [200]:
df_val["status"].value_counts()

status
c    45191
0    40259
x    20387
1     1301
5      202
2      124
3       38
4       32
Name: count, dtype: int64

In [222]:
decline = y_pred >= 0.01

This quality measure is called accuracy. It's very easy to calculate accuracy with NumPy:

In [225]:
(y_val == decline).mean()

np.float64(0.8522234828054382)

y_val contains only zeroes and ones, it is our target variable because it gives us one if the customer got declined and zero otherwise. decline contains Boolean predictions. In this case, True means we predict the customer will be declined, and False means the customer will be approved.

If the true value in y_pred matches our prediction in decline, the label is True, and if it doesn’t, the label is False. In other words, we have True if our prediction is correct and False if it’s not.

We see 0.85 as the output. This means that the model predictions matched the actual value 85% of the time, or the model makes correct predictions in 85% of cases. This is what we call the accuracy of the model.

## Step 5: Evaluation

We know that the logistic regression model has two parameters that it learns from data:

- w0 is the bias term.
- w = (w1, w2, ..., wn) is the weights vector.

We can get the bias term from model.intercept_[0]. When we train our model on all features, the bias term is –0.12.

The rest of the weights are stored in model.coef_[0]. If we look inside, it’s just an array of numbers, which is hard to understand on its own.

To see which feature is associated with each weight, we can use the get_feature_names_out method of the DictVectorizer. We can zip the feature names together with the coefficients before looking at them:

In [234]:
dict(zip(dv.get_feature_names_out(), model.coef_[0].round(10)))

{'amt_income_total': np.float64(-5.8332e-06),
 'begin_months': np.float64(4.0714e-06),
 'cnt_children=1_children': np.float64(-8.01e-08),
 'cnt_children=2+_children': np.float64(-1.061e-07),
 'cnt_children=no_children': np.float64(-1.049e-07),
 'code_gender=f': np.float64(-2.904e-07),
 'code_gender=m': np.float64(-7e-10),
 'days_birth': np.float64(0.0002885935),
 'days_employed': np.float64(0.0002481958),
 'flag_email': np.float64(-5.68e-08),
 'flag_own_car=n': np.float64(-2.041e-07),
 'flag_own_car=y': np.float64(-8.71e-08),
 'flag_own_realty=n': np.float64(-2.088e-07),
 'flag_own_realty=y': np.float64(-8.24e-08),
 'flag_phone': np.float64(-5.7e-09),
 'flag_work_phone': np.float64(-1.051e-07),
 'job=accountants': np.float64(-2.19e-08),
 'job=cleaning_staff': np.float64(-1.5e-09),
 'job=cooking_staff': np.float64(-1.31e-08),
 'job=core_staff': np.float64(-7.35e-08),
 'job=drivers': np.float64(-7.9e-09),
 'job=high_skill_tech_staff': np.float64(6.4e-09),
 'job=hr_staff': np.float64(-3.3

To understand how the model works, let’s consider what happens when we apply this model. To build the intuition, let’s train a simpler and smaller model that uses only three variables: contract, tenure, and totalcharges.

The variables tenure and totalcharges are numeric so we don’t need to do any additional preprocessing; we can take them as is. On the other hand, contract is a categorical variable, so to be able to use it, we need to apply one-hot encoding.

Let’s redo the same steps we did for training, this time using a smaller set of features:

In [235]:
subset = ["status"]
train_dict_small = df_train[subset].to_dict(orient="records")
dv_small = DictVectorizer(sparse=False)
dv_small.fit(train_dict_small)

X_small_train = dv_small.transform(train_dict_small)

dv_small.get_feature_names_out()

array(['status=0', 'status=1', 'status=2', 'status=3', 'status=4',
       'status=5', 'status=c', 'status=x'], dtype=object)

In [236]:
model_small = LogisticRegression(solver="liblinear", random_state=1)
model_small.fit(X_small_train, y_train)

In [237]:
model_small.intercept_[0]

np.float64(-2.356431834775256)

In [238]:
dict(zip(dv_small.get_feature_names_out(), model_small.coef_[0].round(3)))

{'status=0': np.float64(-7.349),
 'status=1': np.float64(-4.403),
 'status=2': np.float64(6.246),
 'status=3': np.float64(5.366),
 'status=4': np.float64(5.074),
 'status=5': np.float64(6.921),
 'status=c': np.float64(-7.455),
 'status=x': np.float64(-6.757)}

These weights are essentially w1, w2, w3, w4, and w5 for the weights vector.

The results line up because every customer with a status of 2, 3, 4, or 5 got declined, where the other customers got approved.

## Step 6: Deployment

In [240]:
df.columns

Index(['id', 'code_gender', 'flag_own_car', 'flag_own_realty', 'cnt_children',
       'amt_income_total', 'name_education_type', 'name_family_status',
       'name_housing_type', 'days_birth', 'days_employed', 'flag_mobil',
       'flag_work_phone', 'flag_phone', 'flag_email', 'job', 'begin_months',
       'status', 'target'],
      dtype='object')

In [242]:
df.head()

Unnamed: 0,id,code_gender,flag_own_car,flag_own_realty,cnt_children,amt_income_total,name_education_type,name_family_status,name_housing_type,days_birth,days_employed,flag_mobil,flag_work_phone,flag_phone,flag_email,job,begin_months,status,target
0,5065438,f,y,n,2+_children,270000.0,secondary_/_secondary_special,married,with_parents,-13258,-2300,1,0,0,0,managers,-6,c,0
1,5142753,f,n,n,no_children,81000.0,secondary_/_secondary_special,single_/_not_married,house_/_apartment,-17876,-377,1,1,1,0,private_service_staff,-4,0,0
2,5111146,m,y,y,no_children,270000.0,higher_education,married,house_/_apartment,-19579,-1028,1,0,1,0,laborers,0,c,0
3,5010310,f,y,y,1_children,112500.0,secondary_/_secondary_special,married,house_/_apartment,-15109,-1956,1,0,0,0,core_staff,-3,0,0
4,5010835,m,y,y,2+_children,139500.0,secondary_/_secondary_special,married,house_/_apartment,-17281,-5578,1,1,0,0,drivers,-29,0,0


In [244]:
customer = {
    "id": 44234232,
    "code_gender": "f",
    "flag_own_car": "y",
    "flag_own_realty": "n",
    "cnt_children": "no_children",
    "amt_income_total": 100000.0,
    "name_education_type": "higher_education",
    "name_family_status": "single_/_not_married",
    "name_housing_type": "with_parents",
    "days_birth": -18423,
    "days_employed": -75,
    "flag_mobil": 1,
    "flag_work_phone": 0,
    "flag_phone": 1,
    "flag_email": 1,
    "job": "core_staff",
    "begin_months": -3,
    "status": "0",
}

In [249]:
X_test = dv.transform([customer])
X_test

array([[ 1.0000e+05, -3.0000e+00,  0.0000e+00,  0.0000e+00,  1.0000e+00,
         1.0000e+00,  0.0000e+00, -1.8423e+04, -7.5000e+01,  1.0000e+00,
         0.0000e+00,  1.0000e+00,  1.0000e+00,  0.0000e+00,  1.0000e+00,
         0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  1.0000e+00,
         0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
         0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
         0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
         1.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
         0.0000e+00,  0.0000e+00,  1.0000e+00,  0.0000e+00,  0.0000e+00,
         0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  1.0000e+00,
         1.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
         0.0000e+00,  0.0000e+00,  0.0000e+00]])

In [248]:
model.predict_proba(X_test)

array([[0.99731854, 0.00268146]])

The output is a matrix with predictions. For each customer, it outputs two numbers, which are the probability of being approved and the probability of being declined. Because there’s only one customer, we get a tiny NumPy array with one row and two columns.

All we need from the matrix is the number at the first row and second column: the probability of churning for this customer. To select this number from the array, we use the brackets operator:

In [253]:
model.predict_proba(X_test)[0, 1]

np.float64(0.002681457513745784)

We used this operator to select the second column from the array. However, this time there’s only one row, so we can explicitly ask NumPy to return the value from that row. Because indexes start from 0 in NumPy, [0, 1] means first row, second column.

When we execute this line, we see that the output is 0.0027, so that the probability that this customer will get declined is only 0.27%. It’s less than 1% (the threshold), so we will give this customer an approval.

We can try to score another client:

In [280]:
customer = {
    "id": 442343252,
    "code_gender": "f",
    "flag_own_car": "y",
    "flag_own_realty": "n",
    "cnt_children": "no_children",
    "amt_income_total": 1000.0,
    "name_education_type": "higher_education",
    "name_family_status": "single_/_not_married",
    "name_housing_type": "with_parents",
    "days_birth": -12423,
    "days_employed": -75,
    "flag_mobil": 1,
    "flag_work_phone": 0,
    "flag_phone": 1,
    "flag_email": 1,
    "job": "core_staff",
    "begin_months": -3,
    "status": "3",
}

X_test = dv.transform([customer])
model.predict_proba(X_test)[0, 1]

np.float64(0.026348078192034795)

This customer has a lower days_birth and a lower amt_income_total, and it results in them getting a decline.

In [283]:
y_pred = model.predict_proba(X_test)[0, 1]
approval = y_pred >= 0.5
(approval == y_val).mean()

np.float64(0.9963174437852215)

The accuracy of the model is 99.6%, so it's a great model!