In [1]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
import pandas as pd
import numpy as np
import pygal

pd.set_option("max_columns", 500)
pd.set_option('display.float_format', lambda x: "{:,.2f}".format(x))

In [2]:
df = pd.read_csv("UCI_Credit_Card.csv")

In [3]:
ls_disc = ["SEX", "EDUCATION", "MARRIAGE", "PAY_0", "PAY_2", "PAY_3",
           "PAY_4", "PAY_5", "PAY_6"]
ls_cont = ["LIMIT_BAL", "AGE", "BILL_AMT1", "BILL_AMT2", "BILL_AMT3",
           "BILL_AMT4", "BILL_AMT5", "BILL_AMT6", "PAY_AMT1", 
           "PAY_AMT2", "PAY_AMT3", "PAY_AMT4", "PAY_AMT5", "PAY_AMT6"]
tgt = "default.payment.next.month"

In [4]:
df.head()

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,BILL_AMT1,BILL_AMT2,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default.payment.next.month
0,1,20000.0,2,2,1,24,2,2,-1,-1,-2,-2,3913.0,3102.0,689.0,0.0,0.0,0.0,0.0,689.0,0.0,0.0,0.0,0.0,1
1,2,120000.0,2,2,2,26,-1,2,0,0,0,2,2682.0,1725.0,2682.0,3272.0,3455.0,3261.0,0.0,1000.0,1000.0,1000.0,0.0,2000.0,1
2,3,90000.0,2,2,2,34,0,0,0,0,0,0,29239.0,14027.0,13559.0,14331.0,14948.0,15549.0,1518.0,1500.0,1000.0,1000.0,1000.0,5000.0,0
3,4,50000.0,2,2,1,37,0,0,0,0,0,0,46990.0,48233.0,49291.0,28314.0,28959.0,29547.0,2000.0,2019.0,1200.0,1100.0,1069.0,1000.0,0
4,5,50000.0,1,2,1,57,-1,0,-1,0,0,0,8617.0,5670.0,35835.0,20940.0,19146.0,19131.0,2000.0,36681.0,10000.0,9000.0,689.0,679.0,0


In [5]:
df[tgt].value_counts(True)

0   0.78
1   0.22
Name: default.payment.next.month, dtype: float64

In [6]:
for i in range(1, 7):
#     df[f"PERC_PAID_{i}"] = df[f"PAY_AMT{i}"] / df[f"BILL_AMT{i}"]
    df[f"USE_{i}"] = df[f"BILL_AMT{i}"] / df["LIMIT_BAL"]
    ls_cont.append(f"USE_{i}")

In [7]:
# for i in range(7):
#     if i == 1:
#         continue
#     df[f"PAY_{i}"] = df[f"PAY_{i}"].astype(int).map({-1: "pay duly", 
#                                          1: "payment delay for 1 months", 
#                                          2: "payment delay for 2 months", 
#                                          3: "payment delay for 3 months", 
#                                          4: "payment delay for 4 months", 
#                                          5: "payment delay for 5 months",
#                                          6: "payment delay for 6 months", 
#                                          7: "payment delay for 7 months", 
#                                          8: "payment delay for 8 months", 
#                                          9: "payment delay for 9+ months", })

In [8]:
df["SEX"] = df["SEX"].map({1: "male", 2: "female"})
df["MARRIAGE"] = df["MARRIAGE"].map({1: "married", 2: "single", 3: "others"})
df["EDUCATION"] = df["EDUCATION"].map({1: "graduate school", 2: "university",
                     3: "high school", 4: "others", 5: "unknown",
                     6: "unknown"})

In [9]:
for var in ls_cont:
    df[f"C_{var}"] = pd.cut(df[var], bins=5).astype(str)

In [10]:
def IV(df, var, tgt):
    aux = df[[var, tgt]].groupby(var).agg(["count", "sum"])
    aux["evento"] = aux[tgt, "sum"]
    aux["no_evento"] = aux[tgt, "count"] - aux[tgt, "sum"]
    aux["%evento"] = aux["evento"] / aux["evento"].sum()
    aux["%no_evento"] = aux["no_evento"] / aux["no_evento"].sum()
    aux["WOE"] = np.log(aux["%no_evento"] / aux["%evento"])
    aux["IV"] = (aux["%no_evento"] - aux["%evento"])*aux["WOE"]
    return aux["IV"].sum()

In [11]:
new_disc = [x for x in df.columns if x.startswith("C_")] + ls_disc
for col in new_disc:
    df[col] = df[col].fillna("Missing")

In [12]:
df_iv = pd.DataFrame(columns=["iv"])
for var in new_disc:
    df_iv.loc[var, "iv"] = IV(df = df, var = var, tgt = tgt)

  import sys


In [13]:
df_iv.sort_values(by = "iv", ascending=False)

Unnamed: 0,iv
C_LIMIT_BAL,inf
C_PAY_AMT5,inf
PAY_5,inf
PAY_2,inf
EDUCATION,inf
C_USE_6,inf
C_USE_5,inf
C_USE_4,inf
C_USE_3,inf
C_USE_2,inf


In [14]:
ls_best = df_iv[df_iv["iv"] != np.inf].index.tolist()

In [15]:
ls_best

['C_AGE', 'SEX', 'MARRIAGE', 'PAY_0', 'PAY_3', 'PAY_4']

In [16]:
def WOE(df, var, tgt):
    aux = df[[var, tgt]].groupby(var).agg(["count", "sum"])
    aux["evento"] = aux[tgt, "sum"]
    aux["no_evento"] = aux[tgt, "count"] - aux[tgt, "sum"]
    aux["%evento"] = aux["evento"] / aux["evento"].sum()
    aux["%no_evento"] = aux["no_evento"] / aux["no_evento"].sum()
    aux["WOE"] = np.log(aux["%no_evento"] / aux["%evento"])
    aux.columns = aux.columns.droplevel(1)
    aux = aux[["WOE"]].reset_index().rename(columns={"WOE": f"W_{var}"})
    df = df.merge(aux, on = var, how = "left")
    display(aux)
    return df

In [17]:
for col in ls_best:
    df = WOE(df = df, var = col, tgt = tgt)

Unnamed: 0,C_AGE,W_C_AGE
0,"(20.942, 32.6]",0.02
1,"(32.6, 44.2]",0.05
2,"(44.2, 55.8]",-0.11
3,"(55.8, 67.4]",-0.25
4,"(67.4, 79.0]",-0.13


Unnamed: 0,SEX,W_SEX
0,female,0.08
1,male,-0.12


Unnamed: 0,MARRIAGE,W_MARRIAGE
0,Missing,1.02
1,married,-0.08
2,others,-0.21
3,single,0.07


Unnamed: 0,PAY_0,W_PAY_0
0,-2,0.62
1,-1,0.34
2,0,0.66
3,1,-0.59
4,2,-2.07
5,3,-2.4
6,4,-2.03
7,5,-1.26
8,6,-1.44
9,7,-2.51


Unnamed: 0,PAY_3,W_PAY_3
0,-2,0.22
1,-1,0.43
2,0,0.3
3,1,-0.16
4,2,-1.32
5,3,-1.56
6,4,-1.58
7,5,-1.55
8,6,-1.7
9,7,-2.74


Unnamed: 0,PAY_4,W_PAY_4
0,-2,0.18
1,-1,0.41
2,0,0.24
3,1,-1.26
4,2,-1.35
5,3,-1.71
6,4,-1.95
7,5,-1.32
8,6,-0.85
9,7,-2.83


In [18]:
ls_woe = [x for x in df.columns if x.startswith("W")]

In [None]:
lr = LogisticRegression(n_jobs=-1)

In [None]:
lr.fit(X=df[ls_woe], y=df[tgt])

  " = {}.".format(effective_n_jobs(self.n_jobs)))


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn', n_jobs=-1,
          penalty='l2', random_state=None, solver='warn', tol=0.0001,
          verbose=0, warm_start=False)

In [None]:
lr.predict(df[ls_woe]).sum()

3737

In [None]:
lr.score(X=df[ls_woe], y=df[tgt])

0.8198333333333333

In [None]:
roc_auc_score(y_true=df[tgt], y_score=lr.predict(X=df[ls_woe]))

0.6603461825785361

In [None]:
pdo = 10
base_score = 10
base_odds = 100
factor = pdo/np.log(2)
offset = base_score - (factor - np.log(base_odds))
m = len(ls_woe)

In [None]:
alpha = lr.intercept_
betas = lr.coef_[0]

In [None]:
for feat, beta in zip(ls_woe, betas):
    df["P_" + feat[2:]] = df[feat].apply(lambda x:((beta*x + (alpha/m))*factor) + (offset/m)).astype(int)

In [None]:
df

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,BILL_AMT1,BILL_AMT2,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default.payment.next.month,USE_1,USE_2,USE_3,USE_4,USE_5,USE_6,C_LIMIT_BAL,C_AGE,C_BILL_AMT1,C_BILL_AMT2,C_BILL_AMT3,C_BILL_AMT4,C_BILL_AMT5,C_BILL_AMT6,C_PAY_AMT1,C_PAY_AMT2,C_PAY_AMT3,C_PAY_AMT4,C_PAY_AMT5,C_PAY_AMT6,C_USE_1,C_USE_2,C_USE_3,C_USE_4,C_USE_5,C_USE_6,W_C_AGE,W_SEX,W_MARRIAGE,W_PAY_0,W_PAY_3,W_PAY_4,P_C_AGE,P_SEX,P_MARRIAGE,P_PAY_0,P_PAY_3,P_PAY_4
0,1,20000.00,female,university,married,24,2,2,-1,-1,-2,-2,3913.00,3102.00,689.00,0.00,0.00,0.00,0.00,689.00,0.00,0.00,0.00,0.00,1,0.20,0.16,0.03,0.00,0.00,0.00,"(9010.0, 208000.0]","(20.942, 32.6]","(-166710.091, 60438.2]","(-70830.708, 140964.6]","(-159085.353, 207006.6]","(-171061.586, 42317.2]","(-82342.505, 120367.0]","(-79349.6, 180903.8]","(-873.552, 174710.4]","(-1684.259, 336851.8]","(-896.04, 179208.0]","(-621.0, 124200.0]","(-426.529, 85305.8]","(-528.666, 105733.2]","(-0.627, 0.795]","(-1.403, 0.16]","(-1.037, 1.318]","(-0.0702, 1.234]","(-0.883, 0.286]","(-0.431, 0.649]",0.02,0.08,-0.08,-2.07,0.43,0.41,-3,-3,-1,21,-4,-4
1,2,120000.00,female,university,single,26,-1,2,0,0,0,2,2682.00,1725.00,2682.00,3272.00,3455.00,3261.00,0.00,1000.00,1000.00,1000.00,0.00,2000.00,1,0.02,0.01,0.02,0.03,0.03,0.03,"(9010.0, 208000.0]","(20.942, 32.6]","(-166710.091, 60438.2]","(-70830.708, 140964.6]","(-159085.353, 207006.6]","(-171061.586, 42317.2]","(-82342.505, 120367.0]","(-79349.6, 180903.8]","(-873.552, 174710.4]","(-1684.259, 336851.8]","(-896.04, 179208.0]","(-621.0, 124200.0]","(-426.529, 85305.8]","(-528.666, 105733.2]","(-0.627, 0.795]","(-1.403, 0.16]","(-1.037, 1.318]","(-0.0702, 1.234]","(-0.883, 0.286]","(-0.431, 0.649]",0.02,0.08,0.07,0.34,0.30,0.24,-3,-3,-3,-7,-4,-4
2,3,90000.00,female,university,single,34,0,0,0,0,0,0,29239.00,14027.00,13559.00,14331.00,14948.00,15549.00,1518.00,1500.00,1000.00,1000.00,1000.00,5000.00,0,0.32,0.16,0.15,0.16,0.17,0.17,"(9010.0, 208000.0]","(32.6, 44.2]","(-166710.091, 60438.2]","(-70830.708, 140964.6]","(-159085.353, 207006.6]","(-171061.586, 42317.2]","(-82342.505, 120367.0]","(-79349.6, 180903.8]","(-873.552, 174710.4]","(-1684.259, 336851.8]","(-896.04, 179208.0]","(-621.0, 124200.0]","(-426.529, 85305.8]","(-528.666, 105733.2]","(-0.627, 0.795]","(-1.403, 0.16]","(-1.037, 1.318]","(-0.0702, 1.234]","(-0.883, 0.286]","(-0.431, 0.649]",0.05,0.08,0.07,0.66,0.30,0.24,-3,-3,-3,-10,-4,-4
3,4,50000.00,female,university,married,37,0,0,0,0,0,0,46990.00,48233.00,49291.00,28314.00,28959.00,29547.00,2000.00,2019.00,1200.00,1100.00,1069.00,1000.00,0,0.94,0.96,0.99,0.57,0.58,0.59,"(9010.0, 208000.0]","(32.6, 44.2]","(-166710.091, 60438.2]","(-70830.708, 140964.6]","(-159085.353, 207006.6]","(-171061.586, 42317.2]","(-82342.505, 120367.0]","(-79349.6, 180903.8]","(-873.552, 174710.4]","(-1684.259, 336851.8]","(-896.04, 179208.0]","(-621.0, 124200.0]","(-426.529, 85305.8]","(-528.666, 105733.2]","(0.795, 2.21]","(0.16, 1.715]","(-1.037, 1.318]","(-0.0702, 1.234]","(0.286, 1.448]","(-0.431, 0.649]",0.05,0.08,-0.08,0.66,0.30,0.24,-3,-3,-1,-10,-4,-4
4,5,50000.00,male,university,married,57,-1,0,-1,0,0,0,8617.00,5670.00,35835.00,20940.00,19146.00,19131.00,2000.00,36681.00,10000.00,9000.00,689.00,679.00,0,0.17,0.11,0.72,0.42,0.38,0.38,"(9010.0, 208000.0]","(55.8, 67.4]","(-166710.091, 60438.2]","(-70830.708, 140964.6]","(-159085.353, 207006.6]","(-171061.586, 42317.2]","(-82342.505, 120367.0]","(-79349.6, 180903.8]","(-873.552, 174710.4]","(-1684.259, 336851.8]","(-896.04, 179208.0]","(-621.0, 124200.0]","(-426.529, 85305.8]","(-528.666, 105733.2]","(-0.627, 0.795]","(-1.403, 0.16]","(-1.037, 1.318]","(-0.0702, 1.234]","(0.286, 1.448]","(-0.431, 0.649]",-0.25,-0.12,-0.08,0.34,0.43,0.24,-1,-1,-1,-7,-4,-4
5,6,50000.00,male,graduate school,single,37,0,0,0,0,0,0,64400.00,57069.00,57608.00,19394.00,19619.00,20024.00,2500.00,1815.00,657.00,1000.00,1000.00,800.00,0,1.29,1.14,1.15,0.39,0.39,0.40,"(9010.0, 208000.0]","(32.6, 44.2]","(60438.2, 286456.4]","(-70830.708, 140964.6]","(-159085.353, 207006.6]","(-171061.586, 42317.2]","(-82342.505, 120367.0]","(-79349.6, 180903.8]","(-873.552, 174710.4]","(-1684.259, 336851.8]","(-896.04, 179208.0]","(-621.0, 124200.0]","(-426.529, 85305.8]","(-528.666, 105733.2]","(0.795, 2.21]","(0.16, 1.715]","(-1.037, 1.318]","(-0.0702, 1.234]","(0.286, 1.448]","(-0.431, 0.649]",0.05,-0.12,0.07,0.66,0.30,0.24,-3,-1,-3,-10,-4,-4
6,7,500000.00,male,graduate school,single,29,0,0,0,0,0,0,367965.00,412023.00,445007.00,542653.00,483003.00,473944.00,55000.00,40000.00,38000.00,20239.00,13750.00,13770.00,0,0.74,0.82,0.89,1.09,0.97,0.95,"(406000.0, 604000.0]","(20.942, 32.6]","(286456.4, 512474.6]","(351706.2, 562447.8]","(207006.6, 571277.2]","(466951.6, 679268.8]","(322068.0, 523769.0]","(441157.2, 701410.6]","(-873.552, 174710.4]","(-1684.259, 336851.8]","(-896.04, 179208.0]","(-621.0, 124200.0]","(-426.529, 85305.8]","(-528.666, 105733.2]","(-0.627, 0.795]","(0.16, 1.715]","(-1.037, 1.318]","(-0.0702, 1.234]","(0.286, 1.448]","(0.649, 1.728]",0.02,-0.12,0.07,0.66,0.30,0.24,-3,-1,-3,-10,-4,-4
7,8,100000.00,female,university,single,23,0,-1,-1,0,0,-1,11876.00,380.00,601.00,221.00,-159.00,567.00,380.00,601.00,0.00,581.00,1687.00,1542.00,0,0.12,0.00,0.01,0.00,-0.00,0.01,"(9010.0, 208000.0]","(20.942, 32.6]","(-166710.091, 60438.2]","(-70830.708, 140964.6]","(-159085.353, 207006.6]","(-171061.586, 42317.2]","(-82342.505, 120367.0]","(-79349.6, 180903.8]","(-873.552, 174710.4]","(-1684.259, 336851.8]","(-896.04, 179208.0]","(-621.0, 124200.0]","(-426.529, 85305.8]","(-528.666, 105733.2]","(-0.627, 0.795]","(-1.403, 0.16]","(-1.037, 1.318]","(-0.0702, 1.234]","(-0.883, 0.286]","(-0.431, 0.649]",0.02,0.08,0.07,0.66,0.43,0.24,-3,-3,-3,-10,-4,-4
8,9,140000.00,female,high school,married,28,0,0,2,0,0,0,11285.00,14096.00,12108.00,12211.00,11793.00,3719.00,3329.00,0.00,432.00,1000.00,1000.00,1000.00,0,0.08,0.10,0.09,0.09,0.08,0.03,"(9010.0, 208000.0]","(20.942, 32.6]","(-166710.091, 60438.2]","(-70830.708, 140964.6]","(-159085.353, 207006.6]","(-171061.586, 42317.2]","(-82342.505, 120367.0]","(-79349.6, 180903.8]","(-873.552, 174710.4]","(-1684.259, 336851.8]","(-896.04, 179208.0]","(-621.0, 124200.0]","(-426.529, 85305.8]","(-528.666, 105733.2]","(-0.627, 0.795]","(-1.403, 0.16]","(-1.037, 1.318]","(-0.0702, 1.234]","(-0.883, 0.286]","(-0.431, 0.649]",0.02,0.08,-0.08,0.66,-1.32,0.24,-3,-3,-1,-10,2,-4
9,10,20000.00,male,high school,single,35,-2,-2,-2,-2,-1,-1,0.00,0.00,0.00,0.00,13007.00,13912.00,0.00,0.00,0.00,13007.00,1122.00,0.00,0,0.00,0.00,0.00,0.00,0.65,0.70,"(9010.0, 208000.0]","(32.6, 44.2]","(-166710.091, 60438.2]","(-70830.708, 140964.6]","(-159085.353, 207006.6]","(-171061.586, 42317.2]","(-82342.505, 120367.0]","(-79349.6, 180903.8]","(-873.552, 174710.4]","(-1684.259, 336851.8]","(-896.04, 179208.0]","(-621.0, 124200.0]","(-426.529, 85305.8]","(-528.666, 105733.2]","(-0.627, 0.795]","(-1.403, 0.16]","(-1.037, 1.318]","(-0.0702, 1.234]","(0.286, 1.448]","(0.649, 1.728]",0.05,-0.12,0.07,0.62,0.22,0.18,-3,-1,-3,-10,-3,-3


In [None]:
df["score"] = df[[x for x in df.columns if x.startswith("P_")]].sum(axis = 1)

In [None]:
def plot_histogram(data, n_bins = 10):
    freq, bins = np.histogram(data, bins=n_bins)
    hist = pygal.Histogram()
    hist.add('Wide bars', list(zip(freq, bins, bins[1:])))
    return hist

In [None]:
plot_histogram(data = df["score"])

In [None]:
df["score"].describe()

In [None]:
ls_sc = [x[2:] for x in ls_woe] + [x for x in df.columns if x.startswith("P_")]

In [None]:
ls_sc = [y[::-1] for y in sorted([x[::-1] for x in ls_sc])]

In [None]:
df[["MARRIAGE", "P_MARRIAGE"]].drop_duplicates()

In [None]:
# Regresión logística sin transformaciones
lr_orig = LogisticRegression(n_jobs=-1)
lr_orig.fit(X=df[ls_cont], y=df[tgt])
lr_orig.score(X=df[ls_cont], y=df[tgt])
lr_orig.predict(df[ls_cont]).sum()
roc_auc_score(y_true=df[tgt], y_score=lr_orig.predict(X=df[ls_cont]))