## Libraries

In [47]:
import pandas as pd
import numpy as np
from scipy.stats import chi2_contingency
from sklearn.preprocessing import OneHotEncoder
from category_encoders import BinaryEncoder
from sklearn.compose import ColumnTransformer
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV

## Data review

In [48]:
card_data = pd.read_csv("/Users/a123456/Desktop/Projekty/Predicting_Credit_Card_Approvals/credit+approval/crx.data",header=None) 
cc_apps=card_data.copy()
cc_apps.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202,0,+
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43,560,+
2,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280,824,+
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100,3,+
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120,0,+


In [49]:
cc_apps.describe()

Unnamed: 0,2,7,10,14
count,690.0,690.0,690.0,690.0
mean,4.758725,2.223406,2.4,1017.385507
std,4.978163,3.346513,4.86294,5210.102598
min,0.0,0.0,0.0,0.0
25%,1.0,0.165,0.0,0.0
50%,2.75,1.0,0.0,5.0
75%,7.2075,2.625,3.0,395.5
max,28.0,28.5,67.0,100000.0


In [50]:
cc_apps.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 690 entries, 0 to 689
Data columns (total 16 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   0       690 non-null    object 
 1   1       690 non-null    object 
 2   2       690 non-null    float64
 3   3       690 non-null    object 
 4   4       690 non-null    object 
 5   5       690 non-null    object 
 6   6       690 non-null    object 
 7   7       690 non-null    float64
 8   8       690 non-null    object 
 9   9       690 non-null    object 
 10  10      690 non-null    int64  
 11  11      690 non-null    object 
 12  12      690 non-null    object 
 13  13      690 non-null    object 
 14  14      690 non-null    int64  
 15  15      690 non-null    object 
dtypes: float64(2), int64(2), object(12)
memory usage: 86.4+ KB


## Cleaning data

Deletion of non-relevant data

In [51]:
cc_apps=cc_apps.drop(cc_apps.columns[[1,11,13]],axis=1)
cc_apps = cc_apps.reset_index(drop=True)

Columns names updating

In [52]:
cc_apps.columns = [str(i) for i in range(cc_apps.shape[1])] 
cc_apps.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,b,0.0,u,g,w,v,1.25,t,t,1,g,0,+
1,a,4.46,u,g,q,h,3.04,t,t,6,g,560,+
2,a,0.5,u,g,q,h,1.5,t,f,0,g,824,+
3,b,1.54,u,g,w,v,3.75,t,t,5,g,3,+
4,b,5.625,u,g,w,v,1.71,t,f,0,s,0,+


### Missing values

In [53]:
cc_apps = cc_apps.replace('?', pd.NA)

In [54]:
cc_apps.isna().sum()

0     12
1      0
2      6
3      6
4      9
5      9
6      0
7      0
8      0
9      0
10     0
11     0
12     0
dtype: int64

Maximum percentage of deleted data per column 

In [55]:
12*100/690

1.7391304347826086

Percentage of deleted data

In [56]:
(cc_apps.isna().sum().sum())*100/690

6.086956521739131

In [57]:
cc_apps.dropna(axis=0, inplace=True)

In [58]:
cc_apps.isna().sum()

0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
11    0
12    0
dtype: int64

## Checking assumptions

In [59]:
cc_apps.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,b,0.0,u,g,w,v,1.25,t,t,1,g,0,+
1,a,4.46,u,g,q,h,3.04,t,t,6,g,560,+
2,a,0.5,u,g,q,h,1.5,t,f,0,g,824,+
3,b,1.54,u,g,w,v,3.75,t,t,5,g,3,+
4,b,5.625,u,g,w,v,1.71,t,f,0,s,0,+


Checking: Lack of multicollinearity

In [60]:
def ch_2bonf(df):
    zmienne = list(df.columns)

    # Liczba wszystkich możliwych par testów
    liczba_testow = len(zmienne) * (len(zmienne) - 1) / 2
    # Nowy poziom istotności po korekcie Bonferroniego
    alfa_bonferroni = 0.05 / liczba_testow
    print(f"Skorygowany poziom istotności: {alfa_bonferroni}")

    for i in range(len(zmienne)):
        for j in range(i+1, len(zmienne)):
            tabela_przestawna = pd.crosstab(df[zmienne[i]], df[zmienne[j]])
            stat, p, dof, expected = chi2_contingency(tabela_przestawna)
            if p >= alfa_bonferroni:
                print(f"Nie znaleziono statystycznie istotnej zależności.")
                print(f"Test chi-kwadrat dla {zmienne[i]} vs {zmienne[j]}:")
                print(f"Statystyka={stat}, p={p}\n")

In [61]:
ch2result=ch_2bonf(cc_apps)
ch2result

Skorygowany poziom istotności: 0.000641025641025641
Nie znaleziono statystycznie istotnej zależności.
Test chi-kwadrat dla 0 vs 1:
Statystyka=203.8320078047908, p=0.6797982234230798

Nie znaleziono statystycznie istotnej zależności.
Test chi-kwadrat dla 0 vs 2:
Statystyka=3.6013420014079203, p=0.16518800975515074

Nie znaleziono statystycznie istotnej zależności.
Test chi-kwadrat dla 0 vs 3:
Statystyka=3.6013420014079203, p=0.16518800975515074

Nie znaleziono statystycznie istotnej zależności.
Test chi-kwadrat dla 0 vs 6:
Statystyka=128.2607863155258, p=0.5513802701281089

Nie znaleziono statystycznie istotnej zależności.
Test chi-kwadrat dla 0 vs 7:
Statystyka=0.17853936110796703, p=0.6726315069582038

Nie znaleziono statystycznie istotnej zależności.
Test chi-kwadrat dla 0 vs 8:
Statystyka=3.4201884325996894, p=0.06440366163285269

Nie znaleziono statystycznie istotnej zależności.
Test chi-kwadrat dla 0 vs 9:
Statystyka=22.4624209132364, p=0.4325950904193057

Nie znaleziono statystyc

- We assume there is high multicollinearity in variables 6,9,11 and medicore in 4,5,7,10. There is no statistically significant relationship between variables 10, 11 and variable 12, which we are going to predict. This phenomenon is undesirable. 
  
- The lack of multicollinearity is observed in variables 0,1,2,3. However, between variables 0,1 and 12 there is no statistically significant relationship what may negatively affect the prediction

In [62]:
cc_apps.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,b,0.0,u,g,w,v,1.25,t,t,1,g,0,+
1,a,4.46,u,g,q,h,3.04,t,t,6,g,560,+
2,a,0.5,u,g,q,h,1.5,t,f,0,g,824,+
3,b,1.54,u,g,w,v,3.75,t,t,5,g,3,+
4,b,5.625,u,g,w,v,1.71,t,f,0,s,0,+


In [63]:
cleaned_data= cc_apps.drop(cc_apps.iloc[:,[6,9,11]],axis=1)
cleaned_data.head()

Unnamed: 0,0,1,2,3,4,5,7,8,10,12
0,b,0.0,u,g,w,v,t,t,g,+
1,a,4.46,u,g,q,h,t,t,g,+
2,a,0.5,u,g,q,h,t,f,g,+
3,b,1.54,u,g,w,v,t,t,g,+
4,b,5.625,u,g,w,v,t,f,s,+


## Encoding categorical features

The remove_rare_values function processes the data frame and modifies its columns, removing values that are rare - they occur with a frequency of less than 5%.

In [64]:
def remove_rare_values(data):
    data = data.copy()
    for col in data.columns.drop(data.columns[1]):
        if data[col].dtype == 'object':
            value_counts = data[col].value_counts(normalize=True) * 100
            rare_values = value_counts[value_counts < 5].index
            data.loc[data[col].isin(rare_values), col] = None
    return data

In [65]:
def unique_val(data):
    for i in range(len(data.columns)):
        if data.dtypes[i] == 'object':
            column_name = data.columns[i] 
            unikalne_wartosci = data.iloc[:, i].unique()
            num_unik = len(unikalne_wartosci)
            zliczenie = data.iloc[:, i].value_counts()
            print(f"Unikalne wartości {num_unik} z kolumny {column_name}: {unikalne_wartosci},\nZliczone wartosci z kolumny {column_name}:\n{zliczenie}")


In [66]:
cc_apps=remove_rare_values(cc_apps)
unique_val(cc_apps)

Unikalne wartości 2 z kolumny 0: ['b' 'a'],
Zliczone wartosci z kolumny 0:
0
b    463
a    208
Name: count, dtype: int64
Unikalne wartości 3 z kolumny 2: ['u' 'y' None],
Zliczone wartosci z kolumny 2:
2
u    510
y    159
Name: count, dtype: int64
Unikalne wartości 3 z kolumny 3: ['g' 'p' None],
Zliczone wartosci z kolumny 3:
3
g    510
p    159
Name: count, dtype: int64
Unikalne wartości 11 z kolumny 4: ['w' 'q' 'm' None 'cc' 'k' 'c' 'x' 'i' 'aa' 'ff'],
Zliczone wartosci z kolumny 4:
4
c     135
q      76
w      63
i      58
aa     53
ff     53
k      50
cc     40
m      38
x      38
Name: count, dtype: int64
Unikalne wartości 5 z kolumny 5: ['v' 'h' 'bb' 'ff' None],
Zliczone wartosci z kolumny 5:
5
v     392
h     137
bb     57
ff     57
Name: count, dtype: int64
Unikalne wartości 2 z kolumny 7: ['t' 'f'],
Zliczone wartosci z kolumny 7:
7
t    358
f    313
Name: count, dtype: int64
Unikalne wartości 2 z kolumny 8: ['t' 'f'],
Zliczone wartosci z kolumny 8:
8
f    379
t    292
Name: cou

In [67]:
cleaned_data=remove_rare_values(cleaned_data)
unique_val(cleaned_data)

Unikalne wartości 2 z kolumny 0: ['b' 'a'],
Zliczone wartosci z kolumny 0:
0
b    463
a    208
Name: count, dtype: int64
Unikalne wartości 3 z kolumny 2: ['u' 'y' None],
Zliczone wartosci z kolumny 2:
2
u    510
y    159
Name: count, dtype: int64
Unikalne wartości 3 z kolumny 3: ['g' 'p' None],
Zliczone wartosci z kolumny 3:
3
g    510
p    159
Name: count, dtype: int64
Unikalne wartości 11 z kolumny 4: ['w' 'q' 'm' None 'cc' 'k' 'c' 'x' 'i' 'aa' 'ff'],
Zliczone wartosci z kolumny 4:
4
c     135
q      76
w      63
i      58
aa     53
ff     53
k      50
cc     40
m      38
x      38
Name: count, dtype: int64
Unikalne wartości 5 z kolumny 5: ['v' 'h' 'bb' 'ff' None],
Zliczone wartosci z kolumny 5:
5
v     392
h     137
bb     57
ff     57
Name: count, dtype: int64
Unikalne wartości 2 z kolumny 7: ['t' 'f'],
Zliczone wartosci z kolumny 7:
7
t    358
f    313
Name: count, dtype: int64
Unikalne wartości 2 z kolumny 8: ['t' 'f'],
Zliczone wartosci z kolumny 8:
8
f    379
t    292
Name: cou

In [68]:
def transfor_cat(data,one_hot_columns,binary_encoding):
    transformers =[('onehot',OneHotEncoder(),one_hot_columns),
                ('binaryencoding',BinaryEncoder(),binary_encoding)] 

    column_transformer=ColumnTransformer(transformers, remainder='drop')
    transformed_data = column_transformer.fit_transform(data)
    transformed_df = pd.DataFrame(transformed_data)
    return transformed_df

In [69]:
def wspol_VIF(data):
    data1=data.copy()
    data1 = add_constant(data1)
    VIF_data = pd.DataFrame()
    VIF_data['feature'] = data1.columns
    VIF_data['VIF'] = [variance_inflation_factor(data1.values, i) for i in range(data1.shape[1])]
    return VIF_data

Raw data

In [70]:
raw_data=transfor_cat(cc_apps,[0,2,3,7,8,10],[4,5])

In [71]:
wspol_VIF(raw_data)

  return 1 - self.ssr/self.centered_tss
  vif = 1. / (1. - r_squared_i)


Unnamed: 0,feature,VIF
0,const,0.0
1,0,inf
2,1,inf
3,2,inf
4,3,43292010.0
5,4,476017300000.0
6,5,inf
7,6,2830047.0
8,7,233456000000.0
9,8,3578991.0


Cleaned data

In [72]:
cleaned_data.head()

Unnamed: 0,0,1,2,3,4,5,7,8,10,12
0,b,0.0,u,g,w,v,t,t,g,+
1,a,4.46,u,g,q,h,t,t,g,+
2,a,0.5,u,g,q,h,t,f,g,+
3,b,1.54,u,g,w,v,t,t,g,+
4,b,5.625,u,g,w,v,t,f,s,+


In [73]:
cleaned_data=transfor_cat(cleaned_data,[0,2,3,7,8],[4,5])
wspol_VIF(cleaned_data)

  return 1 - self.ssr/self.centered_tss
  vif = 1. / (1. - r_squared_i)


Unnamed: 0,feature,VIF
0,const,0.0
1,0,inf
2,1,inf
3,2,inf
4,3,194638200.0
5,4,216425600000.0
6,5,117842.1
7,6,59940880.0
8,7,32117070000.0
9,8,951190.2


## Model

Model with raw data

In [74]:
y_dt=cc_apps.iloc[:,12]
X_trainr, X_testr, y_trainr, y_testr = train_test_split(raw_data,y_dt,test_size=0.3,random_state=123)
default_lr_params = {
        'C': 1.0,
        'max_iter': 100,
        'solver': 'lbfgs',
        'penalty': 'l2',
        'tol': 1e-4}

In [75]:
def init_model(X_train, X_test, y_train, y_test,lr_params=None):
       if lr_params is None:
              lr_params = {}
       steps=[("minmaxscaler",MinMaxScaler()),
              ("logisticregression",LogisticRegression(**lr_params))]
       pipeline=Pipeline(steps)
       pipeline.fit(X_train,y_train)
       score_tr=round(pipeline.score(X_train,y_train),3)
       pred_y=pipeline.predict(X_test)
       prawd=pipeline.predict_proba(X_test)
       score_ts=round(pipeline.score(X_test,y_test),3)
       conf=confusion_matrix(y_test,pred_y)
       print(f"Training Score is equal: {score_tr}\nTesting Score is equal: {score_ts}\n Confusion matrix\n {conf}")
       return pred_y, prawd, pipeline

In [76]:
predictionr, probabr, pipeliner = init_model(X_trainr, X_testr, y_trainr, y_testr,default_lr_params)

Training Score is equal: 0.881
Testing Score is equal: 0.856
 Confusion matrix
 [[89 14]
 [15 84]]


In [77]:
def tunning(X_train,y_train,pipeline,penalty,C,tol,iter,solv):
    kf=KFold(n_splits=10,random_state=123,shuffle=True)
    param_grid = {
        "logisticregression__penalty": penalty,
        "logisticregression__C": C,
        "logisticregression__tol": tol,
        "logisticregression__max_iter": iter,
        "logisticregression__solver":solv
    }

    model=GridSearchCV(pipeline,param_grid,cv=kf)
    model.fit(X_train,y_train)
    print(f"Best cross-validation score: {round(model.best_score_, 3)}")
    return (model.best_params_)

In [78]:
def dict_for_init_model(after_tunning):
    cleaned_params = {}
    for key, value in after_tunning.items():
        if '__' in key:
            new_key = key.split('__', 1)[1]
            cleaned_params[new_key] = value
        else:
            cleaned_params[key] = value
    return cleaned_params

Checking solvers:"lbfgs","newton-cg" for raw model

In [41]:
tunning(X_trainr,y_trainr,pipeliner,["l2"],[0.001, 0.01, 0.1, 1, 10],[0.01, 0.001, 0.0001],np.arange(100,200,50),["lbfgs","newton-cg"])

Best cross-validation score: 0.864


{'logisticregression__C': 0.1,
 'logisticregression__max_iter': 100,
 'logisticregression__penalty': 'l2',
 'logisticregression__solver': 'lbfgs',
 'logisticregression__tol': 0.01}

In [42]:
tun_lbfgs=tunning(X_trainr,y_trainr,pipeliner,["l2"],np.arange(0.01,1,0.01),np.arange(0,0.1,0.01),[100],['lbfgs'])
tun_lbfgs

Best cross-validation score: 0.874


{'logisticregression__C': 0.04,
 'logisticregression__max_iter': 100,
 'logisticregression__penalty': 'l2',
 'logisticregression__solver': 'lbfgs',
 'logisticregression__tol': 0.0}

In [43]:
tun_lbfgs=dict_for_init_model(tun_lbfgs)
predictionlb, probablb, pipelinelb = init_model(X_trainr, X_testr, y_trainr, y_testr,lr_params=tun_lbfgs)

Training Score is equal: 0.876
Testing Score is equal: 0.851
 Confusion matrix
 [[86 17]
 [13 86]]


Checking solvers: "liblinear","saga" for raw model

In [44]:
tun_saga=tunning(X_trainr,y_trainr,pipeliner,["l1","l2"],np.arange(0.1,1.2,0.1),np.arange(0.01,0.1,0.01),np.arange(400,700,20),["liblinear","saga"])
tun_saga

Best cross-validation score: 0.881


{'logisticregression__C': 0.30000000000000004,
 'logisticregression__max_iter': 640,
 'logisticregression__penalty': 'l1',
 'logisticregression__solver': 'saga',
 'logisticregression__tol': 0.08}

In [45]:
tun_saga=dict_for_init_model(tun_saga)
predictionlb, probablb, pipelinelb = init_model(X_trainr, X_testr, y_trainr, y_testr,lr_params=tun_saga)

Training Score is equal: 0.881
Testing Score is equal: 0.851
 Confusion matrix
 [[87 16]
 [14 85]]


Model with cleaned data

In [79]:
X_trainc, X_testc, y_trainc, y_testc = train_test_split(cleaned_data,y_dt,test_size=0.3,random_state=123)
predictionc, probabc, pipelinec = init_model(X_trainc, X_testc, y_trainc, y_testc,default_lr_params)

Training Score is equal: 0.761
Testing Score is equal: 0.713
 Confusion matrix
 [[67 36]
 [22 77]]
