In [3]:
#Basic Import
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer

from sklearn.metrics import confusion_matrix,precision_score, recall_score, f1_score, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
#from sklearn.ensemble import AdaBoostClassifier
from sklearn.svm import SVC

#from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
#from sklearn.model_selection import GridSearchCV


import warnings
warnings.filterwarnings('ignore')
## Display all the columns of the dataframe
pd.pandas.set_option('display.max_columns',None)

In [5]:
df = pd.read_csv("data/UCI_Credit_Card_Data.csv")

In [6]:
df.shape

(30000, 25)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30000 entries, 0 to 29999
Data columns (total 25 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   id         30000 non-null  int64  
 1   Limit_Bal  30000 non-null  float64
 2   Sex        30000 non-null  int64  
 3   Education  30000 non-null  int64  
 4   Marriage   30000 non-null  int64  
 5   Age        30000 non-null  int64  
 6   Pay_1      30000 non-null  int64  
 7   Pay_2      30000 non-null  int64  
 8   Pay_3      30000 non-null  int64  
 9   Pay_4      30000 non-null  int64  
 10  Pay_5      30000 non-null  int64  
 11  Pay_6      30000 non-null  int64  
 12  Bill_Amt1  30000 non-null  float64
 13  Bill_Amt2  30000 non-null  float64
 14  Bill_Amt3  30000 non-null  float64
 15  Bill_Amt4  30000 non-null  float64
 16  Bill_Amt5  30000 non-null  float64
 17  Bill_Amt6  30000 non-null  float64
 18  Pay_Amt1   30000 non-null  float64
 19  Pay_Amt2   30000 non-null  float64
 20  Pay_Am

In [8]:
df.dropna(how="any",inplace=True)

In [9]:
df.drop(labels=["id"],axis=1,inplace=True)

In [10]:
df.isnull().sum()

Limit_Bal    0
Sex          0
Education    0
Marriage     0
Age          0
Pay_1        0
Pay_2        0
Pay_3        0
Pay_4        0
Pay_5        0
Pay_6        0
Bill_Amt1    0
Bill_Amt2    0
Bill_Amt3    0
Bill_Amt4    0
Bill_Amt5    0
Bill_Amt6    0
Pay_Amt1     0
Pay_Amt2     0
Pay_Amt3     0
Pay_Amt4     0
Pay_Amt5     0
Pay_Amt6     0
Default      0
dtype: int64

In [11]:
df["Sex"]=df["Sex"].map({1: "Male",2 : "Female"})
df["Education"]=df["Education"].map({0 :"Others",1:'Graduate School',2 :"University",3:'High School',4:'Others',5:"Others",6:"Others"})
df["Marriage"]=df["Marriage"].map({0:"Others",1:"Married",2 : "Bachelors"})


In [12]:
df.head()

Unnamed: 0,Limit_Bal,Sex,Education,Marriage,Age,Pay_1,Pay_2,Pay_3,Pay_4,Pay_5,Pay_6,Bill_Amt1,Bill_Amt2,Bill_Amt3,Bill_Amt4,Bill_Amt5,Bill_Amt6,Pay_Amt1,Pay_Amt2,Pay_Amt3,Pay_Amt4,Pay_Amt5,Pay_Amt6,Default
0,20000.0,Female,University,Married,24,2,2,-1,-1,-2,-2,3913.0,3102.0,689.0,0.0,0.0,0.0,0.0,689.0,0.0,0.0,0.0,0.0,1
1,120000.0,Female,University,Bachelors,26,-1,2,0,0,0,2,2682.0,1725.0,2682.0,3272.0,3455.0,3261.0,0.0,1000.0,1000.0,1000.0,0.0,2000.0,1
2,90000.0,Female,University,Bachelors,34,0,0,0,0,0,0,29239.0,14027.0,13559.0,14331.0,14948.0,15549.0,1518.0,1500.0,1000.0,1000.0,1000.0,5000.0,0
3,50000.0,Female,University,Married,37,0,0,0,0,0,0,46990.0,48233.0,49291.0,28314.0,28959.0,29547.0,2000.0,2019.0,1200.0,1100.0,1069.0,1000.0,0
4,50000.0,Male,University,Married,57,-1,0,-1,0,0,0,8617.0,5670.0,35835.0,20940.0,19146.0,19131.0,2000.0,36681.0,10000.0,9000.0,689.0,679.0,0


In [13]:
df["Education"].value_counts()

Education
University         14030
Graduate School    10585
High School         4917
Others               468
Name: count, dtype: int64

In [14]:
df["Marriage"].value_counts()

Marriage
Bachelors    15964
Married      13659
Others          54
Name: count, dtype: int64

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30000 entries, 0 to 29999
Data columns (total 24 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Limit_Bal  30000 non-null  float64
 1   Sex        30000 non-null  object 
 2   Education  30000 non-null  object 
 3   Marriage   29677 non-null  object 
 4   Age        30000 non-null  int64  
 5   Pay_1      30000 non-null  int64  
 6   Pay_2      30000 non-null  int64  
 7   Pay_3      30000 non-null  int64  
 8   Pay_4      30000 non-null  int64  
 9   Pay_5      30000 non-null  int64  
 10  Pay_6      30000 non-null  int64  
 11  Bill_Amt1  30000 non-null  float64
 12  Bill_Amt2  30000 non-null  float64
 13  Bill_Amt3  30000 non-null  float64
 14  Bill_Amt4  30000 non-null  float64
 15  Bill_Amt5  30000 non-null  float64
 16  Bill_Amt6  30000 non-null  float64
 17  Pay_Amt1   30000 non-null  float64
 18  Pay_Amt2   30000 non-null  float64
 19  Pay_Amt3   30000 non-null  float64
 20  Pay_Am

In [16]:
# Seggregating independent and dependent Features
X = df.drop(labels=['Default'],axis=1)
y = df["Default"]

In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30000 entries, 0 to 29999
Data columns (total 24 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Limit_Bal  30000 non-null  float64
 1   Sex        30000 non-null  object 
 2   Education  30000 non-null  object 
 3   Marriage   29677 non-null  object 
 4   Age        30000 non-null  int64  
 5   Pay_1      30000 non-null  int64  
 6   Pay_2      30000 non-null  int64  
 7   Pay_3      30000 non-null  int64  
 8   Pay_4      30000 non-null  int64  
 9   Pay_5      30000 non-null  int64  
 10  Pay_6      30000 non-null  int64  
 11  Bill_Amt1  30000 non-null  float64
 12  Bill_Amt2  30000 non-null  float64
 13  Bill_Amt3  30000 non-null  float64
 14  Bill_Amt4  30000 non-null  float64
 15  Bill_Amt5  30000 non-null  float64
 16  Bill_Amt6  30000 non-null  float64
 17  Pay_Amt1   30000 non-null  float64
 18  Pay_Amt2   30000 non-null  float64
 19  Pay_Amt3   30000 non-null  float64
 20  Pay_Am

In [17]:
# Define which columns should be ordinal-encoded and which should be scaled
categorical_cols = X.select_dtypes(include="object").columns
numerical_cols = X.select_dtypes(exclude="object").columns


In [18]:
print(f"Categorical Columns : {categorical_cols}")
print(f"Numerical Columns : {numerical_cols}")

Categorical Columns : Index(['Sex', 'Education', 'Marriage'], dtype='object')
Numerical Columns : Index(['Limit_Bal', 'Age', 'Pay_1', 'Pay_2', 'Pay_3', 'Pay_4', 'Pay_5',
       'Pay_6', 'Bill_Amt1', 'Bill_Amt2', 'Bill_Amt3', 'Bill_Amt4',
       'Bill_Amt5', 'Bill_Amt6', 'Pay_Amt1', 'Pay_Amt2', 'Pay_Amt3',
       'Pay_Amt4', 'Pay_Amt5', 'Pay_Amt6'],
      dtype='object')


In [19]:
#Numerical Pipeline
num_pipeline = Pipeline(
    steps=[
        ("imputer",SimpleImputer(strategy="median")),
        ("scaler",StandardScaler())
        
    ]
)

#Categorical Pipeline
cat_pipeline = Pipeline (
    steps = [
        ("imputer",SimpleImputer(strategy="most_frequent")),
        ("onehotencoder",OneHotEncoder()),
        ("scaler",StandardScaler(with_mean=False))
    ]
)

#Combining numerical and categorical pipeline

preprocessor = ColumnTransformer(
    [
        ("num_pipeline",num_pipeline,numerical_cols),
        ("cat_pipeline",cat_pipeline,categorical_cols)
        
    ]
)

In [20]:
##Train Test split
from sklearn.model_selection import train_test_split    
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.30,random_state=42)

In [21]:
X_train.head()

Unnamed: 0,Limit_Bal,Sex,Education,Marriage,Age,Pay_1,Pay_2,Pay_3,Pay_4,Pay_5,Pay_6,Bill_Amt1,Bill_Amt2,Bill_Amt3,Bill_Amt4,Bill_Amt5,Bill_Amt6,Pay_Amt1,Pay_Amt2,Pay_Amt3,Pay_Amt4,Pay_Amt5,Pay_Amt6
28465,240000.0,Female,Graduate School,Married,40,-2,-2,-2,-2,-2,-2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
27622,50000.0,Female,Graduate School,Bachelors,23,-1,-1,-1,-1,-1,-1,3430.0,2478.0,2299.0,4800.0,9810.0,660.0,2548.0,2321.0,4800.0,9810.0,660.0,2980.0
28376,50000.0,Female,University,Married,36,2,2,2,2,0,0,46203.0,45159.0,49125.0,47956.0,43578.0,35126.0,0.0,4700.0,0.0,2004.0,3500.0,0.0
10917,200000.0,Female,High School,Married,54,6,5,4,3,2,2,110185.0,107665.0,104686.0,102549.0,101400.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
27234,240000.0,Male,Graduate School,Married,35,-1,-1,-1,0,-1,-1,2024.0,2007.0,21790.0,17102.0,13367.0,22659.0,2017.0,21817.0,1120.0,13434.0,22772.0,22820.0


In [22]:
preprocessor.fit_transform(X_train)

array([[ 0.56391448,  0.48768399, -1.75953854, ...,  0.        ,
         2.00730537,  0.        ],
       [-0.90610925, -1.35477524, -0.87292972, ...,  2.00674233,
         0.        ,  0.        ],
       [-0.90610925,  0.05416417,  1.78689675, ...,  0.        ,
         2.00730537,  0.        ],
       ...,
       [-0.90610925, -1.02963538, -1.75953854, ...,  2.00674233,
         0.        ,  0.        ],
       [-0.75136991, -1.13801533,  0.01367911, ...,  2.00674233,
         0.        ,  0.        ],
       [-0.05504288,  0.05416417, -1.75953854, ...,  0.        ,
         2.00730537,  0.        ]])

In [23]:
#Here we will fit and transform the train data and transform the test data
X_train = pd.DataFrame(preprocessor.fit_transform(X_train),columns=preprocessor.get_feature_names_out())
X_test = pd.DataFrame(preprocessor.transform(X_test),columns=preprocessor.get_feature_names_out())

In [24]:
X_train.head()

Unnamed: 0,num_pipeline__Limit_Bal,num_pipeline__Age,num_pipeline__Pay_1,num_pipeline__Pay_2,num_pipeline__Pay_3,num_pipeline__Pay_4,num_pipeline__Pay_5,num_pipeline__Pay_6,num_pipeline__Bill_Amt1,num_pipeline__Bill_Amt2,num_pipeline__Bill_Amt3,num_pipeline__Bill_Amt4,num_pipeline__Bill_Amt5,num_pipeline__Bill_Amt6,num_pipeline__Pay_Amt1,num_pipeline__Pay_Amt2,num_pipeline__Pay_Amt3,num_pipeline__Pay_Amt4,num_pipeline__Pay_Amt5,num_pipeline__Pay_Amt6,cat_pipeline__Sex_Female,cat_pipeline__Sex_Male,cat_pipeline__Education_Graduate School,cat_pipeline__Education_High School,cat_pipeline__Education_Others,cat_pipeline__Education_University,cat_pipeline__Marriage_Bachelors,cat_pipeline__Marriage_Married,cat_pipeline__Marriage_Others
0,0.563914,0.487684,-1.759539,-1.558129,-1.534656,-1.522407,-1.528914,-1.479949,-0.694979,-0.690691,-0.676308,-0.674219,-0.665072,-0.657278,-0.334158,-0.23782,-0.286287,-0.303704,-0.306697,-0.295508,2.046432,0.0,2.091199,0.0,0.0,0.0,0.0,2.007305,0.0
1,-0.906109,-1.354775,-0.87293,-0.724841,-0.697092,-0.66758,-0.646969,-0.614771,-0.648178,-0.655728,-0.643113,-0.599093,-0.502942,-0.646122,-0.183068,-0.145394,-0.022842,0.305068,-0.265029,-0.127272,2.046432,0.0,2.091199,0.0,0.0,0.0,2.006742,0.0,0.0
2,-0.906109,0.054164,1.786897,1.775022,1.815598,1.896903,0.234975,0.250407,-0.064556,-0.053538,0.032999,0.076353,0.055141,-0.063506,-0.334158,-0.050658,-0.286287,-0.179343,-0.08573,-0.295508,2.046432,0.0,0.0,0.0,0.0,2.004511,0.0,2.007305,0.0
3,0.254436,2.005003,5.333332,4.274884,3.490726,2.751731,1.998864,1.980764,0.808454,0.828364,0.835234,0.930802,1.010764,-0.657278,-0.334158,-0.23782,-0.286287,-0.303704,-0.306697,-0.295508,2.046432,0.0,0.0,2.700691,0.0,0.0,0.0,2.007305,0.0
4,0.563914,-0.054216,-0.87293,-0.724841,-0.697092,0.187248,-0.646969,-0.614771,-0.667362,-0.662374,-0.361686,-0.406551,-0.444155,-0.274249,-0.214555,0.630972,-0.224816,0.52996,1.130978,0.992796,0.0,2.046432,2.091199,0.0,0.0,0.0,0.0,2.007305,0.0


In [25]:
import numpy as np
def evaluate_model(true, predicted):
    matrix = confusion_matrix(true,predicted)
    accuracy = accuracy_score(true, predicted)
    precision =precision_score(true,predicted)
    recall = recall_score(true,predicted)
    f1 = f1_score(true,predicted)
   
    
    return matrix,accuracy,precision,recall,f1
    

In [26]:
#Train multiple Models
models={
    'LogisticRegression':LogisticRegression(class_weight='balanced'),
    'DecisionTreeClassifier':DecisionTreeClassifier(class_weight='balanced'),
    #'AdaBoostClassifier':AdaBoostClassifier(),
    #'KNeighborsClassifier':KNeighborsClassifier(),
    'SVC':SVC(class_weight='balanced'),
    #'GradientBoostingClassifier' : GradientBoostingClassifier(),
    'RandomForestClassifier' : RandomForestClassifier(class_weight='balanced'),
    #'Smote':Smote()
    #'GridSearchCV': GridSearchCV()
    
    
    
}

model_list=[]
acc=[]
fscore = []

for i in range(len(list(models))):
    model=list(models.values())[i]
    model.fit(X_train,y_train)

    #Make Predictions
    y_test_pred=model.predict(X_test)

    matrix,accuracy,precision,recall,f1=evaluate_model(y_test,y_test_pred)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])

    print('Model Training Performance')
    print("Confusion Matrix :",matrix)
    print("Accuracy:",accuracy)
    print("Precision:",precision)
    print("Recall :",recall)
    print('F1-Score :',f1)

    acc.append(accuracy_score)
    fscore.append(f1_score)
    
    print('='*35)
    print('\n')

LogisticRegression
Model Training Performance
Confusion Matrix : [[4931 2109]
 [ 705 1255]]
Accuracy: 0.6873333333333334
Precision: 0.3730677764565993
Recall : 0.6403061224489796
F1-Score : 0.47145003756573994


DecisionTreeClassifier
Model Training Performance
Confusion Matrix : [[5824 1216]
 [1166  794]]
Accuracy: 0.7353333333333333
Precision: 0.39502487562189054
Recall : 0.4051020408163265
F1-Score : 0.4


SVC
Model Training Performance
Confusion Matrix : [[5842 1198]
 [ 841 1119]]
Accuracy: 0.7734444444444445
Precision: 0.48295209322399657
Recall : 0.5709183673469388
F1-Score : 0.5232639700724807


RandomForestClassifier
Model Training Performance
Confusion Matrix : [[6662  378]
 [1289  671]]
Accuracy: 0.8147777777777778
Precision: 0.6396568160152526
Recall : 0.3423469387755102
F1-Score : 0.44599534729145895


