In [27]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix,classification_report
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
import warnings
warnings.filterwarnings('ignore')

In [28]:
# Data Ingestion
data = pd.read_csv("credit_card.csv")
data.head()

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default.payment.next.month
0,1,20000.0,2,2,1,24,2,2,-1,-1,...,0.0,0.0,0.0,0.0,689.0,0.0,0.0,0.0,0.0,1
1,2,120000.0,2,2,2,26,-1,2,0,0,...,3272.0,3455.0,3261.0,0.0,1000.0,1000.0,1000.0,0.0,2000.0,1
2,3,90000.0,2,2,2,34,0,0,0,0,...,14331.0,14948.0,15549.0,1518.0,1500.0,1000.0,1000.0,1000.0,5000.0,0
3,4,50000.0,2,2,1,37,0,0,0,0,...,28314.0,28959.0,29547.0,2000.0,2019.0,1200.0,1100.0,1069.0,1000.0,0
4,5,50000.0,1,2,1,57,-1,0,-1,0,...,20940.0,19146.0,19131.0,2000.0,36681.0,10000.0,9000.0,689.0,679.0,0


In [29]:
# Checking the dtype of Columns
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30000 entries, 0 to 29999
Data columns (total 25 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   ID                          30000 non-null  int64  
 1   LIMIT_BAL                   30000 non-null  float64
 2   SEX                         30000 non-null  int64  
 3   EDUCATION                   30000 non-null  int64  
 4   MARRIAGE                    30000 non-null  int64  
 5   AGE                         30000 non-null  int64  
 6   PAY_0                       30000 non-null  int64  
 7   PAY_2                       30000 non-null  int64  
 8   PAY_3                       30000 non-null  int64  
 9   PAY_4                       30000 non-null  int64  
 10  PAY_5                       30000 non-null  int64  
 11  PAY_6                       30000 non-null  int64  
 12  BILL_AMT1                   30000 non-null  float64
 13  BILL_AMT2                   300

In [30]:
# Renaming Columns for better understanding
new_col = {"PAY_0":"PAY_1","default.payment.next.month":"DEFAULTER"}
data.rename(columns=new_col,inplace= True)

In [31]:
# Creating a list of Columns having similar names.
pay_col = ["PAY_1","PAY_2","PAY_3","PAY_4","PAY_5","PAY_6"]
bill_amt_col = ["BILL_AMT1","BILL_AMT2","BILL_AMT3","BILL_AMT4","BILL_AMT5","BILL_AMT6"]
pay_amt_col = ["PAY_AMT1","PAY_AMT2","PAY_AMT3","PAY_AMT4","PAY_AMT5","PAY_AMT6"]

In [32]:
# Replacing the numbers in each column with their corresponding values
sex = {1:"male",2:"female"}
edu = {0:"unknown",1:"graduate school",2:"university",3:"high school",4:"others",5:"unknown",6:"unknown"}
marriage = {0:"others",1:"married",2:"single",3:"others"}
pay = {-2:"payment delay for 2 months",-1:"pay duly",0:"pay duly",1:"payment delay for 1 month",2:"payment delay for 2 months",
       3:"payment delay for 3 months",4:"payment delay for 4 months",5:"payment delay for 5 months",6:"payment delay for 6 months",
       7:"payment delay for 7 months",8:"payment delay for 8 months"}
default = {1:"yes",0:"no"}

data["SEX"] = data["SEX"].replace(sex)
data["EDUCATION"] = data["EDUCATION"].replace(edu)
data["MARRIAGE"] = data["MARRIAGE"].replace(marriage)
data["DEFAULTER"]= data["DEFAULTER"].replace(default)
for i in pay_col:
    data[i] = data[i].replace(pay)

In [33]:
data.to_csv("credit_card_defaulter.csv",index= False)

In [34]:
# Dropping id column
data.drop(labels="ID",axis = 1, inplace = True)

In [35]:
# Separate features and target variable
X = data.drop(labels=['DEFAULTER'], axis=1)  # Features
y = data['DEFAULTER'] #Target

In [36]:
# Separating numerical and categorical features
cat_col = X.select_dtypes(include='O').columns
num_col = X.select_dtypes(exclude='O').columns

In [37]:
cat_col

Index(['SEX', 'EDUCATION', 'MARRIAGE', 'PAY_1', 'PAY_2', 'PAY_3', 'PAY_4',
       'PAY_5', 'PAY_6'],
      dtype='object')

In [38]:
num_col

Index(['LIMIT_BAL', 'AGE', 'BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4',
       'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3',
       'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6'],
      dtype='object')

In [39]:
# Creating Pipeline

# Numerical pipeline
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scalar', StandardScaler())
])

# Categorical pipeline
cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder())
])

# Combining categorical and numerical pipelines
preprocessor = ColumnTransformer(transformers=[
    ('num_pipeline', num_pipeline, num_col),
    ('cat_pipeline', cat_pipeline, cat_col)
])

In [40]:
# Preprocess the data
X_preprocessed = pd.DataFrame(preprocessor.fit_transform(X))

In [41]:
# Splitting the Data 
X_train, X_test, y_train, y_test = train_test_split(X_preprocessed, y, test_size=0.3, random_state=38)

In [42]:
# Model Training and Evaluation
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Random Forest': RandomForestClassifier(),
    'SVM': SVC(),
    'Decission Tree': DecisionTreeClassifier(),
    "KNN":KNeighborsClassifier(),
    "Naive Bayes":GaussianNB()
}

In [43]:
results = {}
for model_name, model in models.items():
    # Model training
    model.fit(X_train, y_train)

    # Model evaluation
    y_pred = model.predict(X_test)
    accuracy = round(accuracy_score(y_test, y_pred)*100,2)
    report = classification_report(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)
    results[model_name] = {'model': model, 'accuracy': accuracy, 'classification_report': report, 'confusion_matrix': cm}

In [44]:
# Display accuracy score for all models
print("="*50)    
for model_name, result in results.items():
    print(f'{model_name} Accuracy: {result["accuracy"]:.2f}%')
print("="*50)

# Model Selection
best_model_name = max(results, key=lambda k: results[k]['accuracy'])
best_model = results[best_model_name]['model']
print(f'Best Model: {best_model_name}')
print(f'Accuracy: {results[best_model_name]["accuracy"]:.2f}%')
print(f'Classification Report:\n{"="*65}\n{results[best_model_name]["classification_report"]}\n{"="*65}\n')
print(f'Confusion Matrix:\n{results[best_model_name]["confusion_matrix"]}\n')

Logistic Regression Accuracy: 80.14%
Random Forest Accuracy: 81.16%
SVM Accuracy: 81.14%
Decission Tree Accuracy: 71.50%
KNN Accuracy: 78.66%
Naive Bayes Accuracy: 78.60%
Best Model: Random Forest
Accuracy: 81.16%
Classification Report:
              precision    recall  f1-score   support

          no       0.83      0.95      0.89      6982
         yes       0.66      0.34      0.45      2018

    accuracy                           0.81      9000
   macro avg       0.74      0.64      0.67      9000
weighted avg       0.79      0.81      0.79      9000


Confusion Matrix:
[[6624  358]
 [1338  680]]

