## Importing Necessary Libraries

In [1]:
import pandas as pd
import plotly.express as px
import plotly.figure_factory as ff
from sklearn.ensemble import IsolationForest
import numpy as np

## Loading the Dataset

In [2]:
data = pd.read_csv('UCI_Credit_Card.csv')

In [3]:
data

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default.payment.next.month
0,1,20000.0,2,2,1,24,2,2,-1,-1,...,0.0,0.0,0.0,0.0,689.0,0.0,0.0,0.0,0.0,1
1,2,120000.0,2,2,2,26,-1,2,0,0,...,3272.0,3455.0,3261.0,0.0,1000.0,1000.0,1000.0,0.0,2000.0,1
2,3,90000.0,2,2,2,34,0,0,0,0,...,14331.0,14948.0,15549.0,1518.0,1500.0,1000.0,1000.0,1000.0,5000.0,0
3,4,50000.0,2,2,1,37,0,0,0,0,...,28314.0,28959.0,29547.0,2000.0,2019.0,1200.0,1100.0,1069.0,1000.0,0
4,5,50000.0,1,2,1,57,-1,0,-1,0,...,20940.0,19146.0,19131.0,2000.0,36681.0,10000.0,9000.0,689.0,679.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29995,29996,220000.0,1,3,1,39,0,0,0,0,...,88004.0,31237.0,15980.0,8500.0,20000.0,5003.0,3047.0,5000.0,1000.0,0
29996,29997,150000.0,1,3,2,43,-1,-1,-1,-1,...,8979.0,5190.0,0.0,1837.0,3526.0,8998.0,129.0,0.0,0.0,0
29997,29998,30000.0,1,2,2,37,4,3,2,-1,...,20878.0,20582.0,19357.0,0.0,0.0,22000.0,4200.0,2000.0,3100.0,1
29998,29999,80000.0,1,3,1,41,1,-1,0,0,...,52774.0,11855.0,48944.0,85900.0,3409.0,1178.0,1926.0,52964.0,1804.0,1


## Removing ID Column

In [4]:
data = data.drop('ID', axis=1)

In [5]:
data.head()

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default.payment.next.month
0,20000.0,2,2,1,24,2,2,-1,-1,-2,...,0.0,0.0,0.0,0.0,689.0,0.0,0.0,0.0,0.0,1
1,120000.0,2,2,2,26,-1,2,0,0,0,...,3272.0,3455.0,3261.0,0.0,1000.0,1000.0,1000.0,0.0,2000.0,1
2,90000.0,2,2,2,34,0,0,0,0,0,...,14331.0,14948.0,15549.0,1518.0,1500.0,1000.0,1000.0,1000.0,5000.0,0
3,50000.0,2,2,1,37,0,0,0,0,0,...,28314.0,28959.0,29547.0,2000.0,2019.0,1200.0,1100.0,1069.0,1000.0,0
4,50000.0,1,2,1,57,-1,0,-1,0,0,...,20940.0,19146.0,19131.0,2000.0,36681.0,10000.0,9000.0,689.0,679.0,0


## EDA

### Data Information

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30000 entries, 0 to 29999
Data columns (total 24 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   LIMIT_BAL                   30000 non-null  float64
 1   SEX                         30000 non-null  int64  
 2   EDUCATION                   30000 non-null  int64  
 3   MARRIAGE                    30000 non-null  int64  
 4   AGE                         30000 non-null  int64  
 5   PAY_0                       30000 non-null  int64  
 6   PAY_2                       30000 non-null  int64  
 7   PAY_3                       30000 non-null  int64  
 8   PAY_4                       30000 non-null  int64  
 9   PAY_5                       30000 non-null  int64  
 10  PAY_6                       30000 non-null  int64  
 11  BILL_AMT1                   30000 non-null  float64
 12  BILL_AMT2                   30000 non-null  float64
 13  BILL_AMT3                   300

### Checking for Missing Values

In [7]:
data.isnull().sum()

LIMIT_BAL                     0
SEX                           0
EDUCATION                     0
MARRIAGE                      0
AGE                           0
PAY_0                         0
PAY_2                         0
PAY_3                         0
PAY_4                         0
PAY_5                         0
PAY_6                         0
BILL_AMT1                     0
BILL_AMT2                     0
BILL_AMT3                     0
BILL_AMT4                     0
BILL_AMT5                     0
BILL_AMT6                     0
PAY_AMT1                      0
PAY_AMT2                      0
PAY_AMT3                      0
PAY_AMT4                      0
PAY_AMT5                      0
PAY_AMT6                      0
default.payment.next.month    0
dtype: int64

### Distribution of the Target Variable

In [8]:
fig = px.histogram(data, x='default.payment.next.month', nbins=2)
fig.update_layout(title='Distribution of Default Payment Next Month',
                  xaxis_title='Default Payment Next Month',
                  yaxis_title='Count',
                  bargap=0.2)
fig.show()

### Correlation matrix

In [9]:
correlation_matrix = data.corr()

In [10]:
fig = ff.create_annotated_heatmap(
    z=correlation_matrix.values,
    x=list(correlation_matrix.columns),
    y=list(correlation_matrix.index),
    annotation_text=correlation_matrix.round(2).values,
    colorscale='RdBu',
    reversescale=True
)

In [11]:
fig.update_layout(
    title='Correlation Matrix',
    xaxis_nticks=36,
    width=1200,
    height=1000
)

fig.show()

## Data Cleaning

### Checking for Outliers

#### Using Data Description (Statistical Summary)
Outliers can often be identified by looking at the summary statistics of the data

In [12]:
data.describe()

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default.payment.next.month
count,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,...,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0
mean,167484.322667,1.603733,1.853133,1.551867,35.4855,-0.0167,-0.133767,-0.1662,-0.220667,-0.2662,...,43262.948967,40311.400967,38871.7604,5663.5805,5921.163,5225.6815,4826.076867,4799.387633,5215.502567,0.2212
std,129747.661567,0.489129,0.790349,0.52197,9.217904,1.123802,1.197186,1.196868,1.169139,1.133187,...,64332.856134,60797.15577,59554.107537,16563.280354,23040.87,17606.96147,15666.159744,15278.305679,17777.465775,0.415062
min,10000.0,1.0,0.0,0.0,21.0,-2.0,-2.0,-2.0,-2.0,-2.0,...,-170000.0,-81334.0,-339603.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,50000.0,1.0,1.0,1.0,28.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,2326.75,1763.0,1256.0,1000.0,833.0,390.0,296.0,252.5,117.75,0.0
50%,140000.0,2.0,2.0,2.0,34.0,0.0,0.0,0.0,0.0,0.0,...,19052.0,18104.5,17071.0,2100.0,2009.0,1800.0,1500.0,1500.0,1500.0,0.0
75%,240000.0,2.0,2.0,2.0,41.0,0.0,0.0,0.0,0.0,0.0,...,54506.0,50190.5,49198.25,5006.0,5000.0,4505.0,4013.25,4031.5,4000.0,0.0
max,1000000.0,2.0,6.0,3.0,79.0,8.0,8.0,8.0,8.0,8.0,...,891586.0,927171.0,961664.0,873552.0,1684259.0,896040.0,621000.0,426529.0,528666.0,1.0


#### Using Visualization with Scatterplots
Scatterplots can help visualize the relationships between variables and identify outliers

In [13]:
# Assuming 'data' is your DataFrame containing numeric data
numeric_data = data._get_numeric_data()

# Train the Isolation Forest model
iso_forest = IsolationForest(contamination=0.1, random_state=42)
iso_forest.fit(numeric_data)

# Predict outliers
outliers = iso_forest.predict(numeric_data)

# Get the indices of the outliers
outlier_indices = np.where(outliers == -1)[0]

# View outliers
print(data.iloc[outlier_indices])

# Create a scatter plot of the data with outliers highlighted
fig = px.scatter(data, x=numeric_data.columns[0], y=numeric_data.columns[1], title='Outliers Detected by Isolation Forest')

# Highlight outliers
fig.add_scatter(x=data.iloc[outlier_indices][numeric_data.columns[0]],
                y=data.iloc[outlier_indices][numeric_data.columns[1]],
                mode='markers',
                name='Outliers',
                marker=dict(color='red', size=10, symbol='cross'))

fig.show()


X does not have valid feature names, but IsolationForest was fitted with feature names



       LIMIT_BAL  SEX  EDUCATION  MARRIAGE  AGE  PAY_0  PAY_2  PAY_3  PAY_4  \
6       500000.0    1          1         2   29      0      0      0      0   
17      320000.0    1          1         1   49      0      0      0     -1   
33      500000.0    2          2         1   54     -2     -2     -2     -2   
34      500000.0    1          1         1   58     -2     -2     -2     -2   
40      360000.0    1          1         2   33      0      0      0      0   
...          ...  ...        ...       ...  ...    ...    ...    ...    ...   
29970   360000.0    1          1         1   34     -1     -1     -1      0   
29972   190000.0    1          1         1   37      0      0      0      0   
29978   310000.0    1          2         1   39      0      0      0      0   
29988   250000.0    1          1         1   34      0      0      0      0   
29998    80000.0    1          3         1   41      1     -1      0      0   

       PAY_5  ...  BILL_AMT4  BILL_AMT5  BILL_AMT6 

#### Using the Interquartile Range (IQR)
The IQR method identifies outliers as points that are below Q1 - 1.5IQR or above Q3 + 1.5IQR

In [14]:
# Calculate Q1 (25th percentile) and Q3 (75th percentile)
Q1 = data.quantile(0.25)
Q3 = data.quantile(0.75)
IQR = Q3 - Q1

# Identify outliers
outliers = (data < (Q1 - 1.5 * IQR)) | (data > (Q3 + 1.5 * IQR))

# View outliers
print(data[outliers.any(axis=1)])

       LIMIT_BAL  SEX  EDUCATION  MARRIAGE  AGE  PAY_0  PAY_2  PAY_3  PAY_4  \
0        20000.0    2          2         1   24      2      2     -1     -1   
1       120000.0    2          2         2   26     -1      2      0      0   
4        50000.0    1          2         1   57     -1      0     -1      0   
6       500000.0    1          1         2   29      0      0      0      0   
8       140000.0    2          3         1   28      0      0      2      0   
...          ...  ...        ...       ...  ...    ...    ...    ...    ...   
29994    80000.0    1          2         2   34      2      2      2      2   
29995   220000.0    1          3         1   39      0      0      0      0   
29997    30000.0    1          2         2   37      4      3      2     -1   
29998    80000.0    1          3         1   41      1     -1      0      0   
29999    50000.0    1          2         1   46      0      0      0      0   

       PAY_5  ...  BILL_AMT4  BILL_AMT5  BILL_AMT6 

### Function for Removing Outliers

In [15]:
def remove_outliers(df, col):
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]

### Removing Outliers for 'LIMIT_BAL'

In [16]:
data = remove_outliers(data, 'LIMIT_BAL')

### Data Description after Data Cleaning

In [17]:
data.describe()

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default.payment.next.month
count,29833.0,29833.0,29833.0,29833.0,29833.0,29833.0,29833.0,29833.0,29833.0,29833.0,...,29833.0,29833.0,29833.0,29833.0,29833.0,29833.0,29833.0,29833.0,29833.0,29833.0
mean,164963.284953,1.604029,1.854423,1.552107,35.463983,-0.015017,-0.132236,-0.164717,-0.219254,-0.265109,...,42631.006805,39715.641806,38279.789696,5605.118996,5862.359,5113.647035,4756.921664,4690.585694,5121.51862,0.221801
std,125529.099652,0.489066,0.789758,0.522081,9.218344,1.124272,1.198032,1.197798,1.170399,1.13429,...,62444.527519,58902.828515,57803.698109,16382.410837,22929.33,16388.582348,15258.488005,14680.485954,17405.221856,0.415465
min,10000.0,1.0,0.0,0.0,21.0,-2.0,-2.0,-2.0,-2.0,-2.0,...,-170000.0,-81334.0,-339603.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,50000.0,1.0,1.0,1.0,28.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,2310.0,1749.0,1240.0,1000.0,820.0,390.0,291.0,244.0,102.0,0.0
50%,140000.0,2.0,2.0,2.0,34.0,0.0,0.0,0.0,0.0,0.0,...,19002.0,18070.0,16990.0,2100.0,2005.0,1800.0,1500.0,1500.0,1500.0,0.0
75%,240000.0,2.0,2.0,2.0,41.0,0.0,0.0,0.0,0.0,0.0,...,53952.0,49939.0,48914.0,5001.0,5000.0,4500.0,4000.0,4000.0,4000.0,0.0
max,520000.0,2.0,6.0,3.0,79.0,8.0,8.0,8.0,8.0,8.0,...,706864.0,587067.0,699944.0,873552.0,1684259.0,889043.0,621000.0,426529.0,528666.0,1.0


## Feature Engineering
Creating new features or transforming existing ones to improve model performance

In [18]:
from sklearn.preprocessing import StandardScaler

In [19]:
# Feature scaling
scaler = StandardScaler()

# Scale numerical features
numerical_features = ['LIMIT_BAL', 'AGE'] + [f'BILL_AMT{i}' for i in range(1, 7)] + [f'PAY_AMT{i}' for i in range(1, 7)]
data[numerical_features] = scaler.fit_transform(data[numerical_features])

# Encoding categorical features
data = pd.get_dummies(data, columns=['SEX', 'EDUCATION', 'MARRIAGE'], drop_first=True)

In [20]:
data.head()

Unnamed: 0,LIMIT_BAL,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,BILL_AMT1,BILL_AMT2,...,SEX_2,EDUCATION_1,EDUCATION_2,EDUCATION_3,EDUCATION_4,EDUCATION_5,EDUCATION_6,MARRIAGE_1,MARRIAGE_2,MARRIAGE_3
0,-1.154838,-1.243626,2,2,-1,-1,-2,-2,-0.650323,-0.655496,...,True,False,True,False,False,False,False,True,False,False
1,-0.358196,-1.026664,-1,2,0,0,0,2,-0.667504,-0.675375,...,True,False,True,False,False,False,False,False,True,False
2,-0.597189,-0.158815,0,0,0,0,0,0,-0.296853,-0.497773,...,True,False,True,False,False,False,False,False,True,False
3,-0.915845,0.166629,0,0,0,0,0,0,-0.049106,-0.003947,...,True,False,True,False,False,False,False,True,False,False
4,-0.915845,2.336252,-1,0,-1,0,0,0,-0.58467,-0.618422,...,False,False,True,False,False,False,False,True,False,False


## Models' Evaluation

In [21]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
import xgboost as xgb
from sklearn.metrics import classification_report, roc_auc_score, accuracy_score

# Split the data into training and testing sets
X = data.drop('default.payment.next.month', axis=1)
y = data['default.payment.next.month']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Define models
models = {
    'Logistic Regression': LogisticRegression(),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'Naive Bayes': GaussianNB(),
    'Linear Discriminant Analysis': LinearDiscriminantAnalysis(),
    'XGBoost': xgb.XGBClassifier(),
    'Support Vector Machine': SVC(probability=True)
}

# Function to evaluate models
def evaluate_model(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]
    print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
    print(f'AUC-ROC: {roc_auc_score(y_test, y_proba)}')
    print(classification_report(y_test, y_pred))

# Evaluate each model
for model_name, model in models.items():
    print(f'Evaluating {model_name}...')
    evaluate_model(model, X_train, y_train, X_test, y_test)
    print('-' * 50)


Evaluating Logistic Regression...



lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression



Accuracy: 0.8096089385474861
AUC-ROC: 0.701753061818629
              precision    recall  f1-score   support

           0       0.82      0.97      0.89      6965
           1       0.70      0.25      0.37      1985

    accuracy                           0.81      8950
   macro avg       0.76      0.61      0.63      8950
weighted avg       0.79      0.81      0.77      8950

--------------------------------------------------
Evaluating K-Nearest Neighbors...
Accuracy: 0.7886033519553073
AUC-ROC: 0.6973087821258144
              precision    recall  f1-score   support

           0       0.83      0.91      0.87      6965
           1       0.54      0.35      0.43      1985

    accuracy                           0.79      8950
   macro avg       0.68      0.63      0.65      8950
weighted avg       0.77      0.79      0.77      8950

--------------------------------------------------
Evaluating Decision Tree...
Accuracy: 0.7270391061452514
AUC-ROC: 0.611418951540719
             

### Ensemble Method

In [23]:
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import confusion_matrix

# Assuming you have X_train, X_test, y_train, y_test

# Define the base classifiers
clf1 = LogisticRegression(max_iter=1000, random_state=42)
clf2 = SVC(probability=True, random_state=42)
clf3 = KNeighborsClassifier()
clf4 = DecisionTreeClassifier(random_state=42)
clf5 = RandomForestClassifier(random_state=42)
clf6 = GradientBoostingClassifier(random_state=42)
clf7 = GaussianNB()
clf8 = LinearDiscriminantAnalysis()
clf9 = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)

# Initialize VotingClassifier
voting_clf = VotingClassifier(estimators=[
    ('lr', clf1),
    ('svc', clf2),
    ('knn', clf3),
    ('dt', clf4),
    ('rf', clf5),
    ('gb', clf6),
    ('nb', clf7),
    ('lda', clf8),
    ('xgb', clf9)
], voting='soft')

# Train the ensemble model
voting_clf.fit(X_train, y_train)

# Predictions
y_pred = voting_clf.predict(X_test)

# Evaluate ensemble model
print("Ensemble Model Performance:")
print("Accuracy Score:", accuracy_score(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Ensemble Model Performance:
Accuracy Score: 0.8109497206703911

Classification Report:
              precision    recall  f1-score   support

           0       0.84      0.94      0.89      6965
           1       0.63      0.36      0.46      1985

    accuracy                           0.81      8950
   macro avg       0.73      0.65      0.67      8950
weighted avg       0.79      0.81      0.79      8950


Confusion Matrix:
[[6539  426]
 [1266  719]]


### Feature Importance
For feature importance, we'll use the Random Forest model since it provides a straightforward way to access feature importance scores

In [24]:
# Fit Random Forest model
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

# Feature importances
feature_importances = rf_model.feature_importances_
sorted_idx = np.argsort(feature_importances)[::-1]

# Create a DataFrame for better handling
feature_importance_df = pd.DataFrame({
    'Feature': X_train.columns[sorted_idx],
    'Importance': feature_importances[sorted_idx]
})

# Plotly bar plot
fig = px.bar(feature_importance_df, x='Feature', y='Importance',
             title='Random Forest Feature Importance',
             labels={'Importance': 'Importance', 'Feature': 'Feature'},
             template='plotly_white')

fig.show()

### Error Analysis
Identifying and analyzing instances where the model predictions are incorrect

In [30]:
import plotly.graph_objs as go
from plotly.subplots import make_subplots

# Incorrect predictions
incorrect_predictions = X_test[y_test != y_pred]

# Convert to DataFrame for easy handling
X_test_df = pd.DataFrame(X_test, columns=X_train.columns)
incorrect_predictions_df = X_test_df[y_test != y_pred]

# Display some of the misclassified instances
print("Examples of Incorrect Predictions:")
print(incorrect_predictions_df.head())

# Determine the number of rows needed to fit 5 plots per row
num_features = len(X_test_df.columns)
num_cols = 5
num_rows = (num_features + num_cols - 1) // num_cols

# Create subplots
fig = make_subplots(rows=num_rows, cols=num_cols, subplot_titles=X_test_df.columns)

# Add histograms to subplots
for i, feature in enumerate(X_test_df.columns):
    row = i // num_cols + 1
    col = i % num_cols + 1

    fig.add_trace(go.Histogram(
        x=X_test_df[feature],
        name='All Data',
        opacity=0.75,
        marker=dict(color='blue'),
        showlegend=False
    ), row=row, col=col)

    fig.add_trace(go.Histogram(
        x=incorrect_predictions_df[feature],
        name='Misclassified Data',
        opacity=0.75,
        marker=dict(color='red'),
        showlegend=False
    ), row=row, col=col)

# Update layout
fig.update_layout(
    title_text='Distribution of Features for All Data vs. Misclassified Data',
    height=300 * num_rows,
    showlegend=True,
    barmode='overlay',
    template='plotly_white'
)

# Add legend manually
fig.add_trace(go.Scatter(x=[None], y=[None], mode='markers',
                         marker=dict(size=10, color='blue'), name='All Data'), row=1, col=1)
fig.add_trace(go.Scatter(x=[None], y=[None], mode='markers',
                         marker=dict(size=10, color='red'), name='Misclassified Data'), row=1, col=1)

fig.show()


Examples of Incorrect Predictions:
       LIMIT_BAL       AGE  PAY_0  PAY_2  PAY_3  PAY_4  PAY_5  PAY_6  \
6015   -0.836181  1.468403      1     -2     -2     -2     -2     -2   
4558   -0.995509 -0.918183      0      0      2      0      0      0   
27957  -0.915845 -1.026664      2      2      2      2      2      0   
22547   1.553743 -0.267296     -2     -2     -2     -2     -2     -2   
9389   -0.995509 -1.026664      1      2      2      2      2      2   

       BILL_AMT1  BILL_AMT2  ...  SEX_2  EDUCATION_1  EDUCATION_2  \
6015   -0.704936  -0.700279  ...   True        False         True   
4558   -0.332806  -0.244234  ...  False        False         True   
27957  -0.297900  -0.290330  ...   True         True        False   
22547  -0.704936  -0.700279  ...   True        False         True   
9389   -0.149092  -0.112844  ...  False        False         True   

       EDUCATION_3  EDUCATION_4  EDUCATION_5  EDUCATION_6  MARRIAGE_1  \
6015         False        False        False