In [None]:
import tensorflow as tf 
from tensorflow import keras
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from imblearn.pipeline import Pipeline
from imblearn.combine import SMOTETomek
from imblearn.over_sampling import SMOTE
from sklearn import metrics
import seaborn as sns
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, learning_curve, validation_curve
from sklearn.metrics import (RocCurveDisplay, PrecisionRecallDisplay,
                             ConfusionMatrixDisplay,
                             mean_squared_error, r2_score)

In [None]:
data=pd.read_csv(r"uci_credit_card.csv")

In [None]:
data.head(5)

In [None]:
data.describe()

In [None]:
data.info()

📊 EDA

In [None]:
plt.Figure(figsize=(1000,500))
sns.histplot(data=data , x="default.payment.next.month")
plt.title("Distribution of payment")
plt.show()


In [None]:
data["SEX"].replace({
    "1":"Male",
    "2":"Female"
})

In [None]:
data["default.payment.next.month"].replace({
    "1":"Yes",
    "0":"No"
})

In [None]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

#default.payment.next.month= D.P.N.M
g_labels = ['Male', 'Female']
c_labels = ['No', 'Yes']
fig = make_subplots(rows=1, cols=2, specs=[[{'type':'domain'}, {'type':'domain'}]])
fig.add_trace(go.Pie(labels=g_labels, values=data['SEX'].value_counts(), name="Gender"),
              1, 1)
fig.add_trace(go.Pie(labels=c_labels, values=data['default.payment.next.month'].value_counts(), name="D.P.N.M"),
              1, 2)


fig.update_traces(hole=.4, hoverinfo="label+percent+name", textfont_size=16)

fig.update_layout(
    title_text="Gender and D.P.N.M Distributions",

    annotations=[dict(text='Gender', x=0.19, y=0.5, font_size=20, showarrow=False),
                 dict(text='D.P.N.M', x=0.82, y=0.5, font_size=20, showarrow=False)])
fig.show()

In [None]:
plt.figure(figsize=(16,12))
corr = data.corr()
mask = np.triu(np.ones_like(corr, dtype=bool))
sns.heatmap(corr, mask=mask, annot=True, fmt=".2f", cmap="vlag", linewidths=.5, cbar_kws={"shrink":.6})
plt.title("Correlation matrix — data data")
plt.tight_layout()
plt.show()


In [None]:
pay_cols = ['PAY_0','PAY_2','PAY_3','PAY_4','PAY_5','PAY_6']

for col in pay_cols:
    table = data.groupby(col)['default.payment.next.month'].agg(['count','mean']).rename(columns={'mean':'default_rate'})
    display(table.sort_values('count', ascending=False).head(12))  # تبص على أشهر القيم
    plt.figure(figsize=(6,4))
    sns.barplot(x=table.index, y=table['default_rate'])
    plt.title(f'Default rate by {col}')
    plt.xlabel(col)
    plt.ylabel('Default rate')
    plt.ylim(0,1)
    plt.tight_layout()
    plt.show()


In [None]:
bill_cols = [f'BILL_AMT{i}' for i in range(1,7)]
pay_amt_cols = [f'PAY_AMT{i}' for i in range(1,7)]


summary = data.groupby('default.payment.next.month')[bill_cols + pay_amt_cols].agg(['mean','median']).transpose()
display(summary.head(20))

# Boxplots 
for col in bill_cols + pay_amt_cols:
    plt.figure(figsize=(8,4))
    sns.boxplot(x='default.payment.next.month', y=col, data=data)
    plt.title(f'{col} by Default (0=no, 1=yes)')
    plt.tight_layout()
    plt.show()


In [None]:
# Age distribution
plt.figure(figsize=(8,5))
sns.histplot(data['AGE'], kde=True, bins=20, color="skyblue")
plt.title("Age Distribution")
plt.show()

In [None]:
# Gender distribution
fig = px.histogram(data, x="SEX", color="default.payment.next.month", barmode="group",
                   title="Gender Distribution by ASD Class")
fig.show()

In [None]:
# Default distribution
plt.subplot(1, 2, 2)
sns.countplot(data=data, x="default.payment.next.month", palette="Set1")
plt.title("Default Payment Next Month", fontsize=14)
plt.xlabel("Default (0=No, 1=Yes)")
plt.ylabel("Count")
plt.tight_layout()
plt.show()

In [None]:
# Credit Limit distribution
plt.subplot(1,2,2)
sns.histplot(data["LIMIT_BAL"], bins=30, kde=True, color="salmon")
plt.title("Credit Limit Distribution", fontsize=14)
plt.xlabel("Credit Limit (LIMIT_BAL)")
plt.ylabel("Count")

plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(8,6))
sns.boxplot(x="default.payment.next.month", y="AGE", data=data, palette="Set3")
plt.title("Age vs Default", fontsize=14)
plt.xlabel("Default (0=No, 1=Yes)")
plt.ylabel("Age")
plt.show()


In [None]:
plt.figure(figsize=(8,6))
sns.countplot(x="EDUCATION", hue="default.payment.next.month", data=data, palette="Set2")
plt.title("Education vs Default", fontsize=14)
plt.xlabel("Education (1=Graduate School, 2=University, 3=High School, 4=Others)")
plt.ylabel("Count")
plt.legend(title="Default (0=No, 1=Yes)")
plt.show()


In [None]:
plt.figure(figsize=(8,6))
sns.countplot(x="MARRIAGE", hue="default.payment.next.month", data=data, palette="coolwarm")
plt.title("Marriage vs Default", fontsize=14)
plt.xlabel("Marriage (1=Married, 2=Single, 3=Others)")
plt.ylabel("Count")
plt.legend(title="Default (0=No, 1=Yes)")
plt.show()


In [None]:
sample = data.sample(1000, random_state=42)
sns.pairplot(sample[["LIMIT_BAL", "AGE", "SEX", "EDUCATION", "default.payment.next.month"]],
             hue="default.payment.next.month", diag_kind="kde", palette="husl")
plt.show()


 These visualizations will give you:

General distributions (Age, Credit Limit, Gender, Default).

Relationships between categorical features (Gender, Education, Marriage) and default.

Correlation overview with heatmap.

Pairwise relationships with pairplot.

Data PreProcessing

In [None]:
print("Counts:")
print(data['default.payment.next.month'].value_counts())
print("\nRelative frequencies:")
print(data['default.payment.next.month'].value_counts(normalize=True))


In [None]:
pay_corr = corr.loc[pay_cols + bill_cols + pay_amt_cols, 'default.payment.next.month'].sort_values(ascending=False)
print("Top correlated features with default:")
display(pay_corr.head(20))


In [None]:
# Remove duplicate rows if any
data.drop_duplicates(inplace=True)

In [None]:
data=data.drop(["ID"] , axis=1)

In [None]:
scaler =  StandardScaler()
# Select numeric columns for scaling
num_cols = ["LIMIT_BAL", "AGE"] + [col for col in data.columns if "BILL_AMT" in col or "PAY_AMT" in col]

data[num_cols] = scaler.fit_transform(data[num_cols])
data["default.payment.next.month"] = LabelEncoder().fit_transform(data["default.payment.next.month"])

categorical_cols = ["SEX", "EDUCATION", "MARRIAGE"]
data = pd.get_dummies(data, columns=categorical_cols, drop_first=True)

In [None]:

y=data["default.payment.next.month"]
x=data.drop(["default.payment.next.month"] , axis=1)
features = x.columns
x = scaler.fit_transform(x)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42 , stratify=y)
x_train.shape  ,y_train.shape


In [None]:
x.shape

✅Data Preprocessing:

Remove duplicates

Encode categorical variables

Scale numerical values

Train-test split

Handle class imbalance

In [None]:
from sklearn.metrics import accuracy_score, classification_report, RocCurveDisplay, PrecisionRecallDisplay, ConfusionMatrixDisplay
from sklearn.model_selection import learning_curve
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from imblearn.combine import SMOTETomek
import matplotlib.pyplot as plt
import numpy as np

# -------------------------
# Define models
# -------------------------
models = {
    "RandomForestClassifier": RandomForestClassifier(class_weight="balanced", random_state=42, n_estimators=100),
    "GradientBoostingClassifier": GradientBoostingClassifier(random_state=42, n_estimators=100),
    "LogisticRegression": LogisticRegression(class_weight="balanced", random_state=42, max_iter=500, solver="liblinear")
}

# -------------------------
# Helper function to train & evaluate
# -------------------------
def evaluate_models(models, X_train, y_train, X_test, y_test, phase="Before SMOTE"):
    print(f"\n========== {phase.upper()} ==========")
    
    for name, model in models.items():
        print(f"\n{name} ({phase})")
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        # Metrics
        accuracy = accuracy_score(y_test, y_pred)
        print(f"{name} - Accuracy: {accuracy:.4f}")
        print(classification_report(y_test, y_pred, digits=3))

        # Visualizations
        RocCurveDisplay.from_estimator(model, X_test, y_test)
        plt.title(f"ROC Curve ({phase}): {name}")
        plt.show()

        PrecisionRecallDisplay.from_estimator(model, X_test, y_test)
        plt.title(f"Precision-Recall ({phase}): {name}")
        plt.show()

        ConfusionMatrixDisplay.from_estimator(model, X_test, y_test)
        plt.title(f"Confusion Matrix ({phase}): {name}")
        plt.show()

        # Learning curve
        train_sizes, train_scores, val_scores = learning_curve(
            model, X_train, y_train, cv=5, scoring='f1',
            train_sizes=np.linspace(0.1, 1.0, 5), n_jobs=-1
        )
        plt.plot(train_sizes, train_scores.mean(axis=1), marker='o', label='Train')
        plt.plot(train_sizes, val_scores.mean(axis=1), marker='o', label='Validation')
        plt.xlabel('Training examples')
        plt.ylabel('F1 Score')
        plt.title(f'Learning Curve ({phase}): {name}')
        plt.legend()
        plt.show()


# -------------------------
# 1) Before SMOTE
# -------------------------
evaluate_models(models, x_train, y_train, x_test, y_test, phase="Before SMOTE")

# -------------------------
# 2) After SMOTE
# -------------------------
smt = SMOTETomek(random_state=42)
x_train_sm, y_train_sm = smt.fit_resample(x_train, y_train)

evaluate_models(models, x_train_sm, y_train_sm, x_test, y_test, phase="After SMOTE")



In [None]:

model1 = tf.keras.Sequential([
    tf.keras.layers.Input(shape=[30]),
    
    tf.keras.layers.Dense(64, activation=tf.nn.leaky_relu),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dropout(0.3),
    
    tf.keras.layers.Dense(32, activation=tf.nn.leaky_relu),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dropout(0.4),
    
    tf.keras.layers.Dense(1, activation=tf.nn.sigmoid)
])
model1.summary()





In [None]:
# Callbacks
early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
lr_schedule = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3)

model1.compile(optimizer='adam', loss='binary_crossentropy',metrics=['accuracy'])

hist = model1.fit(
    x_train, y_train,
    epochs=50,
    batch_size=32,
    validation_split=0.2,
    callbacks=[early_stop, lr_schedule]
)

In [None]:

plt.figure(figsize=(12,5))

# Loss Curve
plt.subplot(1,2,1)
plt.plot(hist.history['loss'], label='Train Loss')
if 'val_loss' in hist.history:
    plt.plot(hist.history['val_loss'], label='Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Loss Curve (Sequential)')
plt.legend()

# Accuracy Curve
plt.subplot(1,2,2)
plt.plot(hist.history['accuracy'], label='Train Accuracy')
if 'val_accuracy' in hist.history:
    plt.plot(hist.history['val_accuracy'], label='Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.title('Accuracy Curve (Sequential)')
plt.legend()

plt.show()

In [None]:
test_loss, test_acc = model1.evaluate(x_test, y_test, verbose=0)
print(f"📊 Test Accuracy: {test_acc:.4f}")
print(f"📉 Test Loss: {test_loss:.4f}")

In [None]:
import joblib
best_model=GradientBoostingClassifier()
best_model.fit(x_train,y_train)
joblib.dump(best_model, "credit.pkl")


