<a href="https://colab.research.google.com/github/armaanranjan/credit-card-fraud-detection/blob/main/CCFD_ML_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# --- 1. Install compatible packages ---
!pip install sdv==1.27.0 pandas numpy scikit-learn xgboost matplotlib seaborn -q

# --- 2. Import libraries ---
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sdv.single_table import CTGANSynthesizer
from sdv.metadata import SingleTableMetadata
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    confusion_matrix, classification_report, roc_auc_score,
    roc_curve, precision_recall_curve
)
from xgboost import XGBClassifier
from google.colab import files



In [None]:
# --- 3. Load Dataset (edit this path) ---
# Make sure to upload the creditcard.csv file to your Colab environment
# using the file upload button in the left sidebar (folder icon).
DATASET_PATH = "/content/creditcard.csv"

try:
    df = pd.read_csv(DATASET_PATH) # Removed encoding and on_bad_lines for standard CSV reading
    print("Dataset loaded successfully!")
    print(df.head())
    print("\nClass Distribution:\n", df['Class'].value_counts())

    # --- 4. Train CTGAN on Fraud Samples ---

    fraud = df[df['Class'] == 1].reset_index(drop=True)

    metadata = SingleTableMetadata()
    metadata.detect_from_dataframe(data=fraud)
    ctgan = CTGANSynthesizer(metadata)
    ctgan.fit(fraud)

    # --- 5. Generate Synthetic Fraud Samples ---

    target_multiplier = 10  # create 10√ó fraud data
    n_generate = len(fraud) * target_multiplier
    synthetic_fraud = ctgan.sample(num_rows=n_generate)
    synthetic_fraud['Class'] = 1

    print(f"\nGenerated {len(synthetic_fraud)} synthetic fraud samples ")

    # --- 6. Create Augmented Dataset ---
    augmented_df = pd.concat([df, synthetic_fraud], ignore_index=True).sample(frac=1, random_state=42)
    print("Augmented class counts:\n", augmented_df['Class'].value_counts())

    # --- 7. Visualize Class Distribution ---

    plt.figure(figsize=(6,4))
    sns.countplot(x='Class', data=augmented_df)
    plt.title("Class Distribution after GAN Augmentation")
    plt.savefig("class_distribution.png")
    plt.show()

    # --- 7.5. Clean the Dataset Before Splitting ---

    # Remove rows with missing target or any NaN values
    augmented_df = augmented_df.dropna(subset=['Class'])
    augmented_df = augmented_df.dropna().reset_index(drop=True)

    # Ensure target column is integer type (in case it became float after CTGAN)
    augmented_df['Class'] = augmented_df['Class'].astype(int)

    print(f" Cleaned dataset ‚Äî no NaN values remain. Final shape: {augmented_df.shape}")
    print("Class distribution after cleaning:\n", augmented_df['Class'].value_counts())




    # --- 8. Split and Scale Data ---

    X_train, X_test, y_train, y_test = train_test_split(
        augmented_df.drop(columns=['Class']),
        augmented_df['Class'],
        test_size=0.2,
        stratify=augmented_df['Class'],
        random_state=42
    )
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    # --- 9. Train XGBoost Classifier ---

    model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', n_estimators=200)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:,1]

    # --- 10. Evaluate Model Performance ---

    print("\n Classification Report:\n")
    print(classification_report(y_test, y_pred, digits=4))
    print("ROC-AUC:", roc_auc_score(y_test, y_prob))

    # Confusion Matrix
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(5,4))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
    plt.title("Confusion Matrix")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.savefig("confusion_matrix.png")
    plt.show()

    # ROC Curve
    fpr, tpr, _ = roc_curve(y_test, y_prob)
    plt.figure(figsize=(5,4))
    plt.plot(fpr, tpr, label=f"AUC = {roc_auc_score(y_test, y_prob):.3f}")
    plt.plot([0,1],[0,1],'k--')
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.title("ROC Curve")
    plt.legend()
    plt.savefig("roc_curve.png")
    plt.show()

    # Precision‚ÄìRecall Curve
    prec, rec, _ = precision_recall_curve(y_test, y_prob)
    plt.figure(figsize=(5,4))
    plt.plot(rec, prec, color='darkorange')
    plt.xlabel("Recall")
    plt.ylabel("Precision")
    plt.title("Precision‚ÄìRecall Curve")
    plt.savefig("precision_recall_curve.png")
    plt.show()

    print("\n All visualizations saved: "
          "class_distribution.png, confusion_matrix.png, roc_curve.png, precision_recall_curve.png")

except FileNotFoundError:
    print(f"Error: The file '{DATASET_PATH}' was not found.")
    print("Please upload the 'creditcard.csv' file to your Colab environment.")
except Exception as e:
    print(f"An error occurred: {e}")

Error: The file '/content/creditcard.csv' was not found.
Please upload the 'creditcard.csv' file to your Colab environment.


In [None]:
# --- 11. Compare Multiple Classifiers ---

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import mean_squared_error, mean_absolute_error, accuracy_score

# Initialize models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(n_estimators=150, random_state=42),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42),
    "KNN": KNeighborsClassifier(n_neighbors=5)
}

# Store results
results = {}

for name, clf in models.items():
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mae = mean_absolute_error(y_test, y_pred)
    acc = accuracy_score(y_test, y_pred)

    results[name] = {"RMSE": rmse, "MAE": mae, "Accuracy": acc}

# --- Print Results  ---
print("Target column used: Class\n")
for model, metrics in results.items():
    print(f"{model} -> RMSE: {metrics['RMSE']:.2f}, MAE: {metrics['MAE']:.2f}, Accuracy: {metrics['Accuracy']*100:.2f}%")

# --- Convert to DataFrame ---
results_df = pd.DataFrame(results).T

# --- 12. Visual Comparison Graph  ---

plt.figure(figsize=(9,5))
plt.plot(results_df.index, results_df["RMSE"], color='orange', marker='o', label='RMSE')
plt.plot(results_df.index, results_df["MAE"], color='blue', marker='o', label='MAE')
plt.title("Classifier Performance Comparison (CTGAN-Augmented Fraud Dataset)")
plt.xlabel("Model")
plt.ylabel("Error Value")
plt.legend()
plt.grid(True, linestyle="--", alpha=0.7)
plt.tight_layout()
plt.savefig("model_comparison.png")
plt.show()

print("\nComparison chart saved as: model_comparison.png")


NameError: name 'X_train' is not defined

In [None]:
!pip install streamlit==1.39.0 pyngrok==7.2.0 xgboost scikit-learn seaborn matplotlib pandas -q


In [None]:
from pyngrok import ngrok

# Paste your token here ‚Üì
!ngrok config add-authtoken 350rMHoVjxDJAOEZKaKEj1itu8U_3UEMgVJ1feVf3PsfU1UfH


Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml


In [None]:
print("üåê Starting Streamlit app ‚Äî please wait a few seconds for the link...")
!streamlit run app.py &>/content/log.txt &
public_url = ngrok.connect(8501)
print("‚úÖ Access your GUI here üëá")
print(public_url)


üåê Starting Streamlit app ‚Äî please wait a few seconds for the link...
‚úÖ Access your GUI here üëá
NgrokTunnel: "https://kimbery-fightable-clotilde.ngrok-free.dev" -> "http://localhost:8501"


In [None]:
%%writefile app.py
import streamlit as st
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    classification_report, confusion_matrix, roc_auc_score,
    roc_curve, mean_squared_error, mean_absolute_error, accuracy_score
)
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sdv.single_table import CTGANSynthesizer
from sdv.metadata import SingleTableMetadata

# =========================
# Streamlit Config
# =========================
st.set_page_config(page_title="CTGAN + XGBoost Fraud Detection", layout="wide")
st.title("üí≥ Credit Card Fraud Detection using CTGAN and XGBoost")
st.write("""
This system detects fraudulent credit card transactions using **CTGAN for synthetic data generation**
and compares multiple classifiers including **XGBoost, Random Forest, Logistic Regression, and KNN.**
""")

# =========================
# Step 1: Upload Dataset
# =========================
uploaded_file = st.file_uploader("üìÇ Upload credit card dataset (CSV with 'Class' column)", type=["csv"])

if uploaded_file:
    df = pd.read_csv(uploaded_file)
    st.subheader("üìä Dataset Preview")
    st.dataframe(df.head())

    if 'Class' not in df.columns:
        st.error("‚ùå The dataset must contain a 'Class' column (0 = normal, 1 = fraud).")
    else:
        st.write("‚úÖ Original Class Distribution:")
        st.bar_chart(df['Class'].value_counts())

        # Variables to store generated data
        if "augmented_df" not in st.session_state:
            st.session_state.augmented_df = None
        if "results_df" not in st.session_state:
            st.session_state.results_df = None
        if "roc_data" not in st.session_state:
            st.session_state.roc_data = None

        # =========================
        # Step 2: Train CTGAN Button
        # =========================
        if st.button("üß† Train CTGAN and Generate Synthetic Fraud Data"):
            fraud = df[df['Class'] == 1].reset_index(drop=True)
            metadata = SingleTableMetadata()
            metadata.detect_from_dataframe(data=fraud)

            ctgan = CTGANSynthesizer(metadata)
            with st.spinner("Training CTGAN model... please wait (may take a few minutes)"):
                ctgan.fit(fraud)

            target_multiplier = 10
            n_generate = len(fraud) * target_multiplier
            synthetic_fraud = ctgan.sample(num_rows=n_generate)
            synthetic_fraud['Class'] = 1

            augmented_df = pd.concat([df, synthetic_fraud], ignore_index=True).sample(frac=1, random_state=42)
            st.session_state.augmented_df = augmented_df

            st.success(f"‚úÖ CTGAN trained! Generated {len(synthetic_fraud)} synthetic fraud samples.")
            st.write("### üìà Class Distribution After Augmentation")
            st.bar_chart(augmented_df['Class'].value_counts())

        # =========================
        # Step 3: Train Models Button
        # =========================
        if st.session_state.augmented_df is not None:
            if st.button("‚öôÔ∏è Train Classifiers (XGBoost, Random Forest, Logistic Regression, KNN)"):
                augmented_df = st.session_state.augmented_df

                X = augmented_df.drop(columns=['Class'])
                y = augmented_df['Class']
                X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
                scaler = StandardScaler()
                X_train = scaler.fit_transform(X_train)
                X_test = scaler.transform(X_test)

                models = {
                    "Logistic Regression": LogisticRegression(max_iter=1000),
                    "Random Forest": RandomForestClassifier(n_estimators=150, random_state=42),
                    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42),
                    "KNN": KNeighborsClassifier(n_neighbors=5)
                }

                results = {}
                roc_data = {}

                with st.spinner("Training models..."):
                    for name, clf in models.items():
                        clf.fit(X_train, y_train)
                        y_pred = clf.predict(X_test)
                        y_prob = clf.predict_proba(X_test)[:, 1] if hasattr(clf, "predict_proba") else y_pred
                        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
                        mae = mean_absolute_error(y_test, y_pred)
                        acc = accuracy_score(y_test, y_pred)
                        auc = roc_auc_score(y_test, y_prob)
                        results[name] = {"RMSE": rmse, "MAE": mae, "Accuracy": acc, "AUC": auc}
                        fpr, tpr, _ = roc_curve(y_test, y_prob)
                        roc_data[name] = (fpr, tpr)

                st.session_state.results_df = pd.DataFrame(results).T
                st.session_state.roc_data = roc_data
                st.success("‚úÖ Models trained successfully! You can now view comparisons.")

        # =========================
        # Step 4: Show Results Button
        # =========================
        if st.session_state.results_df is not None and st.button("üìä Show Model Comparison Results"):
            results_df = st.session_state.results_df
            roc_data = st.session_state.roc_data

            st.write("### üìã Model Performance Table")
            st.dataframe(results_df.style.highlight_max(color='lightgreen', axis=0))

            # Error metrics plot
            fig, ax = plt.subplots(figsize=(8, 5))
            ax.plot(results_df.index, results_df["RMSE"], marker='o', label='RMSE')
            ax.plot(results_df.index, results_df["MAE"], marker='o', label='MAE')
            ax.set_title("Error Metrics Comparison (CTGAN-Augmented Data)")
            ax.set_xlabel("Model"); ax.set_ylabel("Error Value")
            ax.legend(); ax.grid(True)
            st.pyplot(fig)

            # ROC curves plot
            st.write("### üìà ROC Curve Comparison")
            fig2, ax2 = plt.subplots(figsize=(7, 5))
            for name, (fpr, tpr) in roc_data.items():
                ax2.plot(fpr, tpr, label=f"{name} (AUC={results_df.loc[name, 'AUC']:.3f})")
            ax2.plot([0, 1], [0, 1], 'k--')
            ax2.set_xlabel("False Positive Rate")
            ax2.set_ylabel("True Positive Rate")
            ax2.set_title("ROC Curve Comparison")
            ax2.legend()
            st.pyplot(fig2)

            st.success("‚úÖ Model comparison visualized successfully.")

else:
    st.info("‚¨ÜÔ∏è Upload your anonymized dataset to get started.")


Overwriting app.py


In [None]:
%%writefile app.py
import streamlit as st
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, roc_auc_score
from xgboost import XGBClassifier
import numpy as np

st.set_page_config(page_title="Credit Card Fraud Detection", layout="centered")
st.title("üí≥ Credit Card Fraud Detection using CTGAN + XGBoost")
st.write("""
This demo uses **synthetic, PCA-transformed data** ‚Äî no personal or real card details are used.
You can upload an anonymized dataset with a **'Class'** column (0 = Legit, 1 = Fraud).
""")

uploaded_file = st.file_uploader("üìÇ Upload CSV dataset", type=["csv"])

if uploaded_file:
    df = pd.read_csv(uploaded_file)
    st.subheader("üìä Dataset Preview")
    st.dataframe(df.head())

    if 'Class' not in df.columns:
        st.error("‚ùå 'Class' column not found ‚Äî please upload a dataset with 'Class' as target.")
    else:
        st.write("‚úÖ Target column detected:", df['Class'].value_counts())

        if st.button("üöÄ Train XGBoost Model"):
            X = df.drop(columns=['Class'])
            y = df['Class']

            X = X.dropna()
            y = y.loc[X.index]
            X_train, X_test, y_train, y_test = train_test_split(
                X, y, test_size=0.2, stratify=y, random_state=42
            )
            scaler = StandardScaler()
            X_train = scaler.fit_transform(X_train)
            X_test = scaler.transform(X_test)

            model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)
            y_prob = model.predict_proba(X_test)[:, 1]

            st.success("‚úÖ Model trained successfully!")
            st.write("### üìà Classification Report")
            st.text(classification_report(y_test, y_pred, digits=4))
            st.write(f"ROC-AUC Score = {roc_auc_score(y_test, y_prob):.4f}")

            cm = confusion_matrix(y_test, y_pred)
            fig, ax = plt.subplots()
            sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax)
            ax.set_xlabel("Predicted"); ax.set_ylabel("Actual")
            ax.set_title("Confusion Matrix")
            st.pyplot(fig)

            fpr, tpr, _ = roc_curve(y_test, y_prob)
            fig2, ax2 = plt.subplots()
            ax2.plot(fpr, tpr, label=f"AUC = {roc_auc_score(y_test, y_prob):.3f}")
            ax2.plot([0,1],[0,1],'k--')
            ax2.set_xlabel("False Positive Rate")
            ax2.set_ylabel("True Positive Rate")
            ax2.set_title("ROC Curve")
            ax2.legend()
            st.pyplot(fig2)

            st.info("‚úÖ Visualizations completed. Model trained on uploaded data successfully.")
else:
    st.info("‚¨ÜÔ∏è Upload your anonymized credit card dataset to begin.")


Overwriting app.py
