In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [2]:
#RUN ONLY ONCE
# %pip install kaggle
# !mkdir -p ~/.kaggle
#before running the code below, make sure to download Kaggle API Token first and change path to where the file was downloaded
# !mv /path/to/downloaded/kaggle.json ~/.kaggle/

# import kaggle
# !kaggle datasets download -d nelgiriyewithana/credit-card-fraud-detection-dataset-2023
# !kaggle datasets download -d praveengovi/credit-risk-classification-dataset

# import zipfile
# with zipfile.ZipFile('credit-card-fraud-detection-dataset-2023.zip', 'r') as zip_ref:
#     zip_ref.extractall('credit-card-fraud-detection-dataset')
# with zipfile.ZipFile('credit-risk-classification-dataset.zip', 'r') as zip_ref:
#     zip_ref.extractall('credit-risk-classification-dataset')
# !del credit-card-fraud-detection-dataset-2023.zip
# !del credit-risk-classification-dataset.zip

In [3]:
#data loader methods
def load_credit_risk():
    payment = pd.read_csv('credit-risk-classification-dataset/payment_data.csv')
    customer = pd.read_csv('credit-risk-classification-dataset/customer_data.csv', usecols=range(0,2))
    merged_data = payment.merge(customer, left_on='id', right_on='id').values
    
    cols_to_drop = [0,8,11]#dropping the id and dates; not helpful
    data = np.delete(merged_data, cols_to_drop, axis=1)
    
    #selecting only 1000 of data with labels 0 and 1000 with labels 1
    #splitting data by label
    indices_0 = data[data[:,-1] == 0]
    indices_1 = data[data[:,-1] == 1]
    
    #randomly selecting indices for labels
    random_indices_0 = np.random.choice(indices_0.shape[0], size=1000, replace=False)
    random_indices_1 = np.random.choice(indices_1.shape[0], size=1000, replace=False)

    #creating 2 subsets with 0 and 1
    subset_0 = indices_0[random_indices_0]
    subset_1 = indices_1[random_indices_1]

    #combining both to create one and shuffling
    data = np.vstack((subset_0, subset_1))
    np.random.shuffle(data)
    
    return (data[:,:-1], data[:,-1].astype(int))
    

def load_credit_fraud():
    #deleting the 1st col as it is just the id; X is 1-30 and y is col 31
    X = pd.read_csv('credit-card-fraud-detection-dataset/creditcard_2023.csv', usecols=range(1,31)).values
    
    #selecting only 5000 of data with labels 0 and 5000 with labels 1
    #splitting data by label
    indices_0 = X[X[:,-1] == 0]
    indices_1 = X[X[:,-1] == 1]
    
    #randomly selecting indices for labels
    random_indices_0 = np.random.choice(indices_0.shape[0], size=5000, replace=False)
    random_indices_1 = np.random.choice(indices_1.shape[0], size=5000, replace=False)

    #creating 2 subsets with 0 and 1
    subset_0 = indices_0[random_indices_0]
    subset_1 = indices_1[random_indices_1]

    #combining both to create one and shuffling
    data = np.vstack((subset_0, subset_1))
    np.random.shuffle(data)

    return (data[:,:-1], data[:,-1].astype(int))

In [4]:
#Loading data
X_risk, y_risk = load_credit_risk()
X_fraud, y_fraud = load_credit_fraud()

#Standardizing data
from sklearn.preprocessing import StandardScaler
X_risk = StandardScaler().fit_transform(X_risk)
X_fraud = StandardScaler().fit_transform(X_fraud)

#Filling nan values with mean using impute
from sklearn.impute import SimpleImputer
X_risk = SimpleImputer(strategy='mean').fit_transform(X_risk)

In [5]:
#Checking if data is valid
def data_is_valid(X,y, examples, features):
    return {
        X.shape == (examples, features)
        and y.shape == (examples,)
        and not np.any(np.isnan(X))
        and np.all((y==1) | (y==0))
    }
print(f'Validity for Credit Risk Dataset: {data_is_valid(X_risk, y_risk, 2000, 9)}')
print(f'Validity for Credit Fraud Dataset: {data_is_valid(X_fraud, y_fraud, 10000, 29)}')

Validity for Credit Risk Dataset: {True}
Validity for Credit Fraud Dataset: {True}


In [6]:
#imports
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

In [7]:
#param grids for classifiers
svc_param_grid = {'C': [0.1, 1, 10, 100], 'kernel': ['rbf']}
kNN_param_grid = {'n_neighbors': [3,5,7,9]}
forest_param_grid = {'n_estimators': [50, 100, 150]}

#setting cv
cv = StratifiedKFold(n_splits=5, random_state=0, shuffle=True)

In [8]:
#Cell for calculating base accuracies of the credit risk datatset

#svc classifier
classifier_1 = GridSearchCV(svm.SVC(), svc_param_grid)
classifier_1.fit(X_risk, y_risk)
risk_svc_accuracy = np.mean(cross_val_score(classifier_1, X_risk, y_risk, cv=cv, scoring='accuracy'))

#kNN classifier
classifier_2 = GridSearchCV(KNeighborsClassifier(), kNN_param_grid)
classifier_2.fit(X_risk, y_risk)
risk_kNN_accuracy = np.mean(cross_val_score(classifier_2, X_risk, y_risk, cv=cv, scoring='accuracy'))

#random forest classifier
classifier_3 = GridSearchCV(RandomForestClassifier(), forest_param_grid)
classifier_3.fit(X_risk, y_risk)
risk_forest_accuracy = np.mean(cross_val_score(classifier_3, X_risk, y_risk, cv=cv, scoring='accuracy'))

In [9]:
#Cell for calculating base accuracies of the credit fraud datatset

#svc classifier
classifier_1 = GridSearchCV(svm.SVC(), svc_param_grid)
classifier_1.fit(X_risk, y_risk)
fraud_svc_accuracy = np.mean(cross_val_score(classifier_1, X_fraud, y_fraud, cv=cv, scoring='accuracy'))

#kNN classifier
classifier_2 = GridSearchCV(KNeighborsClassifier(), kNN_param_grid)
classifier_2.fit(X_risk, y_risk)
fraud_kNN_accuracy = np.mean(cross_val_score(classifier_2, X_fraud, y_fraud, cv=cv, scoring='accuracy'))

#random forest classifier
classifier_3 = GridSearchCV(RandomForestClassifier(), forest_param_grid)
classifier_3.fit(X_risk, y_risk)
fraud_forest_accuracy = np.mean(cross_val_score(classifier_3, X_fraud, y_fraud, cv=cv, scoring='accuracy'))

In [10]:
base_accuracies = [
    [risk_svc_accuracy, risk_kNN_accuracy, risk_forest_accuracy],
    [fraud_svc_accuracy, fraud_kNN_accuracy, fraud_forest_accuracy]
]
print('Base accuracies using three classifiers for both datasets:')
pd.DataFrame(base_accuracies, columns=['SVC', 'kNN', 'Random Forest'], index=['Credit Risk Dataset', 'Credit Fraud Dataset'])

Base accuracies using three classifiers for both datasets:


Unnamed: 0,SVC,kNN,Random Forest
Credit Risk Dataset,0.558,0.5395,0.577
Credit Fraud Dataset,0.9913,0.9815,0.9848


In [11]:
#import libraries needed for GAN
import tensorflow as tf
from sklearn.preprocessing import MinMaxScaler
#generator and discriminator functions
def build_generator(latent_dim, output_dim):
    model = tf.keras.Sequential()
    model.add(tf.keras.layers.Dense(128, input_dim=latent_dim, activation='relu'))
    model.add(tf.keras.layers.Dense(output_dim, activation='sigmoid'))
    return model

def build_discriminator(input_dim):
    model = tf.keras.Sequential()
    model.add(tf.keras.layers.Dense(128, input_dim=input_dim, activation='relu'))
    model.add(tf.keras.layers.Dense(1, activation='sigmoid'))
    return model
def build_gan(generator, discriminator):
    discriminator.trainable = False
    model = tf.keras.Sequential()
    model.add(generator)
    model.add(discriminator)
    model.compile(loss='binary_crossentropy', optimizer='adam')
    return model
def train_gan(generator, discriminator, gan, data, latent_dim, epochs=10000, batch_size=32):
    generator.compile(optimizer='adam', loss='binary_crossentropy')
    discriminator.compile(optimizer='adam', loss='binary_crossentropy')
    
    for epoch in range(epochs):
        # Train discriminator on real data
        real_data = data[np.random.randint(0, data.shape[0], batch_size)]
        real_labels = np.ones((batch_size, 1))

        d_loss_real = discriminator.train_on_batch(real_data, real_labels)

        # Train discriminator on generated data
        noise = np.random.normal(0, 1, (batch_size, latent_dim))
        generated_data = generator.predict(noise)
        fake_labels = np.zeros((batch_size, 1))

        d_loss_fake = discriminator.train_on_batch(generated_data, fake_labels)

        # Train generator
        noise = np.random.normal(0, 1, (batch_size, latent_dim))
        valid_labels = np.ones((batch_size, 1))

        g_loss = gan.train_on_batch(noise, valid_labels)
def generate_synthetic_data(generator, latent_dim, n_samples):
    noise = np.random.normal(0, 1, (n_samples, latent_dim))
    generated_data = generator.predict(noise)
    return generated_data

In [14]:
scaler = MinMaxScaler()
scaled_data = scaler.fit_transform(X)

latent_dim = 10  # Size of the random noise vector
output_dim = 20  # Number of features

generator = build_generator(latent_dim, output_dim)
discriminator = build_discriminator(output_dim)
gan = build_gan(generator, discriminator)

epochs = 5000
batch_size = 32

train_gan(generator, discriminator, gan, scaled_data, latent_dim, epochs, batch_size)

n_samples = 100
generated_data = generate_synthetic_data(generator, latent_dim, n_samples)

generated_data = scaler.inverse_transform(generated_data)

print("Generated Data:")
print(generated_data[:5])