In [2]:
!pip install tensorflow



In [96]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LeakyReLU
from tensorflow.keras.optimizers import Adam
from sklearn.preprocessing import StandardScaler

# Load Dataset
df = pd.read_csv('credit_card_data.csv')

# Sample 1000 rows
df_random = df.sample(n=1000, random_state=42).dropna()

# Identify numerical and categorical columns
numeric_columns = df_random.select_dtypes(include=[np.number]).columns.tolist()
categorical_columns = df_random.select_dtypes(include=['object']).columns.tolist()

# Standardize numerical columns
scaler = StandardScaler()
df_random[numeric_columns] = scaler.fit_transform(df_random[numeric_columns])

# One-hot encode categorical columns
df_encoded = pd.get_dummies(df_random, columns=categorical_columns)

# Prepare data for GAN
data = df_encoded.values.astype(np.float32)  # Final GAN input

In [97]:
# Generator
def build_generator(latent_dim, output_dim):
    model = Sequential([
        Dense(128, input_dim=latent_dim),
        LeakyReLU(alpha=0.2),
        Dense(256),
        LeakyReLU(alpha=0.2),
        Dense(512),
        LeakyReLU(alpha=0.2),
        Dense(output_dim, activation='tanh')  # Output shape matches encoded data
    ])
    return model

# Discriminator
def build_discriminator(input_dim):
    model = Sequential([
        Dense(512, input_dim=input_dim),
        LeakyReLU(alpha=0.2),
        Dense(256),
        LeakyReLU(alpha=0.2),
        Dense(128),
        LeakyReLU(alpha=0.2),
        Dense(1, activation='sigmoid')  # Output real/fake
    ])
    return model

### Training GAN model

In [98]:
# Build Models
latent_dim = 100
generator = build_generator(latent_dim, data.shape[1])
discriminator = build_discriminator(data.shape[1])
discriminator.compile(loss='binary_crossentropy', optimizer=Adam(0.0002, 0.5), metrics=['accuracy'])

# Freeze discriminator for GAN
discriminator.trainable = False

# GAN Model
gan_input = tf.keras.Input(shape=(latent_dim,))
x = generator(gan_input)
gan_output = discriminator(x)
gan = tf.keras.Model(gan_input, gan_output)
gan.compile(loss='binary_crossentropy', optimizer=Adam(0.0002, 0.5))

# Training Function
def train_gan(epochs, batch_size):
    half_batch = batch_size // 2

    for epoch in range(epochs):
        # --- Train Discriminator ---
        idx = np.random.randint(0, data.shape[0], half_batch)
        real_data = data[idx]
        noise = np.random.randn(half_batch, latent_dim).astype(np.float32)
        fake_data = generator.predict(noise, verbose=0)

        real_labels = np.ones((half_batch, 1), dtype=np.float32)
        fake_labels = np.zeros((half_batch, 1), dtype=np.float32)

        d_loss_real = discriminator.train_on_batch(real_data, real_labels)
        d_loss_fake = discriminator.train_on_batch(fake_data, fake_labels)
        d_loss = 0.5 * np.add(d_loss_real, d_loss_fake)

        # --- Train Generator ---
        noise = np.random.randn(batch_size, latent_dim).astype(np.float32)
        valid_labels = np.ones((batch_size, 1), dtype=np.float32)
        g_loss = gan.train_on_batch(noise, valid_labels)

        # Print Progress
        if epoch % 100 == 0:
            print(f"Epoch {epoch}: [D loss: {d_loss[0]:.4f}, acc: {d_loss[1]*100:.2f}%] [G loss: {g_loss:.4f}]")

# Start Training
train_gan(epochs=1500, batch_size=64)

Epoch 0: [D loss: 0.6990, acc: 37.50%] [G loss: 0.7140]
Epoch 100: [D loss: 3.7823, acc: 15.71%] [G loss: 0.0254]
Epoch 200: [D loss: 4.4842, acc: 15.47%] [G loss: 0.0128]
Epoch 300: [D loss: 4.8955, acc: 15.40%] [G loss: 0.0086]
Epoch 400: [D loss: 5.1926, acc: 15.32%] [G loss: 0.0065]
Epoch 500: [D loss: 5.4292, acc: 15.48%] [G loss: 0.0052]
Epoch 600: [D loss: 5.6166, acc: 15.59%] [G loss: 0.0043]
Epoch 700: [D loss: 5.7772, acc: 15.58%] [G loss: 0.0037]
Epoch 800: [D loss: 5.9168, acc: 15.55%] [G loss: 0.0032]
Epoch 900: [D loss: 6.0399, acc: 15.53%] [G loss: 0.0029]
Epoch 1000: [D loss: 6.1492, acc: 15.58%] [G loss: 0.0026]
Epoch 1100: [D loss: 6.2490, acc: 15.66%] [G loss: 0.0024]
Epoch 1200: [D loss: 6.3424, acc: 15.62%] [G loss: 0.0022]
Epoch 1300: [D loss: 6.4273, acc: 15.67%] [G loss: 0.0020]
Epoch 1400: [D loss: 6.5075, acc: 15.62%] [G loss: 0.0019]


In [6]:
# High Discriminator Loss: A very high discriminator loss (~6.59) indicates that the
# discriminator is struggling to classify real vs. fake data.

# Very Low Accuracy (15.62%): This shows that the discriminator is wrong most of the time,
# meaning it can't distinguish real from fake data.

In [7]:
# Very Low Generator Loss: The generator loss being close to zero could indicate that
# it is successfully fooling the discriminator.

### Generate New Synthetic Data

In [99]:
# Generate New Synthetic Data
num_samples = 100
noise = np.random.randn(num_samples, latent_dim).astype(np.float32)
generated_data = generator.predict(noise)

# Create DataFrame
generated_df = pd.DataFrame(generated_data, columns=df_encoded.columns)

# --- Important: Reverse Preprocessing ---

# Step 1: Reverse Standardization for numeric columns
generated_df[numeric_columns] = scaler.inverse_transform(generated_df[numeric_columns])

# Step 2: For one-hot encoded categorical columns, take argmax (most probable category)
for original_col in categorical_columns:
    ohe_cols = [col for col in df_encoded.columns if col.startswith(original_col + '_')]
    generated_df[original_col] = generated_df[ohe_cols].idxmax(axis=1).str.replace(f"{original_col}_", "")

    # Drop one-hot columns
    generated_df.drop(columns=ohe_cols, inplace=True)

# Final Synthetic Dataset
print(generated_df.head())

# Save to CSV
generated_df.to_csv('synthetic_credit_card_data.csv', index=False)

[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step
         cc_num         amt           zip        lat       long     unix_time  \
0  4.218340e+17  183.487717  54904.105469  40.893261 -90.047729  1.347058e+09   
1  6.508854e+17  191.444672  54794.917969  40.806644 -90.041702  1.348344e+09   
2  6.821859e+17  139.809769  58521.035156  40.828789 -89.954643  1.347335e+09   
3  5.856104e+17  187.888596  56959.332031  40.879326 -92.137901  1.347582e+09   
4  3.100787e+17  159.408905  58801.785156  40.094479 -91.063728  1.348049e+09   

   merch_lat  merch_long  is_fraud  merch_zipcode  ...        category  \
0  39.607880  -84.955086 -0.014815   62068.437500  ...        misc_net   
1  39.442257  -83.135559 -0.074608   62444.894531  ...  health_fitness   
2  39.309429  -83.933144 -0.004470   57120.476562  ...  health_fitness   
3  39.884277  -87.494484 -0.090105   63843.625000  ...  health_fitness   
4  39.581501  -83.487480 -0.073647   62838.175781  ...  health_fitness 

In [100]:
data_set = pd.read_csv('synthetic_credit_card_data.csv')

In [101]:
# Original Dataset
df.head()

Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,city,...,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud,merch_zipcode
0,31/03/2019 20:37,213107169859697,"fraud_Witting, Beer and Ernser",home,167.96,Megan,Fernandez,F,77127 Paul Station Apt. 495,Woods Cross,...,-111.9027,13983.0,Insurance broker,14/08/1979,49bc93df3f485e1ef5921beb06217514,1333226000.0,41.69658,-112.170373,0.0,84309.0
1,25/11/2019 20:37,340103199302564,fraud_Botsford and Sons,home,181.49,Sophia,Dickson,F,9047 Rodriguez Turnpike,Gibsonville,...,-79.5685,12486.0,Arboriculturist,09/02/1934,f708aa922eb3a3d0a522aae95e96ecca,1353876000.0,36.57033,-79.708278,0.0,24069.0
2,03/06/2019 03:05,3553629419254918,fraud_Luettgen PLC,gas_transport,9.61,Sharon,Johnson,F,7202 Jeffrey Mills,Conway,...,-122.3456,85.0,"Research officer, political party",01/09/1984,f88d5496cd3758a2479e9b2eb8358c1f,1338693000.0,48.161534,-122.80632,1.0,98239.0
3,01/09/2019 20:11,4671727014157745,fraud_Harris Group,food_dining,10.73,Kenneth,Edwards,M,3653 Ryan Crossroad,Andrews,...,-85.6067,2304.0,Retail banker,25/07/1955,ec1db3cbea5b090f3b9ebbd23e18a40e,1346530000.0,41.178952,-86.123005,0.0,46570.0
4,09/12/2019 14:59,4005676619255478,fraud_Brown-Greenholt,entertainment,9.32,William,Perry,M,458 Phillips Island Apt. 768,Denham Springs,...,-90.9027,71335.0,Herbalist,31/05/1994,635abd8e0c00da7758b91e015959004c,1355065000.0,31.219294,-90.940509,0.0,39638.0


In [102]:
# Synthetic dataset
data_set.head()

Unnamed: 0,cc_num,amt,zip,lat,long,unix_time,merch_lat,merch_long,is_fraud,merch_zipcode,...,category,first,last,gender,street,city,state,job,dob,trans_num
0,4.21834e+17,183.48772,54904.105,40.89326,-90.04773,1347058000.0,39.60788,-84.955086,-0.014815,62068.438,...,misc_net,Helen,Pugh,M,901 Ariel Points,pop,NY,"Therapist, occupational",10/04/1996,66bd6f3354d2ea9ad134c93b87cd5606
1,6.508854e+17,191.44467,54794.918,40.806644,-90.0417,1348344000.0,39.442257,-83.13556,-0.074608,62444.895,...,health_fitness,Connie,Butler,F,8925 Nicholas Points,Fulton,IA,"Therapist, occupational",10/04/1996,66bd6f3354d2ea9ad134c93b87cd5606
2,6.821859e+17,139.80977,58521.035,40.82879,-89.95464,1347335000.0,39.30943,-83.93314,-0.00447,57120.477,...,health_fitness,Ashley,Butler,M,265 Mullen Park Apt. 307,Nelson,NY,Lawyer,04/06/1962,84fe9a1cc36499849cc1eb905fcaa201
3,5.856104e+17,187.8886,56959.332,40.879326,-92.1379,1347582000.0,39.884277,-87.494484,-0.090105,63843.625,...,health_fitness,Candice,Jones,M,872 Justin Views Suite 746,pop,IA,"Engineer, biomedical",07/03/1941,66bd6f3354d2ea9ad134c93b87cd5606
4,3.100787e+17,159.4089,58801.785,40.09448,-91.06373,1348049000.0,39.5815,-83.48748,-0.073647,62838.176,...,health_fitness,Connie,Tucker,M,8925 Nicholas Points,pop,RI,Warden/ranger,25/02/1949,3c448d316c44bfd92bf1f3f6d4d31698


In [72]:
df.columns

Index(['trans_date_trans_time', 'cc_num', 'merchant', 'category', 'amt',
       'first', 'last', 'gender', 'street', 'city', 'state', 'zip', 'lat',
       'long', 'city_pop', 'job', 'dob', 'trans_num', 'unix_time', 'merch_lat',
       'merch_long', 'is_fraud', 'merch_zipcode'],
      dtype='object')

In [73]:
data_set.columns

Index(['cc_num', 'amt', 'zip', 'lat', 'long', 'unix_time', 'merch_lat',
       'merch_long', 'is_fraud', 'merch_zipcode', 'trans_date_trans_time',
       'merchant', 'category', 'first', 'last', 'gender', 'street', 'city',
       'state', 'job', 'dob', 'trans_num'],
      dtype='object')

### Comparing both the datasets

In [74]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score

In [75]:
real = pd.read_csv("credit_card_data.csv")
fake = pd.read_csv("synthetic_credit_card_data.csv")

In [76]:
columns_to_use = [
    'amt', 'lat', 'long', 'unix_time', 'merch_lat', 'merch_long',
    'is_fraud', 'gender', 'category', 'job', 'state'
]
real = real[columns_to_use].copy()
fake = fake[columns_to_use].copy()

In [77]:
# Encode categoricals
categorical_cols = ['gender', 'category', 'job', 'state']
for col in categorical_cols:
    le = LabelEncoder()
    combined = pd.concat([real[col], fake[col]], axis=0)
    le.fit(combined.astype(str))
    real[col] = le.transform(real[col].astype(str))
    fake[col] = le.transform(fake[col].astype(str))

In [78]:
# Add labels
real['label'] = 1
fake['label'] = 0

In [79]:
# Combine datasets
data = pd.concat([real, fake], ignore_index=True)
X = data.drop(columns=['label'])
y = data['label']

In [80]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.3, random_state=42)

In [81]:
# Train classifier
clf = RandomForestClassifier()
clf.fit(X_train, y_train)
acc = accuracy_score(y_test, clf.predict(X_test))

In [104]:
from scipy.stats import ks_2samp

def similarity_score_ks(real_df, fake_df, columns):
    similarities = []
    for col in columns:
        if real_df[col].dtype in ['float64', 'int64']:
            stat, _ = ks_2samp(real_df[col], fake_df[col])
            similarity = (1 - stat) * 100  # Convert distance to % similarity
            print(f"🔹 {col}: {similarity:.2f}% similar")
            similarities.append(similarity)
    return sum(similarities) / len(similarities)

columns_to_use = [
    'amt', 'lat', 'long', 'unix_time', 'merch_lat', 'merch_long',
    'is_fraud', 'gender', 'category', 'job', 'state'
]

overall_similarity = similarity_score_ks(real, fake, columns_to_use)
print(f"\n✅ Overall Dataset Similarity: {overall_similarity:.2f}%")

🔹 amt: 24.37% similar
🔹 lat: 40.29% similar
🔹 long: 57.07% similar
🔹 unix_time: 45.21% similar
🔹 merch_lat: 47.77% similar
🔹 merch_long: 44.07% similar
🔹 is_fraud: 20.00% similar
🔹 gender: 46.39% similar
🔹 category: 20.37% similar
🔹 job: 64.37% similar
🔹 state: 60.35% similar

✅ Overall Dataset Similarity: 42.75%


### Machine Learning Models

In [103]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Assuming your DataFrame is already loaded as df

# 1. Preprocessing
df = df.copy()

# Convert date to datetime object and extract useful features
df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'])
df['hour'] = df['trans_date_trans_time'].dt.hour
df['day'] = df['trans_date_trans_time'].dt.day
df['month'] = df['trans_date_trans_time'].dt.month
df['weekday'] = df['trans_date_trans_time'].dt.weekday

# Convert date of birth to age
df['dob'] = pd.to_datetime(df['dob'])
df['age'] = (df['trans_date_trans_time'] - df['dob']).dt.days // 365

# Drop unnecessary or high-cardinality columns
df = df.drop(columns=[
    'trans_date_trans_time', 'dob', 'trans_num', 'cc_num', 'first', 'last',
    'street', 'unix_time', 'merchant', 'merch_zipcode'
])

# Label encode categorical variables
categorical_cols = ['gender', 'city', 'state', 'job', 'category']
label_encoders = {}

for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))
    label_encoders[col] = le

# 2. Split features and target
X = df.drop(columns=['is_fraud'])
y = df['is_fraud']

# 3. Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 4. Train/Test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# 5. Train classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# 6. Predict and evaluate
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f"Model Accuracy on real data: {accuracy:.4f}")

Model Accuracy on real data: 0.9685


In [92]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# --- Step 1: Copy the dataset ---
fraud_data_v2 = data.copy()

# --- Step 2: Drop high-cardinality or irrelevant ID columns if present ---
drop_cols = ['cc_num', 'trans_num', 'first', 'last', 'street', 'dob', 'merchant', 'merch_zipcode']
fraud_data_v2.drop(columns=[col for col in drop_cols if col in fraud_data_v2.columns], inplace=True)

# --- Step 3: Parse datetime (day-first format) ---
if 'trans_date_trans_time' in fraud_data_v2.columns:
    fraud_data_v2['trans_date_trans_time'] = pd.to_datetime(
        fraud_data_v2['trans_date_trans_time'],
        format='%d/%m/%Y %H:%M',
        dayfirst=True,
        errors='coerce'
    )
    fraud_data_v2['hour'] = fraud_data_v2['trans_date_trans_time'].dt.hour
    fraud_data_v2['day'] = fraud_data_v2['trans_date_trans_time'].dt.day
    fraud_data_v2['month'] = fraud_data_v2['trans_date_trans_time'].dt.month
    fraud_data_v2['weekday'] = fraud_data_v2['trans_date_trans_time'].dt.weekday
    fraud_data_v2.drop(columns=['trans_date_trans_time'], inplace=True)

# --- Step 4: Encode categorical features ---
categorical_cols = fraud_data_v2.select_dtypes(include='object').columns.tolist()
label_encoders = {}

for col in categorical_cols:
    le = LabelEncoder()
    fraud_data_v2[col] = le.fit_transform(fraud_data_v2[col].astype(str))
    label_encoders[col] = le

# --- Step 5: Fill missing values ---
fraud_data_v2.fillna(0, inplace=True)

# --- Step 6: Validate and clean 'is_fraud' column ---
if 'is_fraud' not in fraud_data_v2.columns:
    raise ValueError("❌ Target column 'is_fraud' not found in the dataset.")

# Convert to binary if float values detected
if fraud_data_v2['is_fraud'].dtype in ['float64', 'float32']:
    fraud_data_v2['is_fraud'] = (fraud_data_v2['is_fraud'] >= 0.5).astype(int)

# Final check
unique_targets = fraud_data_v2['is_fraud'].unique()
if len(unique_targets) != 2:
    raise ValueError(f"🚨 Target 'is_fraud' is not binary even after thresholding: {unique_targets}")

# --- Step 7: Prepare features and labels ---
X = fraud_data_v2.drop(columns=['is_fraud'])
y = fraud_data_v2['is_fraud']

# --- Step 8: Feature Scaling ---
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# --- Step 9: Train-Test Split ---
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# --- Step 10: Train Random Forest Classifier ---
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# --- Step 11: Evaluate Model ---
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f"✅ Random Forest Model Accuracy on fake data: {accuracy:.4f}")

✅ Random Forest Model Accuracy on fake data: 0.9713
