In [41]:
import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest

# Load datasets
original = pd.read_csv("data_FraudDetection_JAR2020.csv")  # Original dataset
gan= pd.read_csv("data_FraudDetection_balanced.csv")  # Your CTGAN-generated dataset


In [56]:
import pandas as pd
import numpy as np

def create_hybrid_dataset(original_path, gan_path, smote_path):
    """
    Creates hybrid dataset combining:
    - Original non-fraud cases
    - GAN-generated fraud cases
    - SMOTE-generated fraud cases
    """
    # Load datasets
    original = pd.read_csv(original_path)
    gan = pd.read_csv(gan_path)
    smote = pd.read_csv(smote_path)
    
    # 1. Get original non-fraud cases
    original_non_fraud = original[original['misstate'] == 0].copy()
    
    # 2. Combine fraud cases from GAN and SMOTE
    # Ensure we only take fraud cases (misstate=1)
    gan_fraud = gan[gan['misstate'] == 1].copy()
    smote_fraud = smote[smote['misstate'] == 1].copy()
    combined_fraud = pd.concat([gan_fraud, smote_fraud], ignore_index=True)
    
    # 3. Create hybrid dataset
    hybrid = pd.concat([original_non_fraud, combined_fraud], ignore_index=True)
    
    
    # 5. Maintain original column order
    hybrid = hybrid[original.columns]
    
    return hybrid

# Example usage
original_file = "data_FraudDetection_JAR2020.csv"
gan_file = "data_FraudDetection_balanced.csv" 
smote_file = "data_FraudDetection_adasyn.csv"


hybrid_data = create_hybrid_dataset(original_file, gan_file, smote_file)
hybrid_data.to_csv("hybrid_dataset_adasyn.csv", index=False)

In [57]:
print("Count of misstate = 1:", (hybrid_data['misstate'] == 1).sum())
print("Count of misstate = 0:", (hybrid_data['misstate'] == 0).sum())

Count of misstate = 1: 179637
Count of misstate = 0: 145081


In [58]:
hybrid_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 324718 entries, 0 to 324717
Data columns (total 46 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   fyear        324718 non-null  int64  
 1   gvkey        324718 non-null  int64  
 2   p_aaer       2202 non-null    float64
 3   misstate     324718 non-null  int64  
 4   act          324718 non-null  float64
 5   ap           324718 non-null  float64
 6   at           324718 non-null  float64
 7   ceq          324718 non-null  float64
 8   che          324718 non-null  float64
 9   cogs         324718 non-null  float64
 10  csho         324718 non-null  float64
 11  dlc          324718 non-null  float64
 12  dltis        324718 non-null  float64
 13  dltt         324718 non-null  float64
 14  dp           324718 non-null  float64
 15  ib           324718 non-null  float64
 16  invt         324718 non-null  float64
 17  ivao         324718 non-null  float64
 18  ivst         324718 non-