In [None]:
"""
Date: 12/30/2024
Author: Asserewou Etekpo
Tpoic: Downsample a large dataset (Credit Card Transactions Fraud Detection Dataset)
 from Kaggle to a manageable size for testing purposes (10,000 rows)

"""

In [2]:
import pandas as pd

# Lets upload the dataset into Pandas
file_path = 'data/fraudTest1.csv'  # Assuming 'data' is a subdirectory in your project
df = pd.read_csv(file_path)


In [None]:
# Lets profile the dataset

df.info() # This will give us the number of rows and columns in the dataset

In [6]:

# Let's separate the dataset into two subsets(df_fraud and df_non_fraud) based on the is_fraud column

df_fraud = df[df['is_fraud'] == 1]

df_non_fraud = df[df['is_fraud'] == 0]


In [7]:
# We would like to downsample the dataset to 10,000 rows.
# Have 70% of the rows as fraud transactions and 30% as non-fraud transactions
# Lets calculate the number of rows to sample from each subset
# We want a dataset with 10,000 rows
total_rows = 10000
fraud_rows = int(total_rows * 0.7)  # For example, 70% fraud
non_fraud_rows = total_rows - fraud_rows 

In [8]:
# Separate the dataset into two subsets based on the is_fraud column

df_fraud = df[df['is_fraud'] == 1]

df_non_fraud = df[df['is_fraud'] == 0]


In [9]:
# Downsample each subset 
df_fraud_sampled = df_fraud.sample(n=fraud_rows, random_state=42, replace=True) 

df_non_fraud_sampled = df_non_fraud.sample(n=non_fraud_rows, random_state=42, replace=True)


In [10]:
# Combine the downsampled subsets into a single dataset

df_downsampled = pd.concat([df_fraud_sampled, df_non_fraud_sampled], ignore_index=True)


In [None]:
# Lets save the downsampled dataset to a new file

df_downsampled.shape

In [None]:
# Shuffle the combined dataframe to mix the rows

df_downsampled = df_downsampled.sample(frac=1, random_state=42).reset_index(drop=True)



In [None]:
# Lets check the first 5 rows of the downsampled dataset

df_downsampled.sample(5, random_state=42)

In [8]:
# Lets save the downsampled dataset to a new Excel file

# output_file_path = 'data/subfraudtest.csv'

output_file_path = r'C:\Users\honor\OneDrive\Documents\CreditCardProject\data\subfraudtest.csv'

df_downsampled.to_csv(output_file_path, index=False)
