In [1]:
import pandas as pd
import numpy as np

# Define the special Hugging Face URL for the CSV file
# This tells pandas to use fsspec and huggingface_hub to find and read the file
hf_csv_url = "hf://datasets/MatrixIA/FraudData/FraudData.csv"

print(f"Attempting to load full dataset from: {hf_csv_url}")
try:
    # Use pandas read_csv directly with the hf:// URL
    # This loads the entire dataset into memory. May take a minute or two.
    df_full = pd.read_csv(hf_csv_url)
    print("Successfully loaded full dataset into Pandas DataFrame.")
    print(f"Full DataFrame shape: {df_full.shape}")

    # --- IMPORTANT: Create a Sample for Development ---
    # Define sample size (e.g., 1 million rows)
    sample_size = 1000000
    print(f"Creating a sample of {sample_size} rows for initial development...")

    # Option 1: Take the first N rows (simplest)
    df = df_full.head(sample_size).copy()

    # Option 2: Take a random sample (better representation, might be slightly slower)
    # df = df_full.sample(n=sample_size, random_state=42).copy()

    print(f"Sample DataFrame shape: {df.shape}")

    # Optional: Delete the full dataframe to free memory if you notice slowdowns
    # Although with 24GB RAM, it might not be necessary yet.
    # del df_full
    # import gc # Garbage collector
    # gc.collect() # Force memory cleanup

except Exception as e:
    # Catch potential errors during download or reading
    print(f"ERROR: Failed to load dataset using pd.read_csv('hf://...'). Error: {e}")
    print("Check your internet connection, proxy settings (if any), and the URL.")
    raise # Stop execution

# --- Basic Inspection (Run on the SAMPLE 'df') ---
# This part remains the same as before, using the 'df' variable which now holds the sample
print("\nFirst 5 rows (from sample):")
print(df.head())

print("\nDataFrame Info (from sample):")
df.info()

print("\nSummary Statistics (from sample):")
print(df.describe())

print("\nMissing Values Count per Column (from sample):")
print(df.isnull().sum())

# --- Target Variable Check (Run on the SAMPLE 'df') ---
# Target column is 'isFraud' for this dataset (PaySim)
target_column = 'isFraud'

if target_column in df.columns:
    print(f"\nClass Distribution for Target Column ('{target_column}') in Sample:")
    print(df[target_column].value_counts(normalize=True))
    print("\nRaw Counts in Sample:")
    print(df[target_column].value_counts(normalize=False))
else:
    print(f"\nERROR: Target column '{target_column}' not found in the DataFrame!")
    print(f"Available columns are: {list(df.columns)}")

Attempting to load full dataset from: hf://datasets/MatrixIA/FraudData/FraudData.csv
Successfully loaded full dataset into Pandas DataFrame.
Full DataFrame shape: (6362620, 11)
Creating a sample of 1000000 rows for initial development...
Sample DataFrame shape: (1000000, 11)

First 5 rows (from sample):
   step      type    amount     nameOrig  oldbalanceOrg  newbalanceOrig  \
0     1   PAYMENT   9839.64  C1231006815       170136.0       160296.36   
1     1   PAYMENT   1864.28  C1666544295        21249.0        19384.72   
2     1  TRANSFER    181.00  C1305486145          181.0            0.00   
3     1  CASH_OUT    181.00   C840083671          181.0            0.00   
4     1   PAYMENT  11668.14  C2048537720        41554.0        29885.86   

      nameDest  oldbalanceDest  newbalanceDest  isFraud  isFlaggedFraud  
0  M1979787155             0.0             0.0        0               0  
1  M2044282225             0.0             0.0        0               0  
2   C553264065        

In [2]:
# Import the profiling tool
try:
    from ydata_profiling import ProfileReport
except ImportError:
    try:
        from pandas_profiling import ProfileReport
    except ImportError:
        print("ERROR: Neither ydata-profiling nor pandas-profiling seem to be installed.")
        print("Please run: pip install ydata-profiling")
        raise # Stop if library isn't installed

print("\nGenerating data profiling report on the SAMPLE... (This might take a minute or two)")

# Create the report object using the SAMPLE DataFrame 'df'
profile = ProfileReport(df, title="Fraud Data Profiling Report (Sample)", explorative=True)

# Define the filename for the HTML report
report_filename = "fraud_data_profiling_report_sample.html"

# Save the report to an HTML file
profile.to_file(report_filename)

print(f"Profiling report saved to {report_filename}")


Generating data profiling report on the SAMPLE... (This might take a minute or two)


Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]


  0%|                                                                         | 0/11 [00:00<?, ?it/s][A
  9%|█████▉                                                           | 1/11 [00:03<00:29,  2.98s/it][A
 18%|███████████▊                                                     | 2/11 [00:03<00:15,  1.70s/it][A
 27%|█████████████████▋                                               | 3/11 [00:10<00:31,  3.97s/it][A
100%|████████████████████████████████████████████████████████████████| 11/11 [00:10<00:00,  1.03it/s][A


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Profiling report saved to fraud_data_profiling_report_sample.html


In [3]:
#Step 2 Level 1: Preprocessing-and-Baseline
# --- Step 2: Basic Preprocessing ---
print("--- Starting Step 2: Basic Preprocessing ---")

# --- Step 2.1: Feature Selection ---
# First, ensure 'df' holds the sample DataFrame from Step 1
# (If in a new notebook, you might need to re-load/re-sample or load from a saved file)
print(f"Original sample shape: {df.shape}")

# Define the features we decided to keep
features_to_keep = ['step', 'type', 'amount', 'oldbalanceOrg', 'newbalanceOrig', 'oldbalanceDest', 'newbalanceDest']
target = 'isFraud' # Define the target column name

print(f"Keeping features: {features_to_keep}")
print(f"Target variable: {target}")

try:
    # Create a new DataFrame containing only the selected features and the target
    # Using .copy() prevents accidental changes to the original 'df'
    df_processed = df[features_to_keep + [target]].copy()

    print("\nCreated df_processed with selected columns.")
    print(f"Shape after selecting features: {df_processed.shape}")
    print("First 5 rows of df_processed:")
    print(df_processed.head())
except KeyError as e:
    print(f"ERROR: A specified column was not found in the DataFrame: {e}")
    print("Please check the 'features_to_keep' list and the 'target' variable against the columns from df.info().")
    raise # Stop execution if columns are incorrect
except Exception as e:
    print(f"An unexpected error occurred during feature selection: {e}")
    raise

--- Starting Step 2: Basic Preprocessing ---
Original sample shape: (1000000, 11)
Keeping features: ['step', 'type', 'amount', 'oldbalanceOrg', 'newbalanceOrig', 'oldbalanceDest', 'newbalanceDest']
Target variable: isFraud

Created df_processed with selected columns.
Shape after selecting features: (1000000, 8)
First 5 rows of df_processed:
   step      type    amount  oldbalanceOrg  newbalanceOrig  oldbalanceDest  \
0     1   PAYMENT   9839.64       170136.0       160296.36             0.0   
1     1   PAYMENT   1864.28        21249.0        19384.72             0.0   
2     1  TRANSFER    181.00          181.0            0.00             0.0   
3     1  CASH_OUT    181.00          181.0            0.00         21182.0   
4     1   PAYMENT  11668.14        41554.0        29885.86             0.0   

   newbalanceDest  isFraud  
0             0.0        0  
1             0.0        0  
2             0.0        1  
3             0.0        1  
4             0.0        0  


In [4]:
# --- Step 2.2: Encode Categorical Features ('type') ---

# Identify the categorical column(s) remaining in df_processed
categorical_cols = ['type']

# Check current data types and unique values in 'type' before encoding
print("\nData types before encoding:")
print(df_processed.info()) # Check df_processed specifically
if 'type' in df_processed.columns:
    print(f"\nUnique values in 'type' column:\n{df_processed['type'].unique()}")
    print(f"\nValue counts for 'type':\n{df_processed['type'].value_counts()}")
else:
    print("\n'type' column not found in df_processed (already dropped or renamed?).")


print(f"\nApplying one-hot encoding to: {categorical_cols} using pd.get_dummies...")

try:
    # Use pd.get_dummies to convert the 'type' column
    # drop_first=True removes redundancy (prevents multicollinearity)
    df_processed = pd.get_dummies(df_processed, columns=categorical_cols, drop_first=True) # This line reassigns df_processed

    print("\nDataFrame after one-hot encoding:")
    print(df_processed.head()) # Notice the original 'type' column is gone
                               # and new 'type_TRANSFER', 'type_PAYMENT', etc. columns appear
    print(f"\nShape after encoding: {df_processed.shape}")

    # Verify that all columns (except potentially target) are now numerical
    print("\nData types after encoding:")
    df_processed.info()

except KeyError as e:
    print(f"ERROR: Column specified for encoding not found: {e}")
    raise
except Exception as e:
    print(f"An unexpected error occurred during one-hot encoding: {e}")
    raise


Data types before encoding:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 8 columns):
 #   Column          Non-Null Count    Dtype  
---  ------          --------------    -----  
 0   step            1000000 non-null  int64  
 1   type            1000000 non-null  object 
 2   amount          1000000 non-null  float64
 3   oldbalanceOrg   1000000 non-null  float64
 4   newbalanceOrig  1000000 non-null  float64
 5   oldbalanceDest  1000000 non-null  float64
 6   newbalanceDest  1000000 non-null  float64
 7   isFraud         1000000 non-null  int64  
dtypes: float64(5), int64(2), object(1)
memory usage: 61.0+ MB
None

Unique values in 'type' column:
['PAYMENT' 'TRANSFER' 'CASH_OUT' 'DEBIT' 'CASH_IN']

Value counts for 'type':
type
CASH_OUT    362676
PAYMENT     329753
CASH_IN     218673
TRANSFER     82424
DEBIT         6474
Name: count, dtype: int64

Applying one-hot encoding to: ['type'] using pd.get_dummies...

DataFrame after one-

In [6]:
# --- Step 2.3: Separate Features (X) and Target (y) ---

# Ensure the target variable name is correctly defined
target = 'isFraud'

print(f"\nSeparating features (X) and target ('{target}')...")

try:
    # IMPORTANT: Make sure 'df_processed' is the DataFrame from the previous step
    # containing the one-hot encoded 'type' columns.

    # Create the features DataFrame 'X' by dropping the target column
    # axis=1 specifies we are dropping a column
    X = df_processed.drop(target, axis=1) # X should have 10 columns

    # Create the target Series 'y' by selecting only the target column
    y = df_processed[target] # y should be a Series

    # --- Verification ---
    print("Separation complete.")
    print("\nFeatures (X) verification:")
    print(f"  Shape: {X.shape}") # Expect (1000000, 10)
    if target in X.columns:
         print(f"  ERROR: Target column '{target}' still present in X!")
    else:
         print(f"  Target column '{target}' successfully removed from X.")
    print("  First 5 rows of X:")
    print(X.head())
    print("\nTarget (y) verification:")
    print(f"  Shape: {y.shape}") # Expect (1000000,)
    print(f"  Data type: {y.dtype}") # Expect int64
    print("  First 5 values of y:")
    print(y.head())

except KeyError:
    print(f"ERROR: Could not find target column '{target}' in df_processed.")
    print(f"Available columns are: {list(df_processed.columns)}")
    raise
except Exception as e:
    print(f"An unexpected error occurred during X/y separation: {e}")
    raise


Separating features (X) and target ('isFraud')...
Separation complete.

Features (X) verification:
  Shape: (1000000, 10)
  Target column 'isFraud' successfully removed from X.
  First 5 rows of X:
   step    amount  oldbalanceOrg  newbalanceOrig  oldbalanceDest  \
0     1   9839.64       170136.0       160296.36             0.0   
1     1   1864.28        21249.0        19384.72             0.0   
2     1    181.00          181.0            0.00             0.0   
3     1    181.00          181.0            0.00         21182.0   
4     1  11668.14        41554.0        29885.86             0.0   

   newbalanceDest  type_CASH_OUT  type_DEBIT  type_PAYMENT  type_TRANSFER  
0             0.0          False       False          True          False  
1             0.0          False       False          True          False  
2             0.0          False       False         False           True  
3             0.0           True       False         False          False  
4           

Excellent! That output is perfect and confirms you have successfully completed Step 2.3: Separate Features (X) and Target (y).

X now has the correct shape (1M rows, 10 feature columns).

y now has the correct shape (1M rows, 1 target column as a Series) and data type (int64).

The verification checks confirm the target was correctly removed from X.