In [3]:
import pandas as pd
import numpy as np

# Define the special Hugging Face URL for the CSV file
# This tells pandas to use fsspec and huggingface_hub to find and read the file
hf_csv_url = "hf://datasets/MatrixIA/FraudData/FraudData.csv"

print(f"Attempting to load full dataset from: {hf_csv_url}")
try:
    # Use pandas read_csv directly with the hf:// URL
    # This loads the entire dataset into memory. May take a minute or two.
    df_full = pd.read_csv(hf_csv_url)
    print("Successfully loaded full dataset into Pandas DataFrame.")
    print(f"Full DataFrame shape: {df_full.shape}")

    # --- IMPORTANT: Create a Sample for Development ---
    # Define sample size (e.g., 1 million rows)
    sample_size = 1000000
    print(f"Creating a sample of {sample_size} rows for initial development...")

    # Option 1: Take the first N rows (simplest)
    df = df_full.head(sample_size).copy()

    # Option 2: Take a random sample (better representation, might be slightly slower)
    # df = df_full.sample(n=sample_size, random_state=42).copy()

    print(f"Sample DataFrame shape: {df.shape}")

    # Optional: Delete the full dataframe to free memory if you notice slowdowns
    # Although with 24GB RAM, it might not be necessary yet.
    # del df_full
    # import gc # Garbage collector
    # gc.collect() # Force memory cleanup

except Exception as e:
    # Catch potential errors during download or reading
    print(f"ERROR: Failed to load dataset using pd.read_csv('hf://...'). Error: {e}")
    print("Check your internet connection, proxy settings (if any), and the URL.")
    raise # Stop execution

# --- Basic Inspection (Run on the SAMPLE 'df') ---
# This part remains the same as before, using the 'df' variable which now holds the sample
print("\nFirst 5 rows (from sample):")
print(df.head())

print("\nDataFrame Info (from sample):")
df.info()

print("\nSummary Statistics (from sample):")
print(df.describe())

print("\nMissing Values Count per Column (from sample):")
print(df.isnull().sum())

# --- Target Variable Check (Run on the SAMPLE 'df') ---
# Target column is 'isFraud' for this dataset (PaySim)
target_column = 'isFraud'

if target_column in df.columns:
    print(f"\nClass Distribution for Target Column ('{target_column}') in Sample:")
    print(df[target_column].value_counts(normalize=True))
    print("\nRaw Counts in Sample:")
    print(df[target_column].value_counts(normalize=False))
else:
    print(f"\nERROR: Target column '{target_column}' not found in the DataFrame!")
    print(f"Available columns are: {list(df.columns)}")

Attempting to load full dataset from: hf://datasets/MatrixIA/FraudData/FraudData.csv
Successfully loaded full dataset into Pandas DataFrame.
Full DataFrame shape: (6362620, 11)
Creating a sample of 1000000 rows for initial development...
Sample DataFrame shape: (1000000, 11)

First 5 rows (from sample):
   step      type    amount     nameOrig  oldbalanceOrg  newbalanceOrig  \
0     1   PAYMENT   9839.64  C1231006815       170136.0       160296.36   
1     1   PAYMENT   1864.28  C1666544295        21249.0        19384.72   
2     1  TRANSFER    181.00  C1305486145          181.0            0.00   
3     1  CASH_OUT    181.00   C840083671          181.0            0.00   
4     1   PAYMENT  11668.14  C2048537720        41554.0        29885.86   

      nameDest  oldbalanceDest  newbalanceDest  isFraud  isFlaggedFraud  
0  M1979787155             0.0             0.0        0               0  
1  M2044282225             0.0             0.0        0               0  
2   C553264065        

In [4]:
# Import the profiling tool
try:
    from ydata_profiling import ProfileReport
except ImportError:
    try:
        from pandas_profiling import ProfileReport
    except ImportError:
        print("ERROR: Neither ydata-profiling nor pandas-profiling seem to be installed.")
        print("Please run: pip install ydata-profiling")
        raise # Stop if library isn't installed

print("\nGenerating data profiling report on the SAMPLE... (This might take a minute or two)")

# Create the report object using the SAMPLE DataFrame 'df'
profile = ProfileReport(df, title="Fraud Data Profiling Report (Sample)", explorative=True)

# Define the filename for the HTML report
report_filename = "fraud_data_profiling_report_sample.html"

# Save the report to an HTML file
profile.to_file(report_filename)

print(f"Profiling report saved to {report_filename}")


Generating data profiling report on the SAMPLE... (This might take a minute or two)


Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]


  0%|                                                                                           | 0/11 [00:00<?, ?it/s][A
  9%|███████▌                                                                           | 1/11 [00:03<00:30,  3.03s/it][A
 18%|███████████████                                                                    | 2/11 [00:04<00:17,  1.94s/it][A
 27%|██████████████████████▋                                                            | 3/11 [00:09<00:27,  3.42s/it][A
100%|██████████████████████████████████████████████████████████████████████████████████| 11/11 [00:11<00:00,  1.04s/it][A


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Profiling report saved to fraud_data_profiling_report_sample.html


In [5]:
#Step 2 Level 1: Preprocessing-and-Baseline
# --- Step 2: Basic Preprocessing ---
print("--- Starting Step 2: Basic Preprocessing ---")

# --- Step 2.1: Feature Selection ---
# First, ensure 'df' holds the sample DataFrame from Step 1
# (If in a new notebook, you might need to re-load/re-sample or load from a saved file)
print(f"Original sample shape: {df.shape}")

# Define the features we decided to keep
features_to_keep = ['step', 'type', 'amount', 'oldbalanceOrg', 'newbalanceOrig', 'oldbalanceDest', 'newbalanceDest']
target = 'isFraud' # Define the target column name

print(f"Keeping features: {features_to_keep}")
print(f"Target variable: {target}")

try:
    # Create a new DataFrame containing only the selected features and the target
    # Using .copy() prevents accidental changes to the original 'df'
    df_processed = df[features_to_keep + [target]].copy()

    print("\nCreated df_processed with selected columns.")
    print(f"Shape after selecting features: {df_processed.shape}")
    print("First 5 rows of df_processed:")
    print(df_processed.head())
except KeyError as e:
    print(f"ERROR: A specified column was not found in the DataFrame: {e}")
    print("Please check the 'features_to_keep' list and the 'target' variable against the columns from df.info().")
    raise # Stop execution if columns are incorrect
except Exception as e:
    print(f"An unexpected error occurred during feature selection: {e}")
    raise

--- Starting Step 2: Basic Preprocessing ---
Original sample shape: (1000000, 11)
Keeping features: ['step', 'type', 'amount', 'oldbalanceOrg', 'newbalanceOrig', 'oldbalanceDest', 'newbalanceDest']
Target variable: isFraud

Created df_processed with selected columns.
Shape after selecting features: (1000000, 8)
First 5 rows of df_processed:
   step      type    amount  oldbalanceOrg  newbalanceOrig  oldbalanceDest  \
0     1   PAYMENT   9839.64       170136.0       160296.36             0.0   
1     1   PAYMENT   1864.28        21249.0        19384.72             0.0   
2     1  TRANSFER    181.00          181.0            0.00             0.0   
3     1  CASH_OUT    181.00          181.0            0.00         21182.0   
4     1   PAYMENT  11668.14        41554.0        29885.86             0.0   

   newbalanceDest  isFraud  
0             0.0        0  
1             0.0        0  
2             0.0        1  
3             0.0        1  
4             0.0        0  


In [6]:
# --- Step 2.2: Encode Categorical Features ('type') ---

# Identify the categorical column(s) remaining in df_processed
categorical_cols = ['type']

# Check current data types and unique values in 'type' before encoding
print("\nData types before encoding:")
print(df_processed.info()) # Check df_processed specifically
if 'type' in df_processed.columns:
    print(f"\nUnique values in 'type' column:\n{df_processed['type'].unique()}")
    print(f"\nValue counts for 'type':\n{df_processed['type'].value_counts()}")
else:
    print("\n'type' column not found in df_processed (already dropped or renamed?).")


print(f"\nApplying one-hot encoding to: {categorical_cols} using pd.get_dummies...")

try:
    # Use pd.get_dummies to convert the 'type' column
    # drop_first=True removes redundancy (prevents multicollinearity)
    df_processed = pd.get_dummies(df_processed, columns=categorical_cols, drop_first=True) # This line reassigns df_processed

    print("\nDataFrame after one-hot encoding:")
    print(df_processed.head()) # Notice the original 'type' column is gone
                               # and new 'type_TRANSFER', 'type_PAYMENT', etc. columns appear
    print(f"\nShape after encoding: {df_processed.shape}")

    # Verify that all columns (except potentially target) are now numerical
    print("\nData types after encoding:")
    df_processed.info()

except KeyError as e:
    print(f"ERROR: Column specified for encoding not found: {e}")
    raise
except Exception as e:
    print(f"An unexpected error occurred during one-hot encoding: {e}")
    raise


Data types before encoding:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 8 columns):
 #   Column          Non-Null Count    Dtype  
---  ------          --------------    -----  
 0   step            1000000 non-null  int64  
 1   type            1000000 non-null  object 
 2   amount          1000000 non-null  float64
 3   oldbalanceOrg   1000000 non-null  float64
 4   newbalanceOrig  1000000 non-null  float64
 5   oldbalanceDest  1000000 non-null  float64
 6   newbalanceDest  1000000 non-null  float64
 7   isFraud         1000000 non-null  int64  
dtypes: float64(5), int64(2), object(1)
memory usage: 61.0+ MB
None

Unique values in 'type' column:
['PAYMENT' 'TRANSFER' 'CASH_OUT' 'DEBIT' 'CASH_IN']

Value counts for 'type':
type
CASH_OUT    362676
PAYMENT     329753
CASH_IN     218673
TRANSFER     82424
DEBIT         6474
Name: count, dtype: int64

Applying one-hot encoding to: ['type'] using pd.get_dummies...

DataFrame after one-

In [10]:
# --- Step 1.2.3: Separate Features (X) and Target (y) ---

# Ensure the target variable name is correctly defined
target = 'isFraud'

print("\n--- Step 1.2.3: Separate Features (X) and Target (y) ---")
print(f"\nSeparating features (X) and target ('{target}')...")

try:
    # IMPORTANT: Make sure 'df_processed' is the DataFrame from the previous step
    # containing the one-hot encoded 'type' columns.

    # Create the features DataFrame 'X' by dropping the target column
    # axis=1 specifies we are dropping a column
    X = df_processed.drop(target, axis=1) # X should have 10 columns

    # Create the target Series 'y' by selecting only the target column
    y = df_processed[target] # y should be a Series

    # --- Verification ---
    print("Separation complete.")
    print("\nFeatures (X) verification:")
    print(f"  Shape: {X.shape}") # Expect (1000000, 10)
    if target in X.columns:
         print(f"  ERROR: Target column '{target}' still present in X!")
    else:
         print(f"  Target column '{target}' successfully removed from X.")
    print("  First 5 rows of X:")
    print(X.head())
    print("\nTarget (y) verification:")
    print(f"  Shape: {y.shape}") # Expect (1000000,)
    print(f"  Data type: {y.dtype}") # Expect int64
    print("  First 5 values of y:")
    print(y.head())

except KeyError:
    print(f"ERROR: Could not find target column '{target}' in df_processed.")
    print(f"Available columns are: {list(df_processed.columns)}")
    raise
except Exception as e:
    print(f"An unexpected error occurred during X/y separation: {e}")
    raise


--- Step 1.2.3: Separate Features (X) and Target (y) ---

Separating features (X) and target ('isFraud')...
Separation complete.

Features (X) verification:
  Shape: (1000000, 10)
  Target column 'isFraud' successfully removed from X.
  First 5 rows of X:
   step    amount  oldbalanceOrg  newbalanceOrig  oldbalanceDest  \
0     1   9839.64       170136.0       160296.36             0.0   
1     1   1864.28        21249.0        19384.72             0.0   
2     1    181.00          181.0            0.00             0.0   
3     1    181.00          181.0            0.00         21182.0   
4     1  11668.14        41554.0        29885.86             0.0   

   newbalanceDest  type_CASH_OUT  type_DEBIT  type_PAYMENT  type_TRANSFER  
0             0.0          False       False          True          False  
1             0.0          False       False          True          False  
2             0.0          False       False         False           True  
3             0.0           Tr

Excellent! That output is perfect and confirms you have successfully completed Step 2.3: Separate Features (X) and Target (y).

X now has the correct shape (1M rows, 10 feature columns).

y now has the correct shape (1M rows, 1 target column as a Series) and data type (int64).

The verification checks confirm the target was correctly removed from X.

In [11]:
#Sub-step 1.3 - Train/Test Split
from sklearn.model_selection import train_test_split
print("\n--- Step 1.3: Train/Test Split ---")


# Ensure X and y exist and are not empty
if not 'X' in locals() or X.empty or not 'y' in locals() or y.empty:
     print("ERROR: Features (X) or target (y) not defined or empty.")
     raise NameError("X or y not defined/empty.")


try:
    X_train, X_test, y_train, y_test = train_test_split(
        X, y,
        test_size=0.2,
        random_state=50,
        stratify=y # Preserve class distribution
    )
    print("Data split into training and testing sets.")
    print(f"X_train shape: {X_train.shape}, y_train shape: {y_train.shape}")
    print(f"X_test shape: {X_test.shape}, y_test shape: {y_test.shape}")
    print("\nTraining set 'isFraud' distribution:")
    print(y_train.value_counts(normalize=True))
    print("\nTest set 'isFraud' distribution:")
    print(y_test.value_counts(normalize=True))
except Exception as e:
    print(f"ERROR during train/test split: {e}")
    raise


--- Step 1.3: Train/Test Split ---
Data split into training and testing sets.
X_train shape: (800000, 10), y_train shape: (800000,)
X_test shape: (200000, 10), y_test shape: (200000,)

Training set 'isFraud' distribution:
isFraud
0    0.999465
1    0.000535
Name: proportion, dtype: float64

Test set 'isFraud' distribution:
isFraud
0    0.999465
1    0.000535
Name: proportion, dtype: float64


In [13]:
!pip install imblearn




[notice] A new release of pip is available: 25.0.1 -> 25.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [16]:
#Sub-Step 1.4: Handle Class Imbalance (SMOTE)
from imblearn.over_sampling import SMOTE # Make sure imblearn is installed
print("\n--- Step 1.4: Apply SMOTE to Training Data ---")


# Ensure X_train and y_train exist
if not 'X_train' in locals() or not 'y_train' in locals():
     print("ERROR: X_train or y_train not defined.")
     raise NameError("X_train/y_train missing.")


try:
    # Instantiate SMOTE - random_state for reproducibility, n_jobs to speed up if possible
    smote = SMOTE(random_state=50)
    print(f"Original training shape: X={X_train.shape}, y={y_train.shape}")
    print(f"Original training class distribution:\n{y_train.value_counts(normalize=True)}")


    print("\nApplying SMOTE (this might take a moment)...")
    # Fit SMOTE and resample ONLY the training data
    X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)


    print("SMOTE application complete.")
    print(f"Resampled training shape: X={X_train_resampled.shape}, y={y_train_resampled.shape}")
    print(f"\nResampled training class distribution:\n{y_train_resampled.value_counts(normalize=True)}") # Should be balanced
except Exception as e:
    print(f"ERROR during SMOTE: {e}")
    raise


--- Step 1.4: Apply SMOTE to Training Data ---
Original training shape: X=(800000, 10), y=(800000,)
Original training class distribution:
isFraud
0    0.999465
1    0.000535
Name: proportion, dtype: float64

Applying SMOTE (this might take a moment)...
SMOTE application complete.
Resampled training shape: X=(1599144, 10), y=(1599144,)

Resampled training class distribution:
isFraud
0    0.5
1    0.5
Name: proportion, dtype: float64


EXPLANATION ON 1.4
Original training shape: X=(800000, 10), y=(800000,)
This shows the size of your training dataset before applying SMOTE.
You had 800,000 samples (rows).
X_train had 10 feature columns.
y_train had 800,000 corresponding labels.
Original training class distribution:
isFraud
0 0.999465: Roughly 99.95% of your original training data was labelled 'Not Fraud' (class 0).
1 0.000535: Only about 0.05% of your original training data was labelled 'Fraud' (class 1).
This confirms the severe class imbalance that SMOTE is designed to address. The model would likely ignore the tiny 'Fraud' class if trained on this original data.
Applying SMOTE (this might take a moment)...
This indicates that the smote.fit_resample(X_train, y_train) command started executing. Since n_jobs was removed, it likely ran on a single CPU core.
SMOTE application complete.
Confirmation that the process finished without errors.
Resampled training shape: X=(1599144, 10), y=(1599144,)
This shows the size of your training dataset after applying SMOTE (X_train_resampled, y_train_resampled).
The number of samples has significantly increased to 1,599,144.
Why the increase? SMOTE works by oversampling the minority class. It doesn't remove majority samples. It creates new, synthetic samples for the minority ('Fraud') class until the number of minority samples equals the number of majority samples.
The number of features (10) remains the same, as SMOTE only creates new samples (rows), not new features.
Resampled training class distribution:
isFraud
0 0.5: Exactly 50% of the resampled training data is now 'Not Fraud'.
1 0.5: Exactly 50% of the resampled training data is now 'Fraud'.
This confirms the successful outcome of SMOTE: the training dataset is now perfectly balanced.
In essence: SMOTE successfully addressed the class imbalance by generating synthetic 'Fraud' examples, resulting in a larger, balanced training dataset (X_train_resampled, y_train_resampled). This balanced dataset will now be used to train the XGBoost model in the next step, forcing the model to pay equal attention to both 'Fraud' and 'Not Fraud' patterns, which is crucial for achieving good Recall on the minority class.

In [18]:
#Sub-Step 1.5: Train Baseline Model (XGBoost)
import xgboost as xgb
print("\n--- Step 1.5: Train Baseline XGBoost Model ---")


# Ensure resampled training data exists
if not 'X_train_resampled' in locals() or not 'y_train_resampled' in locals():
     print("ERROR: Resampled training data not found.")
     raise NameError("Resampled data missing.")


try:
    # Instantiate the classifier
    model_xgb_baseline = xgb.XGBClassifier(
        random_state=42,          # For reproducibility of internal randomness
        use_label_encoder=False,  # Avoids potential deprecation warnings
        eval_metric='logloss'     # Common metric, avoids potential warnings if not set
    )
    print("XGBoost Classifier instantiated.")
    print("Training XGBoost model on resampled data (takes time)...")


    # Train the model using the balanced (SMOTE'd) training data
    model_xgb_baseline.fit(X_train_resampled, y_train_resampled)


    print("Baseline model training complete.")
except Exception as e:
    print(f"ERROR during XGBoost training: {e}")
    raise


--- Step 1.5: Train Baseline XGBoost Model ---
XGBoost Classifier instantiated.
Training XGBoost model on resampled data (takes time)...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Baseline model training complete.


--- Step 1.5: Train Baseline XGBoost Model ---: Indicates the start of this step.
XGBoost Classifier instantiated.: Confirms that the xgb.XGBClassifier(...) line executed correctly, creating the model object before training.
Training XGBoost model on resampled data (takes time)...: Shows that the model_xgb_baseline.fit(X_train_resampled, y_train_resampled) command started. This is where the actual learning happens, using the SMOTE-balanced data.
UserWarning: [...] Parameters: { "use_label_encoder" } are not used.:
What it means: XGBoost is telling you that even though you provided the parameter use_label_encoder=False when creating the classifier, this parameter is essentially ignored or not needed by the underlying training mechanism in newer versions, especially when you also specify eval_metric.
Why it happens: use_label_encoder was primarily relevant for older versions or specific internal label handling that is now deprecated or automatically handled differently. Setting it to False is the recommended practice to avoid potential future issues, but the library is just letting you know it didn't actively use that specific setting during this particular training run.
Is it a problem? No. This is a common warning and can be safely ignored. It does not mean the training failed or is incorrect.
Baseline model training complete.: This is the most important message! It confirms that the .fit() process finished successfully without crashing.

In [19]:
#Sub-Step 1.6: Evaluate Baseline Model
from sklearn.metrics import classification_report, recall_score, confusion_matrix
print("\n--- Step 1.6: Evaluate Baseline Model on Test Set ---")


# Ensure model and test data exist
if not 'model_xgb_baseline' in locals(): raise NameError("Baseline model not trained.")
if not 'X_test' in locals() or not 'y_test' in locals(): raise NameError("Test data not available.")


try:
    print("Making predictions on the original (imbalanced) test set...")
    # Use the trained model to predict on the unseen X_test
    y_pred_baseline = model_xgb_baseline.predict(X_test)


    print("\nConfusion Matrix (Baseline):")
    # Rows = Actual (0: Not Fraud, 1: Fraud)
    # Cols = Predicted (0: Not Fraud, 1: Fraud)
    cm = confusion_matrix(y_test, y_pred_baseline)
    print(cm)
    # You can manually interpret: TN=cm[0,0], FP=cm[0,1], FN=cm[1,0], TP=cm[1,1]


    print("\nClassification Report (Baseline):")
    print(classification_report(y_test, y_pred_baseline, target_names=['Not Fraud (0)', 'Fraud (1)'], digits=4))


    # Calculate recall specifically for the positive 'Fraud' class (label=1)
    recall_fraud = recall_score(y_test, y_pred_baseline, pos_label=1)
    print(f"\nRecall for Fraud Class (1): {recall_fraud:.4f}")


    # Check MVP goal
    MVP_RECALL_TARGET = 0.75 # Our defined target
    print(f"\nChecking against MVP Recall Target ({MVP_RECALL_TARGET})...")
    if recall_fraud >= MVP_RECALL_TARGET:
        print(f">>> Level 1 MVP Recall target MET! :) <<<")
    else:
        print(f">>> Level 1 MVP Recall target NOT MET. :( <<<")
        print(f"    (Further tuning in Level 2 or revisiting preprocessing might be needed)")
except Exception as e:
    print(f"ERROR during baseline model evaluation: {e}")
    raise


--- Step 1.6: Evaluate Baseline Model on Test Set ---
Making predictions on the original (imbalanced) test set...

Confusion Matrix (Baseline):
[[199141    752]
 [    13     94]]

Classification Report (Baseline):
               precision    recall  f1-score   support

Not Fraud (0)     0.9999    0.9962    0.9981    199893
    Fraud (1)     0.1111    0.8785    0.1973       107

     accuracy                         0.9962    200000
    macro avg     0.5555    0.9374    0.5977    200000
 weighted avg     0.9995    0.9962    0.9977    200000


Recall for Fraud Class (1): 0.8785

Checking against MVP Recall Target (0.75)...
>>> Level 1 MVP Recall target MET! :) <<<


!pip freeze > requirements.txt