In [1]:
import pandas as pd

df_path="data/creditcard.csv"
df = pd.read_csv(df_path)
df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [4]:
# Check class distribution
if 'Class' in df.columns:
    class_counts = df['Class'].value_counts()
    print("\nClass distribution:")
    print(class_counts)
    print(f"\nPercentage of fraud cases: {class_counts[1] / len(df) * 100:.4f}%")
    print(f"Imbalance ratio: 1:{class_counts[0] / class_counts[1]:.2f}")


Class distribution:
0    284315
1       492
Name: Class, dtype: int64

Percentage of fraud cases: 0.1727%
Imbalance ratio: 1:577.88


# Upload dataset to hugging face

In [10]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from datasets import Dataset, DatasetDict
from huggingface_hub import HfApi, login
import os
from dotenv import load_dotenv

#set seed
np.random.seed(42)
# Load environment variables from .env file
load_dotenv()

# Get HF API token from environment variables
hf_token = os.getenv("HF_API_TOKEN")
if not hf_token:
    print("Warning: HF_API_TOKEN not found in .env file")
else:
    print("Successfully loaded HF_API_TOKEN from .env file")

# 1. Normalize Time and Amount columns
print("Normalizing Time and Amount columns...")
scaler_time = StandardScaler()
scaler_amount = StandardScaler()

# Fit the scalers on the entire dataset
df['Time_norm'] = scaler_time.fit_transform(df['Time'].values.reshape(-1, 1))
df['Amount_norm'] = scaler_amount.fit_transform(df['Amount'].values.reshape(-1, 1))

# Store the normalization statistics for future reference
time_stats = {
    'mean': float(scaler_time.mean_[0]),
    'std': float(scaler_time.scale_[0])
}
amount_stats = {
    'mean': float(scaler_amount.mean_[0]),
    'std': float(scaler_amount.scale_[0])
}
normalization_stats = {
    'Time': time_stats,
    'Amount': amount_stats
}

print("Normalization statistics:")
print(f"Time: mean={time_stats['mean']:.2f}, std={time_stats['std']:.2f}")
print(f"Amount: mean={amount_stats['mean']:.2f}, std={amount_stats['std']:.2f}")

# 2. Drop the original Time and Amount columns and rename the normalized ones
df = df.drop(['Time', 'Amount'], axis=1)
df = df.rename(columns={'Time_norm': 'Time', 'Amount_norm': 'Amount'})

# 3. Add an index column to help track original indices
df['original_index'] = np.arange(len(df))

# 4. Create stratified train/validation/test splits (80/10/10)
# First split: 80% train, 20% temp
X = df.drop(['Class'], axis=1)
y = df['Class']

print("\nCreating stratified splits (80% train, 10% validation, 10% test)...")
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Second split: split the temp data into test and validation (50% each, resulting in 10% of original data each)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp
)

# 5. Create dataframes for each split
train_df = pd.concat([X_train, y_train], axis=1)
val_df = pd.concat([X_val, y_val], axis=1)
test_df = pd.concat([X_test, y_test], axis=1)

# 6. Check the distribution in each split
print("\nClass distribution in splits:")
print(f"Train set - Total: {len(y_train)}, Fraud: {sum(y_train)}, Percentage: {sum(y_train)/len(y_train)*100:.4f}%")
print(f"Validation set - Total: {len(y_val)}, Fraud: {sum(y_val)}, Percentage: {sum(y_val)/len(y_val)*100:.4f}%")
print(f"Test set - Total: {len(y_test)}, Fraud: {sum(y_test)}, Percentage: {sum(y_test)/len(y_test)*100:.4f}%")

# 7. Convert to Hugging Face Dataset format
print("\nConverting to Hugging Face Dataset format...")
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)

dataset_dict = DatasetDict({
    'train': train_dataset,
    'validation': val_dataset,
    'test': test_dataset
})

# 8. Output some information about the datasets
print("\nDataset split information:")
print(f"Train: {train_dataset.shape}")
print(f"Validation: {val_dataset.shape}")
print(f"Test: {test_dataset.shape}")

# 9. Preview columns and first few examples
print("\nColumns in the dataset:")
print(train_dataset.column_names)
print("\nFirst example from train set:")
print(train_dataset[0])

# 10. Save the datasets locally (optional but useful for verification)
print("\nSaving datasets locally...")
dataset_dict.save_to_disk("credit_card_fraud_dataset")

# 11. To upload to HuggingFace
def upload_to_huggingface(dataset_dict, repo_name, token):
    # Login to HuggingFace
    login(token)
    
    # Get HF API
    api = HfApi()
    
    # Create repository if it doesn't exist
    try:
        api.create_repo(repo_id=repo_name, exist_ok=True)
        print(f"Repository {repo_name} ready.")
    except Exception as e:
        print(f"Error creating repository: {e}")
        return
    
    # Push dataset to HuggingFace
    dataset_dict.push_to_hub(repo_name)
    
    # Also save normalization stats as README.md
    readme_content = f"""# Credit Card Fraud Dataset

This dataset contains normalized credit card transaction data for fraud detection.

## Normalization Statistics
- Time: mean={time_stats['mean']:.4f}, std={time_stats['std']:.4f}
- Amount: mean={amount_stats['mean']:.4f}, std={amount_stats['std']:.4f}

## Class Distribution
- Train set: {sum(y_train)} fraud out of {len(y_train)} ({sum(y_train)/len(y_train)*100:.4f}%)
- Validation set: {sum(y_val)} fraud out of {len(y_val)} ({sum(y_val)/len(y_val)*100:.4f}%)
- Test set: {sum(y_test)} fraud out of {len(y_test)} ({sum(y_test)/len(y_test)*100:.4f}%)

## Features
- Original Time and Amount columns have been normalized
- 'original_index' column refers to the index in the original dataset
"""
    with open("README.md", "w") as f:
        f.write(readme_content)
    
    api.upload_file(
        path_or_fileobj="README.md",
        path_in_repo="README.md",
        repo_id=repo_name,
        commit_message="Add README with normalization stats"
    )
    
    print(f"Dataset successfully uploaded to https://huggingface.co/datasets/{repo_name}")

# Upload to Hugging Face with your username
if hf_token:
    username = "stanpony"
    repo_name = f"{username}/full_european_credit_card_fraud_dataset"
    upload_to_huggingface(dataset_dict, repo_name, hf_token)
else:
    print("Skipping upload to Hugging Face as no API token was found")

  from .autonotebook import tqdm as notebook_tqdm


Successfully loaded HF_API_TOKEN from .env file
Normalizing Time and Amount columns...
Normalization statistics:
Time: mean=94813.86, std=47488.06
Amount: mean=88.35, std=250.12

Creating stratified splits (80% train, 10% validation, 10% test)...

Class distribution in splits:
Train set - Total: 227845, Fraud: 394, Percentage: 0.1729%
Validation set - Total: 28481, Fraud: 49, Percentage: 0.1720%
Test set - Total: 28481, Fraud: 49, Percentage: 0.1720%

Converting to Hugging Face Dataset format...

Dataset split information:
Train: (227845, 33)
Validation: (28481, 33)
Test: (28481, 33)

Columns in the dataset:
['V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Time', 'Amount', 'original_index', 'Class', '__index_level_0__']

First example from train set:
{'V1': 1.94674666728168, 'V2': -0.752525821492348, 'V3': -1.35512953289131, 'V4': -0.66162991323069

Saving the dataset (1/1 shards): 100%|██████████| 227845/227845 [00:01<00:00, 185547.92 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 28481/28481 [00:00<00:00, 171505.65 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 28481/28481 [00:00<00:00, 163003.34 examples/s]


Repository stanpony/full_european_credit_card_fraud_dataset ready.


Creating parquet from Arrow format: 100%|██████████| 228/228 [00:01<00:00, 140.52ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:07<00:00,  7.60s/it]
Creating parquet from Arrow format: 100%|██████████| 29/29 [00:00<00:00, 232.42ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00,  1.31s/it]
Creating parquet from Arrow format: 100%|██████████| 29/29 [00:00<00:00, 125.83ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00,  1.53s/it]
- empty or missing yaml metadata in repo card


Dataset successfully uploaded to https://huggingface.co/datasets/stanpony/full_european_credit_card_fraud_dataset


In [11]:
from datasets import load_dataset
import pandas as pd

# Load the dataset without try/except
dataset = load_dataset("stanpony/full_european_credit_card_fraud_dataset")
print("Successfully loaded the dataset!")

# Print available splits
print("\nAvailable splits:", list(dataset.keys()))

# Print statistics for each split
for split_name, split_dataset in dataset.items():
    print(f"\n{split_name.upper()} split:")
    print(f"- Number of samples: {len(split_dataset)}")
    
    # Convert to pandas for easier analysis
    split_df = split_dataset.to_pandas()
    
    # Check class distribution
    if 'Class' in split_df.columns:
        class_counts = split_df['Class'].value_counts()
        print(f"- Class distribution: {dict(class_counts)}")
        fraud_percentage = class_counts.get(1, 0) / len(split_df) * 100
        print(f"- Fraud percentage: {fraud_percentage:.4f}%")
    
    # Check if Time and Amount are normalized
    if 'Time' in split_df.columns and 'Amount' in split_df.columns:
        print(f"- Time column stats: mean={split_df['Time'].mean():.4f}, std={split_df['Time'].std():.4f}")
        print(f"- Amount column stats: mean={split_df['Amount'].mean():.4f}, std={split_df['Amount'].std():.4f}")
    
    # Check if original_index column exists
    if 'original_index' in split_df.columns:
        print("- original_index column exists ✓")
    else:
        print("- original_index column missing ✗")
        
    # Check for any missing values
    missing_values = split_df.isnull().sum().sum()
    print(f"- Missing values: {missing_values}")

Generating train split: 100%|██████████| 227845/227845 [00:01<00:00, 124532.81 examples/s]
Generating validation split: 100%|██████████| 28481/28481 [00:00<00:00, 136459.30 examples/s]
Generating test split: 100%|██████████| 28481/28481 [00:00<00:00, 141953.99 examples/s]


Successfully loaded the dataset!

Available splits: ['train', 'validation', 'test']

TRAIN split:
- Number of samples: 227845
- Class distribution: {0: 227451, 1: 394}
- Fraud percentage: 0.1729%
- Time column stats: mean=0.0015, std=1.0000
- Amount column stats: mean=-0.0007, std=1.0024
- original_index column exists ✓
- Missing values: 0

VALIDATION split:
- Number of samples: 28481
- Class distribution: {0: 28432, 1: 49}
- Fraud percentage: 0.1720%
- Time column stats: mean=-0.0130, std=1.0002
- Amount column stats: mean=0.0075, std=1.0155
- original_index column exists ✓
- Missing values: 0

TEST split:
- Number of samples: 28481
- Class distribution: {0: 28432, 1: 49}
- Fraud percentage: 0.1720%
- Time column stats: mean=0.0010, std=0.9997
- Amount column stats: mean=-0.0020, std=0.9645
- original_index column exists ✓
- Missing values: 0
