In [None]:
import pandas as pd
import numpy as np
from src.data_preprocessing_PROPER import task1
import sys
sys.path.append('..')

In [2]:
print("Loading data with selected columns only...")
necessary_columns = [
    'user_id', 'purchase_value', 'age', 'time_since_signup_hours',
    'hour_of_day', 'day_of_week', 'source', 'browser', 'sex', 'country', 'class'
]

try:
    df = pd.read_csv(
        '../data/processed/fraud_data_with_features.csv',
        usecols=lambda col: col in necessary_columns,
        nrows=50000
    )
    print(f"Loaded {len(df)} rows with {len(df.columns)} columns")
except:
    print("Could not load with column selection, loading full then filtering...")
    df = pd.read_csv('../data/processed/fraud_data_with_features.csv')
    existing_cols = [col for col in necessary_columns if col in df.columns]
    df = df[existing_cols]
    print(f"Filtered to {len(df)} rows, {len(df.columns)} columns")

print(f"\nColumns available: {list(df.columns)}")
print(f"Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

Loading data with selected columns only...
Loaded 50000 rows with 11 columns

Columns available: ['user_id', 'purchase_value', 'source', 'browser', 'sex', 'age', 'class', 'country', 'time_since_signup_hours', 'hour_of_day', 'day_of_week']
Memory usage: 10.53 MB


In [3]:
X_train_bal, X_test, y_train_bal, y_test = task1(df)


1. Preparing data...
   Original shape: (50000, 11)
   Target distribution: {0: 45321, 1: 4679}

2. Selecting key features to prevent memory issues...
   Using columns: ['purchase_value', 'age', 'time_since_signup_hours', 'hour_of_day', 'day_of_week', 'source', 'browser', 'sex']

3. Applying efficient One-Hot Encoding...
   Categorical columns: ['source', 'browser', 'sex']
   Numerical columns: ['purchase_value', 'age', 'time_since_signup_hours', 'hour_of_day', 'day_of_week']
   Shape after encoding: (50000, 12)

4. Applying feature scaling...
   Scaled columns: ['purchase_value', 'age', 'time_since_signup_hours', 'hour_of_day', 'day_of_week']

5. Creating train-test split...
   Training set: (40000, 12)
   Test set: (10000, 12)
   Training fraud %: 9.36%

6. Handling class imbalance with undersampling...

Applying efficient undersampling...
Fraud samples: 3743
Legitimate samples: 36257
After undersampling:
  Total samples: 5614
  Fraud: 3743 (66.7%)
  Legitimate: 1871 (33.3%)

7. Sav

In [4]:
print("VERIFICATION")
print(f"\nBalanced Training Data:")
print(f"Shape: {X_train_bal.shape}")
print(f"Class distribution: {pd.Series(y_train_bal).value_counts().to_dict()}")
print(f"Fraud %: {y_train_bal.mean()*100:.1f}%")

print(f"\nTest Data (real distribution):")
print(f"Shape: {X_test.shape}")
print(f"Class distribution: {pd.Series(y_test).value_counts().to_dict()}")
print(f"Fraud %: {y_test.mean()*100:.1f}%")

print(f"\nFirst few columns of training data:")
print(X_train_bal.columns.tolist()[:10])

VERIFICATION

Balanced Training Data:
Shape: (5614, 12)
Class distribution: {1: 3743, 0: 1871}
Fraud %: 66.7%

Test Data (real distribution):
Shape: (10000, 12)
Class distribution: {0: 9064, 1: 936}
Fraud %: 9.4%

First few columns of training data:
['purchase_value', 'age', 'time_since_signup_hours', 'hour_of_day', 'day_of_week', 'source_Direct', 'source_SEO', 'browser_FireFox', 'browser_IE', 'browser_Opera']
