# Feature Engineering - Credit Card Fraud Detection

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

In [2]:
# Load data
train_df = pd.read_csv('../data/raw/fraudTrain.csv', index_col=0)
test_df = pd.read_csv('../data/raw/fraudTest.csv', index_col=0)

print(f"Train shape: {train_df.shape}")
print(f"Test shape: {test_df.shape}")

Train shape: (1296675, 22)
Test shape: (555719, 22)


In [3]:
# Select features
features = ['amt', 'city_pop', 'category', 'gender', 'state']
target = 'is_fraud'

# Encode categorical variables
le_category = LabelEncoder()
le_gender = LabelEncoder()
le_state = LabelEncoder()

train_df['category_encoded'] = le_category.fit_transform(train_df['category'])
train_df['gender_encoded'] = le_gender.fit_transform(train_df['gender'])
train_df['state_encoded'] = le_state.fit_transform(train_df['state'])

test_df['category_encoded'] = le_category.transform(test_df['category'])
test_df['gender_encoded'] = le_gender.transform(test_df['gender'])
test_df['state_encoded'] = le_state.transform(test_df['state'])

print("Features encoded!")

Features encoded!


In [4]:
# Create final feature set
feature_cols = ['amt', 'city_pop', 'category_encoded', 'gender_encoded', 'state_encoded']

X_train = train_df[feature_cols]
y_train = train_df[target]

X_test = test_df[feature_cols]
y_test = test_df[target]

print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")

X_train shape: (1296675, 5)
X_test shape: (555719, 5)


In [5]:
# Save processed data
X_train.to_csv('../data/processed/X_train.csv', index=False)
y_train.to_csv('../data/processed/y_train.csv', index=False)
X_test.to_csv('../data/processed/X_test.csv', index=False)
y_test.to_csv('../data/processed/y_test.csv', index=False)

print("Processed data saved!")

Processed data saved!
