In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE

# Load the data
df = pd.read_csv('/content/drive/MyDrive/creditcard.csv')

# 1. Data Cleaning
def clean_data(df):
    # Handle missing values
    imputer = SimpleImputer(strategy='median')
    df_imputed = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)

    # Remove duplicates
    df_imputed = df_imputed.drop_duplicates()

    # Handle outliers (using IQR method for Amount)
    Q1 = df_imputed['Amount'].quantile(0.25)
    Q3 = df_imputed['Amount'].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    df_imputed['Amount'] = df_imputed['Amount'].clip(lower_bound, upper_bound)

    return df_imputed

# 2. Data Transformation
def transform_data(df):
    # Normalize numerical features
    scaler = StandardScaler()
    numerical_features = ['Time', 'Amount'] + [f'V{i}' for i in range(1, 29)]
    df[numerical_features] = scaler.fit_transform(df[numerical_features])

    # Create time-based features
    df['Hour'] = df['Time'].apply(lambda x: (x / 3600) % 24)
    df['Day'] = df['Time'].apply(lambda x: (x / 86400) % 7)

    return df

# 3. Feature Engineering
def engineer_features(df):
    # Create a feature for transaction frequency per card
    df['TransactionFreq'] = df.groupby('V1')['Time'].transform('count')

    # Create a feature for average transaction amount per card
    df['AvgAmount'] = df.groupby('V1')['Amount'].transform('mean')

    return df

# 4. Handle Imbalanced Data
def balance_data(X, y):
    smote = SMOTE(random_state=42)
    X_resampled, y_resampled = smote.fit_resample(X, y)
    return X_resampled, y_resampled

# 5. Data Splitting
def split_data(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
    return X_train, X_test, y_train, y_test

# Main preprocessing pipeline
def preprocess_data(df):
    print("Original shape:", df.shape)

    # Clean data
    df_cleaned = clean_data(df)
    print("Shape after cleaning:", df_cleaned.shape)

    # Transform data
    df_transformed = transform_data(df_cleaned)
    print("Shape after transformation:", df_transformed.shape)

    # Engineer features
    df_engineered = engineer_features(df_transformed)
    print("Shape after feature engineering:", df_engineered.shape)

    # Prepare for modelling
    X = df_engineered.drop(['Class'], axis=1)
    y = df_engineered['Class']

    # Balance data
    X_resampled, y_resampled = balance_data(X, y)
    print("Shape after balancing:", X_resampled.shape)

    # Split data
    X_train, X_test, y_train, y_test = split_data(X_resampled, y_resampled)
    print("Training set shape:", X_train.shape)
    print("Testing set shape:", X_test.shape)

    return X_train, X_test, y_train, y_test

# Run the preprocessing pipeline
X_train, X_test, y_train, y_test = preprocess_data(df)

print("\nPreprocessing completed. Data is now ready for modelling.")

Original shape: (284807, 31)
Shape after cleaning: (283726, 31)
Shape after transformation: (283726, 33)
Shape after feature engineering: (283726, 35)
Shape after balancing: (566506, 34)
Training set shape: (453204, 34)
Testing set shape: (113302, 34)

Preprocessing completed. Data is now ready for modelling.


In [2]:
# Save the training and testing data to CSV files
X_train.to_csv('/content/drive/MyDrive/X_train.csv', index=False)
X_test.to_csv('/content/drive/MyDrive/X_test.csv', index=False)
y_train.to_csv('/content/drive/MyDrive/y_train.csv', index=False)
y_test.to_csv('/content/drive/MyDrive/y_test.csv', index=False)

print("\nTraining and testing data saved to Google Drive.")


Training and testing data saved to Google Drive.
