In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import os

# Create a new directory for preprocessed data
os.makedirs('preprocessed_data', exist_ok=True)

# Load the dataset
print("Loading the dataset...")
data = pd.read_csv('./data/data.csv')

# Display initial information
print(f"Dataset shape: {data.shape}")
print("\nFirst few rows:")
print(data.head())

# Check for missing values
print("\nMissing values count:")
print(data.isnull().sum())

# Basic statistical summary
print("\nBasic statistics:")
print(data.describe())

# Preprocessing steps
print("\nStarting preprocessing...")

# 1. Handle the ID column (typically not used as a feature)
data = data.drop('ID', axis=1)

# 2. Identify categorical and numerical columns
categorical_cols = ['SEX', 'EDUCATION', 'MARRIAGE']
numerical_cols = ['LIMIT_BAL', 'AGE', 
                 'PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6',
                 'BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6',
                 'PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6']

# 3. Handle missing values
# For numerical columns, fill with median
for col in numerical_cols:
    data[col] = data[col].fillna(data[col].median())

# 4. Handle outliers in payment history columns (optional)
# Replace values outside the normal range with the mode
pay_cols = ['PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6']
for col in pay_cols:
    # Normally payment status ranges from -2 to 9
    if data[col].max() > 9 or data[col].min() < -2:
        mode_value = data[col].mode()[0]
        data.loc[(data[col] > 9) | (data[col] < -2), col] = mode_value

# 5. Handle education and marriage values
# Education should be 1-4, marriage should be 1-3
if data['EDUCATION'].max() > 4:
    data.loc[data['EDUCATION'] > 4, 'EDUCATION'] = 4
if data['MARRIAGE'].max() > 3:
    data.loc[data['MARRIAGE'] > 3, 'MARRIAGE'] = 3

# 6. One-hot encode categorical variables
print("One-hot encoding categorical variables...")
encoder = OneHotEncoder(sparse_output=False, drop='first')
encoded_cats = encoder.fit_transform(data[categorical_cols])
encoded_cols = []
for i, col in enumerate(categorical_cols):
    categories = encoder.categories_[i][1:]  # Skip the first category (reference)
    for cat in categories:
        encoded_cols.append(f"{col}_{cat}")

# Create a DataFrame with encoded categorical variables
encoded_df = pd.DataFrame(encoded_cats, columns=encoded_cols)

# 7. Scale numerical features
print("Scaling numerical features...")
scaler = StandardScaler()
scaled_numerical = scaler.fit_transform(data[numerical_cols])
scaled_df = pd.DataFrame(scaled_numerical, columns=numerical_cols)

# 8. Combine all processed features
target = data['default payment next month']
preprocessed_data = pd.concat([scaled_df, encoded_df], axis=1)
preprocessed_data['default_payment'] = target

# 9. Save the preprocessed data
print("Saving preprocessed data...")
preprocessed_data.to_csv('preprocessed_data/preprocessed_data.csv', index=False)

# 10. Split the data into training, validation, and testing sets
print("Splitting the data...")
# First split into training and temp (validation + testing)
X = preprocessed_data.drop('default_payment', axis=1)
y = preprocessed_data['default_payment']

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Then split temp into validation and testing
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

# 11. Save the splits
print("Saving data splits...")
# Training set
train_data = pd.concat([X_train, y_train], axis=1)
train_data.to_csv('preprocessed_data/train_data.csv', index=False)

# Validation set
val_data = pd.concat([X_val, y_val], axis=1)
val_data.to_csv('preprocessed_data/validation_data.csv', index=False)

# Test set
test_data = pd.concat([X_test, y_test], axis=1)
test_data.to_csv('preprocessed_data/test_data.csv', index=False)

# 12. Save the scaler and encoder for future use
import pickle
with open('preprocessed_data/scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)
with open('preprocessed_data/encoder.pkl', 'wb') as f:
    pickle.dump(encoder, f)

# Print summary of the preprocessing
print("\nPreprocessing Summary:")
print(f"Original data shape: {data.shape[0]} rows, {data.shape[1]} columns")
print(f"Preprocessed data shape: {preprocessed_data.shape[0]} rows, {preprocessed_data.shape[1]} columns")
print(f"Training set: {X_train.shape[0]} samples")
print(f"Validation set: {X_val.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")
print("\nFiles saved in the 'preprocessed_data' directory:")
print("- preprocessed_data.csv: Full preprocessed dataset")
print("- train_data.csv: Training set (70% of data)")
print("- validation_data.csv: Validation set (15% of data)")
print("- test_data.csv: Test set (15% of data)")
print("- scaler.pkl: StandardScaler fitted on numerical features")
print("- encoder.pkl: OneHotEncoder fitted on categorical features")

Loading the dataset...
Dataset shape: (30000, 25)

First few rows:
   ID  LIMIT_BAL  SEX  EDUCATION  MARRIAGE  AGE  PAY_0  PAY_2  PAY_3  PAY_4  \
0   1      20000    2          2         1   24      2      2     -1     -1   
1   2     120000    2          2         2   26     -1      2      0      0   
2   3      90000    2          2         2   34      0      0      0      0   
3   4      50000    2          2         1   37      0      0      0      0   
4   5      50000    1          2         1   57     -1      0     -1      0   

   ...  BILL_AMT4  BILL_AMT5  BILL_AMT6  PAY_AMT1  PAY_AMT2  PAY_AMT3  \
0  ...          0          0          0         0       689         0   
1  ...       3272       3455       3261         0      1000      1000   
2  ...      14331      14948      15549      1518      1500      1000   
3  ...      28314      28959      29547      2000      2019      1200   
4  ...      20940      19146      19131      2000     36681     10000   

   PAY_AMT4  PAY_AM