In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
import joblib

# Load dataset
df = pd.read_csv("../data/raw/credit_risk_dataset.csv")

# Create 'default' column
df['default'] = (df['loan_status'] > 0).astype(int)

# Drop risk_score if exists
df = df.drop(['risk_score'], axis=1, errors='ignore')

# Encode categorical features
categorical_cols = df.select_dtypes(include='object').columns
df[categorical_cols] = df[categorical_cols].apply(LabelEncoder().fit_transform)

# Split
X = df.drop('default', axis=1)
y = df['default']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Save data and scaler
joblib.dump((X_train_scaled, X_test_scaled, y_train, y_test), "processed_data.pkl")
joblib.dump(scaler, "scaler.pkl")


import pickle

# Replace 'file.pkl' with the path to your pickle file
with open('scaler.pkl', 'rb') as f:
    data = pickle.load(f)

# Now 'data' contains the object that was stored in the .pkl file
print(data)


['person_age' 'person_income' 'person_home_ownership' 'person_emp_length'
 'loan_intent' 'loan_grade' 'loan_amnt' 'loan_int_rate' 'loan_status'
 'loan_percent_income' 'cb_person_default_on_file'
 'cb_person_cred_hist_length']
