In [1]:
import pandas as pd
import sqlite3
import pickle
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE

# --- 1. CONNECT TO DATABASE (SQL Extraction) ---
# We go 'up' one level (..) to find the data folder
conn = sqlite3.connect('../data/credit_risk.db') 
query = "SELECT * FROM loans"
df = pd.read_sql(query, conn)
conn.close()

print("âœ… Data loaded from SQL Database.")

# --- 2. PREPROCESSING ---
# Convert 'Risk' to binary: good=0, bad=1 (Target Variable)
df['Risk_Code'] = df['Risk'].map({'good': 0, 'bad': 1})

# Select Features (X) and Target (y)
# For this MVP, we use the 3 numeric columns to keep it simple
features = ['Age', 'Credit amount', 'Duration']
X = df[features]
y = df['Risk_Code']

# --- 3. SPLIT & SMOTE (Balancing the Data) ---
# Split first to avoid data leakage
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply SMOTE to generate synthetic "Bad" loans for training
smote = SMOTE(random_state=42)
X_train_bal, y_train_bal = smote.fit_resample(X_train, y_train)

print(f"Original Training Count: {len(X_train)}")
print(f"Balanced Training Count: {len(X_train_bal)} (SMOTE Applied)")

# --- 4. TRAIN THE MODEL ---
model = LogisticRegression()
model.fit(X_train_bal, y_train_bal)

# Test Accuracy
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"ðŸ“Š Model Test Accuracy: {accuracy:.2%}")

# --- 5. SAVE THE MODEL (The 'Pickle') ---
# We save it into the /data folder so the App can find it easily
with open('../data/credit_model.pkl', 'wb') as f:
    pickle.dump(model, f)

print("âœ… Model saved as 'credit_model.pkl' in the data folder.")

âœ… Data loaded from SQL Database.
Original Training Count: 800
Balanced Training Count: 1100 (SMOTE Applied)
ðŸ“Š Model Test Accuracy: 57.50%
âœ… Model saved as 'credit_model.pkl' in the data folder.
