In [5]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler, LabelEncoder

# Load datasets
train_df = pd.read_csv("../data/processed/train_set_SMOTEd.csv")
test_df = pd.read_csv("../data/processed/test_set.csv")

### Logistic regression Model

In [6]:
# Define feature columns (all except 'id' and 'credit_status')
feature_columns = [col for col in train_df.columns if col not in ["credit_status", "id"]]

# Split features and target
X_train = train_df[feature_columns]
y_train = LabelEncoder().fit_transform(train_df["credit_status"])
X_test = test_df[feature_columns]
y_test = LabelEncoder().fit_transform(test_df["credit_status"])

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train logistic regression model
logreg = LogisticRegression(max_iter=1000, random_state=42, class_weight="balanced") #
logreg.fit(X_train_scaled, y_train);


### Evaluation

In [7]:
# Predict on test set
y_pred = logreg.predict(X_test_scaled)

# Attach predictions to customer IDs
results_df = test_df[["id"]].copy()
results_df["predicted_credit_status"] = y_pred

# Display results
print(results_df.head())

# Evaluate performance
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

        id  predicted_credit_status
0  5052720                        0
1  5087861                        1
2  5068206                        0
3  5137255                        0
4  5023163                        1
Accuracy: 0.8414701042238069

Confusion Matrix:
 [[  95  546]
 [ 610 6041]]

Classification Report:
               precision    recall  f1-score   support

           0       0.13      0.15      0.14       641
           1       0.92      0.91      0.91      6651

    accuracy                           0.84      7292
   macro avg       0.53      0.53      0.53      7292
weighted avg       0.85      0.84      0.84      7292

