In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.linear_model import LogisticRegression
import joblib
from matplotlib import pyplot as plt

In [None]:
# Mounting Google Drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Load dataset
dataset = pd.read_excel("/content/drive/My Drive/Project_Credit_Scoring/a_Dataset_CreditScoring.xlsx")

In [None]:
# View basic info
print("Initial Dataset Shape:", dataset.shape)
print(dataset.head())

In [None]:
# Drop customer ID column
dataset = dataset.drop('ID', axis=1)
print("Shape after dropping ID:", dataset.shape)

In [None]:
# Check for missing values
print("Missing values before filling:")
print(dataset.isna().sum())

In [None]:
# Fill missing values with mean
dataset = dataset.fillna(dataset.mean())

In [None]:
# Confirm no missing values
print("Missing values after filling:")
print(dataset.isna().sum())

In [None]:
# Define X and y
y = dataset.iloc[:, 0].values
X = dataset.iloc[:, 1:29].values

In [None]:
# Train-test split (80:20)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=0, stratify=y
)

In [None]:
# Standardization
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [None]:
# Export scaler
joblib.dump(sc, '/content/drive/My Drive/Project_Credit_Scoring/f2_Normalisation_CreditScoring')

In [None]:
# Logistic Regression Model
classifier = LogisticRegression()
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

In [None]:
# Export model
joblib.dump(classifier, '/content/drive/My Drive/Project_Credit_Scoring/f1_Classifier_CreditScoring')

In [None]:
# Print confusion matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

In [None]:
# Print accuracy
print("\nAccuracy Score:", accuracy_score(y_test, y_pred))

In [None]:
# Generate probability predictions
predictions = classifier.predict_proba(X_test)

In [None]:
# Save prediction results
df_prediction_prob = pd.DataFrame(predictions, columns=['prob_0', 'prob_1'])
df_prediction_target = pd.DataFrame(classifier.predict(X_test), columns=['predicted_TARGET'])
df_test_dataset = pd.DataFrame(y_test, columns=['Actual Outcome'])

dfx = pd.concat([df_test_dataset, df_prediction_prob, df_prediction_target], axis=1)
dfx.to_csv("/content/drive/My Drive/Project_Credit_Scoring/c1_Model_Prediction.xlsx", sep=',', encoding='UTF-8')
print("\nSample predictions:")
print(dfx.head())

In [None]:
# Create and save accuracy table
report_dict = classification_report(y_test, y_pred, output_dict=True)
accuracy_table = pd.DataFrame(report_dict).transpose()

print("\nClassification Report (Accuracy Table):")
print(accuracy_table)

accuracy_table.to_excel("/content/drive/My Drive/Project_Credit_Scoring/c2_Accuracy_Report.xlsx", index=True)

# Plot classification report as a heatmap
import seaborn as sns
import matplotlib.pyplot as plt

plot_metrics = accuracy_table.loc[['0', '1', 'macro avg', 'weighted avg'], ['precision', 'recall', 'f1-score']]

plt.figure(figsize=(10, 6))
sns.heatmap(plot_metrics, annot=True, cmap='YlGnBu', fmt=".2f", linewidths=0.5)
plt.title('Classification Report Metrics')
plt.ylabel('Classes / Averages')
plt.xlabel('Metric')
plt.tight_layout()
plt.show()

In [None]:
# Optional: Line plot of actual outcomes
df_test_dataset['Actual Outcome'].plot(kind='line', figsize=(8, 4), title='Actual Outcome')
plt.gca().spines[['top', 'right']].set_visible(False)
plt.show()
