In [170]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from ydata_profiling import ProfileReport
import seaborn as sns
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split,KFold, GridSearchCV, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB,BernoulliNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis,QuadraticDiscriminantAnalysis
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report
from sklearn.base import BaseEstimator, ClassifierMixin
import warnings
warnings.filterwarnings("ignore")
import seaborn as sns
import pickle


# 1. Explore the Data

In [171]:
# Load the CSV file uploaded by the user
csv_file_path = '/home/aimssn-it/Desktop/Databeez/german_credit_data.csv'

# Read the CSV file
data = pd.read_csv(csv_file_path)

# Display the first few rows and basic information about the dataset
data_info = {
    "head": data.head(),
    "info": data.info(),
    "description": data.describe(include='all').T,
}

data_info


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Unnamed: 0        1000 non-null   int64 
 1   Age               1000 non-null   int64 
 2   Sex               1000 non-null   object
 3   Job               1000 non-null   int64 
 4   Housing           1000 non-null   object
 5   Saving accounts   817 non-null    object
 6   Checking account  606 non-null    object
 7   Credit amount     1000 non-null   int64 
 8   Duration          1000 non-null   int64 
 9   Purpose           1000 non-null   object
 10  Risk              1000 non-null   object
dtypes: int64(5), object(6)
memory usage: 86.1+ KB


{'head':    Unnamed: 0  Age     Sex  Job Housing Saving accounts Checking account  \
 0           0   67    male    2     own             NaN           little   
 1           1   22  female    2     own          little         moderate   
 2           2   49    male    1     own          little              NaN   
 3           3   45    male    2    free          little           little   
 4           4   53    male    2    free          little           little   
 
    Credit amount  Duration              Purpose  Risk  
 0           1169         6             radio/TV  good  
 1           5951        48             radio/TV   bad  
 2           2096        12            education  good  
 3           7882        42  furniture/equipment  good  
 4           4870        24                  car   bad  ,
 'info': None,
 'description':                    count unique     top freq      mean          std    min  \
 Unnamed: 0        1000.0    NaN     NaN  NaN     499.5   288.819436    0.0 

In [172]:
data=data.drop(['Unnamed: 0'],axis=1)

In [173]:
# Generate a profiling report
profile = ProfileReport(data, title="Credit Risk Germany Data Profiling Report", explorative=True)

# Display the profiling report
profile.to_file("credit_risk_germany_data_profiling_report.html")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Since Age and Duration are left skewed, we are going to make them normally distributed with:
- data['Age']= np.log1p(data['Age']) or 
- data['Age'] = np.sqrt(data['Age'])

In [174]:
sns.histplot(data['Age'], kde = True)
plt.show()

In [175]:
data['Age'] = np.log1p(data['Age'])

In [176]:
sns.histplot(data['Age'], kde = True)
plt.show()

In [177]:
sns.histplot(data['Credit amount'], kde = True)
plt.show()

In [178]:
data['Credit amount'] = np.log1p(data['Credit amount'])

In [179]:
sns.histplot(data['Credit amount'], kde = True)
plt.show()

In [180]:
# Data types of each column
data_types = data.dtypes

# Count of missing values in each column
missing_values = data.isnull().sum()

# Summary statistics
summary_statistics = data.describe(include='all')

data_analysis = {
    "data_types": data_types,
    "missing_values": missing_values,
    "summary_statistics": summary_statistics
}

data_analysis

{'data_types': Age                 float64
 Sex                  object
 Job                   int64
 Housing              object
 Saving accounts      object
 Checking account     object
 Credit amount       float64
 Duration              int64
 Purpose              object
 Risk                 object
 dtype: object,
 'missing_values': Age                   0
 Sex                   0
 Job                   0
 Housing               0
 Saving accounts     183
 Checking account    394
 Credit amount         0
 Duration              0
 Purpose               0
 Risk                  0
 dtype: int64,
 'summary_statistics':                 Age   Sex          Job Housing Saving accounts  \
 count   1000.000000  1000  1000.000000    1000             817   
 unique          NaN     2          NaN       3               4   
 top             NaN  male          NaN     own          little   
 freq            NaN   690          NaN     713             603   
 mean       3.554569   NaN     1.904000 

In [181]:
data.head()

Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk
0,4.219508,male,2,own,,little,7.064759,6,radio/TV,good
1,3.135494,female,2,own,little,moderate,8.691483,48,radio/TV,bad
2,3.912023,male,1,own,little,,7.648263,12,education,good
3,3.828641,male,2,free,little,little,8.972464,42,furniture/equipment,good
4,3.988984,male,2,free,little,little,8.491055,24,car,bad


# Preprocessing

In [182]:

# Handle missing values
data = data.assign(**{
    'Saving accounts': data['Saving accounts'].fillna('unknown'),
    'Checking account': data['Checking account'].fillna('unknown')
})

# Encode categorical variables
data_encoded = data.copy()
data_encoded['Sex'] = data_encoded['Sex'].map({'male': 0, 'female': 1})
data_encoded['Housing'] = data_encoded['Housing'].map({'own': 0, 'rent': 1, 'free': 2})
data_encoded['Saving accounts'] = data_encoded['Saving accounts'].astype('category').cat.codes
data_encoded['Checking account'] = data_encoded['Checking account'].astype('category').cat.codes
data_encoded['Purpose'] = data_encoded['Purpose'].astype('category').cat.codes
data_encoded['Risk'] = data_encoded['Risk'].map({'good': 1, 'bad': 0})

# Normalize or scale numerical features
scaler = StandardScaler()
numerical_features = ['Age', 'Job', 'Credit amount', 'Duration']
data_encoded[numerical_features] = scaler.fit_transform(data_encoded[numerical_features])



In [183]:
data_encoded.columns

Index(['Age', 'Sex', 'Job', 'Housing', 'Saving accounts', 'Checking account',
       'Credit amount', 'Duration', 'Purpose', 'Risk'],
      dtype='object')

In [184]:
data_encoded.head()

Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk
0,2.282879,0,0.146949,0,4,0,-0.933992,-1.236478,5,1
1,-1.438777,1,0.146949,0,0,1,1.163149,2.248194,5,0
2,1.227217,0,-1.383771,0,0,3,-0.18175,-0.738668,3,1
3,0.94095,0,0.146949,2,0,0,1.525385,1.750384,4,1
4,1.491441,0,0.146949,2,0,0,0.904761,0.256953,1,0


In [185]:
# Generate a profiling report
profile = ProfileReport(data_encoded, title="Credit Risk Germany Data Profiling Report", explorative=True)

# Display the profiling report
profile.to_file("preprocessed_credit_risk_germany_data_profiling_report.html")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

In [186]:

# Create correlation matrix
correlation_matrix = data_encoded.corr()

# Display the correlation matrix
print(correlation_matrix)

# Plot the heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Matrix')
plt.show()

# Check correlation with the target variable 'Risk'
correlation_with_target = correlation_matrix['Risk'].sort_values(ascending=False)
print("Correlation with target variable 'Risk':")
print(correlation_with_target)

                       Age       Sex       Job   Housing  Saving accounts  \
Age               1.000000 -0.194999  0.028823  0.093109         0.092608   
Sex              -0.194999  1.000000 -0.070298  0.033818        -0.034982   
Job               0.028823 -0.070298  1.000000  0.101939         0.011709   
Housing           0.093109  0.033818  0.101939  1.000000        -0.003262   
Saving accounts   0.092608 -0.034982  0.011709 -0.003262         1.000000   
Checking account  0.077562 -0.025578  0.040663 -0.121380         0.222867   
Credit amount     0.023974 -0.116756  0.304628  0.163745         0.064227   
Duration         -0.027167 -0.081432  0.210910  0.137434         0.047661   
Purpose          -0.083564  0.063231 -0.025326 -0.086839        -0.053225   
Risk              0.102463 -0.075493 -0.032735 -0.127789         0.178943   

                  Checking account  Credit amount  Duration   Purpose  \
Age                       0.077562       0.023974 -0.027167 -0.083564   
Sex   

Identify key variables influencing credit scoring using correlation matrices and visualizations (e.g., heatmaps, pair plots).

In [187]:

# Pair plot for selected features
selected_features = ['Age', 'Job', 'Credit amount', 'Duration', 'Saving accounts','Purpose','Checking account','Risk']
sns.pairplot(data_encoded[selected_features], hue='Risk', palette='coolwarm')
plt.show()

In [None]:
!pip install imbalanced-learn



In [None]:
from imblearn.over_sampling import SMOTE

# Separate the features and target variable
X = data_encoded.drop('Risk', axis=1)
y = data_encoded['Risk']

# Apply SMOTE to balance the dataset
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Update data_encoded with the resampled data
data_encoded = pd.concat([pd.DataFrame(X_resampled, columns=X.columns), pd.DataFrame(y_resampled, columns=['Risk'])], axis=1)

# Verify the balance of the target variable
print(data_encoded['Risk'].value_counts())


Risk
1    700
0    700
Name: count, dtype: int64


In [190]:
# Split the dataset into training and testing sets
X = data_encoded.drop('Risk', axis=1)
y = data_encoded['Risk']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Modelling

In [146]:

# Initialize models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(),
    "XGBoost": XGBClassifier(eval_metric='logloss'),
    "Gradient Boosting": GradientBoostingClassifier(),
    "AdaBoost": AdaBoostClassifier(),
    "Decision Tree": DecisionTreeClassifier(),
    "Support Vector Machine": SVC(probability=True),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "Gaussian Naive Bayes": GaussianNB(),
    "Bernoulli Naive Bayes": BernoulliNB(),
    "Linear Discriminant Analysis": LinearDiscriminantAnalysis(),
    "Quadratic Discriminant Analysis": QuadraticDiscriminantAnalysis(),
    "Extra Trees": ExtraTreesClassifier(),

}

# Train and evaluate models
best_model = None
best_score = 0
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    auc_roc = roc_auc_score(y_test, y_pred)
    
    print(f"Model: {name}")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-score: {f1:.4f}")
    print(f"AUC-ROC: {auc_roc:.4f}")
    print(confusion_matrix(y_test, y_pred))
    print(classification_report(y_test, y_pred))
    print("\n")
    
    if auc_roc > best_score:
        best_score = auc_roc
        best_model = model

# Save the best-performing model using pickle
with open('best_model.pkl', 'wb') as file:
    pickle.dump(best_model, file)

print(f"Best model saved: {best_model}")

Model: Logistic Regression
Accuracy: 0.7550
Precision: 0.7771
Recall: 0.9149
F1-score: 0.8404
AUC-ROC: 0.6439
[[ 22  37]
 [ 12 129]]
              precision    recall  f1-score   support

           0       0.65      0.37      0.47        59
           1       0.78      0.91      0.84       141

    accuracy                           0.76       200
   macro avg       0.71      0.64      0.66       200
weighted avg       0.74      0.76      0.73       200



Model: Random Forest
Accuracy: 0.7550
Precision: 0.7875
Recall: 0.8936
F1-score: 0.8372
AUC-ROC: 0.6587
[[ 25  34]
 [ 15 126]]
              precision    recall  f1-score   support

           0       0.62      0.42      0.51        59
           1       0.79      0.89      0.84       141

    accuracy                           0.76       200
   macro avg       0.71      0.66      0.67       200
weighted avg       0.74      0.76      0.74       200



Model: XGBoost
Accuracy: 0.7800
Precision: 0.8089
Recall: 0.9007
F1-score: 0.8523


## Model optimizations

In [157]:

# Define the XGBClassifier wrapper for sklearn compatibility
class XGBClassifierWrapper(BaseEstimator, ClassifierMixin):
    def __init__(self, **kwargs):
        # Define the parameters explicitly in the constructor
        self.model = XGBClassifier(  **kwargs)

        
    def fit(self, X, y):
        # Fit the XGBClassifier model
        self.model.fit(X, y)
        return self

    def predict(self, X):
        # Predict using the fitted model
        return self.model.predict(X)

    def predict_proba(self, X):
        # Predict probabilities using the fitted model
        return self.model.predict_proba(X)

    def score(self, X, y):
        # Return the accuracy score using the model
        return self.model.score(X, y)

# Initialize models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(),
    "XGBoost": XGBClassifierWrapper(eval_metric='logloss'),  # Using the wrapped XGBClassifier
    "Gradient Boosting": GradientBoostingClassifier(),
    "AdaBoost": AdaBoostClassifier(),
    "Decision Tree": DecisionTreeClassifier(),
    "Support Vector Machine": SVC(probability=True),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "Gaussian Naive Bayes": GaussianNB(),
    "Bernoulli Naive Bayes": BernoulliNB(),
    "Linear Discriminant Analysis": LinearDiscriminantAnalysis(),
    "Quadratic Discriminant Analysis": QuadraticDiscriminantAnalysis(),
    "Extra Trees": ExtraTreesClassifier(),
}

# Train and evaluate models
best_model = None
best_score = 0
for name, model in models.items():
    kfold = KFold(n_splits=10, random_state=42, shuffle=True)
    cv_results = cross_val_score(model, X_train, y_train, cv=kfold, scoring='roc_auc')
    print(f"{name}: {cv_results.mean():.4f} ({cv_results.std():.4f})")
    
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    auc_roc = roc_auc_score(y_test, y_pred)
    
    print(f"Model: {name}")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-score: {f1:.4f}")
    print(f"AUC-ROC: {auc_roc:.4f}")
    print(confusion_matrix(y_test, y_pred))
    print(classification_report(y_test, y_pred))
    print("\n")
    
    if auc_roc > best_score:
        best_score = auc_roc
        best_model = model

# Save the best-performing model using pickle
with open('best_model.pkl', 'wb') as file:
    pickle.dump(best_model, file)

print(f"Best model saved: {best_model}")


Logistic Regression: 0.7561 (0.0399)
Model: Logistic Regression
Accuracy: 0.7550
Precision: 0.7771
Recall: 0.9149
F1-score: 0.8404
AUC-ROC: 0.6439
[[ 22  37]
 [ 12 129]]
              precision    recall  f1-score   support

           0       0.65      0.37      0.47        59
           1       0.78      0.91      0.84       141

    accuracy                           0.76       200
   macro avg       0.71      0.64      0.66       200
weighted avg       0.74      0.76      0.73       200



Random Forest: 0.7425 (0.0534)
Model: Random Forest
Accuracy: 0.7500
Precision: 0.7791
Recall: 0.9007
F1-score: 0.8355
AUC-ROC: 0.6453
[[ 23  36]
 [ 14 127]]
              precision    recall  f1-score   support

           0       0.62      0.39      0.48        59
           1       0.78      0.90      0.84       141

    accuracy                           0.75       200
   macro avg       0.70      0.65      0.66       200
weighted avg       0.73      0.75      0.73       200



XGBoost: nan (