In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC

In [None]:
# loading data
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [None]:
data = pd.read_csv('/content/drive/My Drive/german_credit_data.csv')
data.head()

Unnamed: 0.1,Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk
0,0,67,male,2,own,,little,1169,6,radio/TV,good
1,1,22,female,2,own,little,moderate,5951,48,radio/TV,bad
2,2,49,male,1,own,little,,2096,12,education,good
3,3,45,male,2,free,little,little,7882,42,furniture/equipment,good
4,4,53,male,2,free,little,little,4870,24,car,bad


In [None]:
# Dropping the 'Unnamed: 0' column
data = data.drop(columns=['Unnamed: 0'])

# Categorical features and numerical features:
categorical_features = ['Sex', 'Housing', 'Saving accounts', 'Checking account', 'Purpose']
numerical_features = ['Age', 'Job', 'Credit amount', 'Duration']

# Preprocessing pipeline:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)])

# Defining our models
models = [
    ('knn', KNeighborsClassifier()),
    ('dt', DecisionTreeClassifier()),
    ('lr', LogisticRegression(max_iter=5000)), # increase max_iter
    ('mlp', MLPClassifier(max_iter=5000, learning_rate_init=0.01)), # increase max_iter and adjust learning_rate_init
    ('svm', SVC())
]


# Splitting our data into training and test sets:
X = data.drop('Risk', axis=1)
y = data['Risk']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Training our models with hyperparameter tuning:
for name, model in models:
    pipe = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', model)])
    
    # Different hyperparameters for different models
    if name == 'knn':
        param_grid = {
            'classifier__n_neighbors': [3, 5, 11],
        }
    elif name == 'dt':
        param_grid = {
            'classifier__max_depth': [3, 5, 7],
            'classifier__min_samples_split': [2, 3, 4]
        }
    elif name == 'lr':
        param_grid = {
            'classifier__C': [0.1, 1.0, 10.0],
        }
    elif name == 'mlp':
        param_grid = {
            'classifier__hidden_layer_sizes': [(50,), (100,), (50, 50)],
            'classifier__alpha': [0.0001, 0.001, 0.01]
        }
    elif name == 'svm':
        param_grid = {
            'classifier__C': [0.1, 1.0, 10.0],
            'classifier__gamma': [0.1, 1.0, 10.0]
        }

    search = GridSearchCV(pipe, param_grid, cv=5)

    search.fit(X_train, y_train)

    print(f"For {name}, Best parameter (CV score={search.best_score_}): {search.best_params_}")
    print(f"Test score: {search.score(X_test, y_test)}\n")


For knn, Best parameter (CV score=0.73125): {'classifier__n_neighbors': 11}
Test score: 0.76

For dt, Best parameter (CV score=0.6799999999999999): {'classifier__max_depth': 5, 'classifier__min_samples_split': 4}
Test score: 0.71

For lr, Best parameter (CV score=0.7375): {'classifier__C': 1.0}
Test score: 0.76

For mlp, Best parameter (CV score=0.71125): {'classifier__alpha': 0.001, 'classifier__hidden_layer_sizes': (100,)}
Test score: 0.69

For svm, Best parameter (CV score=0.74): {'classifier__C': 1.0, 'classifier__gamma': 0.1}
Test score: 0.76



In [None]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

# Best parameters from GridSearchCV
best_params = {
    'knn': {'n_neighbors': 11},
    'dt': {'max_depth': 5, 'min_samples_split': 3},
    'lr': {'C': 1.0},
    'mlp': {'hidden_layer_sizes': (100,), 'alpha': 0.01},
    'svm': {'C': 1.0, 'gamma': 0.1},
}

# Models with best parameters
models_best_params = [
    ('knn', KNeighborsClassifier(n_neighbors=best_params['knn']['n_neighbors'])),
    ('dt', DecisionTreeClassifier(max_depth=best_params['dt']['max_depth'], min_samples_split=best_params['dt']['min_samples_split'])),
    ('lr', LogisticRegression(C=best_params['lr']['C'], max_iter=5000)),
    ('mlp', MLPClassifier(hidden_layer_sizes=best_params['mlp']['hidden_layer_sizes'], alpha=best_params['mlp']['alpha'], max_iter=5000, learning_rate_init=0.01)),
    ('svm', SVC(C=best_params['svm']['C'], gamma=best_params['svm']['gamma']))
]

# Fit models with best parameters
for name, model in models_best_params:
    pipe_best = Pipeline(steps=[('preprocessor', preprocessor),
                                ('classifier', model)])
    pipe_best.fit(X_train, y_train)

    y_train_pred = pipe_best.predict(X_train)
    y_test_pred = pipe_best.predict(X_test)

    print(f"{name} Model:")
    print("Training Accuracy:", accuracy_score(y_train, y_train_pred))
    print("Test Accuracy:", accuracy_score(y_test, y_test_pred))
    print("\nClassification Report (Test Data):")
    print(classification_report(y_test, y_test_pred))
    print("\nConfusion Matrix (Test Data):")
    print(confusion_matrix(y_test, y_test_pred))
    print("\n===========================\n")


knn Model:
Training Accuracy: 0.76375
Test Accuracy: 0.76

Classification Report (Test Data):
              precision    recall  f1-score   support

         bad       0.69      0.34      0.45        59
        good       0.77      0.94      0.85       141

    accuracy                           0.76       200
   macro avg       0.73      0.64      0.65       200
weighted avg       0.75      0.76      0.73       200


Confusion Matrix (Test Data):
[[ 20  39]
 [  9 132]]


dt Model:
Training Accuracy: 0.78
Test Accuracy: 0.71

Classification Report (Test Data):
              precision    recall  f1-score   support

         bad       0.51      0.31      0.38        59
        good       0.75      0.88      0.81       141

    accuracy                           0.71       200
   macro avg       0.63      0.59      0.60       200
weighted avg       0.68      0.71      0.68       200


Confusion Matrix (Test Data):
[[ 18  41]
 [ 17 124]]


lr Model:
Training Accuracy: 0.74875
Test Accuracy

In [None]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.feature_selection import RFE

# Train a RandomForestClassifier
rf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))])

rf.fit(X_train, y_train)
print("RandomForestClassifier Model:\n")
print("Training Accuracy: ", rf.score(X_train, y_train))
print("Test Accuracy: ", rf.score(X_test, y_test))

# Train a GradientBoostingClassifier
gb = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', GradientBoostingClassifier(n_estimators=100, random_state=42))])

gb.fit(X_train, y_train)
print("\nGradientBoostingClassifier Model:\n")
print("Training Accuracy: ", gb.score(X_train, y_train))
print("Test Accuracy: ", gb.score(X_test, y_test))

# Apply preprocessing on entire dataset:
preprocessed_data = preprocessor.fit_transform(X)

# Apply RFE
# Here we will use Logistic Regression as the model
selector = RFE(LogisticRegression(max_iter=5000), n_features_to_select=5, step=1)
selector = selector.fit(preprocessed_data, y)
print("\nFeatures sorted by their rank:")
print(sorted(zip(map(lambda x: round(x, 4), selector.ranking_), range(preprocessed_data.shape[1]))))


RandomForestClassifier Model:

Training Accuracy:  1.0
Test Accuracy:  0.755

GradientBoostingClassifier Model:

Training Accuracy:  0.8775
Test Accuracy:  0.78

Features sorted by their rank:
[(1, 10), (1, 13), (1, 14), (1, 15), (1, 16), (2, 21), (3, 3), (4, 5), (5, 23), (6, 9), (7, 11), (8, 7), (9, 25), (10, 24), (11, 4), (12, 22), (13, 0), (14, 17), (15, 2), (16, 8), (17, 6), (18, 20), (19, 12), (20, 18), (21, 1), (22, 19)]
