In [43]:
import pandas as pd 

In [44]:
from google.colab import files
uploaded = files.upload()
df = pd.read_csv('german_credit_data.csv')

Saving german_credit_data.csv to german_credit_data (3).csv


In [45]:
df = df.drop('Unnamed: 0', axis=1)


In [46]:
columns = ['Saving accounts', 'Checking account', 'Sex', 'Housing']

for column in columns:
    unique_values = df[column].unique()
    print("Unique values in {}: {}".format(column, unique_values))


Unique values in Saving accounts: [nan 'little' 'quite rich' 'rich' 'moderate']
Unique values in Checking account: ['little' 'moderate' nan 'rich']
Unique values in Sex: ['male' 'female']
Unique values in Housing: ['own' 'free' 'rent']


In [47]:
df['Saving accounts'] = df['Saving accounts'].replace({'little': 0, 'moderate': 1, 'rich': 2, 'quite rich': 3})

df['Checking account'] = df['Checking account'].replace({'little': 1, 'moderate': 2, 'rich': 3})

df['Sex'] = df['Sex'].replace({'male': 1, 'female': 0})

df['Housing'] = df['Housing'].replace({'own': 2, 'free': 0, 'rent': 1})

df['Risk'] = df['Risk'].replace({'good': 1, 'bad': 0})

df = pd.get_dummies(df, columns=['Purpose'])


In [48]:
from sklearn.impute import KNNImputer

imputer = KNNImputer(n_neighbors=5)
df_imputed = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)


In [49]:
from sklearn.model_selection import train_test_split

y = df_imputed['Risk']
X = df_imputed.drop(['Risk'], axis=1)

random_state = 42
test_size = 0.3

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)


# **KNN**


In [50]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

knn_params = {
    'n_neighbors': [3, 5, 7],
    'weights': ['uniform', 'distance'],
    'p': [1, 2]
}

knn = KNeighborsClassifier()
knn_cv_model = GridSearchCV(knn, knn_params, cv=5).fit(X_train, y_train)
print(knn_cv_model.best_params_)

knn_tuned = KNeighborsClassifier(
    n_neighbors=knn_cv_model.best_params_['n_neighbors'],
    weights=knn_cv_model.best_params_['weights'],
    p=knn_cv_model.best_params_['p']
).fit(X_train, y_train)

y_pred = knn_tuned.predict(X_test)
print(accuracy_score(y_test, y_pred))


{'n_neighbors': 7, 'p': 1, 'weights': 'distance'}
0.6533333333333333


# **Logistic Regression**

In [51]:

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

logreg_params = {
    'penalty': ['l1', 'l2'],
    'C': [0.01, 0.1, 1, 10],
    'solver': ['liblinear', 'saga']
}

logreg = LogisticRegression(max_iter=10000)

logreg_cv_model = GridSearchCV(logreg, logreg_params, cv=5).fit(X_train, y_train)
print(logreg_cv_model.best_params_)

logreg_tuned = LogisticRegression(
    penalty=logreg_cv_model.best_params_['penalty'],
    C=logreg_cv_model.best_params_['C'],
    solver=logreg_cv_model.best_params_['solver'],
    max_iter=10000
).fit(X_train, y_train)

y_pred = logreg_tuned.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(logreg_cv_model.best_params_)

{'C': 0.1, 'penalty': 'l2', 'solver': 'liblinear'}
Accuracy: 0.7033333333333334
{'C': 0.1, 'penalty': 'l2', 'solver': 'liblinear'}


# Decision Tree


In [52]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

cart = DecisionTreeClassifier()
cart_params = {
    'max_depth': [3, 5, 7],
    'min_samples_split': [2, 5, 10],
    'criterion': ['gini', 'entropy']
}

cart_cv_model = GridSearchCV(cart, cart_params, cv=5).fit(X_train, y_train)
print(cart_cv_model.best_params_)

cart_tuned = DecisionTreeClassifier(
    max_depth=cart_cv_model.best_params_['max_depth'],
    min_samples_split=cart_cv_model.best_params_['min_samples_split'],
    criterion=cart_cv_model.best_params_['criterion']
).fit(X_train, y_train)

y_pred = cart_tuned.predict(X_test)
print(accuracy_score(y_test, y_pred))


{'criterion': 'entropy', 'max_depth': 7, 'min_samples_split': 10}
0.73


# **Bonus: Improving the Model with Feature Selection**

In [53]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel

rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

feature_importances = rf.feature_importances_

selector = SelectFromModel(rf, threshold=0.05)
selector.fit(X_train, y_train)

X_train_selected = selector.transform(X_train)
X_test_selected = selector.transform(X_test)

model = LogisticRegression()
model.fit(X_train_selected, y_train)

y_pred = model.predict(X_test_selected)
accuracy = accuracy_score(y_test, y_pred)
print("  Improving the Model with Feature Selection Score: {:.2f}".format(accuracy))


  Improving the Model with Feature Selection Score: 0.71
