In [1]:
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import pandas as pd
from pathlib import Path

Info on dataset: [South German Credit Dataset](https://archive.ics.uci.edu/ml/datasets/South+German+Credit+%28UPDATE%29)

In [2]:
file = Path('../Resources/german_credit.csv')

In [3]:
df = pd.read_csv(file)
df.head()

Unnamed: 0,laufkont,laufzeit,moral,verw,hoehe,sparkont,beszeit,rate,famges,buerge,...,verm,alter,weitkred,wohn,bishkred,beruf,pers,telef,gastarb,kredit
0,1,18,4,2,1049,1,2,4,2,1,...,2,21,3,1,1,3,2,1,2,1.0
1,1,9,4,0,2799,1,3,2,3,1,...,1,36,3,1,2,3,1,1,2,1.0
2,2,12,2,9,841,2,4,2,2,1,...,1,23,3,1,1,2,2,1,2,1.0
3,1,12,4,0,2122,1,3,3,3,1,...,1,39,3,1,2,2,1,1,1,1.0
4,1,10,4,0,2241,1,2,1,3,1,...,1,48,3,1,2,2,1,1,1,1.0


In [4]:
# Count of rows with null values
df.isnull().sum()

laufkont      0
laufzeit      0
moral         0
verw          0
hoehe         0
sparkont      0
beszeit       0
rate          0
famges        0
buerge        0
wohnzeit      0
verm          0
alter         0
weitkred      0
wohn          0
bishkred      0
beruf         0
pers          0
telef         0
gastarb       0
kredit      200
dtype: int64

In [5]:
# Delete rows with null values
df = df.dropna()

In [6]:
# Separate the dataset into data and target
X = df.drop(['kredit'], axis=1)
y = df['kredit']

In [7]:
# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [8]:
steps = [
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=0.9)),
    ('lr', LogisticRegression())
]

In [9]:
pipe = Pipeline(steps)

In [10]:
params = {'lr__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
         'lr__solver': ['sag', 'lbfgs']}

In [11]:
cv = GridSearchCV(pipe, params)
cv.fit(X_train, y_train)

GridSearchCV(estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                       ('pca', PCA(n_components=0.9)),
                                       ('lr', LogisticRegression())]),
             param_grid={'lr__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
                         'lr__solver': ['sag', 'lbfgs']})

In [12]:
# Evaluate performance
cv.score(X_test, y_test)

0.755

In [13]:
# Best params
cv.best_params_

{'lr__C': 0.1, 'lr__solver': 'sag'}