In [21]:
#standard packages
import pandas as pd
import numpy as np
import sklearn.datasets

#preprocessing
from sklearn.compose import make_column_selector as selector
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler

#results analysis
from sklearn.metrics import confusion_matrix, precision_score, classification_report

# models
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.svm import SVC

#pipeline
from sklearn.pipeline import Pipeline


#dataframe loaded
df = pd.read_csv('creditg.csv')


In [2]:
df.head()

Unnamed: 0,checking_status,duration,credit_history,purpose,credit_amount,savings_status,employment,installment_commitment,personal_status,other_parties,...,property_magnitude,age,other_payment_plans,housing,existing_credits,job,num_dependents,own_telephone,foreign_worker,class
0,'<0',6,'critical/other existing credit',radio/tv,1169,'no known savings','>=7',4,'male single',none,...,'real estate',67,none,own,2,skilled,1,yes,yes,good
1,'0<=X<200',48,'existing paid',radio/tv,5951,'<100','1<=X<4',2,'female div/dep/mar',none,...,'real estate',22,none,own,1,skilled,1,none,yes,bad
2,'no checking',12,'critical/other existing credit',education,2096,'<100','4<=X<7',2,'male single',none,...,'real estate',49,none,own,1,'unskilled resident',2,none,yes,good
3,'<0',42,'existing paid',furniture/equipment,7882,'<100','4<=X<7',2,'male single',guarantor,...,'life insurance',45,none,'for free',1,skilled,2,none,yes,good
4,'<0',24,'delayed previously','new car',4870,'<100','1<=X<4',3,'male single',none,...,'no known property',53,none,'for free',2,skilled,2,none,yes,bad


In [3]:
#check for missing values
df.isnull().sum(axis = 0)

checking_status           0
duration                  0
credit_history            0
purpose                   0
credit_amount             0
savings_status            0
employment                0
installment_commitment    0
personal_status           0
other_parties             0
residence_since           0
property_magnitude        0
age                       0
other_payment_plans       0
housing                   0
existing_credits          0
job                       0
num_dependents            0
own_telephone             0
foreign_worker            0
class                     0
dtype: int64

In [4]:
df.isna().sum()

checking_status           0
duration                  0
credit_history            0
purpose                   0
credit_amount             0
savings_status            0
employment                0
installment_commitment    0
personal_status           0
other_parties             0
residence_since           0
property_magnitude        0
age                       0
other_payment_plans       0
housing                   0
existing_credits          0
job                       0
num_dependents            0
own_telephone             0
foreign_worker            0
class                     0
dtype: int64

In [5]:
y = df.iloc[:, -1:]
y

Unnamed: 0,class
0,good
1,bad
2,good
3,good
4,bad
...,...
995,good
996,good
997,good
998,bad


In [6]:
X = df.drop('class',axis='columns')
X

Unnamed: 0,checking_status,duration,credit_history,purpose,credit_amount,savings_status,employment,installment_commitment,personal_status,other_parties,residence_since,property_magnitude,age,other_payment_plans,housing,existing_credits,job,num_dependents,own_telephone,foreign_worker
0,'<0',6,'critical/other existing credit',radio/tv,1169,'no known savings','>=7',4,'male single',none,4,'real estate',67,none,own,2,skilled,1,yes,yes
1,'0<=X<200',48,'existing paid',radio/tv,5951,'<100','1<=X<4',2,'female div/dep/mar',none,2,'real estate',22,none,own,1,skilled,1,none,yes
2,'no checking',12,'critical/other existing credit',education,2096,'<100','4<=X<7',2,'male single',none,3,'real estate',49,none,own,1,'unskilled resident',2,none,yes
3,'<0',42,'existing paid',furniture/equipment,7882,'<100','4<=X<7',2,'male single',guarantor,4,'life insurance',45,none,'for free',1,skilled,2,none,yes
4,'<0',24,'delayed previously','new car',4870,'<100','1<=X<4',3,'male single',none,4,'no known property',53,none,'for free',2,skilled,2,none,yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,'no checking',12,'existing paid',furniture/equipment,1736,'<100','4<=X<7',3,'female div/dep/mar',none,4,'real estate',31,none,own,1,'unskilled resident',1,none,yes
996,'<0',30,'existing paid','used car',3857,'<100','1<=X<4',4,'male div/sep',none,4,'life insurance',40,none,own,1,'high qualif/self emp/mgmt',1,yes,yes
997,'no checking',12,'existing paid',radio/tv,804,'<100','>=7',4,'male single',none,4,car,38,none,own,1,skilled,1,none,yes
998,'<0',45,'existing paid',radio/tv,1845,'<100','1<=X<4',4,'male single',none,4,'no known property',23,none,'for free',1,skilled,1,yes,yes


In [11]:
#preprocessing and encoding columns

# numeric_features = ['duration',
#                     'credit_amount',
#                     'installment_commitment',
#                     'residence_since',
#                     'age',
#                     'existing_credits',
#                     'num_dependents']
# categorical_features = ['checking_status',
#                        'credit_history',
#                        'purpose',
#                        'savings_status',
#                       'employment',
#                       'personal_status',
#                       'other_parties',
#                        'property_magnitude',
#                        'other_payment_plans',
#                       'housing',
#                       'job',
#                       'own_telephone',
#                       'foreign_worker']


# preprocessor = ColumnTransformer(
#     transformers=[
#         ('num', numeric_transformer, numeric_features),
#         ('cat', categorical_transformer, categorical_features)])


numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])


categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

subset_feature = ['duration',
                    'credit_amount',
                    'installment_commitment',
                    'residence_since',
                    'age',
                    'existing_credits',
                    'num_dependents',
                    'checking_status',
                    'credit_history',
                       'purpose',
                       'savings_status',
                      'employment',
                      'personal_status',
                      'other_parties',
                       'property_magnitude',
                       'other_payment_plans',
                      'housing',
                      'job',
                      'own_telephone',
                      'foreign_worker']
X = X[subset_feature]
X.info()



preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, selector(dtype_exclude="object")),
    ('cat', categorical_transformer, selector(dtype_include="object"))
])




<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 20 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   duration                1000 non-null   int64 
 1   credit_amount           1000 non-null   int64 
 2   installment_commitment  1000 non-null   int64 
 3   residence_since         1000 non-null   int64 
 4   age                     1000 non-null   int64 
 5   existing_credits        1000 non-null   int64 
 6   num_dependents          1000 non-null   int64 
 7   checking_status         1000 non-null   object
 8   credit_history          1000 non-null   object
 9   purpose                 1000 non-null   object
 10  savings_status          1000 non-null   object
 11  employment              1000 non-null   object
 12  personal_status         1000 non-null   object
 13  other_parties           1000 non-null   object
 14  property_magnitude      1000 non-null   object
 15  other

In [17]:
cross_val_score(clf, X, y, scoring='accuracy').mean()

  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)


0.744

In [36]:
#Pipeline

#models
# KNeighborsClassifier(n_neighbors=i)

# LogisticRegression(solver=lbfgs, C=i)
#     solver{‘newton-cg’, ‘lbfgs’, ‘liblinear’, ‘sag’, ‘saga’}, default=’lbfgs’

# SVC(kernel='linear', C=i)
#     kernel{‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’, ‘precomputed’}, default=’rbf’)
#     degreeint, default=3 (for poly only)


model_choice = SVC(kernel='linear',C=i)

resultslist = []
for i in range(1,10):
    K = i
    C = i/10
    for j in range(1,10):
        clf = Pipeline(steps=[('preprocessor', preprocessor),
                          ('classifier', model_choice)])
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
        clf.fit(X_train, y_train)
        resultslist.append(clf.score(X_test, y_test))
        y_pred = clf.predict(X_test)
    conf = confusion_matrix(y_test, y_pred)
    print(conf)
    print("Variable : %.0f" % i)
    print("Current model score: %.5f" % clf.score(X_test, y_test))
print("Average model score: %.5f" % np.mean(resultslist))


  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)


[[ 30  32]
 [ 19 119]]
Variable : 1
Current model score: 0.74500


  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)


[[ 26  28]
 [ 18 128]]
Variable : 2
Current model score: 0.77000


  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)


[[ 28  32]
 [ 24 116]]
Variable : 3
Current model score: 0.72000


  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)


[[ 27  39]
 [ 16 118]]
Variable : 4
Current model score: 0.72500


  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)


[[ 23  33]
 [ 15 129]]
Variable : 5
Current model score: 0.76000


  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)


[[ 31  37]
 [ 24 108]]
Variable : 6
Current model score: 0.69500


  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)


[[ 23  40]
 [ 14 123]]
Variable : 7
Current model score: 0.73000


  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)


[[ 25  35]
 [ 18 122]]
Variable : 8
Current model score: 0.73500


  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)


[[ 28  24]
 [ 18 130]]
Variable : 9
Current model score: 0.79000
Average model score: 0.74969
