In [45]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [46]:
# Load Data
credit_data = pd.read_csv('../input/german-credit-data-with-risk/german_credit_data.csv')

In [47]:
# First Look at data
credit_data.head(10)

Unnamed: 0.1,Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk
0,0,67,male,2,own,,little,1169,6,radio/TV,good
1,1,22,female,2,own,little,moderate,5951,48,radio/TV,bad
2,2,49,male,1,own,little,,2096,12,education,good
3,3,45,male,2,free,little,little,7882,42,furniture/equipment,good
4,4,53,male,2,free,little,little,4870,24,car,bad
5,5,35,male,1,free,,,9055,36,education,good
6,6,53,male,2,own,quite rich,,2835,24,furniture/equipment,good
7,7,35,male,3,rent,little,moderate,6948,36,car,good
8,8,61,male,1,own,rich,,3059,12,radio/TV,good
9,9,28,male,3,own,little,moderate,5234,30,car,bad


In [48]:
# Renaming and Fixing Index
credit_data.rename(columns = {'Unnamed: 0' : 'Index'},inplace=True)
credit_data.set_index('Index',inplace=True)
credit_data.head()

Unnamed: 0_level_0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,67,male,2,own,,little,1169,6,radio/TV,good
1,22,female,2,own,little,moderate,5951,48,radio/TV,bad
2,49,male,1,own,little,,2096,12,education,good
3,45,male,2,free,little,little,7882,42,furniture/equipment,good
4,53,male,2,free,little,little,4870,24,car,bad


In [49]:
# Checking Null values
credit_data.isnull().any()

Age                 False
Sex                 False
Job                 False
Housing             False
Saving accounts      True
Checking account     True
Credit amount       False
Duration            False
Purpose             False
Risk                False
dtype: bool

In [50]:
# Replacing Null values with string Unknown
credit_data.fillna('Unknown',inplace=True)
credit_data.head()

Unnamed: 0_level_0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,67,male,2,own,Unknown,little,1169,6,radio/TV,good
1,22,female,2,own,little,moderate,5951,48,radio/TV,bad
2,49,male,1,own,little,Unknown,2096,12,education,good
3,45,male,2,free,little,little,7882,42,furniture/equipment,good
4,53,male,2,free,little,little,4870,24,car,bad


In [51]:
# Encoding Risk in 1 and 0, good = 1, bad = 0
credit_data[['Risk']]=credit_data.Risk.map({'good':1,'bad':0})
credit_data.head(3)

Unnamed: 0_level_0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,67,male,2,own,Unknown,little,1169,6,radio/TV,1
1,22,female,2,own,little,moderate,5951,48,radio/TV,0
2,49,male,1,own,little,Unknown,2096,12,education,1


In [52]:
# importing pakages
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import RobustScaler, MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import OneHotEncoder

In [53]:
# Creating Feature Data
feature_data = credit_data.drop('Risk',axis =1)

In [54]:
feature_data.head(2)

Unnamed: 0_level_0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,67,male,2,own,Unknown,little,1169,6,radio/TV
1,22,female,2,own,little,moderate,5951,48,radio/TV


In [55]:
# Creating target Data
target_data = credit_data.Risk
target_data.head()

Index
0    1
1    0
2    1
3    1
4    0
Name: Risk, dtype: int64

In [56]:
# Checking column for data type to decide encoding. Ordinal Data will be label encode and non ordinal data will be one hot encode
credit_data.Housing.value_counts()
# Housing column to be one hot encode

own     713
rent    179
free    108
Name: Housing, dtype: int64

In [57]:
credit_data['Saving accounts'].value_counts()
# Saving accounts to be label encode

little        603
Unknown       183
moderate      103
quite rich     63
rich           48
Name: Saving accounts, dtype: int64

In [58]:
credit_data['Checking account'].value_counts()
# Checking account to be label encode

Unknown     394
little      274
moderate    269
rich         63
Name: Checking account, dtype: int64

In [59]:
credit_data['Purpose'].value_counts()
# Purpose to be one hot encode

car                    337
radio/TV               280
furniture/equipment    181
business                97
education               59
repairs                 22
domestic appliances     12
vacation/others         12
Name: Purpose, dtype: int64

In [60]:
class CustLabel(BaseEstimator, TransformerMixin):
    
    def fit(self,X,Y=None):
        return self
    
    def transform(self,X,Y=None):
        db = {'Unknown':0,'little':1,'moderate':2,'rich':3,'quite rich':4}
#         print (type(X))
        r = X.replace(db)
        return pd.DataFrame(r)

In [61]:
credit_data.head(3)

Unnamed: 0_level_0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,67,male,2,own,Unknown,little,1169,6,radio/TV,1
1,22,female,2,own,little,moderate,5951,48,radio/TV,0
2,49,male,1,own,little,Unknown,2096,12,education,1


In [62]:
# Creating pipelines

In [63]:
nums_pipeline = make_pipeline(MinMaxScaler())

In [64]:
custlab_pipeline = make_pipeline(CustLabel())

In [65]:
onehot_pipeline = make_pipeline(OneHotEncoder())

In [75]:
ct = ColumnTransformer(
      transformers=[
          ('number_data', nums_pipeline, ['Credit amount','Age','Duration']),
          ('salary_data', custlab_pipeline, ['Saving accounts','Checking account']),
          ('dept_data', onehot_pipeline, ['Sex','Housing','Purpose'])
      ]
)

In [67]:
trainX,testX,trainY,testY = train_test_split(feature_data,target_data)

In [68]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier,BaggingClassifier,AdaBoostClassifier,ExtraTreesClassifier,GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier,ExtraTreeClassifier
from sklearn.neighbors import KNeighborsClassifier,NearestNeighbors
from sklearn.naive_bayes import BernoulliNB,GaussianNB,MultinomialNB
from sklearn.svm import SVC

In [69]:
AllModels = [SVC(kernel = 'linear'),
             LogisticRegression(solver='lbfgs'),
             RandomForestClassifier(n_estimators = 100),
             BaggingClassifier(),
             AdaBoostClassifier(),
             GradientBoostingClassifier(),
             DecisionTreeClassifier(),
             ExtraTreeClassifier(),
             KNeighborsClassifier(),
             NearestNeighbors(),
             BernoulliNB(),
#              GaussianNB(),
             MultinomialNB()
            ]
AllModelsName = ['SVC(kernel = linear)',
                 'LogisticRegression()',
                 'RandomForestClassifier()',
                 'BaggingClassifier()',
                 'AdaBoostClassifier()',
                 'GradientBoostingClassifier()',
                 'DecisionTreeClassifier()',
                 'ExtraTreeClassifier()',
                 'KNeighborsClassifier()',
                 'NearestNeighbors()',
                 'BernoulliNB()',
#                  'GaussianNB()',
                 'MultinomialNB()'
                ]


In [76]:
pipelines= []
for models in AllModels:
    pipeline = Pipeline(steps = [
        ('preprocessor',ct),
        ('Classifier',models)
    ])
    pipelines.append(pipeline)

In [77]:
for pipeline in pipelines:    
    pipeline.fit(trainX,trainY)
#     print(pipeline)


  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)


In [79]:
for index,pipeline in enumerate(pipelines):        
    try:
        print(str(AllModelsName[index])," = ",pipeline.score(testX,testY))
    except:
        pass

SVC(kernel = linear)  =  0.684
LogisticRegression()  =  0.676
RandomForestClassifier()  =  0.716
BaggingClassifier()  =  0.74
AdaBoostClassifier()  =  0.736
GradientBoostingClassifier()  =  0.752
DecisionTreeClassifier()  =  0.712
ExtraTreeClassifier()  =  0.588
KNeighborsClassifier()  =  0.66
BernoulliNB()  =  0.696
MultinomialNB()  =  0.68


In [None]:
#  base_estimator=SVC
ab = AdaBoostClassifier(algorithm='SAMME',base_estimator=SVC(kernel='rbf',C = 10000, gamma = 'auto'),n_estimators=100)
pipeline = make_pipeline(ct,ab)
pipeline.fit(trainX,trainY)
print('AdaBoost Accuracy with SVC = ',(pipeline.score(testX,testY)*100))