## Import Minimum Library

In [3]:
import pandas as pd
import numpy as np

## Read the Data

In [70]:
data = pd.read_csv("trainCredit.csv")

## Check for NAN

In [72]:
data.isnull().sum()

LIMIT_BAL                     0
SEX                           0
EDUCATION                     0
MARRIAGE                      0
AGE                           0
PAY_0                         0
PAY_2                         0
PAY_3                         0
PAY_4                         0
PAY_5                         0
PAY_6                         0
BILL_AMT1                     0
BILL_AMT2                     0
BILL_AMT3                     0
BILL_AMT4                     0
BILL_AMT5                     0
BILL_AMT6                     0
PAY_AMT1                      0
PAY_AMT2                      0
PAY_AMT3                      0
PAY_AMT4                      0
PAY_AMT5                      0
PAY_AMT6                      0
default payment next month    0
dtype: int64

## Extract the Data

In [73]:
def extractData():
    data = pd.read_csv("trainCredit.csv")
    y = data["default payment next month"]
    x = data.drop(["default payment next month"], axis = 1)
    
    return x, y, data

In [74]:
x, y, data = extractData()

## The Benchmark

In [13]:
y.value_counts(normalize=True)

0    0.7788
1    0.2212
Name: default payment next month, dtype: float64

## Dummy data

In [31]:
def extractNumerical(x):
    categorial = ["SEX", "EDUCATION", "MARRIAGE"]
    numerical = x.drop(categorial, axis = 1)
    
    return numerical

In [54]:
def sexData(x):
    sex = np.where(x["SEX"] == 1, "MALE", "FEMALE")
    sex = pd.DataFrame(sex)
    sex.columns = ['sexDummy']
    
    return sex

In [55]:
def educData(x):
    educ = np.where(x["EDUCATION"] == 1, "S2", x["EDUCATION"])
    educ = np.where(x["EDUCATION"] == 2, "S1", educ)
    educ = np.where(x["EDUCATION"] == 3, "SMA", educ)
    educ = np.where(x["EDUCATION"] == 4, "LAIN", educ)
    educ = pd.DataFrame(educ)
    educ.columns = ['educDummy']
    
    return educ

In [59]:
def marriageData(x):
    marriage = np.where(x["MARRIAGE"] == 1, "MARRIED", x["MARRIAGE"])
    marriage = np.where(x["MARRIAGE"] == 2, "SINGLE", marriage)
    marriage = np.where(x["MARRIAGE"] == 3, "OTHER", marriage)
    marriage = pd.DataFrame(marriage)
    marriage.columns = ['marriageDummy']
    
    return marriage

In [76]:
sex = sexData(x)
educ = educData(x)
marriage = marriageData(x)
dummy = pd.concat([sex, educ, marriage], axis=1)

In [63]:
def extractDummy(x):
    categorial = ["sexDummy", "educDummy", "marriageDummy"]
    dummy = pd.get_dummies(x[categorial])
    
    return dummy

In [77]:
numerical = extractNumerical(x)
dummy = extractDummy(dummy)
x_data = pd.concat([numerical, dummy], axis=1)

In [78]:
dummy.head()

Unnamed: 0,sexDummy_FEMALE,sexDummy_MALE,educDummy_0,educDummy_5,educDummy_6,educDummy_LAIN,educDummy_S1,educDummy_S2,educDummy_SMA,marriageDummy_0,marriageDummy_MARRIED,marriageDummy_OTHER,marriageDummy_SINGLE
0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
2,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
3,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0


# The Classifiers before Feature Engineering

In [79]:
from sklearn.cross_validation import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x_data, y, test_size = 0.25, random_state = 123)

In [80]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.svm import LinearSVC, SVC
from sklearn.tree import ExtraTreeClassifier, DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, BaggingClassifier

from mlxtend.classifier import StackingClassifier
from sklearn.cross_validation import cross_val_score


In [107]:
knn = KNeighborsClassifier()

logreg = LogisticRegression(random_state = 123)

linearSVM = LinearSVC( random_state = 123)
kernelSVM = SVC( random_state = 123)

extratree = ExtraTreeClassifier(random_state = 123)
decisiontree = DecisionTreeClassifier(random_state = 123)

bagging = BaggingClassifier(random_state= 123, n_estimators= 100)
randomforest = RandomForestClassifier(random_state = 123, n_estimators= 100)
boosting = GradientBoostingClassifier( random_state = 123, n_estimators = 100)
adaboost = AdaBoostClassifier(random_state= 123, n_estimators= 100)



In [108]:
boosting2 = GradientBoostingClassifier( random_state = 123, n_estimators = 150)
boosting3 = GradientBoostingClassifier( random_state = 123, n_estimators = 200)


In [109]:
classifiers = [knn, logreg, linearSVM, kernelSVM, decisiontree, extratree, bagging, randomforest, boosting, adaboost]

In [110]:
# for i in classifiers:
#     scores = cross_val_score(i, x_data, y, cv=7, scoring='accuracy')
#     print("Accuracy: %0.4f (+/- %0.4f)" 
#           % (scores.mean(), scores.std())), i

In [113]:
logregCVStacking = LogisticRegressionCV(random_state= 123)
sclf = StackingClassifier(classifiers=[boosting, boosting2, boosting3],
                          use_probas=True,
                          average_probas=False,
                          meta_classifier= logregCVStacking)

print('7-fold cross validation:\n')

for clf, label in zip([knn, logreg, decisiontree, extratree, bagging, randomforest, boosting, boosting2, boosting3, adaboost, sclf], 
                      ['KNN',
                       'Logistic Regression', 
#                        'Linear SVM', 
#                        'Kernel SVM',
                       'Decision Trees',
                       'Extra Trees',
                       'Bagging',
                       'Random Forest',
                       'Stochastic Gradient Boosting',
                       'Stochastic Gradient Boosting 2',
                       'Stochastic Gradient Boosting 3',
                       'Adaptive Boosting',
                       'Stacking    : Stacking All Model with Logistic Reg']):

    scores = cross_val_score(clf, x_data, y, cv=7, scoring='accuracy')
    print("Accuracy: %0.4f (+/- %0.4f) [%s]" 
          % (scores.mean(), scores.std(), label))

7-fold cross validation:

Accuracy: 0.7552 (+/- 0.0069) [KNN]
Accuracy: 0.7788 (+/- 0.0003) [Logistic Regression]
Accuracy: 0.7239 (+/- 0.0093) [Decision Trees]
Accuracy: 0.7265 (+/- 0.0137) [Extra Trees]
Accuracy: 0.8152 (+/- 0.0103) [Bagging]
Accuracy: 0.8164 (+/- 0.0096) [Random Forest]
Accuracy: 0.8210 (+/- 0.0106) [Stochastic Gradient Boosting]
Accuracy: 0.8209 (+/- 0.0110) [Stochastic Gradient Boosting 2]
Accuracy: 0.8209 (+/- 0.0107) [Stochastic Gradient Boosting 3]
Accuracy: 0.8169 (+/- 0.0087) [Adaptive Boosting]
Accuracy: 0.8135 (+/- 0.0110) [Stacking    : Stacking All Model with Logistic Reg]


# Feature Engineering

In [46]:
def featureEngineering():
    
    data = x_data
    feature = pd.DataFrame()
    
    feature["satisfaction_hour"] = data["satisfaction_level"] * data["average_montly_hours"]
    feature["disatisfaction_hour"] = data["average_montly_hours"] - feature["satisfaction_hour"]
    
    feature["satisfaction_project"] =  data["satisfaction_level"] * data["number_project"]
    feature["disatisfaction_project"] =  data["number_project"] - feature["satisfaction_project"]
    
    feature["project_hour_average"] = data["average_montly_hours"] / data["number_project"]
    feature["time_per_project"] = 1/data["number_project"]
    
    feature["salary_per_hour"] = data["salary numerical"] / data["average_montly_hours"]
    feature["salary_per_project"] = data["salary numerical"] / data["number_project"]

    return feature
    

In [47]:
featureEngineering = featureEngineering()
x_data_feature_engineering = pd.concat([x_data, featureEngineering], axis=1)

# The Classifiers after Feature Engineering

In [48]:
logregCVStacking = LogisticRegressionCV(random_state= 123)
sclf = StackingClassifier(classifiers=[randomforest, boosting, randomforest2, randomforest3],
                          use_probas=True,
                          average_probas=False,
                          meta_classifier= logregCVStacking)

print('7-fold cross validation:\n')

for clf, label in zip([knn, logreg, linearSVM, kernelSVM, decisiontree, extratree, bagging, randomforest, randomforest2, randomforest3, boosting, adaboost, sclf], 
                      ['KNN',
                       'Logistic Regression', 
                       'Linear SVM', 
                       'Kernel SVM',
                       'Decision Trees',
                       'Extra Trees',
                       'Bagging',
                       'Random Forest 1',
                       'Random Forest 2',
                       'Random Forest 3',
                       'Stochastic Gradient Boosting',
                       'Adaptive Boosting',
                       'Stacking    : Stacking All Model with Logistic Reg']):

    scores = cross_val_score(clf, x_data_feature_engineering, y, cv=7, scoring='accuracy')
    print("Accuracy: %0.4f (+/- %0.4f) [%s]" 
          % (scores.mean(), scores.std(), label))

7-fold cross validation:

Accuracy: 0.9441 (+/- 0.0103) [KNN]
Accuracy: 0.8823 (+/- 0.0111) [Logistic Regression]
Accuracy: 0.7880 (+/- 0.0517) [Linear SVM]
Accuracy: 0.9706 (+/- 0.0143) [Kernel SVM]
Accuracy: 0.9833 (+/- 0.0099) [Decision Trees]
Accuracy: 0.9667 (+/- 0.0202) [Extra Trees]
Accuracy: 0.9911 (+/- 0.0087) [Bagging]
Accuracy: 0.9922 (+/- 0.0078) [Random Forest 1]
Accuracy: 0.9922 (+/- 0.0082) [Random Forest 2]
Accuracy: 0.9923 (+/- 0.0077) [Random Forest 3]
Accuracy: 0.9765 (+/- 0.0022) [Stochastic Gradient Boosting]
Accuracy: 0.9667 (+/- 0.0032) [Adaptive Boosting]
Accuracy: 0.9924 (+/- 0.0075) [Stacking    : Stacking All Model with Logistic Reg]


## GridSearch Optim

# KNN Optim

In [86]:
param_knn = {'n_neighbors' : [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12 , 13, 14, 15, 16, 17, 18 , 19, 20]}

In [87]:
from sklearn.grid_search import GridSearchCV

In [88]:
gridKNN = GridSearchCV(knn,param_knn, cv = 7)
gridKNN.fit( x_train, y_train)

GridSearchCV(cv=7, error_score='raise',
       estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [89]:
gridKNN.best_params_

{'n_neighbors': 20}

In [90]:
gridKNN.best_score_

0.77617777777777774

# Logreg Optim

In [91]:
param_logreg = {'C' : [1750, 2000, 1500, 1200, 1000, 333, 100, 33, 10, 3.33, 1, 0.33, 0.1, 0.033, 0.01, 0.0033, 0.001, 0.00033, 0.0001]}

In [92]:
gridLogReg = GridSearchCV(logreg,param_logreg, cv = 7)
gridLogReg.fit(x_train, y_train)

GridSearchCV(cv=7, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=123, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'C': [1750, 2000, 1500, 1200, 1000, 333, 100, 33, 10, 3.33, 1, 0.33, 0.1, 0.033, 0.01, 0.0033, 0.001, 0.00033, 0.0001]},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [93]:
gridLogReg.best_score_

0.77768888888888887

In [94]:
gridLogReg.best_params_

{'C': 1500}

# Decision Tre

In [119]:
param_dectree = {'max_depth' : [ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]}

In [120]:
gridDecTree = GridSearchCV(decisiontree, param_dectree, cv = 7)

In [121]:
gridDecTree.fit(x_train, y_train)

GridSearchCV(cv=7, error_score='raise',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=123, splitter='best'),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [122]:
gridDecTree.best_score_

0.82057777777777774

In [123]:
gridDecTree.best_params_

{'max_depth': 3}

# Bagging with best Param

In [124]:
baggBest = BaggingClassifier(DecisionTreeClassifier(max_depth= 3),n_estimators= 1000, random_state= 123)

In [125]:
baggBestScore = cross_val_score(baggBest, x_train, y_train, cv=7, scoring='accuracy')

In [126]:
baggBestScore.mean()

0.82146726554229044

In [127]:
baggBestScore.std()

0.0063269953427523952

# Gradient Boosting

In [129]:
from sklearn.grid_search import RandomizedSearchCV

In [130]:
param_Boosting = {'learning_rate' : [ 1, 0.33, 0.1, 0.033, 0.01, 0.0033],
           'n_estimators': [100, 200, 300, 400, 500, 700,
                           900, 1000, 1200, 1500, 2000, 3000, 4000, 5000]}

In [131]:
boostOptim = RandomizedSearchCV(GradientBoostingClassifier(), param_RF,n_iter = 30 , cv = 7)

In [None]:
boostOptim.fit(x_train, y_train)

In [None]:
boostOptim.best_score_

In [None]:
boostOptim.best_params_