# import

In [181]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import ShuffleSplit
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

# read file

In [182]:
all_df = pd.read_csv('C:/Users/caban/Desktop/data_science/Naver_Data_Competition_2018/project/german_credit/동형이.csv')

In [183]:
all_df.head()

Unnamed: 0,﻿Class,DM_status(Q)_A11,DM_status(Q)_A12,DM_status(Q)_A13,DM_status(Q)_A14,Duration(N),Credit_history(Q)_A30,Credit_history(Q)_A31,Credit_history(Q)_A32,Credit_history(Q)_A33,...,Housing(Q)_A152,Job(Q)_A171,Job(Q)_A172,Job(Q)_A173,Job(Q)_A174,Telephone(Q)_A191,Telephone(Q)_A192,Foreign_worker(Q)_A201,Foreign_worker(Q)_A202,Age in years
0,1,1,0,0,0,6,0,0,0,0,...,1,0,0,1,0,0,1,1,0,67
1,2,0,1,0,0,48,0,0,1,0,...,1,0,0,1,0,1,0,1,0,22
2,1,0,0,0,1,12,0,0,0,0,...,1,0,1,0,0,1,0,1,0,49
3,1,1,0,0,0,42,0,0,1,0,...,0,0,0,1,0,1,0,1,0,45
4,2,1,0,0,0,24,0,0,0,1,...,0,0,0,1,0,1,0,1,0,53


# data handling

In [184]:
y_series = all_df['﻿Class']

In [185]:
all_df.drop(['﻿Class', 'Age in years'], axis=1, inplace=True)

In [186]:
all_df['Credit_amount(N)'] = np.log(all_df['Credit_amount(N)'])

# Logistic regression (L1 regularization)

In [187]:
train_X, test_X, train_y, test_y = train_test_split(all_df, y_series, test_size=0.2)

In [188]:
regr = LogisticRegression(fit_intercept = True, penalty='l1', intercept_scaling = 5)

In [189]:
regr.fit(train_X, train_y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=5, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [190]:
regr.coef_

array([[ 0.28769178,  0.        , -1.01667646, -1.21103549,  0.03571946,
         0.55611963,  0.69176195,  0.        , -0.3191999 , -0.65816474,
         0.57032313, -0.86737584, -0.10856769,  0.        , -0.33841752,
         0.        ,  0.        ,  0.46808973,  0.        ,  0.        ,
         0.03231639,  0.56273395,  0.38886412,  0.        , -0.6158778 ,
        -0.14064718, -0.04728687,  0.03421318,  0.05938439, -0.51460416,
         0.        , -0.35189378, -0.04667123,  0.        ,  0.13061951,
         0.        , -0.43536094,  0.        ,  0.18928311, -0.55092468,
        -0.38242254,  0.        ,  0.        ,  0.        , -0.29423293,
         0.        , -0.3713832 , -0.00176065,  0.06700599,  0.        ,
         0.        ,  0.        , -0.16650522,  0.        , -0.70386731]])

In [191]:
regr.intercept_

array([-0.8296392])

In [192]:
pred = regr.predict(test_X)

In [193]:
regr.score(test_X, test_y)

0.775

In [194]:
confusion_matrix(test_y, pred)

array([[125,   9],
       [ 36,  30]], dtype=int64)

# tree based model function

In [195]:
def fit_trees(algo, n_jobs, max_depth, n_estimators): 
    if algo == "Decision Trees": 
        estimator = DecisionTreeClassifier(criterion = 'entropy')
    else:
        estimator = RandomForestClassifier(criterion = 'entropy')
        
    cv = ShuffleSplit(train_X.shape[0], n_iter=10, test_size=0.2) 
        
    if algo == "Decision Trees": 
        classifier = GridSearchCV(estimator=estimator, cv=cv, param_grid=dict(max_depth=max_depth), n_jobs=n_jobs, scoring='f1')
    else: 
        classifier = GridSearchCV(estimator=estimator, cv=cv, param_grid=dict(n_estimators=n_estimators, max_depth=max_depth), n_jobs=n_jobs, scoring='f1')
            
    classifier.fit(train_X, train_y)
    
    print ("Best Estimator learned through GridSearch")
    print (classifier.best_estimator_) 
    
    return cv, classifier.best_estimator_.max_depth, classifier.best_estimator_.n_estimators
    

# random forest

In [196]:
max_depth = np.linspace(5, 10, 5)

In [197]:
n_estimators = [10, 100, 500, 1000] 

In [198]:
cv,max_depth,n_estimators=fit_trees('Random Forests', n_jobs=10, max_depth=max_depth, n_estimators=n_estimators) 

Best Estimator learned through GridSearch
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=8.75, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)


In [199]:
estimator = RandomForestClassifier(max_depth = max_depth, n_estimators = n_estimators, n_jobs=10, criterion='entropy')

In [200]:
estimator.fit(train_X, train_y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=8.75, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=10,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [201]:
pred = estimator.predict(test_X)

In [202]:
confusion_matrix(test_y, pred)

array([[129,   5],
       [ 47,  19]], dtype=int64)

In [203]:
print(classification_report(test_y, pred))

             precision    recall  f1-score   support

          1       0.73      0.96      0.83       134
          2       0.79      0.29      0.42        66

avg / total       0.75      0.74      0.70       200



In [204]:
accuracy_score(test_y, pred)

0.74

# decision tree

In [205]:
max_depth = np.linspace(5, 10, 5)

In [206]:
n_estimators = [10, 100, 500, 1000]

In [207]:
cv,max_depth,n_estimators=fit_trees('Decision Trees', n_jobs=10, max_depth=max_depth, n_estimators=False ) 

Best Estimator learned through GridSearch
DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=7.5,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')


AttributeError: 'DecisionTreeClassifier' object has no attribute 'n_estimators'

In [208]:
estimator = DecisionTreeClassifier(max_depth = max_depth)

In [209]:
estimator.fit(train_X, train_y)

ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()