Import packages

In [1]:
import numpy as np 
import pandas as pd 
import sklearn as sk 
import matplotlib.pyplot as plt 
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.tree import DecisionTreeClassifier

Load dataset

In [2]:
df = pd.read_csv('credit.csv')

Question 2

In [3]:
df['default'] = (df['default']>1).astype(int)
print(df.columns)
target_column = ['default']
numeric_features =['months_loan_duration',
                   'amount','age','installment_rate',
                  'dependents']
categorical_features =['checking_balance', 'credit_history', 'purpose',
                       'savings_balance', 'employment_length',
                       'personal_status', 'other_debtors', 'residence_history', 'property',
                       'installment_plan', 'housing', 'existing_credits',
                        'telephone', 'foreign_worker', 'job']

Index(['checking_balance', 'months_loan_duration', 'credit_history', 'purpose',
       'amount', 'savings_balance', 'employment_length', 'installment_rate',
       'personal_status', 'other_debtors', 'residence_history', 'property',
       'age', 'installment_plan', 'housing', 'existing_credits', 'default',
       'dependents', 'telephone', 'foreign_worker', 'job'],
      dtype='object')


In [4]:
y = df.pop('default')
X = df

In [5]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('ordinal', OrdinalEncoder())])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])


In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=40)

In [7]:
clf_tree = DecisionTreeClassifier()

In [8]:
pipe_tree = Pipeline([('process', preprocessor), ('clf', clf_tree)])
pipe_tree = pipe_tree.fit(X_train, y_train)

y_pred_train_tree = pipe_tree.predict(X_train)
print('train score accuracy',100*(y_pred_train_tree == y_train).mean())


y_pred_test_tree = pipe_tree.predict(X_test)
print('test score accuracy',100*(y_pred_test_tree == y_test).mean())

recall = metrics.recall_score(y_test, y_pred_test_tree)
print('recall',recall)

train score accuracy 100.0
test score accuracy 70.0
recall 0.47368421052631576


Question 3

In [9]:
clf_rf = RandomForestClassifier(n_estimators = 50, random_state = 40)

In [10]:
pipe_rf = Pipeline([('process', preprocessor), ('clf', clf_rf)])
pipe_rf = pipe_rf.fit(X_train, y_train)

y_pred_train_rf = pipe_rf.predict(X_train)
print('train score accuracy',100*(y_pred_train_rf == y_train).mean())

y_pred_test_rf = pipe_rf.predict(X_test)
print('test score accuracy',100*(y_pred_test_rf == y_test).mean())

recall_rf = metrics.recall_score(y_test, y_pred_test_rf)
print('recall', recall_rf)

train score accuracy 100.0
test score accuracy 76.0
recall 0.38596491228070173


Question 4

In [11]:
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report

param_grid = {'n_estimators': list(range(10, 100, 5))}
clf = RandomForestClassifier(random_state = 40)

grid = GridSearchCV(clf, param_grid, cv = 5, scoring = 'recall')

pipe = make_pipeline(preprocessor, grid)

pipe.fit(X_train, y_train)
y_pred_train = pipe.predict(X_train)
y_pred_test = pipe.predict(X_test)
recall = metrics.recall_score(y_test, y_pred_test)

print("Best parameter found on development set:")
print(grid.best_params_)
print()

print('train score accuracy',100*(y_pred_train_rf == y_train).mean())
print('test score accuracy',100*(y_pred_test_rf == y_test).mean())
print('recall', recall)
print()

Best parameter found on development set:
{'n_estimators': 25}

train score accuracy 100.0
test score accuracy 76.0
recall 0.43859649122807015

