In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [None]:
data = pd.read_csv('../input/hmeq-data/hmeq.csv')
data.info()

In [None]:
data.head(10)

In [None]:
data.describe(include ='all')

In [None]:
missing = pd.DataFrame()
missing['Total missing'] = data.isnull().sum()
missing['%'] = data.isnull().sum()/len(data.index)*100
missing['%'] = missing['%'].round(1)
missing

In [None]:
data.fillna(method='ffill', inplace=True)
data.fillna(method='bfill', inplace=True)

In [None]:
plt.figure(figsize=(10,5))
sns.countplot(data['BAD']).set_title("Target Variable Distribution")
plt.show()

In [None]:
plt.figure(figsize=(10,5))
sns.set_style("whitegrid")
sns.countplot(data['REASON']).set_title("Reason Variable Distribution")
plt.show()

In [None]:
plt.figure(figsize=(10,5))
sns.set_style("whitegrid")
sns.countplot(data['JOB']).set_title("Job Variable Distribution")
plt.show()

In [None]:
numerical_cols = [x for x in data.columns if data[x].dtype in ['int', 'float']]
num = data[numerical_cols]

In [None]:

sns.set_style("whitegrid")

fig, axes = plt.subplots(ncols=2, nrows=5)

# for i , k in zip(numerical_cols[1:], range(1,len(numerical_cols)-1)):
#     plt.figure(figsize=(5,5))
#     ax = fig.add_subplot(5,5,k)
#     sns.distplot(data[i], kde=False,bins=30, hist_kws={"histtype": 'bar', "linewidth": 1, "alpha": 0.5}).set_title("{col_name} Variable Distribution".format(col_name = i))
    
# for i, ax in zip(numerical_cols[1:], axes.flat):
#     plt.figure(figsize=(10,10))
#     sns.distplot(data[i], kde=False, ax=ax, bins=30, hist_kws={"histtype": 'bar', "linewidth": 1, "alpha": 0.5}).set_title("{col_name} Variable Distribution".format(col_name = i))
    
plt.show()

In [None]:
from imblearn.over_sampling import SMOTE,ADASYN
from imblearn.under_sampling import RandomUnderSampler

In [None]:
y = data['BAD']
X = data.drop(['BAD'], axis=1)
X = pd.get_dummies(X)
smo = SMOTE(random_state=0)
X_resampled, y_resampled = smo.fit_resample(X, y)
sns.countplot(y_resampled)
plt.show()

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X = sc.fit_transform(X)

In [None]:
smo = SMOTE(random_state=0)
X_resampled, y_resampled = smo.fit_resample(X, y)
sns.countplot(y_resampled)
plt.show()

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(X_resampled, y_resampled, test_size=0.2,random_state=101)

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, plot_confusion_matrix
from imblearn.over_sampling import SMOTE,ADASYN
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import StratifiedKFold
from collections import Counter
import itertools

In [None]:
# Spot Check Algorithms
models = []
models.append(('Logistic Regression', LogisticRegression(random_state=0)))
models.append(('Bagging', BaggingClassifier(random_state=0)))
models.append(('Random Forest', RandomForestClassifier(random_state=0)))
models.append(('AdaBoost', AdaBoostClassifier(random_state=0)))
models.append(('GBM', GradientBoostingClassifier(random_state=0)))
models.append(('XGB', XGBClassifier(random_state=0)))
results_v = []
names = []
score = []
skf = StratifiedKFold(n_splits=5)
for (name, model) in models:
    param_grid = {}
    my_model = GridSearchCV(model,param_grid,cv=skf)
    my_model.fit(X_train, y_train)
    predictions_v = my_model.predict(X_valid)
    accuracy_valid = accuracy_score(y_valid, predictions_v) 
    results_v.append(accuracy_valid)
    names.append(name)
    f_dict = {'model': name,'accuracy_valid': accuracy_valid}

    # Plot non-normalized confusion matrix
    plot_confusion_matrix(my_model, X_valid, y_valid, values_format = 'd', cmap=plt.cm.Blues, xticks_rotation = 'horizontal').ax_.set_title(str(name)+' Model Confusion Matrix')
    plt.grid(False)
    score.append(f_dict)

plt.show()   

score = pd.DataFrame(score, columns = ['model', 'accuracy_valid'])

In [None]:
score

In [None]:
score