In [357]:
try:
    from xgboost import XGBClassifier
except:
    !conda install -c conda-forge py-xgboost

In [358]:
import pandas as pd
import numpy as np
import seaborn as sns;
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB,GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
    
from sklearn.model_selection import train_test_split,GridSearchCV

from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score, average_precision_score

from sklearn.preprocessing import MinMaxScaler,StandardScaler


In [359]:
class Prediction:
    
    def __init__(self,X,y):
        self.X = X
        self.y = y
        self.X_train, self.X_test, self.y_train, self.y_test,self.y_pred = None,None,None,None,None
        self.model = None
    
    
    def feature_selection(self):
        
        # select features with missing values less than 60%
        req_cols, final_req_cols = [],[]
        X = self.X
        for col in self.X.columns:
            miss_rate = round(X[col].isna().sum()/len(X),2)
            if miss_rate < 0.6:
                req_cols.append(col)
        
        # select features with correlation > 0.2
        '''
        Improvements:
        1. Add visualizations
        2. Create different correlations for different type of variables (refer notebook)
        '''
        df = X[req_cols]
        df['target'] = self.y
#         g = sns.pairplot(df,hue = 'target', diag_kind= 'hist',
#                      vars=df.columns[:-1],
#                      plot_kws=dict(alpha=0.5), 
#                      diag_kws=dict(alpha=0.5))
#         plt.show()
        corr_matrix = df.corr()
        for col in req_cols:
            if abs(corr_matrix["target"][col])>0.2:
                final_req_cols.append(col)

        # update X dataframe which contain only selected features
        self.X = X[final_req_cols]
        
    
    def data_split(self, split=0.2):
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(self.X,self.y,train_size=split)
        
    
    def data_normalization(self):
        cols = self.X.columns
        scaler = MinMaxScaler()
        scaler.fit(self.X)
        self.X = pd.DataFrame(scaler.transform(self.X))
        self.X.columns = cols
    
    def logistic_regression(self):
        self.model = LogisticRegression()
    
    
    def decision_tree(self):
        self.model = DecisionTreeClassifier()
    
    
    def multinomial_naive_bayes(self):
        self.model = MultinomialNB()
       
    
    def gaussian_naive_bayes(self):
        self.model = GaussianNB()
    
    
    def knn(self):
        self.model = KNeighborsClassifier()
    
    
    def rf(self,n_trees=100,criteria='gini',max_depth=None):
        self.model = RandomForestClassifier(n_estimators=n_trees, criterion=criteria, max_depth=max_depth)
    
    
    def xgb(self):
        self.model = XGBClassifier(objective="binary:logistic")
    
    
    def svm(self):
        self.model = SVC(gamma='auto')
    
    def gradient_boost(self,n_estimators=100, learning_rate=1.0,max_depth=1, random_state=0):
        self.model = GradientBoostingClassifier(n_estimators=n_estimators, learning_rate=learning_rate,max_depth=max_depth, random_state=random_state)
        
    
    
    def parameter_tuning(self,model='knn',scoring='accuracy',cv=5,given_params=False):
        
        # check if the parameter grid is given or have to use the the default one
        if given_params==False:
            # use predefined parameters for each model
            if model=='knn':
                self.knn()
                params = [{'n_neighbors':[3,5,7,9], 
                           'weights':['uniform','distance'],
                           'leaf_size':[15,20,30]}] 
            if model=='logistic_regression':
                self.logistic_regression()
                params = [{'penalty':['none','l2','elasticnet','l1'], 'C':[0.001,0.01,0.1,1,10,100,1000], 'fit_intercept':[True,False]}]
            if model== "decision_tree":
                self.decision_tree()
                params = [{'criterion':['gini','entropy'],'max_depth':[3,5,10,15,20,50]}]
            if model=='multinomial_naive_bayes':
                self.multinomial_naive_bayes()
                params = [{'alpha': [1, 0.1, 0.01, 0.001, 0.0001, 0.00001]}]
            if model=="gaussian_naive_bayes":
                self.gaussian_naive_bayes()
                print("Message: No hyperparameter to tune for gaussian naive bayes, use predict() function to get predictions!")
                return
            if model=="rf":
                self.rf()
                params = [{'n_estimators':[10,50,100,200],
                           'criterion':['gini','entropy'],
                           'max_features': ['auto', 'sqrt', 'log2'],
                           'max_depth':[3,5,10,20]}]
            if model=="xgb":
                self.xgb()
                params=[{'max_depth': [3,6,9,12],
                        'subsample': [0.8,0.9,1.0]}]
            if model=='svm':
                self.svm()
                params = [{'C': [1, 10], 'kernel': ('linear', 'rbf')}]
            
        # use parameter grid given
        else:
            params = given_params
            if model=='knn':
                self.knn()
            if model=='logistic_regression':
                self.logistic_regression()
            if model== "decision_tree":
                self.decision_tree()
            if model=='multinomial_naive_bayes':
                self.multinomial_naive_bayes()
            if model=="gaussian_naive_bayes":
                self.gaussian_naive_bayes()
                print("Message: No hyperparameter to tune for gaussian naive bayes, use predict() function to get predictions!")
                return        
            if model=="rf":
                self.rf()
            if model=="xgb":
                self.xgb()
            if model=='svm':
                self.svm()
                
        # initialise grid search
        gs = GridSearchCV(estimator=self.model,
                  param_grid = params,
                  scoring=scoring,
                  cv=cv,
                  verbose=0)
        
        
        # fit the data and get results
        try:
            gs.fit(self.X_train,self.y_train)
            print("best params: ",gs.best_params_)
            print("score: ",gs.score(self.X_train,self.y_train))
            self.model = gs
        except:
            print("Message: The parameters you entered doesn't match the input format. Please refer to the parameter_tuning function to understand the input format for parameter ranges")
            return
        

    def predict(self):
        # fit/train the model
        clf = self.model.fit(self.X_train, self.y_train)
        
        # make predictions
        self.y_pred = clf.predict(self.X_test)
    
    
    def performance(self,threshold=0.5):
        '''
        Improvements
        1. Add visualisations
        2. Read and explain the performannce matrix
        '''
        
        
#         # convert probability to binary output using given threshold (parameter)
#         y_pred_binary = (self.y_pred>threshold).astype(int)
#         print(y_pred_binary)
#         print(self.y_pred)
        
        # accuracy
        accuracy = accuracy_score(self.y_pred, self.y_test)
        print("accuracy:",accuracy)
        
        # confusion mat/rix
        cm = confusion_matrix(self.y_pred, self.y_test)
        print("confusion matrix:\n",cm)
        
        # roc_auc
        roc_auc = roc_auc_score(self.y_pred, self.y_test)
        print("ROC AUC:",roc_auc)
        
        # pr_auc
        pr_auc = average_precision_score(self.y_pred, self.y_test)
        print("PR AUC:",pr_auc)
        
        return {'accuracy':accuracy, 'confusion_matrix':cm, 'roc_auc':roc_auc, 'pr_auc':pr_auc}

In [354]:
data = pd.read_csv('../input/heart.csv')
X = data[data.columns[:-1]]
y = data[data.columns[-1]]

In [355]:
p.model

GridSearchCV(cv=5,
             estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None,
                                     enable_categorical=False, gamma=None,
                                     gpu_id=None, importance_type=None,
                                     interaction_constraints=None,
                                     learning_rate=None, max_delta_step=None,
                                     max_depth=None, min_child_weight=None,
                                     missing=nan, monotone_constraints=None,
                                     n_estimators=100, n_jobs=None,
                                     num_parallel_tree=None, predictor=None,
                                     random_state=None, reg_alpha=None,
                                     reg_lambda=None, scale_pos_weight

In [None]:
def main(data='data_file_path'):
    
    # read the data
    data = pd.read_csv('../input/heart.csv')
    X = data[data.columns[:-1]]
    y = data[data.columns[-1]]
    
    # with parameter tuning

    # call class
    p = Prediction(X,y)

    # data normalization
    p.data_normalization()

    # feature engineering
    p.feature_selection()

    # split data into train and test
    p.data_split()

    # parameter tuning
    p.parameter_tuning(model='svm')

    # make predictions
    p.predict()

    # get model performance
    performance = p.performance()
    
    performance.to_txt('')
    
    

In [None]:
if __name__ == '__main__':
    main()

In [356]:
# with parameter tuning

# call class
p = Prediction(X,y)

# data normalization
p.data_normalization()

# feature engineering
p.feature_selection()

# split data into train and test
p.data_split()

# parameter tuning
p.parameter_tuning(model='svm')

# make predictions
p.predict()

# get model performance
performance = p.performance()

best params:  {'C': 1, 'kernel': 'linear'}
score:  0.8333333333333334
accuracy: 0.8477366255144033
confusion matrix:
 [[ 85  11]
 [ 26 121]]
ROC AUC: 0.8542729591836735
PR AUC: 0.8615310321659527


In [352]:
performance

{'accuracy': 0.7777777777777778,
 'confusion_matrix': array([[ 85,  30],
        [ 24, 104]]),
 'roc_auc': 0.7758152173913043,
 'pr_auc': 0.7293624470241385}

In [291]:
# with parameter tuning

# call class
p = Prediction(X,y)

# data normalization
p.data_normalization()

# feature engineering
p.feature_selection()

# split data into train and test
p.data_split()

# parameter tuning
params = [{'n_neighbors':[3], 
           'weights':['uniform','distance'],
           'leaf_size':[15,20,30]}] 
p.parameter_tuning(model='knn',given_params=params)

# make predictions
p.predict()

# get model performance
performace = p.performance()

params:  [{'n_neighbors': [3], 'weights': ['uniform', 'distance'], 'leaf_size': [15, 20, 30]}]
{'leaf_size': 15, 'n_neighbors': 3, 'weights': 'uniform'}
0.9
accuracy: 0.7777777777777778
confusion matrix:
 [[ 81  22]
 [ 32 108]]
ROC AUC: 0.7789181692094314
PR AUC: 0.7725663636774748


In [220]:
# without parameter tuning
p = Prediction(X,y)
p.data_normalization()
p.feature_selection()
p.data_split()
p.knn()
p.predict()

In [221]:
# svm
p.performance()

accuracy: 0.7983539094650206
confusion matrix:
 [[ 78  18]
 [ 31 116]]
ROC AUC: 0.8008078231292517
PR AUC: 0.8106870535205052


{'accuracy': 0.7983539094650206,
 'confusion_matrix': array([[ 78,  18],
        [ 31, 116]]),
 'roc_auc': 0.8008078231292517,
 'pr_auc': 0.8106870535205052}

In [214]:
#xgb
p.performance()

accuracy: 0.823045267489712
confusion matrix:
 [[ 86  18]
 [ 25 114]]
ROC AUC: 0.8235334809075816
PR AUC: 0.8111867408430158


{'accuracy': 0.823045267489712,
 'confusion_matrix': array([[ 86,  18],
        [ 25, 114]]),
 'roc_auc': 0.8235334809075816,
 'pr_auc': 0.8111867408430158}

In [None]:
# xgboost
p.performance()

In [167]:
#gb
p.performance()

[0 1 1 1 1 0 1 0 0 1 0 0 0 1 1 0 1 0 0 1 0 0 1 1 0 1 0 0 0 1 1 0 0 1 0 0 1
 0 0 1 0 1 0 0 0 0 1 1 1 1 0 1 1 1 1 1 1 0 1 0 1 1 1 0 1 1 1 0 0 0 1 0 1 0
 1 1 1 1 0 0 1 1 1 0 0 1 1 1 0 1 1 1 0 0 1 1 1 1 1 0 0 1 1 1 1 1 1 0 1 1 0
 0 0 1 1 0 0 0 1 0 1 1 0 0 1 1 1 0 0 1 1 1 1 0 1 0 0 0 0 1 0 0 0 1 0 0 0 0
 0 1 1 1 0 1 0 1 0 1 0 1 1 0 1 1 1 1 1 1 1 1 0 0 0 0 0 0 1 1 0 0 0 0 1 1 1
 1 1 1 0 1 0 1 1 0 1 0 1 0 1 1 1 0 0 1 1 0 1 1 0 0 0 1 0 0 0 0 1 1 0 0 1 1
 1 1 1 1 1 1 1 0 0 1 1 0 0 1 1 1 1 1 0 1 0]
accuracy: 0.6872427983539094
confusion matrix:
 [[71 37]
 [39 96]]
ROC AUC: 0.6842592592592593
PR AUC: 0.6737770351805439


{'accuracy': 0.6872427983539094,
 'confusion_matrix': array([[71, 37],
        [39, 96]]),
 'roc_auc': 0.6842592592592593,
 'pr_auc': 0.6737770351805439}

In [164]:
#rf
p.performance()

[1 0 1 1 0 1 1 1 0 0 1 0 0 1 1 0 0 1 1 0 1 1 0 1 1 0 1 1 1 0 0 0 1 1 0 0 1
 0 0 1 1 1 0 1 0 1 0 1 1 1 1 0 0 1 1 0 0 1 1 1 0 1 1 1 1 1 1 0 1 0 1 1 1 1
 0 0 0 1 0 0 0 1 1 0 1 0 1 1 0 0 1 1 1 1 1 0 1 1 0 0 1 1 1 1 0 1 1 0 0 0 1
 1 1 1 0 0 0 1 1 1 1 0 1 1 0 0 1 1 1 1 0 0 1 1 1 0 0 0 1 1 0 0 1 1 1 1 1 1
 0 0 1 0 1 0 1 0 1 0 0 1 0 0 0 0 0 0 0 1 1 1 0 1 1 0 1 1 1 1 1 1 1 0 0 1 0
 1 1 1 1 1 1 1 0 1 1 1 1 1 1 0 1 0 1 1 1 0 0 1 1 0 1 1 1 1 1 1 1 1 1 1 0 1
 1 0 0 0 0 1 1 1 0 0 1 1 1 1 1 1 1 1 1 1 0]
accuracy: 0.6790123456790124
confusion matrix:
 [[ 63  28]
 [ 50 102]]
ROC AUC: 0.6816801619433198
PR AUC: 0.7322795354959099


{'accuracy': 0.6790123456790124,
 'confusion_matrix': array([[ 63,  28],
        [ 50, 102]]),
 'roc_auc': 0.6816801619433198,
 'pr_auc': 0.7322795354959099}

In [158]:
#KNN(5)
p.performance()

[0 1 0 1 1 0 0 0 0 1 1 1 1 1 0 0 0 1 1 0 0 0 1 0 1 1 1 0 1 1 1 0 0 0 0 1 1
 1 1 0 0 0 1 0 1 1 0 1 0 1 1 0 0 1 0 0 0 1 1 0 0 0 1 0 1 1 0 1 0 0 1 1 1 0
 0 1 1 1 0 0 0 0 1 0 1 1 1 0 0 0 0 0 0 1 0 0 1 0 1 0 1 1 1 0 0 1 0 0 0 0 0
 1 0 1 0 0 0 0 0 0 0 0 0 1 0 1 0 0 1 0 1 1 1 0 0 0 0 0 0 1 0 1 1 1 0 0 0 0
 1 0 1 1 0 1 0 0 0 1 0 0 0 1 1 0 0 0 1 1 1 1 0 0 0 1 1 1 1 0 1 0 1 0 1 1 0
 1 0 0 1 0 1 1 0 0 0 0 1 0 0 1 0 1 1 0 1 1 1 0 0 1 0 0 0 0 0 0 1 1 0 0 1 0
 0 0 1 0 0 1 0 0 1 0 1 0 1 0 1 0 0 0 0 0 0]
accuracy: 0.6790123456790124
confusion matrix:
 [[84 56]
 [22 81]]
ROC AUC: 0.6932038834951456
PR AUC: 0.5554913964035296


{'accuracy': 0.6790123456790124,
 'confusion_matrix': array([[84, 56],
        [22, 81]]),
 'roc_auc': 0.6932038834951456,
 'pr_auc': 0.5554913964035296}

In [128]:
#GNB
p.performance()

accuracy: 0.8106995884773662
confusion matrix:
 [[ 92  32]
 [ 14 105]]
ROC AUC: 0.8121442125237192
PR AUC: 0.7338690725456622


In [123]:
# MNB
p.performance()

accuracy: 0.8148148148148148
confusion matrix:
 [[ 91  24]
 [ 21 107]]
ROC AUC: 0.8136209239130435
PR AUC: 0.7692083981245877


In [114]:
#DT
p.performance()

accuracy: 0.7283950617283951
confusion matrix:
 [[83 41]
 [25 94]]
ROC AUC: 0.729635402548116
PR AUC: 0.6528962202164816
