In [38]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import Imputer
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn import preprocessing
import sklearn
import math
from sklearn.ensemble import VotingClassifier


# Normalize Features so that every feature is on the same scale

In [39]:
def normalize(x):
    y=x*x
    
    y=math.sqrt(y.sum())
    return(x/y)

# read_normalize_data function
    1.Requires the path from which the data is to be fetched and numerical features that has to be scaled  
    2.Reads the data
    3.Fills the Nans and Null with mean value
    4.Removes the data within 3 standard deviation using normal distribution
    5.Normalises the data to same scale

In [40]:
def read_normalize_data(path,Numerical_data_dict):
    df=pd.read_csv(path)
    
    for i in df.keys():
            if(i in Numerical_data_dict):
                df[i]=df[i].fillna(df[i].mean())
                df=df[((df[i] - df[i].mean()) / df[i].std()).abs() < 3]
                df[i]=normalize(df[i])
    return(df)


# split_Data Function
    1.Requires the data frame,features and the target
    2.Creates an imputer to fill Nan for categorical variable
    3.Split data into train and test with 70% train and 30% test
    4.Reshapes the data 
    5.Returns the train and test data

In [41]:
def split_Data(df,xaxis,yaxis):
    xtrain, xtest ,ytrain,ytest= train_test_split(df[xaxis],df[yaxis], test_size = 0.3)
    imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
    imp = imp.fit(xtrain)

    # Impute our data, then train
    
    c, r = ytrain.shape
    ytrain = np.array(ytrain).reshape(c,)
    c, r = ytest.shape
    ytest = np.array(ytest).reshape(c,)
    return(xtrain,ytrain,xtest,ytest,imp)

# get_Random_Forest function
    1.Gets train,test and imputes
    2.creates and Random Forest
    3.Creates a grid to search for hyper parameters for the best accuracy model using crossvalidation
    4.Fits the train and test data
    5.Returns the Accuracy,Roc Auc score and the confusion matrix

In [42]:
def get_Random_Forest(X_train_imp,ytrain,xtest,ytest,imp):
    ans=[]
    rfc = RandomForestClassifier(n_jobs=-1) 
    param_grid1 = { 
    'n_estimators': [50,75,100],
    'criterion':['gini','entropy'],
    'max_features': ['auto', 'sqrt', 'log2'],

    }
    CV_rfc = sklearn.model_selection.GridSearchCV(estimator=rfc, param_grid=param_grid1, cv= 5)
    CV_rfc.fit(X_train_imp, ytrain)
    X_test_imp = imp.transform(xtest)
    CV_rfc_y = CV_rfc.predict(X_test_imp)
    ans.append(accuracy_score(ytest,CV_rfc_y))
    ans.append(roc_auc_score(ytest,CV_rfc_y))
    ans.append(sklearn.metrics.confusion_matrix(ytest,CV_rfc_y))
    ans.append('Random Forest')
    return(ans,CV_rfc)

# get_Gradient_Boosted_Tree function
    1.Gets train,test and imputes
    2.creates and Gradient boosted trees
    3.Creates a grid to search for hyper parameters for the best accuracy model using crossvalidation  
    4.Fits the train and test data
    5.Returns the Accuracy,Roc Auc score and the confusion matrix

In [43]:
def get_Gradient_Boosted_Tree(X_train_imp,ytrain,xtest,ytest,imp):
    ans=[]
    gradient_boosted_classifier = sklearn.ensemble.GradientBoostingClassifier() 
    param_grid1 = { 
    'loss': ['deviance', 'exponential'],
    'n_estimators':[50,75,100],
    'max_features': ['auto', 'sqrt', 'log2'],

    }
    gradient_cv = sklearn.model_selection.GridSearchCV(estimator=gradient_boosted_classifier, param_grid=param_grid1, cv= 5)
    gradient_cv.fit(X_train_imp, ytrain)
    X_test_imp = imp.transform(xtest)
    gradient_cv_y = gradient_cv.predict(X_test_imp)
    ans.append(accuracy_score(ytest,gradient_cv_y))
    ans.append(roc_auc_score(ytest,gradient_cv_y))
    ans.append(sklearn.metrics.confusion_matrix(ytest,gradient_cv_y))
    ans.append('Gradient descent')
    return(ans,gradient_cv)


# get_Adaboost function
    1.Gets train,test and imputes
    2.creates and Gradient Adaboosted trees
    3.Creates a grid to search for hyper parameters for the best accuracy model using crossvalidation
    4.Fits the train and test data
    5.Returns the Accuracy,Roc Auc score and the confusion matrix

In [44]:
def get_Adaboost(X_train_imp,ytrain,xtest,ytest,imp):
    ans=[]
    ada_boost = sklearn.ensemble.AdaBoostClassifier()
    param_grid1 = { 
    'algorithm': ['SAMME', 'SAMME.R'],
    'n_estimators':[50,75,100],

    }
    ada_cv = sklearn.model_selection.GridSearchCV(estimator=ada_boost, param_grid=param_grid1, cv= 5)
    ada_cv.fit(X_train_imp, ytrain)
    X_test_imp = imp.transform(xtest)
    ada_cv_y = ada_cv.predict(X_test_imp)
    ans.append(accuracy_score(ytest,ada_cv_y))
    ans.append(roc_auc_score(ytest,ada_cv_y))
    ans.append(sklearn.metrics.confusion_matrix(ytest,ada_cv_y))
    ans.append('Adaboost')
    return(ans,ada_cv)


# get_Voting_Classifier function
    1.Gets train,test and different classifers for voting
    2.creates voting classifier
    3.Fits the train and test data
    4.Returns the Accuracy,Roc Auc score and the confusion matrix

In [45]:
def get_Voting_Classifier(classifiers,X_train_imp,ytrain,xtest,ytest,imp):
    ans=[]
    Voting_classifier = VotingClassifier(estimators=classifiers, voting='soft')
    Voting_classifier.fit(X_train_imp, ytrain)
    X_test_imp = imp.transform(xtest)
    Voting_classifier_y = Voting_classifier.predict(X_test_imp)
    ans.append(accuracy_score(ytest,Voting_classifier_y))
    ans.append(roc_auc_score(ytest,Voting_classifier_y))
    ans.append(sklearn.metrics.confusion_matrix(ytest,Voting_classifier_y))
    ans.append('Voting classifier')
    return(ans)

# get_Best_Classifier function
    1.gets the dataframe,features and class feature
    2.Calls train test split to split data
    3.calls random forest function to get the answer and the classifier
    4.calls gradient boosted function to get the answer and the classifier
    5.calls adaboost boosted function to get the answer and the classifier
    6.Calls the voting classifier with the above three classifier
    7.Returns the best classifier with their accuracy,auc_roc,confusion matrix
    

In [46]:
def get_Best_Classifier(df,xaxis,yaxis):
    xtrain,ytrain,xtest,ytest,imp=split_Data(df=df,
    xaxis=xaxis,
    yaxis=yaxis
    )
    X_train_imp = imp.transform(xtrain)
    ada_Ans,ada_Classifier=get_Adaboost(X_train_imp,ytrain,xtest,ytest,imp)
    random_Ans,random_Classifier=get_Random_Forest(X_train_imp,ytrain,xtest,ytest,imp)
    gradient_Ans,gradient_Classifier=get_Gradient_Boosted_Tree(X_train_imp,ytrain,xtest,ytest,imp)
    voting_ans=get_Voting_Classifier([('Random', ada_Classifier), ('gradient', random_Classifier), ('ada', gradient_Classifier)],
                                    X_train_imp,ytrain,xtest,ytest,imp
                                    )
    final_ans=pd.DataFrame([ada_Ans,
                  random_Ans,
                  gradient_Ans,
                  voting_ans
                 ],
                  columns=['Accuracy','Roc','Confusion','Name'])
    
    return(final_ans.sort_values(by='Roc',ascending=False).head(1))

# Create Data frame and call the best classifier with restricted and all features

In [47]:
df=read_normalize_data('Training Data (N=2000).csv',
                      {'Level_of_Hemoglobin':1,
                        'Genetic_Pedigree_Coefficient':1,
                        'Age':1,
                        'BMI':1,
                        'Physical_activity':1,
                        'salt_content_in_the_diet':1,
                        'alcohol_consumption_per_day':1}
                      
                      )
print('###########')
print('Getting Best Model For Less Features')
print(get_Best_Classifier(df=df,
    xaxis=list((set(['Patient_Number', 'Blood_Pressure_Abnormality', 'Level_of_Hemoglobin','Genetic_Pedigree_Coefficient'] 
                   ))-set(['Blood_Pressure_Abnormality','Patient_Number'])),
    yaxis=['Blood_Pressure_Abnormality']
                   ))
print('###########')
print('\n')
print('Best Model For All The features')
print(get_Best_Classifier(df=df,
    xaxis=list((set(df.keys()))-set(['Blood_Pressure_Abnormality','Patient_Number'])),
    yaxis=['Blood_Pressure_Abnormality']
                   ))


###########
Getting Best Model For Less Features
   Accuracy       Roc               Confusion              Name
2  0.896667  0.897093  [[275, 21], [41, 263]]  Gradient descent
###########


Best Model For All The features
   Accuracy       Roc               Confusion           Name
1  0.933333  0.933204  [[287, 18], [22, 273]]  Random Forest
