In [57]:
import numpy as np
from catboost import CatBoostClassifier, FeaturesData
import pandas as pd
import csv as csv
from sklearn.metrics import confusion_matrix

In [58]:
def csv2data(fileStr):
    dataStr = csv.reader(open(fileStr), delimiter=' ', quotechar='|')
    data = []
    for row in dataStr:
        eachRow = ','.join(row);
        rowArr = list(map(float, eachRow.split(',')));
        data.append(rowArr[:-1])
    
    return data

def csv2label(fileStr):
    dataStr = csv.reader(open(fileStr), delimiter=' ', quotechar='|')
    data = []
    for row in dataStr:
        eachRow = ','.join(row);
        rowArr = eachRow.split(',');
        label = int(rowArr[-1]);
        data.append(label-1)
    
    return data

# import data
fold1 = csv2data("fold1.csv")
fold2 = csv2data("fold2.csv")
fold3 = csv2data("fold3.csv")
fold4 = csv2data("fold4.csv")
fold5 = csv2data("fold5.csv")

fold1Label = csv2label('fold1.csv')
fold2Label = csv2label("fold2.csv")
fold3Label = csv2label("fold3.csv")
fold4Label = csv2label("fold4.csv")
fold5Label = csv2label("fold5.csv")

In [59]:
fold_len = 100000

all_data = np.concatenate([fold1, fold2,fold3,fold4,fold5])
all_labels = np.concatenate([fold1Label,fold2Label,fold3Label,fold4Label,fold5Label])

feature_names = ['all', 'age', 'gender', 'bloodGroup', 'diastolicBP', 'systolicBP', 'weight', 'height', 'BMI', 'exerciseHabit', 'bloodCholestrol', 'parentalHistory', 'alcohol', 'heartRate']

for i in range(len(feature_names)):
    accuaracy =[]
    if i!=0:
        depricated_data = np.delete(all_data, np.s_[(i-1):i], axis=1)
        print('\nWithout the feature ', feature_names[i])
    else:
        depricated_data = all_data
        print('\n\nTaking all the features ')
    
    print("All data shape: ", depricated_data.shape)
    #print(all_label.shape)
    #print(depricated_data[0])
    
    for i in range (0,4):
        #preprocessing the data for k-fold
        #print("Iteration: ", i, "started")
        test_data = depricated_data[i*fold_len:(i+1)*fold_len]
        train_data = np.delete(depricated_data, np.s_[i*fold_len:(i+1)*fold_len], axis=0) #removing the test_data from all_data
        print("Train data shape: ", train_data.shape)
        print("Test data shape: ", test_data.shape)

        #preprocessing the labels for k-fold
        test_labels = all_labels[i*fold_len:(i+1)*fold_len]
        train_labels = np.delete(all_labels, np.s_[i*fold_len:(i+1)*fold_len], axis=0)
        #print("Train label shape: ", train_labels.shape)
        #print("Test label shape: ", test_labels.shape)

        # Initialize CatBoostClassifier
        model = CatBoostClassifier(iterations=1000, learning_rate=0.05, depth=6, loss_function='MultiClass')
        # Fit model
        #model.fit(train_data, train_labels, plot=True)
        model.fit(train_data, train_labels, plot=False, logging_level='Silent')

        # Get predicted classes
        preds_class = model.predict(test_data)
        # Get predicted probabilities for each class
        preds_proba = model.predict_proba(test_data)
        # Get predicted RawFormulaVal
        preds_raw = model.predict(test_data, prediction_type='RawFormulaVal')


        # Get Confusion matrix
        confusionMat=confusion_matrix(test_labels, preds_class)
        accuaracy.append(np.sum(np.diagonal(confusionMat))/np.sum(confusionMat))
        print("Iteration: ", i, "finished")


    avg_acc=np.average(accuaracy)
    print("accuracy")
    print(avg_acc)
    std = np.std(accuaracy)
    print("Std deviation")
    print(std)



Taking all the features 
All data shape:  (500000, 13)
Train data shape:  (400000, 13)
Test data shape:  (100000, 13)
Iteration:  0 finished
Train data shape:  (400000, 13)
Test data shape:  (100000, 13)
Iteration:  1 finished
Train data shape:  (400000, 13)
Test data shape:  (100000, 13)
Iteration:  2 finished
Train data shape:  (400000, 13)
Test data shape:  (100000, 13)
Iteration:  3 finished
accuracy
0.9869075
Std deviation
0.0003509540568222615

Without the feature  age
All data shape:  (500000, 12)
Train data shape:  (400000, 12)
Test data shape:  (100000, 12)
Iteration:  0 finished
Train data shape:  (400000, 12)
Test data shape:  (100000, 12)
Iteration:  1 finished
Train data shape:  (400000, 12)
Test data shape:  (100000, 12)
Iteration:  2 finished
Train data shape:  (400000, 12)
Test data shape:  (100000, 12)
Iteration:  3 finished
accuracy
0.9484125000000001
Std deviation
0.0006120610672146835

Without the feature  gender
All data shape:  (500000, 12)
Train data shape:  (4