Importing all the modules/packages required to complete the given task

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,ExtraTreesClassifier,VotingClassifier,AdaBoostClassifier,GradientBoostingClassifier,StackingClassifier,BaggingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.gaussian_process import  GaussianProcessClassifier
from sklearn.metrics import accuracy_score,precision_score,recall_score,confusion_matrix,f1_score,classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler



The model-1 performs the Logistic-Regression on the given task and tuning is done using simple loops and the accuracy comes around 98.25 percent

In [2]:
def model1(x_train,y_train,x_test,y_test):
    print("Model-1 : Logistic Regression ")
    param_grid={'solver':['liblinear','newton-cg','sag'],'max_iter':[10000]}
    grid = GridSearchCV(LogisticRegression(), param_grid, cv=5, scoring='accuracy')
    grid.fit(x_train,y_train)
    y_test_pred=grid.predict(x_test)
    acc_test=round(accuracy_score(y_test_pred,y_test),4)*100
    print("Accuracy of Model 1 on test set : ",acc_test)
    pre=round(precision_score(y_test_pred,y_test,average='macro'),4)*100
    print("Precision of Model 1 on test set : ",pre)
    rec=round(recall_score(y_test_pred,y_test,average='macro'),4)*100
    print("Recall of Model 1 on test set : ",rec)
    f_score=round(f1_score(y_test_pred,y_test,average='macro'),4)*100
    print("F1-score of Model-1 on test set : ",f_score)
    con_mat=confusion_matrix(y_test,y_test_pred,labels=[1,2,3,4,5,6,7,8])
    print("Confusion Matrix of Model-1 on test set : ")
    print(con_mat)
    print(classification_report(y_test,y_test_pred,digits=8))
    return grid.best_params_

    
    

This is the model-2 which performs the KNearest algorithm.In this model-1,the performance of KNN on the given task is improved Firstly,the datasets is provided with all necessary cleaning and converting to numpy array.I have set a range of values for which ,it will tune the value of k on the validation set such that we can know the value of k whichs performs the best on the validation set.Then we used the best value of k for testing purposes and calculate the accuracy of the model,which comes around 96.75

In [3]:
def model2(x_train,y_train,x_test,y_test):
    print("Model-2 : K-Nearest Neighbors ")
    k_range=[3,7,9,13,15,19,23,25]
    param_grid=dict(n_neighbors=k_range)
    grid = GridSearchCV(KNeighborsClassifier(), param_grid, cv=5, scoring='accuracy')
    grid.fit(x_train,y_train)
    y_test_pred=grid.predict(x_test)
    acc_t=round(accuracy_score(y_test_pred,y_test),4)*100
    print("Accuracy of Model 2 on test set : ",acc_t)
    pre=round(precision_score(y_test_pred,y_test,average='macro'),4)*100
    print("Precision of Model 2 on test set : ",pre)
    rec=round(recall_score(y_test_pred,y_test,average='macro'),4)*100
    print("Recall of Model 2 on test set : ",rec)
    f_score=round(f1_score(y_test_pred,y_test,average='macro'),4)*100
    print("F1-score of Model-2 on test set : ",f_score)
    con_mat=confusion_matrix(y_test,y_test_pred,labels=[1,2,3,4,5,6,7,8])
    print("Confusion Matrix of Model-2 on test set : ")
    print(con_mat)
    print(classification_report(y_test,y_test_pred,digits=8))
    return grid.best_params_

    
    
    

Support Vector Machine Algorithm is performed onn the given model and accuracy obtained was 99.25 percent

In [4]:
def model3(x_train,y_train,x_test,y_test):
    print("Model-3 : Support Vector Machine ")
    param_grid={'C':[1,10,100,100],'gamma':['auto',0.1,0.01,0.001],'kernel':['rbf','poly']}
    grid = GridSearchCV(SVC(), param_grid, cv=5, scoring='accuracy')
    grid.fit(x_train,y_train)
    y_test_pred=grid.predict(x_test)
    acc_test=round(accuracy_score(y_test_pred,y_test),4)*100
    print("Accuracy of Model 3 on test set : ",acc_test)
    pre=round(precision_score(y_test_pred,y_test,average='macro'),4)*100
    print("Precision of Model 3 on test set : ",pre)
    rec=round(recall_score(y_test_pred,y_test,average='macro'),4)*100
    print("Recall of Model 3 on test set : ",rec)
    f_score=round(f1_score(y_test_pred,y_test,average='macro'),4)*100
    print("F1-score of Model-3 on test set : ",f_score)
    con_mat=confusion_matrix(y_test,y_test_pred,labels=[1,2,3,4,5,6,7,8])
    print("Confusion Matrix of Model-3 on test set : ")
    print(con_mat)
    print(classification_report(y_test,y_test_pred,digits=8))
    return grid.best_params_

    

The below model implements the DecisionTree Classification.Here we tune its different parameter and try to get the best parameter by performing holdout validation.Then we used the best parameters of the model on test-set and obtain an accuracy around 95 percent.

In [5]:
def model4(x_train,y_train,x_test,y_test):
    print("Model-4 : Decision Tree ")
    param_grid={'criterion':['gini','entropy'],'max_depth':[5,7,9,13,15,23],'min_samples_leaf':[1,2,5]}
    grid = GridSearchCV(DecisionTreeClassifier(), param_grid, cv=5, scoring='accuracy')
    grid.fit(x_train,y_train)
    y_test_pred=grid.predict(x_test)
    acc_test=round(accuracy_score(y_test_pred,y_test),4)*100
    print("Accuracy of Model 4 on test set : ",acc_test)
    pre=round(precision_score(y_test_pred,y_test,average='macro'),4)*100
    print("Precision of Model 4 on test set : ",pre)
    rec=round(recall_score(y_test_pred,y_test,average='macro'),4)*100
    print("Recall of Model 4 on test set : ",rec)
    f_score=round(f1_score(y_test_pred,y_test,average='macro'),4)*100
    print("F1-score of Model-4 on test set : ",f_score)
    con_mat=confusion_matrix(y_test,y_test_pred,labels=[1,2,3,4,5,6,7,8])
    print("Confusion Matrix of Model-4 on test set : ")
    print(con_mat)
    print(classification_report(y_test,y_test_pred,digits=8))
    return grid.best_params_

    


Given below function perform the Random Forest Classification on the given data.Hypereparameters are tuned according to the validation set and we test the best parameter on test data so that we can get an accuracy of nearly 98.3 percent.

In [6]:
def model5(x_train,y_train,x_test,y_test):
    print("Model-5 : Random Forest Classification  ")
    param_grid={'n_estimators':[10,50,100,200],'criterion':['gini','entropy'],'max_depth':[5,6,8,13,15,23,25]}
    grid = GridSearchCV(RandomForestClassifier(), param_grid, cv=5, scoring='accuracy')
    grid.fit(x_train,y_train)
    y_test_pred=grid.predict(x_test)
    acc_test=round(accuracy_score(y_test_pred,y_test),4)*100
    print("Accuracy of Model 5 on test set : ",acc_test)
    pre=round(precision_score(y_test_pred,y_test,average='macro'),4)*100
    print("Precision of Model 5 on test set : ",pre)
    rec=round(recall_score(y_test_pred,y_test,average='macro'),4)*100
    print("Recall of Model 5 on test set : ",rec)
    f_score=round(f1_score(y_test_pred,y_test,average='macro'),4)*100
    print("F1-score of Model-5 on test set : ",f_score)
    con_mat=confusion_matrix(y_test,y_test_pred,labels=[1,2,3,4,5,6,7,8])
    print("Confusion Matrix of Model-5 on test set : ")
    print(con_mat)
    print(classification_report(y_test,y_test_pred,digits=8))
    return grid.best_params_

    

Given below function perform the ExtraTrees Classification on the given data.Hypereparameters are tuned according to the validation set and we test the best parameter on test data so that we can get an accuracy more than 98.3 percent.

In [7]:
def model6(x_train,y_train,x_test,y_test):
    print("Model-6 : ExtraTrees Classification  ")
    param_grid={'n_estimators':[10,50,100,200],'criterion':['gini','entropy'],'max_depth':[5,6,8,13,15,23,25]}
    grid = GridSearchCV(ExtraTreesClassifier(), param_grid, cv=5, scoring='accuracy')
    grid.fit(x_train,y_train)
    y_test_pred=grid.predict(x_test)
    acc_test=round(accuracy_score(y_test_pred,y_test),4)*100
    print("Accuracy of Model 6 on test set : ",acc_test)
    pre=round(precision_score(y_test_pred,y_test,average='macro'),4)*100
    print("Precision of Model 6 on test set : ",pre)
    rec=round(recall_score(y_test_pred,y_test,average='macro'),4)*100
    print("Recall of Model 6 on test set : ",rec)
    f_score=round(f1_score(y_test_pred,y_test,average='macro'),4)*100
    print("F1-score of Model-6 on test set : ",f_score)
    con_mat=confusion_matrix(y_test,y_test_pred,labels=[1,2,3,4,5,6,7,8])
    print("Confusion Matrix of Model-6 on test set : ")
    print(con_mat)
    print(classification_report(y_test,y_test_pred,digits=8))
    return grid.best_params_

    

Given below is the VotingClassifier which trains on an ensemble of numerous models and predicts an output (class) based on their highest probability of chosen class as the output. It simply aggregates the findings of each classifier passed into Voting Classifier and predicts the output class based on the highest majority of voting. In this hard voting method was used to train the model,the predicted output class was class with highest majority of votes. The accuracy obtained was slightly less than the accuracy obtained from SVM.

In [8]:
def model7(x_train,x_test,y_train,y_test,estimator):
    e=[('lr',LogisticRegression(solver=estimator[0]['solver'])),
       ('knn',KNeighborsClassifier(n_neighbors=estimator[1]['n_neighbors'])),
      ('svm',SVC(C=estimator[2]['C'],gamma=estimator[2]['gamma'],kernel='rbf')),
      ('dt',DecisionTreeClassifier(criterion=estimator[3]['criterion'],max_depth=estimator[3]['max_depth'])),
      ('rf',RandomForestClassifier(n_estimators=estimator[4]['n_estimators'],criterion=estimator[4]['criterion'],max_depth=estimator[4]['max_depth'])),
      ('et',ExtraTreesClassifier(n_estimators=estimator[5]['n_estimators'],criterion=estimator[5]['criterion'],max_depth=estimator[5]['max_depth']))]
    print("Model-7 : VotingClassifier")
    vot_hard=VotingClassifier(estimators = e, voting ='hard') 
    vot_hard.fit(x_train, y_train)
    y_test_pred=vot_hard.predict(x_test)
    acc_t=round(accuracy_score(y_test_pred,y_test),4)*100    #accuracy of the predicting model
    print("Accuracy of Model 7 on test set : ",acc_t)
    pre=round(precision_score(y_test_pred,y_test,average='macro'),4)*100   #precision of the predicting model
    print("Precision of Model 7 on test set : ",pre)
    rec=round(recall_score(y_test_pred,y_test,average='macro'),4)*100      #recall_Score of the model
    print("Recall of Model 7 on test set : ",rec)
    f_score=round(f1_score(y_test_pred,y_test,average='macro'),4)*100      #f1_score of the model
    print("F1-score of Model-7 on test set : ",f_score)
    con_mat=confusion_matrix(y_test,y_test_pred,labels=[1,2,3,4,5,6,7,8])   #confusion matrix of the model
    print("Confusion Matrix of Model-7 on test set : ")
    print(con_mat)
    print(classification_report(y_test,y_test_pred,digits=8))


A Bagging classifier is an ensemble meta-estimator that fits base classifiers each on random subsets of the original dataset and then aggregate their individual predictions (either by voting or by averaging) to form a final prediction. Such a meta-estimator can typically be used as a way to reduce the variance of a black-box estimator (e.g., a decision tree), by introducing randomization into its construction procedure and then making an ensemble out of it.

The accuracy obtained was greater than 98.98 percent



In [9]:
def model8(x_train,x_test,y_train,y_test,estimator):
    print("Model-8 : BaggingClassifier")
    bag=BaggingClassifier(base_estimator=SVC(C=estimator[2]['C'],gamma=estimator[2]['gamma'],kernel='rbf'),n_estimators=200,random_state=5) 
    bag.fit(x_train, y_train)
    y_test_pred=bag.predict(x_test)
    acc_t=round(accuracy_score(y_test_pred,y_test),4)*100    #accuracy of the predicting model
    print("Accuracy of Model 8 on test set : ",acc_t)
    pre=round(precision_score(y_test_pred,y_test,average='macro'),4)*100   #precision of the predicting model
    print("Precision of Model 8 on test set : ",pre)
    rec=round(recall_score(y_test_pred,y_test,average='macro'),4)*100      #recall_Score of the model
    print("Recall of Model 8 on test set : ",rec)
    f_score=round(f1_score(y_test_pred,y_test,average='macro'),4)*100      #f1_score of the model
    print("F1-score of Model-8 on test set : ",f_score)
    con_mat=confusion_matrix(y_test,y_test_pred,labels=[1,2,3,4,5,6,7,8])   #confusion matrix of the model
    print("Confusion Matrix of Model-8 on test set : ")
    print(con_mat)
    print(classification_report(y_test,y_test_pred,digits=8))


Below is the boosting method in which I have used GradientBoosting and obtain an accuracy around 98.5 percent.

In [10]:
def model9(x_train,x_test,y_train,y_test,estimator):
    print("Model-9 Gradient-Boosting ")
    gbc=GradientBoostingClassifier(n_estimators=200,learning_rate=0.1,subsample=1,random_state=0)
    gbc.fit(x_train,y_train)
    y_test_pred=gbc.predict(x_test)
    acc_t=round(accuracy_score(y_test_pred,y_test),4)*100    #accuracy of the predicting model
    print("Accuracy of Model 9 on test set : ",acc_t)
    pre=round(precision_score(y_test_pred,y_test,average='macro'),4)*100   #precision of the predicting model
    print("Precision of Model 9 on test set : ",pre)
    rec=round(recall_score(y_test_pred,y_test,average='macro'),4)*100      #recall_Score of the model
    print("Recall of Model 9 on test set : ",rec)
    f_score=round(f1_score(y_test_pred,y_test,average='macro'),4)*100      #f1_score of the model
    print("F1-score of Model-9 on test set : ",f_score)
    con_mat=confusion_matrix(y_test,y_test_pred,labels=[1,2,3,4,5,6,7,8])   #confusion matrix of the model
    print("Confusion Matrix of Model-9 on test set : ")
    print(con_mat)
    print(classification_report(y_test,y_test_pred,digits=8))


Below is the implementation of StackingClassifier.Here,we stacked the results obtain from different classifier and stacked together and passed to a final classifier which is SVM and the result obtained was around 99.3 percent.

In [11]:
def model10(x_train,x_test,y_train,y_test,estimator):
    esti=[('lr',LogisticRegression(solver=estimator[0]['solver'])),
          ('knn',KNeighborsClassifier(n_neighbors=estimator[1]['n_neighbors'])),
          ('svm',SVC(C=estimator[2]['C'],gamma=estimator[2]['gamma'],kernel='rbf')),
          ('dt',DecisionTreeClassifier(criterion=estimator[3]['criterion'],max_depth=estimator[3]['max_depth'])),
          ('rf',RandomForestClassifier(n_estimators=estimator[4]['n_estimators'],criterion=estimator[4]['criterion'],max_depth=estimator[4]['max_depth'])),
          ('et',ExtraTreesClassifier(n_estimators=estimator[5]['n_estimators'],criterion=estimator[5]['criterion'],max_depth=estimator[5]['max_depth']))]
    print("Model-10 : Stacking Classifier ")
    stc=StackingClassifier(estimators=esti,final_estimator=SVC(C=estimator[2]['C'],gamma=estimator[2]['gamma'],kernel='rbf'),cv=3)
    stc.fit(x_train,y_train)
    y_test_pred=stc.predict(x_test)
    acc_t=round(accuracy_score(y_test_pred,y_test),4)*100    #accuracy of the predicting model
    print("Accuracy of Model 10 on test set : ",acc_t)
    pre=round(precision_score(y_test_pred,y_test,average='macro'),4)*100   #precision of the predicting model
    print("Precision of Model 10 on test set : ",pre)
    rec=round(recall_score(y_test_pred,y_test,average='macro'),4)*100      #recall_Score of the model
    print("Recall of Model 10 on test set : ",rec)
    f_score=round(f1_score(y_test_pred,y_test,average='macro'),4)*100      #f1_score of the model
    print("F1-score of Model-10 on test set : ",f_score)
    con_mat=confusion_matrix(y_test,y_test_pred,labels=[1,2,3,4,5,6,7,8])   #confusion matrix of the model
    print("Confusion Matrix of Model-10 on test set : ")
    print(con_mat)
    print(classification_report(y_test,y_test_pred,digits=8))


    

In [12]:
if __name__ == '__main__':
    train=pd.read_excel('cTTD_features_with_Labels/S5/trainset_60.xls')  #reading the xls file into dataframe
    validate=pd.read_excel('cTTD_features_with_Labels/S5/validate_20.xls')
    test=pd.read_excel('cTTD_features_with_Labels/S5/testset_20.xls')
    
    x_tr=train.drop(43,axis=1)    #separating the target values
    y_tr=train[43]
    x_v=validate.drop(43,axis=1)
    y_v=validate[43]
    x_te=test.drop(43,axis=1)
    y_te=test[43]
    
    x_train=x_tr.to_numpy()        # converting dataframe to numpy array
    y_train=y_tr.to_numpy()
    x_val=x_v.to_numpy()
    y_val=y_v.to_numpy()
    x_test=x_te.to_numpy()
    y_test=y_te.to_numpy()
    
    sc=StandardScaler()
    x_train=sc.fit_transform(x_train)       #standardizing the features for better traing process
    x_val=sc.fit_transform(x_val)
    x_test=sc.fit_transform(x_test)
    estimator=[]
    
    x_train=np.concatenate((x_train,x_val))  #Now combining both training and validation data
    y_train=np.concatenate((y_train,y_val))  #Now combing the target values of training and validation data
    
    
    estimator.append(model1(x_train,y_train,x_test,y_test))
    print("==========================================================================================================")
    
    estimator.append(model2(x_train,y_train,x_test,y_test))
    print("==========================================================================================================")
    
    
    estimator.append(model3(x_train,y_train,x_test,y_test))
    print("==========================================================================================================")
    
    estimator.append(model4(x_train,y_train,x_test,y_test))
    print("==========================================================================================================")
    
    
    estimator.append(model5(x_train,y_train,x_test,y_test))
    print("==========================================================================================================")
    
    estimator.append(model6(x_train,y_train,x_test,y_test))
    print("==========================================================================================================")
    
    print(estimator)
    
   
    model7(x_train,x_test,y_train,y_test,estimator)
    print("===========================================================================================================")
    
    model8(x_train,x_test,y_train,y_test,estimator)
    print("===========================================================================================================")
    
    model9(x_train,x_test,y_train,y_test,estimator)
    print("===========================================================================================================")
    
    model10(x_train,x_test,y_train,y_test,estimator)
    print("===========================================================================================================")

Model-1 : Logistic Regression 
Accuracy of Model 1 on test set :  94.64
Precision of Model 1 on test set :  94.66
Recall of Model 1 on test set :  94.73
F1-score of Model-1 on test set :  94.67999999999999
Confusion Matrix of Model-1 on test set : 
[[877   0   0   2   0   0   0   0]
 [  0 887   1   0   0   0   0   0]
 [  1   0 863   5   0   9   0   0]
 [  0   0   7 820   0   7  23  40]
 [  0   0   0   0 874   0   0   0]
 [  0   0   3   1   0 805  66   8]
 [  0   0   0   5   0  85 770  38]
 [  1   0   0   9   0   9  57 767]]
              precision    recall  f1-score   support

           1  0.99772469 0.99772469 0.99772469       879
           2  1.00000000 0.99887387 0.99943662       888
           3  0.98741419 0.98291572 0.98515982       878
           4  0.97387173 0.91415831 0.94307073       897
           5  1.00000000 1.00000000 1.00000000       874
           6  0.87978142 0.91166478 0.89543938       883
           7  0.84061135 0.85746102 0.84895259       898
           8  0.

Accuracy of Model 7 on test set :  97.76
Precision of Model 7 on test set :  97.75
Recall of Model 7 on test set :  97.76
F1-score of Model-7 on test set :  97.75
Confusion Matrix of Model-7 on test set : 
[[879   0   0   0   0   0   0   0]
 [  0 888   0   0   0   0   0   0]
 [  1   0 876   0   0   1   0   0]
 [  0   0   1 888   0   0   5   3]
 [  0   0   0   0 874   0   0   0]
 [  0   0   0   0   0 862  12   9]
 [  0   0   0   4   0  51 824  19]
 [  0   0   0   6   0   3  43 791]]
              precision    recall  f1-score   support

           1  0.99886364 1.00000000 0.99943150       879
           2  1.00000000 1.00000000 1.00000000       888
           3  0.99885975 0.99772210 0.99829060       878
           4  0.98886414 0.98996656 0.98941504       897
           5  1.00000000 1.00000000 1.00000000       874
           6  0.94002181 0.97621744 0.95777778       883
           7  0.93212670 0.91759465 0.92480359       898
           8  0.96228710 0.93831554 0.95015015       843

 