In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,precision_score,recall_score,confusion_matrix,f1_score,roc_auc_score


In [2]:
def model2(x_train,y_train,x_val,y_val,x_test,y_test):
    lr=LogisticRegression(max_iter=1000)  #again training the model with best hyperparameter
    lr.fit(x_train,y_train)
    y_test_pred=lr.predict(x_test)   #fitting the training data into model
    print("Model-2 : Logistic Regression")  
    acc_t=round(accuracy_score(y_test_pred,y_test),4)*100 #accuracy of the model
    print("Accuracy of Model 2 on test set before tuning : ",acc_t)
    pre=round(precision_score(y_test_pred,y_test,average='macro'),4)*100   #precision score
    print("Precision of Model 2 on test set before tuning : ",pre)
    rec=round(recall_score(y_test_pred,y_test,average='macro'),4)*100
    print("Recall of Model 2 on test set before tuning: ",rec)      #recall_score
    f_score=round(f1_score(y_test_pred,y_test,average='macro'),4)*100
    print("F1-score of Model-2 on test set before tuning : ",f_score)    #f1_Score
    con_mat=confusion_matrix(y_test,y_test_pred,labels=[1,2,3,4,5,6,7,8]) #confusion matrix
    print("Confusion Matrix of Model-2 on test set before tuning: ")
    print(con_mat)
    print("-------------------------------------------------------------------------------------------------")
    print("Tuning the hyperparameter with the help of validation set")
    solver=['saga','liblinear']             #taking a list which contains different algorithm that can be used to optimize 
    multi_class=['auto','ovr']              #list containing different multi_class parameter
    c=[10,100]                              #list containing different inverse regularization values
    feature=[]
    a=[]
    for i in solver:
        for j in multi_class:
            for k in c:
                lr=LogisticRegression(solver=i,max_iter=10000,multi_class=j,penalty='l1',C=k)   #tuning the model each time with a new hyperparameter
                lr.fit(x_train,y_train)
                y_val_pred=lr.predict(x_val)
                acc=round(accuracy_score(y_val_pred,y_val),4)*100
                feature.append((i,j,k))
                a.append(acc)
    m=max(a)       #getting the maximum accuracy from validation set
    p=a.index(m)   #getting the index of the max_accuracy
    lr=LogisticRegression(solver=feature[p][0],multi_class=feature[p][1],max_iter=10000,penalty='l1',C=feature[p][2])  #again training the model with best hyperparameter
    x_train=np.concatenate((x_train,x_val))  #Now combining both training and validation data
    y_train=np.concatenate((y_train,y_val))  #Now combing the target values of training and validation data
    lr.fit(x_train,y_train)
    y_test_pred=lr.predict(x_test)   #fitting the training data into model  
    acc_t=round(accuracy_score(y_test_pred,y_test),4)*100 #accuracy of the model
    print("Accuracy of Model 2 on test set after tuning: ",acc_t)
    pre=round(precision_score(y_test_pred,y_test,average='macro'),4)*100   #precision score
    print("Precision of Model 2 on test set after tuning: ",pre)
    rec=round(recall_score(y_test_pred,y_test,average='macro'),4)*100
    print("Recall of Model 2 on test set after tuning: ",rec)      #recall_score
    f_score=round(f1_score(y_test_pred,y_test,average='macro'),4)*100
    print("F1-score of Model-2 on test set after tuning: ",f_score)    #f1_Score
    con_mat=confusion_matrix(y_test,y_test_pred,labels=[1,2,3,4,5,6,7,8]) #confusion matrix
    print("Confusion Matrix of Model-2 on test set after tuning: ")
    print(con_mat)
    

In [3]:
if __name__ == '__main__':
    train=pd.read_excel('cTTD_features_with_Labels/S2/trainset_60.xls')  #reading the xls file into dataframe
    validate=pd.read_excel('cTTD_features_with_Labels/S2/validate_20.xls')
    test=pd.read_excel('cTTD_features_with_Labels/S2/testset_20.xls')
    
    x_tr=train.drop(43,axis=1)    #separating the target values
    y_tr=train[43]
    x_v=validate.drop(43,axis=1)
    y_v=validate[43]
    x_te=test.drop(43,axis=1)
    y_te=test[43]
    
    x_train=x_tr.to_numpy()        # converting dataframe to numpy array
    y_train=y_tr.to_numpy()
    x_val=x_v.to_numpy()
    y_val=y_v.to_numpy()
    x_test=x_te.to_numpy()
    y_test=y_te.to_numpy()
    
    model2(x_train,y_train,x_val,y_val,x_test,y_test)
    print("==========================================================================================================")
    


Model-2 : Logistic Regression
Accuracy of Model 2 on test set before tuning :  92.4
Precision of Model 2 on test set before tuning :  92.47
Recall of Model 2 on test set before tuning:  92.44
F1-score of Model-2 on test set before tuning :  92.44
Confusion Matrix of Model-2 on test set before tuning: 
[[870   0   7   0   2   0   0   0]
 [  0 880   0   1   0   6   1   0]
 [  2   0 849  16   7   3   0   1]
 [  0   2  23 723   7  16 109  17]
 [  0   3   8   4 814  41   4   0]
 [  0  21  10  20  23 775  34   0]
 [  0   0   0  70  13  47 768   0]
 [  0   0   1  15   0   1   0 826]]
-------------------------------------------------------------------------------------------------
Tuning the hyperparameter with the help of validation set
Accuracy of Model 2 on test set after tuning:  97.56
Precision of Model 2 on test set after tuning:  97.59
Recall of Model 2 on test set after tuning:  97.58
F1-score of Model-2 on test set after tuning:  97.58
Confusion Matrix of Model-2 on test set after tun

This is the total analysis done using Logistic Regression.The model is properly tuned with the help of validation set.Several parameters were tuned to increase the model effciency.As it can be seen that accuracy before tuning the parameter was 92.44 percent and after tuning it jumps to 97.56 percent.The precision of the model also raise to 97.59 percent after tuning the model.The recall also rose to 97.58 after tuning the hyperparameter.F1 score before tuning was 92.44 whereas after tuning it became 97.58 percent.As there is not much difference between precision and recall,accuracy will be ideal metric choice.

Summary
 1. This is the total analysis done using Logistic Regression.
 2. First ,datasets was converted into numpy array so that it can be easily put in the model.
 3. Then we normally train the model without tuning any of its hyperparameter and then test it on test dataset and observe the       metrics 
 4. After that we tuned the hyperparameters by varying with different values and checked it on validation set and after gettig       the best hyperparameters and after that we combine both train and validate and train as a whole.
 5. After that we checked the accuracy,precision,recall,confusion-matrix as well as F1 score of the testing data with predicted     value
 