In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn import metrics

# Binary Relevance

Read the 'processed_data.csv' file

In [None]:
data = pd.read_csv("processed_data.csv")
df = data[['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']]
df.head()

Using Stratified Sampling to create train-test split (75:25).
Finding accuracies of individual labels using Multinomial Naive Bayes classifier, Logistic Regression

Here, experimentation is done for the Count Vectorizer.

In [None]:
X = np.array(df[['0']])
yarg_list = ['1', '2', '3', '4', '5', '6', '7', '8', '9']
predicted_lol_MNB = [] # Prediction list of lists for MNB for Count Vectorizer
predicted_lol_MNB_tf = [] # Prediction list of lists for MNB for TF without IDF
predicted_lol_MNB_tfidf = [] # Prediction list of lists for MNB for TF with IDF

predicted_lol_LR = [] # Prediction list of lists for LR for Count Vectorizer
predicted_lol_LR_tf = [] # Prediction list of lists for LR for TF without IDF
predicted_lol_LR_tfidf = [] # Prediction list of lists for LR for TF with IDF

predicted_lol_SVM = [] # Prediction list of lists for SVM for Count Vectorizer
predicted_lol_SVM_tf = [] # Prediction list of lists for SVM for TF without IDF
predicted_lol_SVM_tfidf = [] # Prediction list of lists for SVM for TF with IDF
true_labels = []

for yarg in yarg_list: # Training over all the labels and making consequent predictions
    Y = np.array(df[[yarg]])
    XTrain, XTest, YTrain, YTest = train_test_split(X, Y, stratify=Y, test_size=0.25)
    true_labels.append(list(YTest.flatten())) #Getting true set of labels
    cv = CountVectorizer(ngram_range = (1,2), min_df = 1, max_df = 1)
    tf = TfidfTransformer(use_idf = False)
    tfidf = TfidfTransformer(use_idf = True)
    XTrain_counts = cv.fit_transform(XTrain[:, 0])
    XTrain_tf = tf.fit_transform(XTrain_counts)
    XTrain_tfidf = tfidf.fit_transform(XTrain_counts)
    
    XTest_counts = cv.transform(XTest[:, 0])
    XTest_tf = tf.transform(XTest_counts)
    XTest_tfidf = tfidf.transform(XTest_counts)
    
    # Multinomial Naive Bayes
    classifier_MNB = MultinomialNB(alpha = 0.01).fit(XTrain_counts, YTrain)
    classifier_MNB_tf = MultinomialNB(alpha = 0.01).fit(XTrain_tf, YTrain)
    classifier_MNB_tfidf = MultinomialNB(alpha = 0.01).fit(XTrain_tfidf, YTrain)
    predicted_MNB = classifier_MNB.predict(XTest_counts) # Predictions for MNB with Count Vectorizer
    predicted_MNB_tf = classifier_MNB.predict(XTest_tf) # Predictions for MNB with TF
    predicted_MNB_tfidf = classifier_MNB.predict(XTest_tfidf) # Predictions for MNB with TF-EDF

    predicted_lol_MNB.append(list(predicted_MNB))
    predicted_lol_MNB_tf.append(list(predicted_MNB_tf))
    predicted_lol_MNB_tfidf.append(list(predicted_MNB_tfidf))

    # Logistic Regression
    classifier_LR = LogisticRegression(random_state=0, solver='lbfgs',multi_class='multinomial').fit(XTrain_counts, YTrain)
    classifier_LR_tf = LogisticRegression(random_state=0, solver='lbfgs',multi_class='multinomial').fit(XTrain_tf, YTrain)
    classifier_LR_tfidf = LogisticRegression(random_state=0, solver='lbfgs',multi_class='multinomial').fit(XTrain_tfidf, YTrain)
    predicted_LR = classifier_LR.predict(XTest_counts) # Predictions for Logistic Regression with Count Vectorizer
    predicted_LR_tf = classifier_LR.predict(XTest_tf) # Predictions for Logistic Regression with TF
    predicted_LR_tfidf = classifier_LR.predict(XTest_tfidf) # Predictions for Logistic Regression with TF-IDF

    predicted_lol_LR.append(list(predicted_LR))
    predicted_lol_LR_tf.append(list(predicted_LR_tf))
    predicted_lol_LR_tfidf.append(list(predicted_LR_tfidf))

    # Support Vector Machine
    classifier_SVM = svm.SVC().fit(XTrain_counts, YTrain)
    classifier_SVM_tf = svm.SVC().fit(XTrain_tf, YTrain)
    classifier_SVM_tfidf = svm.SVC().fit(XTrain_tfidf, YTrain)
    predicted_SVM = classifier_SVM.predict(XTest_counts) # Predictions for SVM with Count Vectorizer
    predicted_SVM_tf = classifier_SVM.predict(XTest_tf) # Predictions for SVM with TF
    predicted_SVM_tfidf = classifier_SVM.predict(XTest_tfidf) # Predictions for SVM with TF-IDF

    predicted_lol_SVM.append(list(predicted_SVM))
    predicted_lol_SVM_tf.append(list(predicted_SVM_tf))
    predicted_lol_SVM_tfidf.append(list(predicted_SVM_tfidf))

In [None]:
#print(len(predicted_lol_MNB), len(predicted_lol_MNB_tf), len(predicted_lol_MNB_tfidf)) 
print(len(predicted_lol_MNB[0]))

# Evaluation Methods

Hamming Loss

In [None]:
true_labels1 = list(map(list, zip(*true_labels)))  # Transpose of true_labels

In [None]:
# Multinomial Naive Bayes Classifier

predicted_output_MNB = [] # List to store predicted outputs as integers
predicted_output_MNB_tf = []
predicted_output_MNB_tfidf = []

for i in range(len(predicted_lol_MNB[0])):
    temp1 = []
    temp2 = []
    temp3 = []
    for j in range(9):
        temp1.append(predicted_lol_MNB[j][i])
        temp2.append(predicted_lol_MNB_tf[j][i])
        temp3.append(predicted_lol_MNB_tfidf[j][i])
    predicted_output_MNB.append(temp1)
    predicted_output_MNB_tf.append(temp2)
    predicted_output_MNB_tfidf.append(temp3)

# Logistic Regression

predicted_output_LR = [] # List to store predicted outputs as integers
predicted_output_LR_tf = []
predicted_output_LR_tfidf = []

for i in range(len(predicted_lol_LR[0])):
    temp1 = []
    temp2 = []
    temp3 = []
    for j in range(9):
        temp1.append(predicted_lol_LR[j][i])
        temp2.append(predicted_lol_LR_tf[j][i])
        temp3.append(predicted_lol_LR_tfidf[j][i])
    predicted_output_LR.append(temp1)
    predicted_output_LR_tf.append(temp2)
    predicted_output_LR_tfidf.append(temp3)

# Support Vector Machine

predicted_output_SVM = [] # List to store predicted outputs as integers
predicted_output_SVM_tf = []
predicted_output_SVM_tfidf = []

for i in range(len(predicted_lol_SVM[0])):
    temp1 = []
    temp2 = []
    temp3 = []
    for j in range(9):
        temp1.append(predicted_lol_SVM[j][i])
        temp2.append(predicted_lol_SVM_tf[j][i])
        temp3.append(predicted_lol_SVM_tfidf[j][i])
    predicted_output_SVM.append(temp1)
    predicted_output_SVM_tf.append(temp2)
    predicted_output_SVM_tfidf.append(temp3)


In [None]:
# Hamming Loss = (sum_N(sum_L(xor(T, P)))) / (N * L)

N_MNB = []
N_MNB_tf = []
N_MNB_tfidf = []
N_LR = []
N_LR_tf = []
N_LR_tfidf = []
N_SVM = []
N_SVM_tf = []
N_SVM_tfidf = []

for i in range(10250):
    L_MNB = 0
    L_MNB_tf = 0
    L_MNB_tfidf = 0
    L_LR = 0
    L_LR_tf = 0
    L_LR_tfidf = 0
    L_SVM = 0
    L_SVM_tf = 0
    L_SVM_tfidf = 0
    
    for j in range(9):
        if true_labels1[i][j] != predicted_output_MNB[i][j]:
            L_MNB += 1
        if true_labels1[i][j] != predicted_output_LR[i][j]:
            L_LR += 1
        if true_labels1[i][j] != predicted_output_SVM[i][j]:
            L_SVM += 1
        if true_labels1[i][j] != predicted_output_MNB_tf[i][j]:
            L_MNB_tf += 1
        if true_labels1[i][j] != predicted_output_LR_tf[i][j]:
            L_LR_tf += 1
        if true_labels1[i][j] != predicted_output_SVM_tf[i][j]:
            L_SVM_tf += 1
        if true_labels1[i][j] != predicted_output_MNB_tfidf[i][j]:
            L_MNB_tfidf += 1
        if true_labels1[i][j] != predicted_output_LR_tfidf[i][j]:
            L_LR_tfidf += 1
        if true_labels1[i][j] != predicted_output_SVM_tfidf[i][j]:
            L_SVM_tfidf += 1

    N_MNB.append(L_MNB)
    N_LR.append(L_LR)
    N_SVM.append(L_SVM)
    N_MNB_tf.append(L_MNB_tf)
    N_LR_tf.append(L_LR_tf)
    N_SVM_tf.append(L_SVM_tf)
    N_MNB_tfidf.append(L_MNB_tfidf)
    N_LR_tfidf.append(L_LR_tfidf)
    N_SVM_tfidf.append(L_SVM_tfidf)

    
hamming_loss_MNB = sum(N_MNB)/ (10250 * 9)
hamming_loss_LR = sum(N_LR)/ (10250 * 9)
hamming_loss_SVM = sum(N_SVM)/ (10250 * 9)
hamming_loss_MNB_tf = sum(N_MNB_tf)/ (10250 * 9)
hamming_loss_LR_tf = sum(N_LR_tf)/ (10250 * 9)
hamming_loss_SVM_tf = sum(N_SVM_tf)/ (10250 * 9)
hamming_loss_MNB_tfidf = sum(N_MNB_tfidf)/ (10250 * 9)
hamming_loss_LR_tfidf = sum(N_LR_tfidf)/ (10250 * 9)
hamming_loss_SVM_tfidf = sum(N_SVM_tfidf)/ (10250 * 9)

print(hamming_loss_MNB, hamming_loss_LR, hamming_loss_SVM)  #Optimal value is zero.
print(hamming_loss_MNB_tf, hamming_loss_LR_tf, hamming_loss_SVM_tf)
print(hamming_loss_MNB_tfidf, hamming_loss_LR_tfidf, hamming_loss_SVM_tfidf)

Precision, Recall, F-Score

In [None]:
# Macro-Average and Micro-Average Precision, Recall, F-Score

from sklearn.metrics import precision_recall_fscore_support
for i in range(9):
    # Macro Average
    prfs_mnb_ma = precision_recall_fscore_support(true_labels[i], predicted_lol_MNB[i], average='macro') # For MNB with Count Vectorizer
    prfs_mnb_tf_ma = precision_recall_fscore_support(true_labels[i], predicted_lol_MNB_tf[i], average='macro') # For MNB with TF
    prfs_mnb_tfidf_ma = precision_recall_fscore_support(true_labels[i], predicted_lol_MNB_tfidf[i], average='macro') # For MNB with TF-IDF

    prfs_lr_ma = precision_recall_fscore_support(true_labels[i], predicted_lol_LR[i], average='macro') # For LR with Count Vectorizer
    prfs_lr_tf_ma = precision_recall_fscore_support(true_labels[i], predicted_lol_LR[i], average='macro') # For LR with TF
    prfs_lr_tfidf_ma = precision_recall_fscore_support(true_labels[i], predicted_lol_LR[i], average='macro') # For LR with TF-IDF

    prfs_svm_ma = precision_recall_fscore_support(true_labels[i], predicted_lol_SVM[i], average='macro') # For SVM with Count Vectorizer
    prfs_svm_tf_ma = precision_recall_fscore_support(true_labels[i], predicted_lol_SVM[i], average='macro') # For SVM with TF
    prfs_svm_tfidf_ma = precision_recall_fscore_support(true_labels[i], predicted_lol_SVM[i], average='macro') # For SVM with TF-IDF

    # Micro Average
    prfs_mnb_mi = precision_recall_fscore_support(true_labels[i], predicted_lol_MNB[i], average='micro') # For MNB with Count Vectorizer
    prfs_mnb_tf_mi = precision_recall_fscore_support(true_labels[i], predicted_lol_MNB_tf[i], average='micro') # For MNB with TF 
    prfs_mnb_tfidf_mi = precision_recall_fscore_support(true_labels[i], predicted_lol_MNB_tfidf[i], average='micro') # For MNB with TF-IDF

    prfs_lr_mi = precision_recall_fscore_support(true_labels[i], predicted_lol_LR[i], average='micro') # For LR with Count Vectorizer
    prfs_lr_tf_mi = precision_recall_fscore_support(true_labels[i], predicted_lol_LR_tf[i], average='micro') # For LR with TF
    prfs_lr_tfidf_mi = precision_recall_fscore_support(true_labels[i], predicted_lol_LR_tfidf[i], average='micro') # For LR with TF-IDF

    prfs_svm_mi = precision_recall_fscore_support(true_labels[i], predicted_lol_SVM[i], average='micro') # For SVM with Count Vectorizer
    prfs_svm_tf_mi = precision_recall_fscore_support(true_labels[i], predicted_lol_SVM_tf[i], average='micro') # For SVM with TF
    prfs_svm_tfidf_mi = precision_recall_fscore_support(true_labels[i], predicted_lol_SVM_tfidf[i], average='micro') # For SVM with TF-IDF

    

In [None]:
# Printing precision, recall, F-Score

print(prfs_mnb_ma, prfs_mnb_tf_ma, prfs_mnb_tfidf_ma)
print(prfs_mnb_mi, prfs_mnb_tf_mi, prfs_mnb_tfidf_mi)
print(prfs_lr_ma, prfs_lr_tf_ma, prfs_lr_tfidf_ma)
print(prfs_lr_mi, prfs_lr_tf_mi, prfs_lr_tfidf_mi)
print(prfs_svm_ma, prfs_svm_tf_ma, prfs_svm_tfidf_ma)
print(prfs_svm_mi, prfs_svm_tf_mi, prfs_svm_tfidf_mi)