In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn import svm
from collections import Counter
from sklearn.linear_model import LogisticRegression

# Label Powerset

Read the processed csv file for Label POSET technique.

In [None]:
df = pd.read_csv('processed_data_lp.csv')

Getting counts of the labels in the Label POSET dataframe.

In [None]:
df_class = np.array(df[['Class']]).flatten()
label_count = list(dict(Counter(df_class)).values())

label_count_df = []
for i in range(len(label_count)):
    temp = [label_count[i]]*label_count[i]
    label_count_df.extend(temp)

df['Label Count'] = label_count_df


Removing Labels which occur only once in the Label Powerset Dataframe as they cannot be futher split into the training and testing set.

In [None]:
df = df[df['Label Count'] != 1]

Using Stratified Sampling to create train-test split (75:25). Finding accuracies of POSET labels using Multinomial Naive Bayes classifier, Logistic Regression.

In [None]:
X = np.array(df['Abstract'])
Y = np.array(df['Class'])

XTrain, XTest, YTrain, YTest = train_test_split(X, Y, stratify=Y, test_size=0.25) # Getting train and test sets for X and Y
cv = CountVectorizer(ngram_range = (1,2), min_df = 1, max_df = 1)
tf = TfidfTransformer(use_idf = False)
tfidf = TfidfTransformer(use_idf = True)

XTrain_counts = cv.fit_transform(XTrain[:])
XTrain_tf = tf.fit_transform(XTrain_counts)
XTrain_tfidf = tfidf.fit_transform(XTrain_counts)

XTest_counts = cv.transform(XTest[:])
XTest_tf = tf.transform(XTest_counts)
XTest_tfidf = tfidf.transform(XTest_counts)



In [None]:
# Multinomial Naive Bayes
classifier_MNB = MultinomialNB(alpha = 0.01).fit(XTrain_counts, YTrain) # Naive Bayes classifier trained on Count Vectorizer
classifier_MNB_tf = MultinomialNB(alpha = 0.01).fit(XTrain_tf, YTrain)  # Naive Bayes classifier trained on TF without IDF
classifier_MNB_tfidf = MultinomialNB(alpha = 0.01).fit(XTrain_tfidf, YTrain)  # Naive Bayes classifier trained on TF-IDF
predicted_MNB = classifier_MNB.predict(XTest_counts)
predicted_MNB_tf = classifier_MNB.predict(XTest_tf)
predicted_MNB_tfidf = classifier_MNB.predict(XTest_tfidf)


In [None]:
# Logistic Regression
classifier_LR = LogisticRegression(random_state=0).fit(XTrain_counts, YTrain)  # Logistic Regression trained on Count Vectorizer
classifier_LR_tf = LogisticRegression(random_state=0).fit(XTrain_tf, YTrain)  # Logistic Regression trained on TF without IDF
classifier_LR_tfidf = LogisticRegression(random_state=0).fit(XTrain_tfidf, YTrain)  # Logistic Regression trained on TF-IDF
predicted_LR = classifier_LR.predict(XTest_counts)
predicted_LR_tf = classifier_LR_tf.predict(XTest_tf)
predicted_LR_tfidf = classifier_LR_tfidf.predict(XTest_tfidf)

In [None]:
# Support Vector Machine
classifier_SVM = svm.SVC().fit(XTrain_counts, YTrain) # SVM classifier trained on Count Vectorizer
classifier_SVM_tf = svm.SVC().fit(XTrain_tf, YTrain) # SVM classifier trained on TF without IDF
classifier_SVM_tfidf = svm.SVC().fit(XTrain_tfidf, YTrain) # SVM classifier trained on TF-IDF
predicted_SVM = classifier_SVM.predict(XTest_counts)
predicted_SVM_tf = classifier_SVM.predict(XTest_tf)
predicted_SVM_tfidf = classifier_SVM.predict(XTest_tfidf)

# Evaluation Metrics

Accuracy

In [None]:
# Printing accuracies of each classifier

print('Accuracy of Multinomial Naive Bayes Classifier: '+ str(classifier_MNB.score(XTest_counts, YTest)))
print('Accuracy of Logistic Regression: '+ str(classifier_LR.score(XTest_counts, YTest)))
print('Accuracy of Support Vector Machine: '+ str(classifier_SVM.score(XTest_counts, YTest)))
print('Accuracy of Multinomial Naive Bayes Classifier with TF: '+ str(classifier_MNB_tf.score(XTest_tf, YTest)))
print('Accuracy of Logistic Regression with TF: '+ str(classifier_LR_tf.score(XTest_tf, YTest)))
print('Accuracy of Support Vector Machine with TF: '+ str(classifier_SVM_tf.score(XTest_tf, YTest)))
print('Accuracy of Multinomial Naive Bayes Classifier with TF-IDF: '+ str(classifier_MNB_tfidf.score(XTest_tfidf, YTest)))
print('Accuracy of Logistic Regression with TF-IDF: '+ str(classifier_LR_tfidf.score(XTest_tfidf, YTest)))
print('Accuracy of Support Vector Machine with TF-IDF: '+ str(classifier_SVM_tfidf.score(XTest_tfidf, YTest)))

Precision, Recall, F-Score

In [None]:
from sklearn.metrics import precision_recall_fscore_support
import warnings
warnings.filterwarnings('ignore')

# Macro Average : Output is a tuple (Precision, Recall, F-Score, Support)
prfs_mnb_ma = precision_recall_fscore_support(YTest, predicted_MNB, average='macro')
prfs_mnb_tf_ma = precision_recall_fscore_support(YTest, predicted_MNB_tf, average='macro')
prfs_mnb_tfidf_ma = precision_recall_fscore_support(YTest, predicted_MNB_tfidf, average='macro')

prfs_lr_ma = precision_recall_fscore_support(YTest, predicted_LR, average='macro')
prfs_lr_tf_ma = precision_recall_fscore_support(YTest, predicted_LR_tf, average='macro')
prfs_lr_tfidf_ma = precision_recall_fscore_support(YTest, predicted_LR_tfidf, average='macro')

prfs_svm_ma = precision_recall_fscore_support(YTest, predicted_SVM, average='macro')
prfs_svm_tf_ma = precision_recall_fscore_support(YTest, predicted_SVM, average='macro')
prfs_svm_tfidf_ma = precision_recall_fscore_support(YTest, predicted_SVM, average='macro')

# Micro Average : Output is a tuple (Precision, Recall, F-Score, Support)
prfs_mnb_mi = precision_recall_fscore_support(YTest, predicted_MNB, average='micro')
prfs_mnb_tf_mi = precision_recall_fscore_support(YTest, predicted_MNB_tf, average='micro')
prfs_mnb_tfidf_mi = precision_recall_fscore_support(YTest, predicted_MNB, average='micro')

prfs_lr_mi = precision_recall_fscore_support(YTest, predicted_LR, average='micro')
prfs_lr_tf_mi = precision_recall_fscore_support(YTest, predicted_LR_tf, average='micro')
prfs_lr_tfidf_mi = precision_recall_fscore_support(YTest, predicted_LR_tfidf, average='micro')

prfs_svm_mi = precision_recall_fscore_support(YTest, predicted_SVM, average='micro')
prfs_svm_tf_mi = precision_recall_fscore_support(YTest, predicted_SVM, average='micro')
prfs_svm_tfidf_mi = precision_recall_fscore_support(YTest, predicted_SVM, average='micro')
