In [1]:
# package import
import copy
import random
import operator
import os, math
import re, string
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from os import path
from PIL import Image
from stop_words import get_stop_words
from collections import Counter, defaultdict
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

import nltk
from nltk.corpus import stopwords 

import torch
import torch.utils.data as tud
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from sklearn.metrics import roc_auc_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [2]:
# define 6 labels
labels = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
y_labels = ['toxic_y', 'severe_toxic_y', 'obscene_y', 'threat_y', 'insult_y', 'identity_hate_y']

In [3]:
# read in the data 
def load_data(data_file):
    return pd.read_csv(data_file)

In [4]:
# prepare test label set, baseline label set, and proposed label set 
test_label = load_data("test_labels.csv")
baseline = load_data("NBSVM_submission.csv")
proposed = load_data("model_submission.csv")

In [5]:
# keep only the test label that are for testing 
test_label = test_label[(test_label['toxic'] != -1) & 
                        (test_label['severe_toxic'] != -1) & 
                        (test_label['obscene'] != -1) & 
                        (test_label['threat'] != -1) & 
                        (test_label['insult'] != -1) &
                        (test_label['identity_hate'] != -1)]

In [6]:
def output(data):
    
    # read in the predicted result, merge with true label 
    predict = data.merge(test_label, on=['id','id'])
    predict = predict.drop(y_labels, axis=1)

    true = test_label.drop("id", axis=1)
    predict = predict.drop("id", axis=1)

    predict.columns = labels

    acc = []
    precision = []
    recall = []
    
    # compute accuracy, precision and recall for each column 
    for i in labels:
        y_true = np.array(true[i])
        y_predict = np.array(predict[i])

        # convert probability to binary, use 0.5 as cut off 
        y_predict_2 = np.where(y_predict < .5000000000, 0, 1)

        acc.append(roc_auc_score(y_true, y_predict))
        precision.append(precision_score(y_true, y_predict_2))
        recall.append(recall_score(y_true, y_predict_2))

    # calculate the average of all columns 
    acc_scores = sum(acc) / len(acc) 
    precision_scores = sum(precision) / len(precision) 
    recall_scores = sum(recall) / len(recall) 
    return acc_scores, precision_scores, recall_scores

In [7]:
baseline_acc_scores, baseline_precision_scores, baseline_recall_scores = output(baseline)

In [8]:
print("baseline accurary: {}".format(baseline_acc_scores))
print("baseline precision: {}".format(baseline_precision_scores))
print("baseline recall: {}".format(baseline_recall_scores))

baseline accurary: 0.9624293529808764
baseline precision: 0.61825263005296
baseline recall: 0.4605871765562332


In [9]:
proposed_acc_scores, proposed_precision_scores, proposed_recall_scores = output(proposed)

In [10]:
print("proposed accurary: {}".format(proposed_acc_scores))
print("proposed precision: {}".format(proposed_precision_scores))
print("proposed recall: {}".format(proposed_recall_scores))

proposed accurary: 0.9276368080283678
proposed precision: 0.367051421326654
proposed recall: 0.6724089318160916
