In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
import textblob
import json
from nltk.tokenize import word_tokenize
import nltk

In [None]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
!cp "/content/drive/My Drive/toxic span/toxic_span_practice.csv" "./toxic_span_practice.csv"
!cp "/content/drive/My Drive/toxic span/toxic_span_train.csv" "./toxic_span_train.csv"
!cp "/content/drive/My Drive/toxic span/bad-words.csv" "./bad-words.csv"

In [None]:
def f1(preds,trues):
    if len(trues) == 0:
        return 1. if len(preds) == 0 else 0.
    if len(preds) == 0:
        return 0.
    predictions_set = set(preds)
    gold_set = set(trues)
    nom = 2 * len(predictions_set.intersection(gold_set))
    denom = len(predictions_set) + len(gold_set)
    return float(nom)/float(denom)

In [None]:
def avg_f1(preds,trues):
    avg_f1_total = 0.0
    for pred,true in zip(preds,trues):
        avg_f1_total += f1(pred,true)
    return avg_f1_total/len(preds)

In [None]:
test_set = pd.read_csv("toxic_span_practice.csv")
test_set['spans'] = test_set['spans'].apply(lambda x : json.loads(x))
train_set = pd.read_csv("toxic_span_train.csv")
train_set['spans'] = train_set['spans'].apply(lambda x : json.loads(x))
dataset = test_set.append(train_set,ignore_index=True)
dataset['text'] = dataset['text'].apply(lambda x : x.lower())

# Hate words list base

In [None]:
hate_words = pd.read_csv("./bad-words.csv")['jigaboo'].to_list()

## split using spaces

In [None]:
indices_all = []
for index,row in dataset.iterrows():
    text = row['text']
    start = 0
    indices = []
    for word in text.split():
        if word in hate_words:
            start_index = text[start:].find(word)
            indices += list(range(start + start_index,start + start_index+len(word)))
            start += start_index+len(word)
    indices_all.append(indices)

In [None]:
toxic_f1 = avg_f1(indices_all,dataset['spans'].to_numpy())

In [None]:
print("F1 : %f"%(toxic_f1))

F1 : 0.224819


## tokenize with nltk

In [None]:
indices_all = []
for index,row in dataset.iterrows():
    text = row['text']
    start = 0
    indices = []
    for word in word_tokenize(text):
        if word in hate_words:
            start_index = text[start:].find(word)
            indices += list(range(start + start_index,start + start_index+len(word)))
            start += start_index+len(word)
    indices_all.append(indices)

In [None]:
toxic_f1 = avg_f1(indices_all,dataset['spans'].to_numpy())

In [None]:
print("F1 : %f"%(toxic_f1))

F1 : 0.332807


# sentiment base

## tokenize using space

In [None]:
indices_all = []
for index,row in dataset.iterrows():
    text = row['text']
    start = 0
    indices = []
    for word in text.split():
        word_blob = textblob.TextBlob(word)
        if word_blob.polarity < 0:
            start_index = text[start:].find(word)
            indices += list(range(start + start_index,start + start_index+len(word)))
            start += start_index+len(word)
    indices_all.append(indices)

In [None]:
toxic_f1 = avg_f1(indices_all,dataset['spans'].to_numpy())

In [None]:
print("F1 : %f"%(toxic_f1))

F1 : 0.364640


## tokenize using nltk

In [None]:
indices_all = []
for index,row in dataset.iterrows():
    text = row['text']
    start = 0
    indices = []
    for word in word_tokenize(text):
        word_blob = textblob.TextBlob(word)
        if word_blob.polarity < 0:
            start_index = text[start:].find(word)
            indices += list(range(start + start_index,start + start_index+len(word)))
            start += start_index+len(word)
    indices_all.append(indices)

In [None]:
toxic_f1 = avg_f1(indices_all,dataset['spans'].to_numpy())

In [None]:
print("F1 : %f"%(toxic_f1))

F1 : 0.378191


# sentiment and hate words base

In [None]:
hate_words = pd.read_csv("./bad-words.csv")['jigaboo'].to_list()

## tokenize using space

In [None]:
indices_all = []
for index,row in dataset.iterrows():
    text = row['text']
    start = 0
    indices = []
    for word in text.split():
        word_blob = textblob.TextBlob(word)
        if word_blob.polarity < 0 or word in hate_words:
            start_index = text[start:].find(word)
            indices += list(range(start + start_index,start + start_index+len(word)))
            start += start_index+len(word)
    indices_all.append(indices)

In [None]:
toxic_f1 = avg_f1(indices_all,dataset['spans'].to_numpy())

In [None]:
print("F1 : %f"%(toxic_f1))

F1 : 0.386213


## tokenize using nltk

In [None]:
indices_all = []
for index,row in dataset.iterrows():
    text = row['text']
    start = 0
    indices = []
    for word in word_tokenize(text):
        word_blob = textblob.TextBlob(word)
        if word_blob.polarity < 0 or word in hate_words:
            start_index = text[start:].find(word)
            indices += list(range(start + start_index,start + start_index+len(word)))
            start += start_index+len(word)
    indices_all.append(indices)

In [None]:
toxic_f1 = avg_f1(indices_all,dataset['spans'].to_numpy())

In [None]:
print("F1 : %f"%(toxic_f1))

F1 : 0.418540
