In [1]:
# %pip install sumy
# %pip install matplotlib
# %pip install gensim
# %pip install num2words

import numpy as np
import pandas as pd

import string

import contractions
import gensim
import gensim.downloader as api
import matplotlib.pyplot as plt
import nltk
import numpy as np
import pandas as pd
from num2words import num2words

nltk.download("punkt", quiet=True)

from sklearn.linear_model import LinearRegression, LogisticRegression, Perceptron, SGDClassifier
from sklearn.metrics import f1_score
from sumy.nlp.stemmers import Stemmer
from sumy.nlp.tokenizers import Tokenizer
from sumy.parsers.plaintext import PlaintextParser
from sumy.summarizers.lex_rank import LexRankSummarizer
from sumy.summarizers.lsa import LsaSummarizer
from sumy.summarizers.luhn import LuhnSummarizer
from sumy.utils import get_stop_words

pd.set_option("display.max_rows", 50)
pd.set_option("display.max_columns", 50)
pd.set_option("display.max_colwidth", 1000)

In [2]:
# thread = [(toxicity, (words))]
import os
import re
import json

data_path = 'data'
conversations = [
    'conversations-part1', 
    'conversations-part2',
    'conversations-part3',
]

dfs = []
total_number_of_conversations = sum(
    [len(os.listdir(os.path.join(data_path, conversation))) 
     for conversation in conversations])
number_of_conversations = 0

for conversation in conversations:
    path = os.path.join(data_path, conversation)
    for f in os.listdir(path):
        true_path = os.path.join(path, f)
        
        c_df = pd.read_json(true_path, lines=True)
        c_id = re.search('([0-9]+)', f).group(0)
        c_thread = []
        for t, c in zip(c_df['toxicity'], c_df['cleaned_content']):
            c_thread.append((float(t), str(c)))
        
        df = pd.DataFrame({'id': c_id, 'thread': c_thread})
        dfs.append(df)
        
        number_of_conversations += 1

thread_dfs = pd.concat(dfs)

In [3]:
LANGUAGE = "english"
NUM_SENTENCES = 3

tokenizer = Tokenizer(LANGUAGE)
stemmer = Stemmer(LANGUAGE)
summarizer = LuhnSummarizer(stemmer)
summarizer.stop_words = get_stop_words(LANGUAGE)

def summarize_text(text: str) -> str:
    if not text: return ""
    parser = PlaintextParser.from_string(text, tokenizer)
    return "".join([x._text for x in summarizer(parser.document, NUM_SENTENCES)])

def numbers_to_words(text: str) -> str:
    t = text.split()
    for ind, word in enumerate(t):
        if all(c.isdigit() for c in word):
            t[ind] = num2words(word)
        elif (
            len(word) > 2
            and all(c.isdigit() for c in word[:-2])
            and word[-2:] in ["st", "nd", "rd", "th"]
        ):
            t[ind] = num2words(int(word[:-2]), to="ordinal")

    return " ".join(t)

In [4]:
grouped_thread_dfs = thread_dfs.groupby(['id'])['thread'].apply(list).reset_index()
# let's sample .01% of the datasets for testing
grouped_thread_dfs = grouped_thread_dfs.sample(frac=.01)

In [5]:
import re
from nltk.tokenize import word_tokenize

grouped_threads = []

for thread in grouped_thread_dfs["thread"]:
    grouped_comments = []
    
    for comment in thread:
        
        stripped_comment = str(comment[1]).strip().lower()
        no_http_comment = re.sub(r'\s*https?://\S+(\s+|$)', '', stripped_comment)        
        alphabetical_comment = re.sub(r'[^a-zA-Z\s+]', '', no_http_comment)
        whitespaced_comment = re.sub(r'\s\s+/g', ' ', alphabetical_comment)
        expanded_comment = contractions.fix(whitespaced_comment)
        tokenized_comment = word_tokenize(expanded_comment)
            
        grouped_comments.append(' '.join(tokenized_comment) + '.')

    grouped_threads.append(' '.join(grouped_comments))

grouped_thread_dfs['thread_text'] = grouped_threads

In [6]:
grouped_thread_dfs["summary"] = [summarize_text(text) for text in grouped_thread_dfs['thread_text']]

In [7]:
# get the labels
def calculate_overall_toxicity_boolean(thread):
    if not thread: return 0
    return sum([comment[0] for comment in thread])/len(thread) > .5

def calculate_overall_toxicity_continuous(thread):
    if not thread: return 0
    return sum([comment[0] for comment in thread])/len(thread)

In [8]:
grouped_thread_dfs['toxicity'] = [calculate_overall_toxicity_boolean(thread) 
                                  for thread in grouped_thread_dfs['thread']]

In [None]:
df = grouped_thread_dfs[['toxicity', 'summary']]
# of df == 1, select random == .3*len(df)
# of df == 0, select random == .3*len(df)
yes_toxic = df[df['toxicity'] == 1].sample(.02*len(df))
no_toxic = df[df['toxicity'] == 0].sample(.02*len(df))
balanced_df = pd.concat([yes_toxic, no_toxic])

train_df = balanced_df.sample(frac=0.6, random_state=200)
dev_df = balanced_df.drop(train_df.index).sample(frac=.5)
test_df = balanced_df.drop(train_df.index).drop(dev_df.index)

In [10]:
word2vec = api.load("word2vec-google-news-300")

In [11]:
### Word2Vec
def get_average_w2v_vector(row):
    words = row["summary"].split()

    avg_text_w2v = np.sum([word2vec[w] for w in words if w in word2vec], axis=0) / (
        len(words) if words else 1
    )

    if avg_text_w2v.shape != (300,):
        avg_text_w2v = np.zeros((300,))

    embedding = np.concatenate((avg_text_w2v), axis=None)
    return embedding

f1_scores = pd.DataFrame(columns=['Target Column', 'Dev Dataset F1', 'Test Dataset F1'])

for target_col in ["toxicity",]:

    train_X = train_df.apply(
        lambda x: get_average_w2v_vector(x), axis=1, result_type="expand"
    )
    dev_X = dev_df.apply(
        lambda x: get_average_w2v_vector(x), axis=1, result_type="expand"
    )
    test_X = test_df.apply(
        lambda x: get_average_w2v_vector(x), axis=1, result_type="expand"
    )

    train_y = train_df[target_col]
    dev_y = dev_df[target_col]
    test_y = test_df[target_col]

    train_X = train_X[train_y.notna()]
    train_y = train_y[train_y.notna()]

    dev_X = dev_X[dev_y.notna()]
    dev_y = dev_y[dev_y.notna()]

    test_X = test_X[test_y.notna()]
    test_y = test_y[test_y.notna()]

    clf = LogisticRegression(max_iter=10000)
    clf.fit(train_X, train_y)

    pred_dev_y = clf.predict(dev_X)
    pred_test_y = clf.predict(test_X)

    print(f"{target_col} : ")
    
    print(f1_score(dev_y, pred_dev_y, average='weighted', zero_division=0))
    print(f1_score(test_y, pred_test_y, average='weighted', zero_division=0))
    f1_scores.loc[len(f1_scores.index)] = [target_col, f1_score(dev_y, pred_dev_y, average='weighted', zero_division=0), f1_score(test_y, pred_test_y, average='weighted', zero_division=0)] 


toxicity : 
0.955090283353451
0.9294933004774372


In [12]:
### TF-IDF

from sklearn.feature_extraction.text import TfidfVectorizer

train_col = 'summary'
target_col = 'toxicity'

vectorizer = TfidfVectorizer()
transformer = vectorizer.fit_transform(train_df[train_col])

train_X = vectorizer.transform(train_df[train_col])
dev_X = vectorizer.transform(dev_df[train_col])
test_X = vectorizer.transform(test_df[train_col])

train_y = train_df[target_col]
dev_y = dev_df[target_col]
test_y = test_df[target_col]

train_X = train_X[train_y.notna()]
train_y = train_y[train_y.notna()]

dev_X = dev_X[dev_y.notna()]
dev_y = dev_y[dev_y.notna()]

test_X = test_X[test_y.notna()]
test_y = test_y[test_y.notna()]


# Logistic Regression
clf = LogisticRegression(max_iter=10000)
clf.fit(transformer, train_df['toxicity'])

pred_dev_y = clf.predict(dev_X)
pred_test_y = clf.predict(test_X)

print("TF-IDF Logistic Regression:")
print(f1_score(dev_y, pred_dev_y, average='weighted', zero_division=0))
print(f1_score(test_y, pred_test_y, average='weighted', zero_division=0))
f1_scores.loc[len(f1_scores.index)] = [target_col, f1_score(dev_y, pred_dev_y, average='weighted', zero_division=0), f1_score(test_y, pred_test_y, average='weighted', zero_division=0)] 


# Perceptron
clf = Perceptron(max_iter=10000)
clf.fit(transformer, train_df['toxicity'])

pred_dev_y = clf.predict(dev_X)
pred_test_y = clf.predict(test_X)

print("TF-IDF Perceptron:")
print(f1_score(dev_y, pred_dev_y, average='weighted', zero_division=0))
print(f1_score(test_y, pred_test_y, average='weighted', zero_division=0))
f1_scores.loc[len(f1_scores.index)] = [target_col, f1_score(dev_y, pred_dev_y, average='weighted', zero_division=0), f1_score(test_y, pred_test_y, average='weighted', zero_division=0)] 


# SDGClassifier
clf = SGDClassifier(max_iter=10000)
clf.fit(transformer, train_df['toxicity'])

pred_dev_y = clf.predict(dev_X)
pred_test_y = clf.predict(test_X)

print("TF-IDF SDGClassifier:")
print(f1_score(dev_y, pred_dev_y, average='weighted', zero_division=0))
print(f1_score(test_y, pred_test_y, average='weighted', zero_division=0))
f1_scores.loc[len(f1_scores.index)] = [target_col, f1_score(dev_y, pred_dev_y, average='weighted', zero_division=0), f1_score(test_y, pred_test_y, average='weighted', zero_division=0)] 

TF-IDF Logistic Regression:
0.938262466759876
0.9307862679955703
TF-IDF Perceptron:
0.9638968163808151
0.935813953488372
TF-IDF SDGClassifier:
0.955090283353451
0.9307862679955703


In [13]:
grouped_thread_dfs['toxicity'] = [calculate_overall_toxicity_continuous(thread) 
                                  for thread in grouped_thread_dfs['thread']]

In [14]:
# TF-IDF Continuous
from sklearn.neural_network import MLPRegressor

train_col = 'summary'
target_col = 'toxicity'

vectorizer = TfidfVectorizer()
transformer = vectorizer.fit_transform(train_df[train_col])

train_X = vectorizer.transform(train_df[train_col])
dev_X = vectorizer.transform(dev_df[train_col])
test_X = vectorizer.transform(test_df[train_col])

train_y = train_df[target_col]
dev_y = dev_df[target_col]
test_y = test_df[target_col]

train_X = train_X[train_y.notna()]
train_y = train_y[train_y.notna()]

dev_X = dev_X[dev_y.notna()]
dev_y = dev_y[dev_y.notna()]

test_X = test_X[test_y.notna()]
test_y = test_y[test_y.notna()]

# Linear Regression
clf = LinearRegression()
clf.fit(train_X, train_df[target_col])
print("Linear Regression:")
print(f'R^2: {clf.score(dev_X, dev_y)}')

# Perceptron
clf = MLPRegressor()
clf.fit(train_X, train_df[target_col])
print("MLP Regressor:")
print(f'R^2: {clf.score(dev_X, dev_y)}')

Linear Regression:
R^2: 0.1588517053595372
MLP Regressor:
R^2: 0.0018399695738918753


In [17]:
# %pip install transformers
# %pip install sentencepiece

# BART
from transformers import pipeline

bart = pipeline("summarization", model="facebook/bart-large-cnn")

def summarize_text_with_bart(text: str) -> str:
    if not text: return ""
    try:
        summary_list = bart(text, max_length=10000, min_length=1, do_sample=False)
        return summary_list[0]['summary_text']
    except:
        return ""
        

# T5
# https://towardsdatascience.com/simple-abstractive-text-summarization-with-pretrained-t5-text-to-text-transfer-transformer-10f6d602c426
import torch
import json 
from transformers import T5Tokenizer, T5ForConditionalGeneration, T5Config

model = T5ForConditionalGeneration.from_pretrained('t5-small')
tokenizer = T5Tokenizer.from_pretrained('t5-small')
device = torch.device('cpu')

def summarize_text_with_t5(text: str) -> str:
    tokenized_text = tokenizer.encode(text, return_tensors="pt").to(device)
    summary_ids = model.generate(tokenized_text,
                                 num_beams=4,
                                 no_repeat_ngram_size=2,
                                 min_length=1,
                                 max_length=10000,
                                 early_stopping=True)
    output = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return output

In [None]:
grouped_thread_dfs['summary_bart'] = [summarize_text_with_bart(text) for text in grouped_thread_dfs['thread_text']]

In [None]:
### Word2Vec
def get_average_w2v_vector(row, col):
    words = row[col].split()

    avg_text_w2v = np.sum([word2vec[w] for w in words if w in word2vec], axis=0) / (
        len(words) if words else 1
    )

    if avg_text_w2v.shape != (300,):
        avg_text_w2v = np.zeros((300,))

    embedding = np.concatenate((avg_text_w2v), axis=None)
    return embedding

f1_scores = pd.DataFrame(columns=['Target Column', 'Dev Dataset F1', 'Test Dataset F1'])

for target_col in ["toxicity",]:

    train_X = train_df.apply(
        lambda x: get_average_w2v_vector(x, "summary_bart"), axis=1, result_type="expand"
    )
    dev_X = dev_df.apply(
        lambda x: get_average_w2v_vector(x, "summary_bart"), axis=1, result_type="expand"
    )
    test_X = test_df.apply(
        lambda x: get_average_w2v_vector(x, "summary_bart"), axis=1, result_type="expand"
    )

    train_y = train_df[target_col]
    dev_y = dev_df[target_col]
    test_y = test_df[target_col]

    train_X = train_X[train_y.notna()]
    train_y = train_y[train_y.notna()]

    dev_X = dev_X[dev_y.notna()]
    dev_y = dev_y[dev_y.notna()]

    test_X = test_X[test_y.notna()]
    test_y = test_y[test_y.notna()]

    clf = LogisticRegression(max_iter=10000)
    clf.fit(train_X, train_y)

    pred_dev_y = clf.predict(dev_X)
    pred_test_y = clf.predict(test_X)

    print(f"{target_col} : ")
    
    print(f1_score(dev_y, pred_dev_y, average='weighted', zero_division=0))
    print(f1_score(test_y, pred_test_y, average='weighted', zero_division=0))
    f1_scores.loc[len(f1_scores.index)] = [target_col, f1_score(dev_y, pred_dev_y, average='weighted', zero_division=0), f1_score(test_y, pred_test_y, average='weighted', zero_division=0)] 


In [None]:
grouped_thread_dfs['summary_t5'] = [summarize_text_with_t5(text) for text in grouped_thread_dfs['thread_text']]

In [None]:
### Word2Vec
def get_average_w2v_vector(row, col):
    words = row[col].split()

    avg_text_w2v = np.sum([word2vec[w] for w in words if w in word2vec], axis=0) / (
        len(words) if words else 1
    )

    if avg_text_w2v.shape != (300,):
        avg_text_w2v = np.zeros((300,))

    embedding = np.concatenate((avg_text_w2v), axis=None)
    return embedding

f1_scores = pd.DataFrame(columns=['Target Column', 'Dev Dataset F1', 'Test Dataset F1'])

for target_col in ["toxicity",]:

    train_X = train_df.apply(
        lambda x: get_average_w2v_vector(x, "summary_t5"), axis=1, result_type="expand"
    )
    dev_X = dev_df.apply(
        lambda x: get_average_w2v_vector(x, "summary_t5"), axis=1, result_type="expand"
    )
    test_X = test_df.apply(
        lambda x: get_average_w2v_vector(x, "summary_t5"), axis=1, result_type="expand"
    )

    train_y = train_df[target_col]
    dev_y = dev_df[target_col]
    test_y = test_df[target_col]

    train_X = train_X[train_y.notna()]
    train_y = train_y[train_y.notna()]

    dev_X = dev_X[dev_y.notna()]
    dev_y = dev_y[dev_y.notna()]

    test_X = test_X[test_y.notna()]
    test_y = test_y[test_y.notna()]

    clf = LogisticRegression(max_iter=10000)
    clf.fit(train_X, train_y)

    pred_dev_y = clf.predict(dev_X)
    pred_test_y = clf.predict(test_X)

    print(f"{target_col} : ")
    
    print(f1_score(dev_y, pred_dev_y, average='weighted', zero_division=0))
    print(f1_score(test_y, pred_test_y, average='weighted', zero_division=0))
    f1_scores.loc[len(f1_scores.index)] = [target_col, f1_score(dev_y, pred_dev_y, average='weighted', zero_division=0), f1_score(test_y, pred_test_y, average='weighted', zero_division=0)] 
