# Hate Speech Detection A

In [None]:
import pandas as pd
import numpy as np

from tqdm import tqdm

from pandarallel import pandarallel
pandarallel.initialize(nb_workers=4, progress_bar=False)

import torch

from scipy.special import softmax
from transformers import AutoTokenizer, AutoModelForSequenceClassification

## Load Model

### Load Tokenizer and Classifier

In [None]:
tokenizer = AutoTokenizer.from_pretrained("Hate-speech-CNERG/dehatebert-mono-english")

model = AutoModelForSequenceClassification.from_pretrained("Hate-speech-CNERG/dehatebert-mono-english")

### Load Model Label Mapping

In [None]:
# https://huggingface.co/Hate-speech-CNERG/dehatebert-mono-english/blob/main/config.json
id2label = {0: "NON_HATE", 1: "HATE"}

## Load Data

### Load Tweets

In [None]:
df_tweets = pd.read_parquet('data/tweets/en/english_tweets.parquet')[:20]

### Load News IDs

In [None]:
news_ids = pd.read_csv('data/news/news_indexes.csv', header=None).values.reshape(-1)

### Filter News

In [None]:
df_tweets['id'] = df_tweets['id'].astype(int)
df_tweets['is_news'] = df_tweets['id'].isin(news_ids)

## Hate Speech Detection

### Pre-Processing

In [None]:
def preprocess(text):
    new_text = []
    for t in text.split(' '):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return ' '.join(new_text)

In [None]:
df_tweets['pre_process_text'] = df_tweets.text.str.replace(r'\s+', ' ', regex=True).parallel_apply(preprocess)

In [None]:
texts = df_tweets.pre_process_text.values.tolist()

In [None]:
### Classification

In [None]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"

model = model.to(device)

print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
print('Cached:   ', round(torch.cuda.memory_reserved(0)/1024**3,1), 'GB')

In [None]:
step = 5
res = []

for i in tqdm(list(range(0, len(df_tweets), step))):
    j = min(i+step, len(df_tweets))
    
    _texts = texts[i:j]
    
    ### Tokenizing

    encoded_input = tokenizer(_texts, return_tensors='pt', padding=True).to(device)
    # encoded_input = tokenizer(_texts, return_tensors='pt', padding=True)

    ### Hate Speech Classification

    output = model(**encoded_input)

    scores = output[0].detach().cpu().numpy()
    # scores = output[0].detach().numpy()
    
    res.append(scores)
    
    del encoded_input
    del output
    torch.cuda.empty_cache()

In [None]:
res = np.concatenate(res, axis=0)

In [None]:
res

In [None]:
result_scores = pd.DataFrame(res, index=df_tweets.index).rename(columns=id2label)

In [None]:
result_scores

In [None]:
result_softmax = pd.DataFrame(softmax(res, axis=1), index=df_tweets.index).rename(columns=id2label)

In [None]:
result_softmax

In [None]:
df_tweets = df_tweets.join(result_scores).join(result_softmax, rsuffix='_softmax')
df_tweets

In [None]:
df_tweets[['id', 'NON_HATE', 'HATE', 'NON_HATE_softmax', 'HATE_softmax']]

In [None]:
df_tweets[['id', 'NON_HATE', 'HATE', 'NON_HATE_softmax', 'HATE_softmax']].to_parquet('data/hate_speech/hate_model_a.parquet')