In [1]:
import cleansetext
import pandas as pd

In [2]:
df_stg_1_post_label = pd.read_csv('Raw_Data\Latent_Hatred\implicit_hate_v1_stg1_posts.tsv',
                      delimiter = '\t')
df_stg_1_id_label = pd.read_csv('Raw_Data\Latent_Hatred\implicit_hate_v1_stg1.tsv',
                      delimiter = '\t')
df_stg_2_post_implicit_class = pd.read_csv('Raw_Data\Latent_Hatred\implicit_hate_v1_stg2_posts.tsv',
                                  delimiter = '\t')
df_stg_2_id_implicit_class = pd.read_csv('Raw_Data\Latent_Hatred\implicit_hate_v1_stg2.tsv',
                                  delimiter = '\t')
df_stg3_implicit_post_target_implied_statement = pd.read_csv("Raw_Data\Latent_Hatred\implicit_hate_v1_stg3_posts.tsv", delimiter = '\t')
df_mre = pd.merge(df_stg_1_post_label, df_stg_2_post_implicit_class, on='post', how='outer')

In [3]:
df_stg_1_posts = df_stg_1_post_label['post'].to_list()
df_stg_1_classes = df_stg_1_post_label['class'].to_list()
df_stg_2_posts = df_stg_2_post_implicit_class['post'].to_list()
df_stg_2_implicit_class = df_stg_2_post_implicit_class['implicit_class'].to_list()
df_stg_3_posts = df_stg3_implicit_post_target_implied_statement['post'].to_list()
df_stg_3_targets = df_stg3_implicit_post_target_implied_statement['target'].to_list()
df_stg_3_implied_statement = df_stg3_implicit_post_target_implied_statement['implied_statement'].to_list()

In [4]:
final_posts = []
final_labels = []
final_implicit_class = []
c = 0
for idx in range(len(df_stg_1_posts)):
    if df_stg_1_classes[idx] == 'implicit_hate':
        if df_stg_1_posts[idx] not in df_stg_2_posts:
            final_posts.append(df_stg_1_posts[idx])
            final_labels.append(df_stg_1_classes[idx])
            final_implicit_class.append('implicit_but_not_implicit_class_annotated')
        else:
            final_posts.append(df_stg_1_posts[idx])
            final_labels.append(df_stg_1_classes[idx])
            final_implicit_class.append(df_stg_2_implicit_class[df_stg_2_posts.index(df_stg_1_posts[idx])])
    else:
        final_posts.append(df_stg_1_posts[idx])
        final_labels.append(df_stg_1_classes[idx])
        final_implicit_class.append(df_stg_1_classes[idx])

In [5]:
df_latenthatred = pd.DataFrame({'final_posts':final_posts, 
                              'final_labels':final_labels, 
                            })

In [6]:
import cleansetext
from cleansetext.pipeline import Pipeline
from cleansetext.steps import *
from nltk.tokenize import TweetTokenizer

tk = TweetTokenizer()

# Create a pipeline with a list of preprocessing steps
pipeline = Pipeline([
    RemoveEmojis(),
    RemoveAllPunctuations(),
    RemoveTokensWithOnlyPunctuations(),
    ReplaceURLsandHTMLTags(),
    ReplaceUsernames(),
    RemoveWhiteSpaceOrChunksOfWhiteSpace()
], track_diffs=False)

def apply_preprocessing(text):
    text = tk.tokenize(text)
    text = pipeline.process(text)
    return " ".join(text)

In [7]:
df_latenthatred['final_posts'] = df_latenthatred['final_posts'].apply(lambda x: apply_preprocessing(x))

In [8]:
df_latenthatred_labels = df_latenthatred[['final_posts', 'final_labels']]

In [9]:
from sklearn.model_selection import train_test_split

df_latenthatred_labels_train, df_latenthatred_labels_test = train_test_split(df_latenthatred_labels, random_state=42, test_size=0.25)

In [10]:
df_latenthatred_labels_train['final_labels'] = df_latenthatred_labels_train['final_labels'].map({'implicit_hate': 1, 'not_hate': 0, 'explicit_hate': 2})
df_latenthatred_labels_test['final_labels'] = df_latenthatred_labels_test['final_labels'].map({'implicit_hate': 1, 'not_hate': 0, 'explicit_hate': 2})

In [11]:
# Train Dev Split
df_latenthatred_labels_train, df_latenthatred_labels_dev = train_test_split(df_latenthatred_labels_train, random_state=42, test_size=0.25)

In [12]:
implicit_hate_train = (df_latenthatred_labels_train['final_labels'] == 1).sum()
implicit_hate_dev = (df_latenthatred_labels_dev['final_labels'] == 1).sum()
implicit_hate_test = (df_latenthatred_labels_test['final_labels'] == 1).sum()

explicit_hate_train = (df_latenthatred_labels_train['final_labels'] == 2).sum()
explicit_hate_dev = (df_latenthatred_labels_dev['final_labels'] == 2).sum()
explicit_hate_test = (df_latenthatred_labels_test['final_labels'] == 2).sum()

non_hate_train = (df_latenthatred_labels_train['final_labels'] == 0).sum()
non_hate_dev = (df_latenthatred_labels_dev['final_labels'] == 0).sum()
non_hate_test = (df_latenthatred_labels_test['final_labels'] == 0).sum()

print(implicit_hate_train, implicit_hate_dev, implicit_hate_test)
print(explicit_hate_train, explicit_hate_dev, explicit_hate_test)
print(non_hate_train, non_hate_dev, non_hate_test)

3991 1356 1753
590 228 271
7501 2444 3346


In [29]:
df_latenthatred_labels_train.to_csv('PreProcessed_Data\Latent_Hatred\LatentHatred_Training.txt', sep='\t', index=False)
df_latenthatred_labels_dev.to_csv('PreProcessed_Data\Latent_Hatred\LatentHatred_Val.txt', sep='\t', index=False)
df_latenthatred_labels_test.to_csv('PreProcessed_Data\Latent_Hatred\LatentHatred_Test.txt', sep='\t', index=False)