In [1]:
import cleansetext
import pandas as pd

In [2]:
df_training = pd.read_csv('Raw_Data\OLID\OLID_Training.txt', sep='\t')
df_training = df_training[['id', 'tweet', 'subtask_a']]
df_training.columns = ['id', 'text', 'label']
df_training['label'] = df_training['label'].map({'OFF': 1, 'NOT': 0})

In [3]:
df_test_sentences = pd.read_csv('Raw_Data\OLID\OLID_Test_Sentences.txt', sep='\t')
df_test_labels = pd.read_csv('Raw_Data\OLID\OLID_Test_Labels.csv', names=['id', 'label'])
df_test = pd.merge(df_test_sentences, df_test_labels, on='id')
df_test = df_test[['id', 'tweet', 'label']]
df_test.columns = ['id', 'text', 'label']
df_test['label'] = df_test['label'].map({'OFF': 1, 'NOT': 0})

In [4]:
import cleansetext
from cleansetext.pipeline import Pipeline
from cleansetext.steps import *
from nltk.tokenize import TweetTokenizer

tk = TweetTokenizer()

# Create a pipeline with a list of preprocessing steps
pipeline = Pipeline([
    RemoveEmojis(),
    RemoveAllPunctuations(),
    RemoveTokensWithOnlyPunctuations(),
    ReplaceURLsandHTMLTags(),
    ReplaceUsernames(),
    RemoveWhiteSpaceOrChunksOfWhiteSpace()
], track_diffs=False)

def apply_preprocessing(text):
    text = tk.tokenize(text)
    text = pipeline.process(text)
    return " ".join(text)

In [5]:
df_training['text'] = df_training['text'].apply(apply_preprocessing)

In [6]:
df_test['text'] = df_test['text'].apply(apply_preprocessing)

In [7]:
# Train Val Split
from sklearn.model_selection import train_test_split
df_training, df_val = train_test_split(df_training, test_size=0.2, random_state=42)

In [8]:
hate_train = (df_training['label'] == 1).sum()
hate_dev = (df_val['label'] == 1).sum()
hate_test = (df_test['label'] == 1).sum()

non_hate_train = (df_training['label'] == 0).sum()
non_hate_dev = (df_val['label'] == 0).sum()
non_hate_test = (df_test['label'] == 0).sum()

print(hate_train, hate_dev, hate_test)
print(non_hate_train, non_hate_dev, non_hate_test)

3485 915 240
7107 1733 620


In [16]:
df_training.to_csv('PreProcessed_Data\OLID\OLID_Training.txt', sep='\t', index=False)
df_val.to_csv('PreProcessed_Data\OLID\OLID_Val.txt', sep='\t', index=False)
df_test.to_csv('PreProcessed_Data\OLID\OLID_Test.txt', sep='\t', index=False)