In [None]:
 !pip install simpletransformers


## OLID PRE-PROCESSED

In [None]:
from simpletransformers.classification import ClassificationModel
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

def preprocess_none(text):
    return text

def preprocess_extreme(text):
    text = text.lower()
    text = re.sub(r'http\S+|www.\S+', '', text)
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'#\w+', '', text)
    text = re.sub(r'[^a-z\s]', '', text)
    return text.strip()

def preprocess_keep_hashtags(text):
    text = text.lower()
    text = re.sub(r'http\S+|www.\S+', '', text)
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'[^a-z#\s]', '', text)
    return text.strip()

def preprocess_keep_hashtags_and_users(text):
    text = text.lower()
    text = re.sub(r'http\S+|www.\S+', '', text)
    text = re.sub(r'[^a-z@#\s]', '', text)
    return text.strip()

train_data = pd.read_csv("/content/olid-train-small.csv")
test_data = pd.read_csv("/content/olid-test.csv")

# train and validation sets
train_df, val_df = train_test_split(train_data, test_size=0.2, random_state=42)

#  preprocessing functions to test
preprocess_functions = [
    ("No Preprocessing", preprocess_none),
    ("Extreme Preprocessing", preprocess_extreme),
    ("Keep Hashtags", preprocess_keep_hashtags),
    ("Keep Hashtags and Users", preprocess_keep_hashtags_and_users)
]

# train and evaluate model
def train_and_evaluate(train_df, val_df, preprocess_func, output_dir):
    train_df['text'] = train_df['text'].apply(preprocess_func)
    val_df['text'] = val_df['text'].apply(preprocess_func)
    # BERT
    model = ClassificationModel('bert', 'bert-base-cased', use_cuda=True, num_labels=2)

    # hateBERT, put as a comment because this is the code output of BERT, but the code for hateBERT is basically the same, just this one line difference
    # model = ClassificationModel('bert', 'GroNLP/hateBERT', use_cuda=True, num_labels=2)

    model.train_model(train_df, args={'output_dir': output_dir, 'overwrite_output_dir': True})
    # evaluate on the validation set
    result, model_outputs, wrong_predictions = model.eval_model(val_df)
    predictions, raw_outputs = model.predict(val_df['text'].tolist())

    return classification_report(val_df['labels'], predictions)

# now test each preprocessing method
for name, func in preprocess_functions:
    print(f"\nTesting: {name}")
    report = train_and_evaluate(train_df, val_df, func, f'outputs_{name.lower().replace(" ", "_")}/')
    print(report)
train_df = pd.DataFrame({
    'text': train_data['text'],
    'labels': train_data['labels']
})

test_df = pd.DataFrame({
    'text': test_data['text'],
    'labels': test_data['labels']
})


# HASOC TRAIN

In [None]:
from simpletransformers.classification import ClassificationModel
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

def preprocess_none(text):
    return text

def preprocess_extreme(text):
    text = text.lower()
    text = re.sub(r'http\S+|www.\S+', '', text)
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'#\w+', '', text)
    text = re.sub(r'[^a-z\s]', '', text)
    return text.strip()

def preprocess_keep_hashtags(text):
    text = text.lower()
    text = re.sub(r'http\S+|www.\S+', '', text)
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'[^a-z#\s]', '', text)
    return text.strip()

def preprocess_keep_hashtags_and_users(text):
    text = text.lower()
    text = re.sub(r'http\S+|www.\S+', '', text)
    text = re.sub(r'[^a-z@#\s]', '', text)
    return text.strip()


train_data = pd.read_csv("/content/hasoc-train.csv")
test_data = pd.read_csv("/content/olid-test.csv")


train_df, val_df = train_test_split(train_data, test_size=0.2, random_state=42)


preprocess_functions = [
    ("No Preprocessing", preprocess_none),
    ("Extreme Preprocessing", preprocess_extreme),
    ("Keep Hashtags", preprocess_keep_hashtags),
    ("Keep Hashtags and Users", preprocess_keep_hashtags_and_users)
]


def train_and_evaluate(train_df, val_df, preprocess_func, output_dir):
    train_df['text'] = train_df['text'].apply(preprocess_func)
    val_df['text'] = val_df['text'].apply(preprocess_func)
    #bert
    model = ClassificationModel('bert', 'bert-base-cased', use_cuda=True, num_labels=2)
    model.train_model(train_df, args={'output_dir': output_dir, 'overwrite_output_dir': True})

    # hateBERT, put as a comment because this is the code output of BERT, but the code for hateBERT is basically the same, just this one line difference
    # model = ClassificationModel('bert', 'GroNLP/hateBERT', use_cuda=True, num_labels=2)
    result, model_outputs, wrong_predictions = model.eval_model(val_df)
    predictions, raw_outputs = model.predict(val_df['text'].tolist())

    return classification_report(val_df['labels'], predictions)


for name, func in preprocess_functions:
    print(f"\nTesting: {name}")
    report = train_and_evaluate(train_df, val_df, func, f'outputs_{name.lower().replace(" ", "_")}/')
    print(report)
train_df = pd.DataFrame({
    'text': train_data['text'],
    'labels': train_data['labels']
})

test_df = pd.DataFrame({
    'text': test_data['text'],
    'labels': test_data['labels']
})
