In [2]:
import os
import pandas as pd

folder_path = r"C:\Users\raned\Documents\Github\POSTMODERATION\TrainingData"
print("Files in TrainingData folder:")
print(os.listdir(folder_path))

file_path = os.path.join(folder_path, "combined_dataset_emoji.csv")
training_data_df = pd.read_csv(file_path)

Files in TrainingData folder:
['combined_dataset.csv', 'combined_dataset_emoji.csv', 'combined_dataset_old.csv', 'HateSpeechDatasetBalanced.csv', 'labeled_data.csv']


In [4]:
# 1. Install necessary packages (if not already installed)
# !pip install transformers scikit-learn torch

# 2. Imports
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, precision_recall_fscore_support


# 4. Split into training and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(
    training_data_df['CleanContent'].tolist(),
    training_data_df['Label'].tolist(),
    test_size=0.2,
    random_state=42
)

# 5. For quick prototyping, use only 7000 training samples
train_texts = train_texts[:7000]
train_labels = train_labels[:7000]

# Ensure all texts are strings and remove NaNs
train_texts = [str(t) if pd.notnull(t) else "" for t in train_texts]
val_texts = [str(t) if pd.notnull(t) else "" for t in val_texts]


# 6. Load DistilBERT tokenizer and model
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

# 7. Tokenize the datasets (you can lower max_length to 64 for faster speed if desired)
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=128)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=128)

# 8. PyTorch Dataset wrapper
class TweetDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = TweetDataset(train_encodings, train_labels)
val_dataset = TweetDataset(val_encodings, val_labels)

# 9. Training arguments — optimized for speed
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="steps",         # Evaluate every N steps
    save_strategy="steps",               # Save every N steps
    eval_steps=1000,
    save_steps=1000,
    logging_steps=500,
    num_train_epochs=2,
    per_device_train_batch_size=32,      # Try increasing to 32 for speed if GPU allows
    per_device_eval_batch_size=32,
    load_best_model_at_end=True,
    save_total_limit=2,
    logging_dir='./logs',
    fp16=torch.cuda.is_available(),      # Enables half-precision (fast on GPU)
)

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)  # get class with highest probability
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')  # change to 'macro' or 'weighted' if needed
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'precision': precision,
        'recall': recall,
        'f1': f1,
    }

# 10. Define trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)


# 11. Train the model
trainer.train()

results = trainer.evaluate()
print(results)



Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.bias', 'classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss


{'eval_loss': 0.43859291076660156, 'eval_accuracy': 0.8072672471533825, 'eval_precision': 0.8236289776574137, 'eval_recall': 0.7943192948090108, 'eval_f1': 0.8087086588000665, 'eval_runtime': 229.9671, 'eval_samples_per_second': 25.969, 'eval_steps_per_second': 0.813, 'epoch': 2.0}


In [None]:

# 1. Define new texts
new_texts = [
    "Fuck this place.", 
    "What a beautiful day, feeling grateful!",
    "I hate it here.",
    "You're a piece of shit",
    "Dumb ass bitch",
    "you’re such a dumbass 🤡 nobody wants you around 💩", #testing malicious tweets with emojis
    "go back to your country 🖕",
    "You're such a clown 🤡 lol",
    "I hate you"
]

# 2. Tokenize the new content
new_encodings = tokenizer(new_texts, truncation=True, padding=True, max_length=128, return_tensors="pt")

# 3. Run the model in evaluation mode (no gradients)
model.eval()
with torch.no_grad():
    outputs = model(**new_encodings)
    predictions = torch.argmax(outputs.logits, dim=1)

# 4. Print the predictions
print(predictions)

tensor([1, 0, 0, 0, 1])
