<a href="https://colab.research.google.com/github/aawatif/Cyberbully_Detection/blob/main/transferlearning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers torch gensim scikit-learn pandas numpy



In [10]:
from google.colab import drive

# Mount your Google Drive to the Colab runtime
drive.mount('/content/drive')

# Now you can access files in your Drive using the path '/content/drive/My Drive/'
# For example, if your file is named 'my_file.csv' and is located in the root of your Drive:
file_path = '/content/drive/My Drive/final_hatemalay_dataset.csv'

Mounted at /content/drive


In [4]:
# Install required libraries
!pip install transformers datasets scikit-learn pandas

import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
import torch

# Step 0: Import Libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
# Step 1: Upload the Dataset
# Load the dataset
file_path = '/content/drive/My Drive/final_hatemalay_dataset.csv'  # Update with your file path
df = pd.read_csv(file_path)

# Check the dataset
print(df.head())  # Ensure 'text' column for input and 'label' for labels are present

# Step 2: Preprocess the Dataset
# Convert string labels to integers (if needed)
# label_mapping = {label: idx for idx, label in enumerate(df['label'].unique())}
# df['label'] = df['label'].map(label_mapping)

# Split into train and test sets
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['texts'], df['cyberbully'], test_size=0.2, random_state=42
)

# Create Pandas DataFrames for HuggingFace Datasets
train_data = pd.DataFrame({'texts': train_texts, 'cyberbully': train_labels})
test_data = pd.DataFrame({'texts': test_texts, 'cyberbully': test_labels})

# Convert to HuggingFace Dataset objects
train_dataset = Dataset.from_pandas(train_data)
test_dataset = Dataset.from_pandas(test_data)

# Step 3: Tokenize the Dataset
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

def tokenize_function(examples):
    return tokenizer(examples["texts"], padding="max_length", truncation=True)

train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

# Remove unnecessary columns
train_dataset = train_dataset.remove_columns(["texts"])
test_dataset = test_dataset.remove_columns(["texts"])

# Rename the 'cyberbully' column to 'labels' for the Trainer to recognize it
train_dataset = train_dataset.rename_column("cyberbully", "labels")
test_dataset = test_dataset.rename_column("cyberbully", "labels")

# Set the format for PyTorch
train_dataset.set_format("torch")
test_dataset.set_format("torch")

# Step 4: Define the Model
num_labels = len(df['cyberbully'].unique())

model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=num_labels  # Directly use the calculated number of labels
)


# Step 5: Define Training Arguments
training_args = TrainingArguments(
    output_dir="./results",          # Output directory for model checkpoints
    evaluation_strategy="epoch",    # Evaluate every epoch
    learning_rate=2e-5,             # Learning rate
    per_device_train_batch_size=16, # Batch size for training
    per_device_eval_batch_size=16,  # Batch size for evaluation
    num_train_epochs=4,             # Number of training epochs
    weight_decay=0.01,              # Weight decay for AdamW optimizer
    logging_dir="./logs",           # Log directory
    save_strategy="epoch",          # Save model every epoch
    logging_steps=10,               # Log every 10 steps
    load_best_model_at_end=True,    # Load best model at the end
    save_total_limit=2              # Save only the 2 most recent checkpoints
)

# Step 6: Train the Model
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
)

trainer.train()

# Step 7: Evaluate the Model
metrics = trainer.evaluate()
print("Evaluation Metrics:", metrics)

# Evaluate the model
metrics = trainer.evaluate()
print("Trainer Metrics:", metrics)

# Get predictions from the model
predictions = trainer.predict(test_dataset)

# Extract predicted labels
preds = torch.argmax(torch.tensor(predictions.predictions), axis=1).cpu().numpy() # Convert preds to NumPy array
labels = predictions.label_ids

# Calculate metrics
# Move tensors to CPU and convert to NumPy arrays
preds = preds.cpu().numpy() if torch.is_tensor(preds) else preds
labels = labels.cpu().numpy() if torch.is_tensor(labels) else labels

# Calculate metrics
accuracy = accuracy_score(labels, preds)
precision = precision_score(labels, preds, average='weighted')  # Change 'weighted' to 'macro' if needed
recall = recall_score(labels, preds, average='weighted')
f1 = f1_score(labels, preds, average='weighted')

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

# Detailed classification report
# Generate the classification report as a dictionary
report_dict = classification_report(labels, preds, target_names=[str(i) for i in range(num_labels)], output_dict=True)

# Format the dictionary values to percentages with 2 decimal places
formatted_report = {}
for label, metrics in report_dict.items():
    if isinstance(metrics, dict):  # If metrics is a dictionary (e.g., '0', '1', 'macro avg')
        formatted_report[label] = {
            metric: f"{value * 100:.2f}%" if isinstance(value, float) else value
            for metric, value in metrics.items()
        }
    else:  # If metrics is a float (e.g., 'accuracy')
        formatted_report[label] = f"{metrics * 100:.2f}%"

# Print the formatted classification report
for label, metrics in formatted_report.items():
    print(f"Class: {label}")
    if isinstance(metrics, dict):
        for metric, value in metrics.items():
            print(f"  {metric}: {value}")
    else:
        print(f"  {metrics}")
    print("\n")

# Step 8: Save the Fine-Tuned Model
trainer.save_model("./distilbert_finetuned_hatemalay")
tokenizer.save_pretrained("./distilbert_finetuned_hatemalay")

                                               texts  cyberbully
0  ['hiyubyu aku prnah sekali panjang jalan baca ...           0
1  ['memang serabut ada manusia celaka macam ni d...           1
2  ['depan nti kalau panjang ya alhamdulilah aku ...           1
3  ['masa rakyat china cecah bilion dulu rakyat i...           0
4  ['wahbagus tuan jaga halal haram baik saya dul...           0


Map:   0%|          | 0/3817 [00:00<?, ? examples/s]

Map:   0%|          | 0/955 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,0.3883,0.385001
2,0.284,0.382699
3,0.3325,0.366634
4,0.3521,0.37174


Evaluation Metrics: {'eval_loss': 0.3666340708732605, 'eval_runtime': 13.0892, 'eval_samples_per_second': 72.961, 'eval_steps_per_second': 4.584, 'epoch': 4.0}
Trainer Metrics: {'eval_loss': 0.3666340708732605, 'eval_runtime': 13.2692, 'eval_samples_per_second': 71.971, 'eval_steps_per_second': 4.522, 'epoch': 4.0}
Accuracy: 0.8387434554973822
Precision: 0.8362216592542552
Recall: 0.8387434554973822
F1 Score: 0.8374089718845835
Class: 0
  precision: 89.55%
  recall: 90.48%
  f1-score: 90.01%
  support: 76700.00%


Class: 1
  precision: 59.44%
  recall: 56.91%
  f1-score: 58.15%
  support: 18800.00%


Class: accuracy
  83.87%


Class: macro avg
  precision: 74.50%
  recall: 73.70%
  f1-score: 74.08%
  support: 95500.00%


Class: weighted avg
  precision: 83.62%
  recall: 83.87%
  f1-score: 83.74%
  support: 95500.00%




('./distilbert_finetuned_hatemalay/tokenizer_config.json',
 './distilbert_finetuned_hatemalay/special_tokens_map.json',
 './distilbert_finetuned_hatemalay/vocab.txt',
 './distilbert_finetuned_hatemalay/added_tokens.json')

In [11]:
# Install required libraries
!pip install transformers datasets scikit-learn pandas

import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import ElectraTokenizer, ElectraForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
import torch

# Step 1: Load the Dataset
file_path = '/content/drive/My Drive/final_hatemalay_dataset.csv'  # Update with your file path
df = pd.read_csv(file_path)

# Check the dataset structure
print(df.head())  # Ensure 'texts' column for input and 'cyberbully' column for labels

# Step 2: Preprocess the Dataset
# Split into train and test sets
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['texts'], df['cyberbully'], test_size=0.2, random_state=42
)

# Create Pandas DataFrames for HuggingFace Datasets
train_data = pd.DataFrame({'texts': train_texts, 'cyberbully': train_labels})
test_data = pd.DataFrame({'texts': test_texts, 'cyberbully': test_labels})

# Convert to HuggingFace Dataset objects
train_dataset = Dataset.from_pandas(train_data)
test_dataset = Dataset.from_pandas(test_data)

# Step 3: Tokenize the Dataset
tokenizer = ElectraTokenizer.from_pretrained("google/electra-base-discriminator")

def tokenize_function(examples):
    return tokenizer(examples["texts"], padding="max_length", truncation=True)

train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

# Remove unnecessary columns
train_dataset = train_dataset.remove_columns(["texts"])
test_dataset = test_dataset.remove_columns(["texts"])

# Rename the 'cyberbully' column to 'labels' for the Trainer to recognize it
train_dataset = train_dataset.rename_column("cyberbully", "labels")
test_dataset = test_dataset.rename_column("cyberbully", "labels")

# Set the format for PyTorch
train_dataset.set_format("torch")
test_dataset.set_format("torch")

# Step 4: Define the Model
num_labels = len(df['cyberbully'].unique())  # Dynamically detect the number of labels

model = ElectraForSequenceClassification.from_pretrained(
    "google/electra-base-discriminator",
    num_labels=num_labels  # Specify the number of output classes
)

# Step 5: Define Training Arguments
training_args = TrainingArguments(
    output_dir="./results",          # Output directory for model checkpoints
    evaluation_strategy="epoch",    # Evaluate every epoch
    learning_rate=2e-5,             # Learning rate
    per_device_train_batch_size=16, # Batch size for training
    per_device_eval_batch_size=16,  # Batch size for evaluation
    num_train_epochs=4,             # Number of training epochs
    weight_decay=0.01,              # Weight decay for AdamW optimizer
    logging_dir="./logs",           # Log directory
    save_strategy="epoch",          # Save model every epoch
    logging_steps=10,               # Log every 10 steps
    load_best_model_at_end=True,    # Load best model at the end
    save_total_limit=2              # Save only the 2 most recent checkpoints
)

# Step 6: Train the Model
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
)

trainer.train()

# Step 7: Evaluate the Model
metrics = trainer.evaluate()
print("Trainer Metrics:", metrics)

# Get predictions from the model
predictions = trainer.predict(test_dataset)

# Extract predicted labels
preds = torch.argmax(torch.tensor(predictions.predictions), axis=1).cpu().numpy()  # Convert preds to NumPy array
labels = predictions.label_ids

# Calculate metrics
accuracy = accuracy_score(labels, preds)
precision = precision_score(labels, preds, average='weighted')  # Use 'macro' if you want unweighted metrics
recall = recall_score(labels, preds, average='weighted')
f1 = f1_score(labels, preds, average='weighted')

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

# Detailed classification report
report_dict = classification_report(labels, preds, target_names=[str(i) for i in range(num_labels)], output_dict=True)

# Format the dictionary values to percentages with 2 decimal places
formatted_report = {}
for label, metrics in report_dict.items():
    if isinstance(metrics, dict):  # If metrics is a dictionary (e.g., '0', '1', 'macro avg')
        formatted_report[label] = {
            metric: f"{value * 100:.2f}%" if isinstance(value, float) else value
            for metric, value in metrics.items()
        }
    else:  # If metrics is a float (e.g., 'accuracy')
        formatted_report[label] = f"{metrics * 100:.2f}%"

# Print the formatted classification report
for label, metrics in formatted_report.items():
    print(f"Class: {label}")
    if isinstance(metrics, dict):
        for metric, value in metrics.items():
            print(f"  {metric}: {value}")
    else:
        print(f"  {metrics}")
    print("\n")

# Step 8: Save the Fine-Tuned Model
trainer.save_model("./electra_finetuned_hatemalay")
tokenizer.save_pretrained("./electra_finetuned_hatemalay")

                                               texts  cyberbully
0  ['hiyubyu aku prnah sekali panjang jalan baca ...           0
1  ['memang serabut ada manusia celaka macam ni d...           1
2  ['depan nti kalau panjang ya alhamdulilah aku ...           1
3  ['masa rakyat china cecah bilion dulu rakyat i...           0
4  ['wahbagus tuan jaga halal haram baik saya dul...           0


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/666 [00:00<?, ?B/s]

Map:   0%|          | 0/3817 [00:00<?, ? examples/s]

Map:   0%|          | 0/955 [00:00<?, ? examples/s]

pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-base-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss
1,0.4147,0.409284
2,0.3269,0.395017
3,0.4266,0.381954
4,0.379,0.369742


Trainer Metrics: {'eval_loss': 0.3697424530982971, 'eval_runtime': 30.2056, 'eval_samples_per_second': 31.617, 'eval_steps_per_second': 1.986, 'epoch': 4.0}
Accuracy: 0.8439790575916231
Precision: 0.8442934237201049
Recall: 0.8439790575916231
F1 Score: 0.8441350944616703
Class: 0
  precision: 90.34%
  recall: 90.22%
  f1-score: 90.28%
  support: 76700.00%


Class: 1
  precision: 60.32%
  recall: 60.64%
  f1-score: 60.48%
  support: 18800.00%


Class: accuracy
  84.40%


Class: macro avg
  precision: 75.33%
  recall: 75.43%
  f1-score: 75.38%
  support: 95500.00%


Class: weighted avg
  precision: 84.43%
  recall: 84.40%
  f1-score: 84.41%
  support: 95500.00%




('./electra_finetuned_hatemalay/tokenizer_config.json',
 './electra_finetuned_hatemalay/special_tokens_map.json',
 './electra_finetuned_hatemalay/vocab.txt',
 './electra_finetuned_hatemalay/added_tokens.json')

In [12]:
import os

# Check if the directory exists
print(os.listdir('./'))

['.config', 'electra_finetuned_hatemalay', 'logs', 'wandb', 'drive', 'results', 'sample_data']


In [14]:
from transformers import ElectraTokenizer, ElectraForSequenceClassification

# Save the model and tokenizer
model.save_pretrained('./electra_finetuned_hatemalay')
tokenizer.save_pretrained('./electra_finetuned_hatemalay')

('./electra_finetuned_hatemalay/tokenizer_config.json',
 './electra_finetuned_hatemalay/special_tokens_map.json',
 './electra_finetuned_hatemalay/vocab.txt',
 './electra_finetuned_hatemalay/added_tokens.json')

In [15]:
!zip -r electra_finetuned_hatemalay.zip electra_finetuned_hatemalay

  adding: electra_finetuned_hatemalay/ (stored 0%)
  adding: electra_finetuned_hatemalay/vocab.txt (deflated 53%)
  adding: electra_finetuned_hatemalay/tokenizer_config.json (deflated 75%)
  adding: electra_finetuned_hatemalay/model.safetensors (deflated 7%)
  adding: electra_finetuned_hatemalay/training_args.bin (deflated 51%)
  adding: electra_finetuned_hatemalay/special_tokens_map.json (deflated 42%)
  adding: electra_finetuned_hatemalay/config.json (deflated 53%)


In [16]:
print(os.listdir('./'))

['.config', 'electra_finetuned_hatemalay', 'logs', 'wandb', 'drive', 'electra_finetuned_hatemalay.zip', 'results', 'sample_data']


In [17]:
from google.colab import files
files.download('electra_finetuned_hatemalay.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [6]:
# Install required libraries
!pip install transformers datasets scikit-learn pandas

import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
import torch

# Step 1: Load the Dataset
file_path = '/content/drive/My Drive/final_hatemalay_dataset.csv'  # Update with your file path
df = pd.read_csv(file_path)

# Check the dataset structure
print(df.head())  # Ensure 'texts' column for input and 'cyberbully' column for labels

# Step 2: Preprocess the Dataset
# Split into train and test sets
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['texts'], df['cyberbully'], test_size=0.2, random_state=42
)

# Create Pandas DataFrames for HuggingFace Datasets
train_data = pd.DataFrame({'texts': train_texts, 'cyberbully': train_labels})
test_data = pd.DataFrame({'texts': test_texts, 'cyberbully': test_labels})

# Convert to HuggingFace Dataset objects
train_dataset = Dataset.from_pandas(train_data)
test_dataset = Dataset.from_pandas(test_data)

# Step 3: Tokenize the Dataset
tokenizer = RobertaTokenizer.from_pretrained("distilroberta-base")

def tokenize_function(examples):
    return tokenizer(examples["texts"], padding="max_length", truncation=True)

train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

# Remove unnecessary columns
train_dataset = train_dataset.remove_columns(["texts"])
test_dataset = test_dataset.remove_columns(["texts"])

# Rename the 'cyberbully' column to 'labels' for the Trainer to recognize it
train_dataset = train_dataset.rename_column("cyberbully", "labels")
test_dataset = test_dataset.rename_column("cyberbully", "labels")

# Set the format for PyTorch
train_dataset.set_format("torch")
test_dataset.set_format("torch")

# Step 4: Define the Model
num_labels = len(df['cyberbully'].unique())  # Dynamically detect the number of labels

model = RobertaForSequenceClassification.from_pretrained(
    "distilroberta-base",
    num_labels=num_labels  # Specify the number of output classes
)

# Step 5: Define Training Arguments
training_args = TrainingArguments(
    output_dir="./results",          # Output directory for model checkpoints
    evaluation_strategy="epoch",    # Evaluate every epoch
    learning_rate=2e-5,             # Learning rate
    per_device_train_batch_size=16, # Batch size for training
    per_device_eval_batch_size=16,  # Batch size for evaluation
    num_train_epochs=4,             # Number of training epochs
    weight_decay=0.01,              # Weight decay for AdamW optimizer
    logging_dir="./logs",           # Log directory
    save_strategy="epoch",          # Save model every epoch
    logging_steps=10,               # Log every 10 steps
    load_best_model_at_end=True,    # Load best model at the end
    save_total_limit=2              # Save only the 2 most recent checkpoints
)

# Step 6: Train the Model
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
)

trainer.train()

# Step 7: Evaluate the Model
metrics = trainer.evaluate()
print("Trainer Metrics:", metrics)

# Get predictions from the model
predictions = trainer.predict(test_dataset)

# Extract predicted labels
preds = torch.argmax(torch.tensor(predictions.predictions), axis=1).cpu().numpy()  # Convert preds to NumPy array
labels = predictions.label_ids

# Calculate metrics
accuracy = accuracy_score(labels, preds)
precision = precision_score(labels, preds, average='weighted')  # Use 'macro' if you want unweighted metrics
recall = recall_score(labels, preds, average='weighted')
f1 = f1_score(labels, preds, average='weighted')

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

# Detailed classification report
report_dict = classification_report(labels, preds, target_names=[str(i) for i in range(num_labels)], output_dict=True)

# Format the dictionary values to percentages with 2 decimal places
formatted_report = {}
for label, metrics in report_dict.items():
    if isinstance(metrics, dict):  # If metrics is a dictionary (e.g., '0', '1', 'macro avg')
        formatted_report[label] = {
            metric: f"{value * 100:.2f}%" if isinstance(value, float) else value
            for metric, value in metrics.items()
        }
    else:  # If metrics is a float (e.g., 'accuracy')
        formatted_report[label] = f"{metrics * 100:.2f}%"

# Print the formatted classification report
for label, metrics in formatted_report.items():
    print(f"Class: {label}")
    if isinstance(metrics, dict):
        for metric, value in metrics.items():
            print(f"  {metric}: {value}")
    else:
        print(f"  {metrics}")
    print("\n")

# Step 8: Save the Fine-Tuned Model
trainer.save_model("./distilroberta_finetuned_hatemalay")
tokenizer.save_pretrained("./distilroberta_finetuned_hatemalay")

                                               texts  cyberbully
0  ['hiyubyu aku prnah sekali panjang jalan baca ...           0
1  ['memang serabut ada manusia celaka macam ni d...           1
2  ['depan nti kalau panjang ya alhamdulilah aku ...           1
3  ['masa rakyat china cecah bilion dulu rakyat i...           0
4  ['wahbagus tuan jaga halal haram baik saya dul...           0


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

Map:   0%|          | 0/3817 [00:00<?, ? examples/s]

Map:   0%|          | 0/955 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/331M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,0.4661,0.429201
2,0.3064,0.358961
3,0.4082,0.359569
4,0.3891,0.367545


Trainer Metrics: {'eval_loss': 0.35896098613739014, 'eval_runtime': 12.1782, 'eval_samples_per_second': 78.419, 'eval_steps_per_second': 4.927, 'epoch': 4.0}
Accuracy: 0.8418848167539267
Precision: 0.8397172145523533
Recall: 0.8418848167539267
F1 Score: 0.8407441987381811
Class: 0
  precision: 89.79%
  recall: 90.61%
  f1-score: 90.20%
  support: 76700.00%


Class: 1
  precision: 60.22%
  recall: 57.98%
  f1-score: 59.08%
  support: 18800.00%


Class: accuracy
  84.19%


Class: macro avg
  precision: 75.01%
  recall: 74.30%
  f1-score: 74.64%
  support: 95500.00%


Class: weighted avg
  precision: 83.97%
  recall: 84.19%
  f1-score: 84.07%
  support: 95500.00%




('./distilroberta_finetuned_hatemalay/tokenizer_config.json',
 './distilroberta_finetuned_hatemalay/special_tokens_map.json',
 './distilroberta_finetuned_hatemalay/vocab.json',
 './distilroberta_finetuned_hatemalay/merges.txt',
 './distilroberta_finetuned_hatemalay/added_tokens.json')