Sentiment Analysis for Cyberbullying Detection on Social Media Platforms using DeepSeek-coder-1.3b-base

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import Dataset
import re
import string
import unicodedata

In [2]:
# Load dataset
df = pd.read_csv('../Datasets/preprocessed_Amazon_Updated.csv', encoding='ISO-8859-1')  
df = df[['text', 'sentiment']].dropna()

In [3]:
# Optional: Integrate sentiment score (you can use TextBlob or Vader here)
#from textblob import TextBlob

#def get_sentiment(text):
    #return TextBlob(text).sentiment.polarity

#df['sentiment'] = df['selected_text'].apply(get_sentiment)
#df['text'] = df['sentiment'].astype(str) + ' ' + df['selected_text']  # prepend sentiment to input

In [4]:
# Encode labels
labels = df['sentiment'].unique().tolist()
df['label'] = df['sentiment'].apply(lambda x: labels.index(x))

In [5]:
df

Unnamed: 0,text,sentiment,label
0,my lovely pat has one of the great voices of h...,positive,0
1,despite the fact that i have only played a sma...,positive,0
2,i bought this charger in jul and it worked ok ...,negative,1
3,check out maha energys website their powerex m...,positive,0
4,reviewed quite a bit of the combo players and ...,positive,0
...,...,...,...
41404,tea tree shampoo is my favorite i usually get ...,positive,0
41405,admittedly i am not a country music fan howeve...,positive,0
41406,i try to listen to something new everyday and ...,positive,0
41407,i bought this to go inside of the enclosed woo...,negative,1


In [6]:
# Train/test split
train_df, test_df = train_test_split(df[['text', 'label']], test_size=0.2, random_state=42)
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

In [12]:
from transformers import AutoTokenizer

model_name = "deepseek-ai/deepseek-coder-1.3b-base"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

#from transformers import AutoModelForSequenceClassification, AutoTokenizer

#model = AutoModelForSequenceClassification.from_pretrained("./fine-tuned-deepseek-coder-1.3b-based-text-classification")
#tokenizer = AutoTokenizer.from_pretrained("./fine-tuned-deepseek-coder-1.3b-based-text-classification")

In [13]:
# Tokenize
def tokenize(batch):
    return tokenizer(batch['text'], truncation=True, padding='max_length', max_length=128)

train_dataset = train_dataset.map(tokenize, batched=True)
test_dataset = test_dataset.map(tokenize, batched=True)

train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

Map:   0%|          | 0/33127 [00:00<?, ? examples/s]

Map:   0%|          | 0/8282 [00:00<?, ? examples/s]

🧠 Customize DeepSeek Model for Classification

In [14]:
import torch
from torch import nn
from transformers import AutoModel, PreTrainedModel, PretrainedConfig

class DeepSeekForClassification(PreTrainedModel):
    def __init__(self, config, num_labels):
        super().__init__(config)
        self.num_labels = num_labels
        self.model = AutoModel.from_pretrained(model_name, trust_remote_code=True)
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask=None, labels=None):
        outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = outputs.last_hidden_state  # (batch_size, seq_len, hidden_dim)
        pooled_output = hidden_state[:, 0]  # take <s> token (CLS token)
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)

        loss = None
        if labels is not None:
            loss_fn = nn.CrossEntropyLoss()
            loss = loss_fn(logits, labels)

        return {"loss": loss, "logits": logits}

⚙️ Training Setup

In [15]:
from transformers import TrainingArguments, Trainer, EvalPrediction
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

In [16]:
from transformers import PretrainedConfig

# Custom configuration class
class CustomConfig(PretrainedConfig):
    def __init__(self, num_labels=2, hidden_size=2048, **kwargs):
        super().__init__(**kwargs)
        self.hidden_size = hidden_size  # default for DeepSeek 1.3B
        self.num_labels = num_labels

num_labels = 2  # for example, if you have 2 classes
config = CustomConfig(num_labels=num_labels)
model = DeepSeekForClassification(config=config, num_labels=num_labels)

In [17]:
# Metrics
def compute_metrics(eval_pred: EvalPrediction):
    logits, labels_ = eval_pred
    predictions = np.argmax(logits, axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels_, predictions, average='weighted')
    acc = accuracy_score(labels_, predictions)
    return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}

In [18]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./deepseek_cyberbullying",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="steps",
    logging_steps=10,
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=1,
    weight_decay=0.01,
    # fp16=True,  # ❌ Comment out this line, as it's only for GPU/TPU (mixed precision)
    report_to="none",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
)



In [19]:
from transformers import TrainerCallback

class PrintEpochCallback(TrainerCallback):
    def on_epoch_end(self, args, state, control, **kwargs):
        print(f"Epoch {int(state.epoch)} finished. Current loss: {state.log_history[-1]}")

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
    callbacks=[PrintEpochCallback()],
)

🚀 Train and Evaluate

In [20]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.0,,0.519198,0.35488,0.269567,0.519198


Epoch 1 finished. Current loss: {'loss': 0.0, 'grad_norm': nan, 'learning_rate': 4.829751267809708e-09, 'epoch': 0.9997585124366095, 'step': 8280}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


TrainOutput(global_step=8282, training_loss=0.0812245405575297, metrics={'train_runtime': 75611.5698, 'train_samples_per_second': 0.438, 'train_steps_per_second': 0.11, 'total_flos': 3.089506810150349e+16, 'train_loss': 0.0812245405575297, 'epoch': 1.0})

Save Model

In [21]:
# Save the fine-tuned model locally
#model_save_path = "./fine-tuned-deepseek-coder-1.3b-based-text-classification-IMDB"

# Save model and tokenizer
#model.save_pretrained(model_save_path)
#tokenizer.save_pretrained(model_save_path)

#print(f"Model and tokenizer saved to {model_save_path}")

Push to Hugging Face Hub

In [22]:
# Log in first if needed
#from huggingface_hub import notebook_login
#notebook_login()

# Push model and tokenizer
#trainer.push_to_hub("Fine-tuned DeepSeek Coder - Text Classification")

In [23]:
# Evaluation
import time
import psutil
from codecarbon import EmissionsTracker

# Start tracking energy consumption
tracker = EmissionsTracker()
tracker.start()

# Measure CPU and RAM usage before evaluation
process = psutil.Process()
cpu_before = psutil.cpu_percent(interval=None)
ram_before = process.memory_info().rss / (1024 * 1024)  # Convert to MB

# Measure inference time
start_time = time.time()

# Evaluation
metrics = trainer.evaluate()

# Measure inference time
end_time = time.time()
inference_time = end_time - start_time

# Measure CPU and RAM usage after evaluation
cpu_after = psutil.cpu_percent(interval=None)
ram_after = process.memory_info().rss / (1024 * 1024)  # Convert to MB

# Calculate differences
cpu_usage = cpu_after - cpu_before
ram_usage = ram_after - ram_before

# Stop energy tracking
emissions = tracker.stop()

# Print results
print(metrics)
print(f"Inference Time: {inference_time:.4f} seconds")
print(f"CPU Usage: {cpu_usage:.2f}%")
print(f"RAM Usage: {ram_usage:.2f} MB")
print(f"Energy Consumption: {emissions:.6f} kWh")

[codecarbon INFO @ 15:52:39] [setup] RAM Tracking...
[codecarbon INFO @ 15:52:39] [setup] GPU Tracking...
[codecarbon INFO @ 15:52:39] Tracking Nvidia GPU via pynvml
[codecarbon INFO @ 15:52:39] [setup] CPU Tracking...
[codecarbon INFO @ 15:52:41] CPU Model on constant consumption mode: 13th Gen Intel(R) Core(TM) i9-13900
[codecarbon INFO @ 15:52:41] >>> Tracker's metadata:
[codecarbon INFO @ 15:52:41]   Platform system: Windows-10-10.0.26100-SP0
[codecarbon INFO @ 15:52:41]   Python version: 3.11.5
[codecarbon INFO @ 15:52:41]   CodeCarbon version: 2.2.2
[codecarbon INFO @ 15:52:41]   Available RAM : 31.696 GB
[codecarbon INFO @ 15:52:41]   CPU count: 32
[codecarbon INFO @ 15:52:41]   CPU model: 13th Gen Intel(R) Core(TM) i9-13900
[codecarbon INFO @ 15:52:41]   GPU count: 1
[codecarbon INFO @ 15:52:41]   GPU model: 1 x NVIDIA GeForce RTX 4090


[codecarbon INFO @ 15:52:59] Energy consumed for RAM : 0.000050 kWh. RAM Power : 11.886009693145752 W
[codecarbon INFO @ 15:52:59] Energy consumed for all GPUs : 0.000253 kWh. Total GPU Power : 60.64 W
[codecarbon INFO @ 15:52:59] Energy consumed for all CPUs : 0.000178 kWh. Total CPU Power : 42.5 W
[codecarbon INFO @ 15:52:59] 0.000481 kWh of electricity used since the beginning.
[codecarbon INFO @ 15:53:14] Energy consumed for RAM : 0.000099 kWh. RAM Power : 11.886009693145752 W
[codecarbon INFO @ 15:53:14] Energy consumed for all GPUs : 0.000507 kWh. Total GPU Power : 60.893 W
[codecarbon INFO @ 15:53:14] Energy consumed for all CPUs : 0.000355 kWh. Total CPU Power : 42.5 W
[codecarbon INFO @ 15:53:14] 0.000961 kWh of electricity used since the beginning.
[codecarbon INFO @ 15:53:29] Energy consumed for RAM : 0.000149 kWh. RAM Power : 11.886009693145752 W
[codecarbon INFO @ 15:53:29] Energy consumed for all GPUs : 0.000757 kWh. Total GPU Power : 59.844 W
[codecarbon INFO @ 15:53:29]

{'eval_loss': nan, 'eval_accuracy': 0.5191982612895436, 'eval_f1': 0.3548803884191762, 'eval_precision': 0.2695668345260852, 'eval_recall': 0.5191982612895436, 'eval_runtime': 4105.9426, 'eval_samples_per_second': 2.017, 'eval_steps_per_second': 0.504, 'epoch': 1.0}
Inference Time: 4105.9466 seconds
CPU Usage: 3.50%
RAM Usage: -9728.59 MB
Energy Consumption: 0.001514 kWh
