# Importing the Libraries

In [5]:
!pip install sentence-transformers nlpaug numpy 
!pip install torch transformers scikit-learn pandas -q

Collecting nlpaug
  Downloading nlpaug-1.1.11-py3-none-any.whl.metadata (14 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Coll

In [79]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import gdown

from sentence_transformers import SentenceTransformer
from sklearn.decomposition import PCA
import nlpaug
import nlpaug.augmenter.word as naw

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.utils import resample

# Download wordnet from nltk
import nltk
nltk.download('wordnet')


import warnings
warnings.filterwarnings('ignore')

[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


**Mount google drive**

In [80]:
# # Mount google drive
# from google.colab import drive
# drive.mount('/content/drive')

# Loading the dataset from drive
!gdown --id 1wg1Rw8yPxiYtgeqYIIg7yh_SrF_JOvss

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Downloading...
From: https://drive.google.com/uc?id=1wg1Rw8yPxiYtgeqYIIg7yh_SrF_JOvss
To: /kaggle/working/all_data_processed.csv
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1.26M/1.26M [00:00<00:00, 128MB/s]


# Loading and exploring the Dataset

In [124]:
data = pd.read_csv(r'/kaggle/working/all_data_processed.csv', encoding='utf-8')
data.sample(10)

Unnamed: 0,post,label
1066,"['speaking', 'car', 'crash', 'learn', 'drive',...",1
11773,"['old', 'old', 'virgin']",0
9914,"['real', 'life', 'fantasy', 'fantasy', 'real',...",0
8343,"['walk', 'charity', 'time', 'fun', 'foot', 'so...",0
5868,"['many', 'bfs', 'middle', 'school']",0
8517,"['want', 'hear', 'cut', 'crash']",0
8551,"['next', 'zach', 'think']",0
5435,"['ever', 'take', 'greyhound', 'yes', 'lol']",0
8620,"['victor', 'homework', 'think']",0
6126,"['ready', 'bore', 'told', 'like', 'say', 'time...",0


**Explore the dataset**

In [125]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12834 entries, 0 to 12833
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   post    12834 non-null  object
 1   label   12834 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 200.7+ KB


In [126]:
print(f'The shape of the dataset is: {data.shape}')
print(f'The columns within the dataset are : {list(data.columns)}')
print(f'The classes within the dataset are: {list(data.label.value_counts().index)}')

The shape of the dataset is: (12834, 2)
The columns within the dataset are : ['post', 'label']
The classes within the dataset are: [0, 1]


In [127]:
# Showing the distribution of the data we have

# Prepare the data
label_counts = data['label'].value_counts().reset_index()
label_counts.columns = ['label', 'count']

# Create subplots (1 row, 2 columns)
fig = make_subplots(
    rows=1, cols=2,
    subplot_titles=("Class Distribution (Bar Chart)", "Class Distribution (Pie Chart)"),
    specs=[[{"type": "bar"}, {"type": "pie"}]]
)

# Define colors
colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b']

# --- Bar chart ---
fig.add_trace(
    go.Bar(
        x=label_counts['label'],
        y=label_counts['count'],
        text=label_counts['count'],
        textposition='outside',
        marker_color=colors[:len(label_counts)],
        name='Class Count'
    ),
    row=1, col=1
)

# --- Pie chart ---
fig.add_trace(
    go.Pie(
        labels=label_counts['label'],
        values=label_counts['count'],
        textinfo='percent+label',
        marker=dict(colors=colors[:len(label_counts)]),
        name='Class Percentage'
    ),
    row=1, col=2
)

# --- Layout ---
fig.update_layout(
    title_text="Class Distribution Overview",
    title_x=0.5,
    showlegend=False,
    template="plotly_white",
    height=500,
    width=1000,
    title_font_size=26
)

fig.show()


**From the Previous graph, we observed that the data is unbalanced and such a problemm has to be solved**

In [128]:
data.drop(data[data.post == '[]'].index, axis = 0,inplace = True)

In [129]:
import pandas as pd
import random

def balance_data_by_length(data: pd.DataFrame, min_length: int = 90, seed: int = 42):
    
    # Convert token lists into strings for length checking
    data = data.copy()
    data['text'] = data['post'].apply(lambda tokens: " ".join(tokens) if isinstance(tokens, list) else str(tokens))
    
    # Filter by minimum length
    data = data[data['text'].str.len() >= min_length]
    
    # Separate classes
    major_class = data[data['label'] == 0]
    minor_class = data[data['label'] == 1]
    
    # Get target count = minor class count
    target_count = len(minor_class)
    
    # Random sample from major class
    major_sample = major_class.sample(n=target_count, random_state=seed)
    
    # Combine and shuffle
    balanced_data = pd.concat([minor_class, major_sample]).sample(frac=1, random_state=seed).reset_index(drop=True)
    
    return balanced_data.drop(columns=['text'])


In [130]:
data = balance_data_by_length(data)
data

Unnamed: 0,post,label
0,"['true', 'city', 'folk', 'call', 'country', 'f...",0
1,"['answer', 'banana', 'question', 'wrong', 'ban...",1
2,"['lady', 'gaga', 'receive', 'nomination', 'org...",0
3,"['hey', 'everyone', 'follow', 'spam', 'want', ...",0
4,"['damn', 'person', 'get', 'hater', 'step', 'ne...",1
...,...,...
1505,"['like', 'pickle', 'lol', 'mean', 'dill', 'pic...",0
1506,"['home', 'alone', 'like', 'loser', 'time', 'al...",0
1507,"['hello', 'tabi', 'school', 'favorite', 'old',...",0
1508,"['size', 'matter', 'haha', 'depend', 'talk', '...",0


In [131]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
from torch.optim import AdamW

from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd

In [132]:
# ======================================================
# FOCAL LOSS
# ======================================================
class FocalLoss(torch.nn.Module):
    def __init__(self, alpha=None, gamma=2.0, reduction='mean'):
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.reduction = reduction

    def forward(self, logits, targets):
        ce_loss = torch.nn.functional.cross_entropy(logits, targets, weight=self.alpha, reduction='none')
        pt = torch.exp(-ce_loss)
        focal_loss = (1 - pt) ** self.gamma * ce_loss
        if self.reduction == 'mean':
            return focal_loss.mean()
        elif self.reduction == 'sum':
            return focal_loss.sum()
        else:
            return focal_loss


# ======================================================
# TRAINING FUNCTION
# ======================================================
def train_imbalanced_text_classifier_full(
    data,
    text_col='post',
    label_col='label',
    model_name="distilbert-base-uncased",
    batch_size=32,
    lr=2e-5,
    epochs=10,
    max_len=128,
    patience=1,
    gamma=2.0
):
    """
    Fine-tune DistilBERT on tokenized text data (list of words per post) with:
      - Focal Loss for class imbalance
      - Early stopping (on val loss)
      - Automatic threshold tuning (to maximize F1 for minority class)
    """

    # ======================================================
    # PREPARE TEXT
    # ======================================================
    texts = data[text_col].apply(lambda x: " ".join(x) if isinstance(x, list) else str(x)).tolist()
    labels = data[label_col].astype(int).tolist()

    X_train, X_val, y_train, y_val = train_test_split(
        texts, labels, test_size=0.2, stratify=labels, random_state=42
    )

    # ======================================================
    # TOKENIZER
    # ======================================================
    tokenizer = DistilBertTokenizerFast.from_pretrained(model_name)

    class TextDataset(Dataset):
        def __init__(self, texts, labels, tokenizer, max_len):
            self.texts = texts
            self.labels = labels
            self.tokenizer = tokenizer
            self.max_len = max_len
        def __len__(self):
            return len(self.texts)
        def __getitem__(self, idx):
            encoding = self.tokenizer(
                self.texts[idx],
                truncation=True,
                padding="max_length",
                max_length=self.max_len,
                return_tensors="pt"
            )
            item = {k: v.squeeze(0) for k, v in encoding.items()}
            item["labels"] = torch.tensor(self.labels[idx], dtype=torch.long)
            return item

    train_ds = TextDataset(X_train, y_train, tokenizer, max_len)
    val_ds = TextDataset(X_val, y_val, tokenizer, max_len)

    train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_ds, batch_size=batch_size)

    # ======================================================
    # CLASS WEIGHTS AND MODEL SETUP
    # ======================================================
    class_weights = compute_class_weight("balanced", classes=np.unique(labels), y=labels)
    alpha = torch.tensor(class_weights, dtype=torch.float)

    model = DistilBertForSequenceClassification.from_pretrained(model_name, num_labels=len(np.unique(labels)))
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    optimizer = AdamW(model.parameters(), lr=lr)
    loss_fn = FocalLoss(alpha=alpha.to(device), gamma=gamma)

    best_val_loss = float('inf')
    patience_counter = 0
    best_state = None

    # ======================================================
    # TRAINING LOOP
    # ======================================================
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for batch in train_loader:
            optimizer.zero_grad()
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)
            outputs = model(input_ids, attention_mask=attention_mask)
            loss = loss_fn(outputs.logits, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        avg_train_loss = total_loss / len(train_loader)

        # Validation loss
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for batch in val_loader:
                input_ids = batch["input_ids"].to(device)
                attention_mask = batch["attention_mask"].to(device)
                labels = batch["labels"].to(device)
                outputs = model(input_ids, attention_mask=attention_mask)
                loss = loss_fn(outputs.logits, labels)
                val_loss += loss.item()

        avg_val_loss = val_loss / len(val_loader)
        print(f"Epoch {epoch+1}/{epochs} | Train Loss: {avg_train_loss:.4f} | Val Loss: {avg_val_loss:.4f}")

        # Early stopping check
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            best_state = model.state_dict()
            patience_counter = 0
        else:
            patience_counter += 1
            print(f"‚ö†Ô∏è No improvement for {patience_counter} epoch(s).")
            if patience_counter >= patience:
                print("üõë Early stopping triggered.")
                break

    # Restore best weights
    if best_state:
        model.load_state_dict(best_state)
        print("‚úÖ Restored best model weights.")

    # ======================================================
    # EVALUATION
    # ======================================================
    model.eval()
    probs, true_labels = [], []

    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].cpu().numpy()
            outputs = model(input_ids, attention_mask=attention_mask)
            probs_batch = torch.softmax(outputs.logits, dim=1)[:, 1].cpu().numpy()  # class 1 probs
            probs.extend(probs_batch)
            true_labels.extend(labels)

    probs = np.array(probs)
    true_labels = np.array(true_labels)

    # ======================================================
    # THRESHOLD TUNING
    # ======================================================
    best_thresh, best_f1 = 0.5, 0
    for t in np.linspace(0.1, 0.9, 17):
        preds = (probs >= t).astype(int)
        report = classification_report(true_labels, preds, digits=3, output_dict=True)
        f1 = report['1']['f1-score']
        if f1 > best_f1:
            best_f1 = f1
            best_thresh = t

    print(f"\nüìà Best threshold for minority class F1: {best_thresh:.2f} (F1 = {best_f1:.3f})")

    # Final report at best threshold
    preds = (probs >= best_thresh).astype(int)
    print("\n--- Validation Report (Optimized Threshold) ---")
    print(classification_report(true_labels, preds, digits=3))
    report = classification_report(true_labels, preds, output_dict=True, digits=3)

    return model, tokenizer, best_thresh, report


In [133]:
model, tokenizer, best_thresh, report = train_imbalanced_text_classifier_full(data)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/10 | Train Loss: 0.1564 | Val Loss: 0.1585
Epoch 2/10 | Train Loss: 0.1328 | Val Loss: 0.1413
Epoch 3/10 | Train Loss: 0.1098 | Val Loss: 0.1421
‚ö†Ô∏è No improvement for 1 epoch(s).
üõë Early stopping triggered.
‚úÖ Restored best model weights.

üìà Best threshold for minority class F1: 0.45 (F1 = 0.730)

--- Validation Report (Optimized Threshold) ---
              precision    recall  f1-score   support

           0      0.733     0.709     0.721       151
           1      0.718     0.742     0.730       151

    accuracy                          0.725       302
   macro avg      0.725     0.725     0.725       302
weighted avg      0.725     0.725     0.725       302



**Saving the Results**

In [135]:
import joblib

# Save model and tokenizer in Hugging Face format
model.save_pretrained("bully_model")
tokenizer.save_pretrained("bully_model")

# Save threshold separately as .pkl
joblib.dump(best_thresh, "bully_threshold.pkl")

print("‚úÖ Model, tokenizer, and threshold saved successfully.")
import joblib

bundle = {
    "model_state": model.state_dict(),
    "tokenizer": tokenizer,
    "threshold": best_thresh
}
joblib.dump(bundle, "bully_model_bundle.pkl")

‚úÖ Model, tokenizer, and threshold saved successfully.


['bully_model_bundle.pkl']