In [3]:
# !pip install git+https://github.com/huggingface/setfit.git -q
# !pip install datasets -q
# !pip install pycm==3.8 -q

  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.6/519.6 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m14.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m17.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m15.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m294.8/294.8 kB[0m [31m27.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

## Loading Dataset

In [4]:
import pandas as pd

data = pd.read_parquet("/content/drive/MyDrive/Bangla Clickbaits/prepared_datasets/dataset_253070_17c_hl10k_cleaned.parquet")
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 253070 entries, 0 to 253069
Data columns (total 17 columns):
 #   Column                Non-Null Count   Dtype 
---  ------                --------------   ----- 
 0   channel_id            253070 non-null  object
 1   channel_name          253070 non-null  object
 2   channel_url           253070 non-null  object
 3   video_id              253070 non-null  object
 4   publishedAt           253070 non-null  object
 5   title                 253070 non-null  object
 6   title_debiased        253070 non-null  object
 7   description           253070 non-null  object
 8   description_debiased  253070 non-null  object
 9   url                   253070 non-null  object
 10  viewCount             253070 non-null  int64 
 11  commentCount          253070 non-null  int64 
 12  likeCount             253070 non-null  int64 
 13  dislikeCount          253070 non-null  int64 
 14  thumbnails            253070 non-null  object
 15  auto_labeled     

In [5]:
data.auto_labeled.value_counts(), data.human_labeled.value_counts()

(Not Clickbait    223758
 Clickbait         29312
 Name: auto_labeled, dtype: int64,
 Clickbait        5644
 Not Clickbait    4356
 Name: human_labeled, dtype: int64)

In [6]:
data["human_labeled"] = data["human_labeled"].replace({"Clickbait":1, "Not Clickbait":0})

## Segregating Human Labeled Dataset

In [7]:
data_hl = data[~data.human_labeled.isna()].reset_index(drop=True).copy()
data_hl.shape

(10000, 17)

### Train, Validation, Test Dataset Preparation

In [8]:
from typing import Tuple
import pandas as pd
import numpy as np


def train_validation_test(data: pd.DataFrame, train_pct: float, validation_pct: float, stratification_col: str) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    test_pct = 1.0 - (train_pct + validation_pct)

    training_data = (data
                     .groupby(stratification_col, group_keys=False)
                     .apply(lambda x: x.sample(frac=train_pct,
                                               random_state=2023))
                     )

    rest_data = data[~data.index.isin(training_data.index)]

    validation_pct = validation_pct / (validation_pct + test_pct)
    test_pct = test_pct / (validation_pct + test_pct)

    validation_data = (rest_data
                       .groupby(stratification_col, group_keys=False)
                       .apply(lambda x: x.sample(frac=validation_pct,
                                                 random_state=2023))
                       )

    test_data = rest_data[~rest_data.index.isin(validation_data.index)]

    return training_data, validation_data, test_data

In [9]:
training_data, validation_data, test_data = train_validation_test(data_hl, 0.60, 0.20, "human_labeled")

In [10]:
training_data.shape, validation_data.shape, test_data.shape

((6000, 17), (2000, 17), (2000, 17))

In [11]:
training_data.human_labeled.value_counts(), validation_data.human_labeled.value_counts(), test_data.human_labeled.value_counts()

(1.0    3386
 0.0    2614
 Name: human_labeled, dtype: int64,
 1.0    1129
 0.0     871
 Name: human_labeled, dtype: int64,
 1.0    1129
 0.0     871
 Name: human_labeled, dtype: int64)

In [12]:
from datasets import Dataset, DatasetDict

datasets = DatasetDict({
    "train": Dataset.from_pandas(training_data),
    "eval": Dataset.from_pandas(validation_data),
    "test": Dataset.from_pandas(test_data)
    })

## Modeling

In [14]:
from sentence_transformers.losses import CosineSimilarityLoss
from setfit import SetFitModel, SetFitTrainer, sample_dataset

In [13]:
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cpu'

### paraphrase-multilingual-MiniLM-L12-v2

In [57]:
# Load a SetFit model from Hub
model_minilm = SetFitModel.from_pretrained(
    "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
    # "sentence-transformers/paraphrase-multilingual-mpnet-base-v2",
    # "sentence-transformers/paraphrase-xlm-r-multilingual-v1",
    cache_dir="/content/drive/MyDrive/Bangla Clickbaits/saved_models/"
).to(device)

model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.


In [60]:
# Create trainer
trainer_minilm = SetFitTrainer(
    model=model_minilm,
    train_dataset=datasets['train'],
    eval_dataset=datasets['eval'],
    loss_class=CosineSimilarityLoss,
    metric="accuracy",
    batch_size=16,
    num_iterations=20,  # The number of text pairs to generate for contrastive learning
    num_epochs=5,  # The number of epochs to use for contrastive learning
    column_mapping={"title_debiased":"text", "human_labeled":"label"}  # Map dataset columns to text/label expected by trainer
)

In [61]:
# Train and evaluate
trainer_minilm.train()
metrics_minilm = trainer_minilm.evaluate()

Applying column mapping to training dataset


Generating Training Pairs:   0%|          | 0/20 [00:00<?, ?it/s]

***** Running training *****
  Num examples = 240000
  Num epochs = 5
  Total optimization steps = 75000
  Total train batch size = 16


Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

Iteration:   0%|          | 0/15000 [00:00<?, ?it/s]

Iteration:   0%|          | 0/15000 [00:00<?, ?it/s]

Iteration:   0%|          | 0/15000 [00:00<?, ?it/s]

Iteration:   0%|          | 0/15000 [00:00<?, ?it/s]

Iteration:   0%|          | 0/15000 [00:00<?, ?it/s]

Applying column mapping to evaluation dataset
***** Running evaluation *****


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [66]:
# trainer_minilm.model._save_pretrained(save_directory="/content/drive/MyDrive/Bangla Clickbaits/exported_models/")

In [16]:
# from setfit import SetFitModel

# model_minilm = SetFitModel.from_pretrained("/content/drive/MyDrive/Bangla Clickbaits/exported_models/paraphrase-multilingual-MiniLM-L12-v2").to(device)

In [19]:
preds_minilm = model_minilm(test_data.title_debiased.dropna().to_list())

In [23]:
from pycm import *

cm = ConfusionMatrix(test_data.human_labeled.tolist(),
                     preds_minilm.tolist(), digit=2)

cm.overall_stat['Overall ACC']

0.985

In [24]:
from sklearn.metrics import classification_report

print(classification_report(y_true=test_data.human_labeled.tolist(),
                            y_pred=preds_minilm.tolist(),
                            target_names=['negative', 'positive']))

              precision    recall  f1-score   support

    negative       0.99      0.98      0.98       871
    positive       0.98      0.99      0.99      1129

    accuracy                           0.98      2000
   macro avg       0.99      0.98      0.98      2000
weighted avg       0.99      0.98      0.98      2000



## Others

In [None]:
# data_hl = data[~data.human_labeled.isna()].reset_index(drop=True).copy()

# data_hl[['channel_name', 'title_debiased', 'human_labeled']].groupby(['channel_name', 'human_labeled']).count()
# data_hl[['channel_name', 'title_debiased', 'human_labeled']].query("channel_name=='Dr Tasnim Jara'")

# cbd = data_hl.query("human_labeled=='Clickbait'").head(5644).copy()
# ncbd = data_hl.query("human_labeled=='Not Clickbait'")
# hl_10k = pd.concat([cbd, ncbd]).reset_index(drop=True)

# data.loc[~data['video_id'].isin(hl_10k['video_id']), 'human_labeled'] = None

# data.to_parquet("/content/drive/MyDrive/Bangla Clickbaits/prepared_datasets/dataset_253070_17c_hl10k_cleaned.parquet")