In [None]:
# !pip install git+https://github.com/huggingface/setfit.git -q
# !pip install datasets -q
# !pip install pycm==3.8 -q

## Loading Dataset

In [None]:
import pandas as pd

data = pd.read_parquet("/content/drive/MyDrive/Bangla Clickbaits/prepared_datasets/dataset_253070_17c_hl10k_cleaned.parquet")
data.info()

In [None]:
data.auto_labeled.value_counts(), data.human_labeled.value_counts()

In [None]:
data["human_labeled"] = data["human_labeled"].replace({"Clickbait":1, "Not Clickbait":0})

## Segregating Human Labeled Dataset

In [None]:
data_hl = data[~data.human_labeled.isna()].reset_index(drop=True).copy()
data_hl.shape

### Train, Validation, Test Dataset Preparation

In [None]:
from typing import Tuple
import pandas as pd
import numpy as np


def train_validation_test(data: pd.DataFrame, train_pct: float, validation_pct: float, stratification_col: str) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    test_pct = 1.0 - (train_pct + validation_pct)

    training_data = (data
                     .groupby(stratification_col, group_keys=False)
                     .apply(lambda x: x.sample(frac=train_pct,
                                               random_state=2023))
                     )

    rest_data = data[~data.index.isin(training_data.index)]

    validation_pct = validation_pct / (validation_pct + test_pct)
    test_pct = test_pct / (validation_pct + test_pct)

    validation_data = (rest_data
                       .groupby(stratification_col, group_keys=False)
                       .apply(lambda x: x.sample(frac=validation_pct,
                                                 random_state=2023))
                       )

    test_data = rest_data[~rest_data.index.isin(validation_data.index)]

    return training_data, validation_data, test_data

In [None]:
training_data, validation_data, test_data = train_validation_test(data_hl, 0.60, 0.20, "human_labeled")

In [None]:
training_data.shape, validation_data.shape, test_data.shape

In [None]:
training_data.human_labeled.value_counts(), validation_data.human_labeled.value_counts(), test_data.human_labeled.value_counts()

In [None]:
from datasets import Dataset, DatasetDict

datasets = DatasetDict({
    "train": Dataset.from_pandas(training_data),
    "eval": Dataset.from_pandas(validation_data),
    "test": Dataset.from_pandas(test_data)
    })

## Modeling

In [None]:
from sentence_transformers.losses import CosineSimilarityLoss
from setfit import SetFitModel, SetFitTrainer, sample_dataset

In [None]:
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"
device

### paraphrase-multilingual-MiniLM-L12-v2

In [None]:
# Load a SetFit model from Hub
model_minilm = SetFitModel.from_pretrained(
    "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
    # "sentence-transformers/paraphrase-multilingual-mpnet-base-v2",
    # "sentence-transformers/paraphrase-xlm-r-multilingual-v1",
    cache_dir="/content/drive/MyDrive/Bangla Clickbaits/saved_models/"
).to(device)

In [None]:
# Create trainer
trainer_minilm = SetFitTrainer(
    model=model_minilm,
    train_dataset=datasets['train'],
    eval_dataset=datasets['eval'],
    loss_class=CosineSimilarityLoss,
    metric="accuracy",
    batch_size=16,
    num_iterations=20,  # The number of text pairs to generate for contrastive learning
    num_epochs=5,  # The number of epochs to use for contrastive learning
    column_mapping={"title_debiased":"text", "human_labeled":"label"}  # Map dataset columns to text/label expected by trainer
)

In [None]:
# Train and evaluate
trainer_minilm.train()
metrics_minilm = trainer_minilm.evaluate()

In [None]:
# trainer_minilm.model._save_pretrained(save_directory="/content/drive/MyDrive/Bangla Clickbaits/exported_models/")

In [None]:
# from setfit import SetFitModel

# model_minilm = SetFitModel.from_pretrained("/content/drive/MyDrive/Bangla Clickbaits/exported_models/paraphrase-multilingual-MiniLM-L12-v2").to(device)

In [None]:
preds_minilm = model_minilm(test_data.title_debiased.dropna().to_list())

In [None]:
from pycm import *

cm = ConfusionMatrix(test_data.human_labeled.tolist(),
                     preds_minilm.tolist(), digit=2)

cm.overall_stat['Overall ACC']

In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_true=test_data.human_labeled.tolist(),
                            y_pred=preds_minilm.tolist(),
                            target_names=['negative', 'positive']))

## Others

In [None]:
# data_hl = data[~data.human_labeled.isna()].reset_index(drop=True).copy()

# data_hl[['channel_name', 'title_debiased', 'human_labeled']].groupby(['channel_name', 'human_labeled']).count()
# data_hl[['channel_name', 'title_debiased', 'human_labeled']].query("channel_name=='Dr Tasnim Jara'")

# cbd = data_hl.query("human_labeled=='Clickbait'").head(5644).copy()
# ncbd = data_hl.query("human_labeled=='Not Clickbait'")
# hl_10k = pd.concat([cbd, ncbd]).reset_index(drop=True)

# data.loc[~data['video_id'].isin(hl_10k['video_id']), 'human_labeled'] = None

# data.to_parquet("/content/drive/MyDrive/Bangla Clickbaits/prepared_datasets/dataset_253070_17c_hl10k_cleaned.parquet")