<div style="text-align: center;">
    <h3>Applied Data Science Project</h3>
    <h2><b>Patient Preferences Studies Classification System</b><h2>
    <h1><b>Train Binary Classifier Model</b></h1>
    <h5>Francesco Giuseppe Gillio</h5>
    <h5>César Augusto Seminario Yrigoyen</h5>
</div>

<div style="text-align: center;">
    <img src="https://upload.wikimedia.org/wikipedia/it/4/47/Logo_PoliTo_dal_2021_blu.png" width="250">
</div>

https://github.com/adsp-polito/2024-P8-PPS

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# *Drive* **Setup**

In [None]:
import os
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Patient Preference Studies Binary Classifier
root = 'PPS-BC'

In [None]:
path = os.path.join('/content/drive/MyDrive', root)
if not os.path.exists(path):
    os.makedirs(path)

In [None]:
paths = {
    'input-data': f'{root}/input-data',
    'bert-embeddings': f'{root}/bert-embeddings',
    'notebook-results': f'{root}/notebook-results',
}

In [None]:
for key, value in paths.items():
    path = os.path.join('/content/drive/MyDrive', value)
    if not os.path.exists(path):
        os.makedirs(path)

**Notebook Directory Structure**
```
/PPS-BC
├── /input-data
│   └── ...
├── /bert-embeddings
│   └── ...
├── /notebook-results
│   └── ...
└── /models
    ├── /biomed-bert-base
    │   └── ...
    ├── /pubmed-bert-base
    │   └── ...
    ├── biomed-svc-pipeline.joblib
    └── pubmed-knn-pipeline.joblib
```

# **Train** *Binary Classifier Model*

**Requirements**

In [None]:
import gc
import sys
import torch
import joblib

import numpy as np
import pandas as pd

from typing import List, Tuple

In [None]:
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.ensemble import VotingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split

from transformers import AutoTokenizer, AutoModel

In [None]:
np.random.seed(42)
torch.manual_seed(42)

<torch._C.Generator at 0x7ef49e18d610>

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
input = os.path.join(f"/content/drive/MyDrive", paths['input-data'], 'clean-articles-2023.csv')

In [None]:
output = os.path.join(f"/content/drive/MyDrive/{root}", 'models')
if not os.path.exists(output):
    os.makedirs(output)

**Train Binary Classifier Model**

In [None]:
def train(
    input: str,
    device: str,
    threshold: int,
    weights: Tuple[int, int],
    path: str = 'models'
):

    def remove(model):
        del model
        gc.collect()
        torch.cuda.empty_cache()

    def get(
        row: pd.Series,
        base: str,
        model: AutoModel,
        tokenizer: AutoTokenizer,
        device: str
    ) -> np.ndarray:

        title = [row['title']]
        abstract = [row['abstract']]

        def meanpooling(
            output: Tuple[torch.Tensor, ...],
            mask: torch.Tensor
        ) -> torch.Tensor:
            embeddings = output[0]
            mask = mask.unsqueeze(-1).expand(embeddings.size()).float()
            return torch.sum(embeddings * mask, 1) / torch.clamp(mask.sum(1), min=1e-9)

        def tokenize(
            text: List[str]
        ) -> dict:
            inputs = tokenizer(
                text,
                padding=True,
                truncation=True,
                return_tensors='pt',
                max_length=512
            )
            inputs = {
                key: value.to(device)
                for key, value in inputs.items()
            }
            return inputs

        def encode(
            text: List[str],
            pooling: bool
        ) -> torch.Tensor:
            inputs = tokenize(text)
            with torch.no_grad():
                output = model(**inputs)
            embeddings = output.pooler_output if not pooling else meanpooling(
                output,
                inputs['attention_mask']
           )
            return embeddings

        if base == 'NeuML/pubmedbert-base-embeddings':
            title = encode(title, pooling=False)
            abstract = encode(abstract, pooling=False)
            embeddings = torch.cat((title, abstract), dim=-1)
        elif base == 'microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract':
            title = encode(title, pooling=True)
            abstract = encode(abstract, pooling=True)
            embeddings = 0.2 * title + 0.8 * abstract
        else:
            raise ValueError(f"unknown base model: {base}")
        return embeddings.cpu().numpy()


    if not os.path.exists(path):
        os.makedirs(path)

    print(f"reading dataset from {input}...")
    dataset = pd.read_csv(input)
    print(f"dataset size: {len(dataset)}")
    labels = dataset['label'].tolist()
    print(f"\nsplitting dataset into train and test sets...")
    titles_train, titles_test, abstracts_train, abstracts_test, y_train, y_test = train_test_split(
        dataset['title'], dataset['abstract'], labels,
        test_size=float(
            260 / 1215
        ),
        random_state=42,
        stratify=labels
    )
    print(f"train set size: {len(y_train)}, test set size: {len(y_test)}")
    results = {}
    for base, desc, rec, classifier in [
        ('NeuML/pubmedbert-base-embeddings',
         'pubmed-knn-pipeline',
         'pubmed-bert-base',
         KNeighborsClassifier(
             n_neighbors=5,
             weights="distance",
             metric="euclidean"
        )),
        ('microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract',
         'biomed-svc-pipeline',
         'biomed-bert-base',
         SVC(
             kernel="rbf",
             probability=True,
             class_weight="balanced",
             random_state=42,
             max_iter=1000,
             gamma='auto',
             C=1e1
        ))
    ]:
        print(f"\nprocessing {desc}...")
        print(f"loading {base}...")
        tokenizer = AutoTokenizer.from_pretrained(base)
        model = AutoModel.from_pretrained(base)
        model = model.to(device)
        x_train = list()
        for idx, row in pd.concat([titles_train, abstracts_train], axis=1).reset_index(drop=True).iterrows():
            percentage = (idx + 1) / len(titles_train) * 100
            sys.stdout.write(f"\rencoding train data... {percentage:.2f}%")
            sys.stdout.flush()
            x_train.append(get(row, base, model, tokenizer, device))
        print()
        x_test = list()
        for idx, row in pd.concat([titles_test, abstracts_test], axis=1).reset_index(drop=True).iterrows():
            percentage = (idx + 1) / len(titles_test) * 100
            sys.stdout.write(f"\rencoding test data... {percentage:.2f}%")
            sys.stdout.flush()
            x_test.append(get(row, base, model, tokenizer, device))
        print(f"\nstacking data for {desc}...")
        x_train = np.vstack(x_train)
        x_test = np.vstack(x_test)
        print(f"train data shape: {x_train.shape}, test data shape: {x_test.shape}")
        output = f'{path}/{rec.replace(" ", "-")}'
        os.makedirs(output, exist_ok=True)
        tokenizer.save_pretrained(output)
        model.save_pretrained(output)
        print(f"saving pre-trained {base} model as {rec} into {output}")
        remove(model)
        print(f"training classifier for {desc}...")
        pipeline = Pipeline([
            ('scaler', StandardScaler()),
            ('classifier', classifier)
        ])
        pipeline.fit(x_train, y_train)
        output = f'{path}/{desc.replace(" ", "-")}.joblib'
        joblib.dump(pipeline, output)
        print(f"saving trained classifier pipeline as {desc} into {output}")
        print(f"computing predictions for {desc}...")
        preds = pipeline.predict_proba(x_test)
        results[base] = preds
    print(f"\nprocessing soft majority vote...")
    alpha, beta = results['NeuML/pubmedbert-base-embeddings'], results['microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract']
    a, b = weights
    probs = (alpha * a) + (beta * b)
    preds = (probs[:, 1] >= threshold).astype(int)
    print(classification_report(y_test, preds))

In [None]:
train(
    input = input,
    device = device,
    threshold = 0.3875,
    weights = (0.4375, 0.5625),
    path = output
)

reading dataset from /content/drive/MyDrive/PPS-BC/input-data/clean-articles-2023.csv...
dataset size: 1215

splitting dataset into train and test sets...
train set size: 955, test set size: 260

processing pubmed-knn-pipeline...
loading NeuML/pubmedbert-base-embeddings...


tokenizer_config.json:   0%|          | 0.00/1.30k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/226k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/706k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/74.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/667 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

encoding train data... 100.00%
encoding test data... 100.00%
stacking data for pubmed-knn-pipeline...
train data shape: (955, 1536), test data shape: (260, 1536)
saving pre-trained NeuML/pubmedbert-base-embeddings model as pubmed-bert-base into /content/drive/MyDrive/PPS-BC/models/pubmed-bert-base
training classifier for pubmed-knn-pipeline...
saving trained classifier pipeline as pubmed-knn-pipeline into /content/drive/MyDrive/PPS-BC/models/pubmed-knn-pipeline.joblib
computing predictions for pubmed-knn-pipeline...

processing biomed-svc-pipeline...
loading microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract...


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/225k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

encoding train data... 100.00%
encoding test data... 100.00%
stacking data for biomed-svc-pipeline...
train data shape: (955, 768), test data shape: (260, 768)
saving pre-trained microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract model as biomed-bert-base into /content/drive/MyDrive/PPS-BC/models/biomed-bert-base
training classifier for biomed-svc-pipeline...
saving trained classifier pipeline as biomed-svc-pipeline into /content/drive/MyDrive/PPS-BC/models/biomed-svc-pipeline.joblib
computing predictions for biomed-svc-pipeline...

processing soft majority vote...
              precision    recall  f1-score   support

           0       0.99      0.97      0.98       211
           1       0.87      0.96      0.91        49

    accuracy                           0.97       260
   macro avg       0.93      0.96      0.95       260
weighted avg       0.97      0.97      0.97       260

