<div style="text-align: center;">
    <h1>PROJECT P8
    <h1><b>Patient Preferences Studies Categorization System</b></h1>
    <h3>Applied Data Science Project</h3>
    <h5>Francesco Giuseppe Gillio</h5>
    <h5>César Augusto Seminario Yrigoyen</h5>
</div>

<div style="text-align: center;">
    <img src="https://upload.wikimedia.org/wikipedia/it/4/47/Logo_PoliTo_dal_2021_blu.png" width="250">
</div>

https://github.com/adsp-polito/2024-P8-PPS

# **set-up**

In [1]:
import os

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
main = '/content/drive/MyDrive/classifier-params'
if not os.path.exists(main):
    os.makedirs(main)

In [4]:
branch = os.path.join('/content/drive/MyDrive/classifier-params', 'data')
if not os.path.exists(branch):
    os.makedirs(branch)

branch = os.path.join('/content/drive/MyDrive/classifier-params', 'tests')
if not os.path.exists(branch):
    os.makedirs(branch)

In [5]:
import warnings
warnings.filterwarnings('ignore')

# **sentence-transformers/pubmedbert-base-embeddings**

In [6]:
import pandas as pd

In [7]:
!git lfs install

Git LFS initialized.


In [8]:
!git clone https://github.com/adsp-polito/2024-P8-PPS.git
%cd 2024-P8-PPS

!git lfs pull

Cloning into '2024-P8-PPS'...
remote: Enumerating objects: 178, done.[K
remote: Counting objects: 100% (83/83), done.[K
remote: Compressing objects: 100% (67/67), done.[K
remote: Total 178 (delta 27), reused 58 (delta 13), pack-reused 95 (from 1)[K
Receiving objects: 100% (178/178), 190.24 MiB | 16.80 MiB/s, done.
Resolving deltas: 100% (54/54), done.
/content/2024-P8-PPS


In [9]:
embeddings = '/content/2024-P8-PPS/PPS-BC/bert-base-embeddings/models/sentence-transformers/pubmedbert-base-embeddings-articles-2023.pkl'
data = pd.read_pickle(embeddings)

pubmedbert-base-embeddings-**text-concatenation**

In [10]:
data = data[['Title', 'Abstract', 'Label', 'pubmedbert-base-embeddings-text-concatenation']]

In [11]:
data.head()

Unnamed: 0,Title,Abstract,Label,pubmedbert-base-embeddings-text-concatenation
0,Preferences for disease-related information an...,PURPOSE: The transition towards adult-focused ...,0,"[-0.38423112, -0.30039778, -0.29578313, -0.637..."
1,Advance Care Planning in South Korea.,South Korea is an Asian country with a very lo...,0,"[-0.38796002, -0.056877185, 0.90112704, -0.154..."
2,Stakeholders' preferences for the design and d...,This systematic review aimed to synthesise evi...,1,"[0.15675779, -0.44729438, -0.13902816, -0.1211..."
3,Discrete Choice Experiments in Health State Va...,BACKGROUND: Discrete choice experiments (DCEs)...,1,"[0.20540375, -0.696071, -0.4694253, 0.22151224..."
4,Improving methods of clinical practice guideli...,BACKGROUND: Current methods for developing cli...,0,"[-0.23814784, -0.18566483, -0.26961714, -0.276..."


In [12]:
data.to_pickle('/content/drive/MyDrive/classifier-params/data/pubmedbert-base-embeddings-text-concatenation-articles-2023.pkl')

# **scikit-learn/multi-layer-perceptron**
<img src="https://scikit-learn.org/1.5/_images/multilayerperceptron_network.png" width="250">

In [13]:
import time

In [14]:
import joblib

In [15]:
import numpy as np
import pandas as pd

In [16]:
from contextlib import redirect_stdout

In [17]:
from itertools import product
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split

In [18]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

* scikit-learn/multi-layer-perceptron-**fine-tune**

In [19]:
def tune(data, path):

    hyperparameters = {
        'hidden_layer_sizes': [
            (64,),
            (128, 64),
            (256, 128, 64),
            (512, 256, 128, 64)
          ],  # default=(100,)
        'activation': [
            'relu',
            'tanh'
        ],  # default='relu'
        'solver': [
            'adam',
            'sgd'
        ],  # default='adam'
        'alpha': [
            0.001,
            0.0001,
            0.00001
        ],  # default=0.0001
        'learning_rate': [
            'constant'
        ],  # default='constant'
        'learning_rate_init': [
            0.01,
            0.001,
            0.0001
        ],  # default=0.001
        'max_iter': [
            200,
            500
        ],  # default=200
        'early_stopping': [
            True,
            False
        ],  # default=False
    }

    with open(
        path, "w",
        encoding="utf-8") as file:

        with redirect_stdout(file):

            y = data['Label']
            x = np.array(data['pubmedbert-base-embeddings-text-concatenation'].tolist())

            # split data into train (60%), validation (20%) and test (20%) sets
            x_train, alpha, y_train, beta = train_test_split(
                x, y, test_size=0.4, random_state=42, stratify=y
            )
            x_val, x_test, y_val, y_test = train_test_split(
                alpha, beta, test_size=0.5, random_state=42, stratify=beta
            )

            combinations = product(*hyperparameters.values())

            # iterate over combinations of hyperparameters
            for combination in combinations:
                start = time.time()

                (
                    hidden_layer_sizes, activation, solver, alpha,
                    learning_rate, learning_rate_init, max_iter, early_stopping
                ) = combination

                mlp = MLPClassifier(
                    hidden_layer_sizes=hidden_layer_sizes,
                    activation=activation,
                    solver=solver,
                    alpha=alpha,
                    learning_rate=learning_rate,
                    learning_rate_init=learning_rate_init,
                    max_iter=max_iter,
                    early_stopping=early_stopping,
                    random_state=42
                )

                pipeline = Pipeline([
                    ('scaler', StandardScaler()),
                    ('mlp', mlp)
                ])

                pipeline.fit(x_train, y_train)
                validation = pipeline.score(x_val, y_val)

                y_pred = pipeline.predict(x_test)

                test = accuracy_score(y_test, y_pred)

                print(f"params: {combination}")
                print(f"validation accuracy: {validation:.4f}")
                print(f"test accuracy: {test:.4f}")

                # compute macro averages
                precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred,
                                                                           average='macro')
                print(f"macro precision: {precision:.4f}")
                print(f"macro recall: {recall:.4f}")
                print(f"macro f1-score: {f1:.4f}")

                # compute weighted averages
                precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred,
                                                                           average='weighted')
                print(f"weighted precision: {precision:.4f}")
                print(f"weighted recall: {recall:.4f}")
                print(f"weighted f1-score: {f1:.4f}")

                # compute the classification time
                end = time.time()
                total = end - start
                minutes = int(total // 60)
                seconds = int(total % 60)
                print(f"classification time: {minutes:02d}:{seconds:02d}")

                print("-" * 50)

In [20]:
def extract(text):

    data = []

    with open(text, 'r') as file:
        content = file.read().split('\n')

    params = None
    metrics = {}

    for line in content:
        if not line.strip() or line.startswith('-'):
            continue

        if line.startswith('params:'):
            if params:
                data.append(metrics)
            params = line.split(': ')[1].strip()
            metrics = {'Params': params}

        elif line.startswith('validation accuracy:'):
            metrics['Validation Accuracy'] = float(line.split(': ')[1].strip())
        elif line.startswith('test accuracy:'):
            metrics['Test Accuracy'] = float(line.split(': ')[1].strip())
        elif line.startswith('macro precision:'):
            metrics['Macro Precision'] = float(line.split(': ')[1].strip())
        elif line.startswith('macro recall:'):
            metrics['Macro Recall'] = float(line.split(': ')[1].strip())
        elif line.startswith('macro f1-score:'):
            metrics['Macro F1'] = float(line.split(': ')[1].strip())
        elif line.startswith('weighted precision:'):
            metrics['Weighted Precision'] = float(line.split(': ')[1].strip())
        elif line.startswith('weighted recall:'):
            metrics['Weighted Recall'] = float(line.split(': ')[1].strip())
        elif line.startswith('weighted f1-score:'):
            metrics['Weighted F1'] = float(line.split(': ')[1].strip())
        elif line.startswith('classification time:'):
            time = line.split(':', 1)[1].strip()
            metrics['Classification Time'] = time

    if params:
        data.append(metrics)

    data = pd.DataFrame(data)

    data = data.sort_values(by=['Macro F1', 'Macro Recall', 'Macro Precision', 'Test Accuracy', 'Validation Accuracy', 'Classification Time'], ascending=[False, False, False, False, False, True])

    return data

* **Store** into /content/drive/MyDrive/classifier-params/tests/

In [21]:
tune(data, '/content/drive/MyDrive/classifier-params/tests/params-logs.txt')

In [22]:
output = extract('/content/drive/MyDrive/classifier-params/tests/params-logs.txt')

In [23]:
output.to_csv('/content/drive/MyDrive/classifier-params/tests/classifier-params.csv', index=False)

* scikit-learn/multi-layer-perceptron-**train**

In [31]:
def train(data, path):

    # optimal hyperparameters:
    hyperparameters = {
        'hidden_layer_sizes': (128, 64),
        'activation': 'tanh',
        'solver': 'sgd',
        'alpha': 0.001,
        'learning_rate': 'constant',
        'learning_rate_init': 0.01,
        'max_iter': 500,
        'early_stopping': False
    }

    with open(
        path, "w",
        encoding="utf-8") as file:

        with redirect_stdout(file):

            y = data['Label']
            x = np.array(data['pubmedbert-base-embeddings-text-concatenation'].tolist())

            # split data into train (80%) and test (20%) sets
            x_train, x_test, y_train, y_test = train_test_split(
                x, y, test_size=0.2, random_state=42, stratify=y
            )

            # initialize the model
            mlp = MLPClassifier(
                hidden_layer_sizes=hyperparameters['hidden_layer_sizes'],
                activation=hyperparameters['activation'],
                solver=hyperparameters['solver'],
                alpha=hyperparameters['alpha'],
                learning_rate=hyperparameters['learning_rate'],
                learning_rate_init=hyperparameters['learning_rate_init'],
                max_iter=hyperparameters['max_iter'],
                early_stopping=hyperparameters['early_stopping'],
                random_state=42
            )

            pipeline = Pipeline([
                ('scaler', StandardScaler()),
                ('mlp', mlp)
            ])

            start = time.time()

            pipeline.fit(x_train, y_train)

            # evaluate the model on the test set
            y_pred = pipeline.predict(x_test)
            accuracy = accuracy_score(y_test, y_pred)

            print(f"accuracy: {accuracy:.4f}")

            # compute macro averages
            precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred,
                                                                       average='macro')
            print(f"macro precision: {precision:.4f}")
            print(f"macro recall: {recall:.4f}")
            print(f"macro f1-score: {f1:.4f}")

            # compute weighted averages
            precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred,
                                                                       average='weighted')
            print(f"weighted precision: {precision:.4f}")
            print(f"weighted recall: {recall:.4f}")
            print(f"weighted f1-score: {f1:.4f}")

            # compute the classification time
            end = time.time()
            total = end - start
            minutes = int(total // 60)
            seconds = int(total % 60)
            print(f"classification time: {minutes:02d}:{seconds:02d}")

            print("-" * 50)

    return pipeline

In [32]:
def extract(text):

    data = []

    with open(text, 'r') as file:
        content = file.read().split('\n')

    metrics = {}

    for line in content:
        if not line.strip() or line.startswith('-'):
            continue

        if line.startswith('accuracy:'):
            metrics['Accuracy'] = float(line.split(': ')[1].strip())
        elif line.startswith('macro precision:'):
            metrics['Macro Precision'] = float(line.split(': ')[1].strip())
        elif line.startswith('macro recall:'):
            metrics['Macro Recall'] = float(line.split(': ')[1].strip())
        elif line.startswith('macro f1-score:'):
            metrics['Macro F1'] = float(line.split(': ')[1].strip())
        elif line.startswith('weighted precision:'):
            metrics['Weighted Precision'] = float(line.split(': ')[1].strip())
        elif line.startswith('weighted recall:'):
            metrics['Weighted Recall'] = float(line.split(': ')[1].strip())
        elif line.startswith('weighted f1-score:'):
            metrics['Weighted F1'] = float(line.split(': ')[1].strip())
        elif line.startswith('classification time:'):
            time = line.split(':', 1)[1].strip()
            metrics['Classification Time'] = time

    metrics['Params'] = ((128, 64), 'tanh', 'sgd', 0.001, 'constant', 0.01, 500, False)

    metrics = {'Params': metrics['Params'], **{key: metrics[key] for key in metrics if key != 'Params'}}

    data.append(metrics)

    data = pd.DataFrame(data)

    return data

* **Store** into /content/drive/MyDrive/classifier-params/tests/

In [33]:
model = train(data, '/content/drive/MyDrive/classifier-params/tests/optimal-params-logs.txt')

In [34]:
output = extract('/content/drive/MyDrive/classifier-params/tests/optimal-params-logs.txt')

In [35]:
output.to_csv('/content/drive/MyDrive/classifier-params/tests/optimal-classifier-params.csv', index=False)

* **Store** into /content/drive/MyDrive/classifier-params/

In [36]:
joblib.dump(model, '/content/drive/MyDrive/classifier-params/model.mdl')

['/content/drive/MyDrive/classifier-params/model.mdl']