Experiment 3

Perform document classification based on document embeddings, by finetuning the model first.

We can use the already pretrained document embedder, pool the results, and attach a logistic regression/svm on top of it. This way we can obtain a backpropable model. If we want to try other approaches such as decision trees or random forests, we can detach the logistic regression/svm head, and redo the training as in experiment 2.

# Prepare environment

In [None]:
!git clone https://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:x-oauth-basic@github.com/alexpod1000/Document-Subjectivity.git
%cd Document-Subjectivity/

In [None]:
# get the dataset folder
!git clone https://github.com/francescoantici/SubjectivITA.git

In [None]:
!pip install transformers
!pip install -U sentence-transformers

# Prepare the data

In [None]:
POSSIBLE_EMBEDDERS = [
    "paraphrase-multilingual-MiniLM-L12-v2",
    "paraphrase-multilingual-mpnet-base-v2",
    "distiluse-base-multilingual-cased-v1"
]
SENTENCE_EMBEDDER_MODEL = POSSIBLE_EMBEDDERS[0]
GRADIENT_ACCUMULATION = 1

In [None]:
import torch

import numpy as np
import pandas as pd

from functools import partial
from sentence_transformers import SentenceTransformer, util
# Sklearn imports
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.utils.class_weight import compute_class_weight

In [None]:
def get_sentences(split):
    """
    Given a split (train, val, test), load the appropriate sentences file.

    Args:
        split (str): string indicating the split.
    Returns:
        df: dataframe with rows representing sentences from articles from
        the given split.
    """
    to_keep = ["ID_ARTICOLO", "ID_FRASE", "FRASE", "TAG_FRASE", "TAG_ARTICOLO", "FONTE"]
    tag_mapper = {"SOG": 0, "OGG": 1}
    df = pd.read_csv(
        "SubjectivITA/datasets/sentences/sentences{}.csv".format(split.capitalize())
    )
    df["TAG_FRASE"] = df["TAG_FRASE"].replace(tag_mapper)
    df["TAG_ARTICOLO"] = df["TAG_ARTICOLO"].replace(tag_mapper)
    df["FONTE"] = df["FONTE"].astype('category').cat.codes
    return df[to_keep]

In [None]:
def load_split_ids(split):
    """
    Given a split (train, test), loads all the article ids for that split.

    Args:
        split (str): string indicating the split
    Returns:
        article_ids (List[str]): list containing article ids for the split.
    """
    df_articles = pd.read_csv(
        "SubjectivITA/datasets/articles/articles{}.csv".format(split.capitalize())
    )
    df_articles["ID_ARTICOLO"] = df_articles["ID_ARTICOLO"].astype(np.int32)
    return df_articles["ID_ARTICOLO"].tolist()

In [None]:
# load the article ids for train and test splits
articles_ids_train = load_split_ids("train")
articles_ids_test = load_split_ids("test")

In [None]:
def get_statistical_data(df, train_sources=None):
    """
    Given an article dataframe, return a list containing its statistical features.
    All the article sources that weren't in the initial training dataset will be
    mapped to an "unknown" source.
    """
    obj_ratio = sum(df[df["TAG_FRASE"]==1]["TAG_FRASE"])/len(df)
    if train_sources is not None:
        # map source to a one hot
        source_id = list(df["FONTE"])[0]
        if source_id not in train_sources:
            source_id = 0
        else:
            # we assume "missing" category to be at index 0
            source_id = train_sources.index(source_id) + 1
        source_encoding = np.zeros(len(train_sources) + 1, dtype=np.float32)
        source_encoding[source_id] = 1.0

        # add batch dimension
        return np.hstack([obj_ratio, source_encoding])
    else:
        return np.array([obj_ratio])

def build_articles_embeddings_data(df, train_sources, model, use_statistical_data = False, aggr_fn = partial(np.mean, axis=0)):
    """
    Given a dataframe, a document embedding model and an embedding aggregation
    function, returns embeddings and subjectivity label as a numpy array.

    Args:
        df: dataframe containing sentences and their corresponding article id.
        train_sources: ordered set containing article sources for training data.
        model: a document embedding model.
        aggr_fn: function to combine document embeddings into a single representation.

    Returns:
        X (np.array): matrix in the shape of (n_articles, emb_dim)
        y (np.array): vector of article tag for each article
    """
    # Get all the sentences in the dataframe as a list
    sentences_list = df["FRASE"].tolist()
    # Embed all the sentences in the dataframe at once and return them as a list
    # inside the dataframe.
    df["EMB_FRASE"] = model.encode(
        sentences_list, convert_to_tensor=True
    ).cpu().numpy().tolist()
    # Helper dictionary for a pandas dataframe
    articles_dataset = {}
    # Divide articles by groups
    article_groups = df.groupby(['ID_ARTICOLO'])
    # For each unique article
    for article_group in article_groups:
        article_id = article_group[0]
        # All the sentences for the given article_id
        df_by_article_id = article_group[1]
        # Document statistical features
        if use_statistical_data:
            doc_stat = get_statistical_data(df_by_article_id, train_sources)
        else:
            doc_stat = []

        sentence_embeddings = np.array(df_by_article_id["EMB_FRASE"].tolist())
        # Get embeddings for a single document (from sentence embeddings)
        document_embedding = aggr_fn(sentence_embeddings)
        articles_dataset[article_id] = {
            "document_emb": np.hstack([document_embedding, doc_stat]), 
            "article_tag": df_by_article_id["TAG_ARTICOLO"].tolist()[0]
        }
    articles_df = pd.DataFrame(articles_dataset).T
    # Convert to numpy
    X = np.array(articles_df["document_emb"].tolist(), dtype=np.float32)
    y = np.array(articles_df["article_tag"], dtype=np.float32)
    return X, y

In [None]:
def batch_to_device(batch, target_device):
    """
    send a pytorch batch to a device (CPU/GPU)
    """
    for key in batch:
        if isinstance(batch[key], torch.Tensor):
            batch[key] = batch[key].to(target_device)
    return batch

class FinetuneDocumentEmbedderModel(torch.nn.Module):

    def __init__(self, sentence_embedder_model, aggr="mean"):
        super(FinetuneDocumentEmbedderModel, self).__init__()
        self.sentence_embedder_model = sentence_embedder_model
        if aggr=="mean":
            self.aggr_fn = partial(torch.mean, axis=0)
        self.classification_layer = torch.nn.Linear(
            in_features=self.sentence_embedder_model.get_sentence_embedding_dimension(),
            out_features=1
        )

    def forward(self, sentences, train=False):
        """
        Note: this model assumes that sentences come from the same article.
        """
        if train:
            self.sentence_embedder_model.train()
        else:
            self.sentence_embedder_model.eval()
        # forward pass on the document embedded
        features = self.sentence_embedder_model.tokenize(sentences)
        features_device = batch_to_device(features, self.sentence_embedder_model.device)
        out_features = self.sentence_embedder_model.forward(features_device)
        embeddings = out_features["sentence_embedding"]
        # aggregate multiple sentence embeddings into a single one
        aggr_embeddings = self.aggr_fn(embeddings)
        # apply classification head
        logits = self.classification_layer(aggr_embeddings)
        return logits

In [None]:
def finetune_model_train_epoch(model, loss_fn, optimizer, dataloader, gradient_accumulation=1, device="cuda"):
    """
    dataloader: a function generating (sentences:List[str], label:Int) pairs.
    """
    stats_collector = {"loss": []}
    current_step = 0
    model.train()
    model.zero_grad()
    for id, sentences, label_gt in dataloader:
        current_step += 1
        # prepare label
        label_gt = torch.Tensor([label_gt]).to(device)
        # forward pass
        label_logits = model(sentences)
        # compute loss
        loss_val = loss_fn(label_logits, label_gt)
        if gradient_accumulation > 1:
            # normalize loss (assume averaging)
            loss_val = loss_val / gradient_accumulation
            loss_val.backward()
            if current_step % gradient_accumulation == 0:
                optimizer.step()
                model.zero_grad()
        else:
            loss_val.backward()
            optimizer.step()
            model.zero_grad()
        stats_collector["loss"].append(loss_val.item())
    return stats_collector

In [None]:
import random

def finetune_dataloader_fn(df):
    # Divide articles by groups
    article_groups = df.groupby(['ID_ARTICOLO'])
    # we'll avoid using generator to easily shuffle samples
    samples = []
    # For each unique article
    for article_group in article_groups:
        article_id = article_group[0]
        # All the sentences for the given article_id
        df_by_article_id = article_group[1]
        article_tag = df_by_article_id["TAG_ARTICOLO"].tolist()[0]
        #yield article_id, df_by_article_id["FRASE"].tolist(), article_tag
        samples.append((article_id, df_by_article_id["FRASE"].tolist(), article_tag))
    random.shuffle(samples)
    return samples

In [None]:
# Load the sentence dataframe for each split
df_train = get_sentences("train")
df_val = get_sentences("val")
df_test = get_sentences("test")
# Combine all the splits into a single dataframe
df_all = pd.concat([df_train, df_val, df_test])

In [None]:
# redistribute data according to articles splits
df_train = df_all.query('ID_ARTICOLO in @articles_ids_train')
df_test = df_all.query('ID_ARTICOLO in @articles_ids_test')

Note: Adam and AdamW were tried, but resulted in huge overfitting

In [None]:
def prepare_data_for_classical(df_train, df_test, sentence_embedder_model):
    aggr_fn = partial(np.mean, axis=0)
    # Build the datasets for classical model
    train_sources = list(set(df_train["FONTE"]))
    use_statistical_data = False
    X_train, y_train = build_articles_embeddings_data(df_train, train_sources, sentence_embedder_model, use_statistical_data=use_statistical_data, aggr_fn=aggr_fn)
    X_test, y_test = build_articles_embeddings_data(df_test, train_sources, sentence_embedder_model, use_statistical_data=use_statistical_data, aggr_fn=aggr_fn)
    print(X_train.shape, X_test.shape)
    return X_train, y_train, X_test, y_test

In [None]:
def train_classifical_classifiers(X_train, y_train, X_test, y_test):
    weights = compute_class_weight(class_weight = 'balanced', classes = [0.0, 1.0], y = y_train)
    class_weights = {0 : weights[0], 1: weights[1]}
    print(f"Class weights are {class_weights}")

    classifier_models_list = {
        "svm": LinearSVC(class_weight=class_weights),
        "logistic": LogisticRegression(class_weight=class_weights),
        "random-forest": RandomForestClassifier(class_weight=class_weights),
        "decision-tree": DecisionTreeClassifier(class_weight=class_weights)
    }

    for model_name, classifier_model in classifier_models_list.items():
        classifier_model.fit(X_train, y_train)
        print(f"Results for model {model_name}")
        print(classification_report(y_test, classifier_model.predict(X_test)))

In [None]:
# Get document embedder
sentence_embedder_model = SentenceTransformer(SENTENCE_EMBEDDER_MODEL)
# NOTE: sentence_embedder_model model works only on cuda (apparently)
finetune_model = FinetuneDocumentEmbedderModel(sentence_embedder_model).to("cuda")
#optimizer = torch.optim.SGD(finetune_model.parameters(), lr=0.001) #
optimizer = torch.optim.AdamW(finetune_model.parameters(), lr=0.0001, weight_decay=0.001)
loss_fn = torch.nn.BCEWithLogitsLoss()

In [None]:
print("Results BEFORE finetuning")
train_classifical_classifiers(*prepare_data_for_classical(df_train, df_test, sentence_embedder_model))

In [None]:
N_EPOCHS = 5
for epoch in range(N_EPOCHS):
    stats = finetune_model_train_epoch(
        finetune_model, loss_fn, optimizer, 
        finetune_dataloader_fn(df_train),
        gradient_accumulation=GRADIENT_ACCUMULATION
    )
    print(f"Epoch {epoch+1}, train_loss_avg: {np.mean(stats['loss'])}")

We can now exploit our code from Experiment 1 and Experiment 2 but with finetuned **sentence_embedder_model**

In [None]:
print("Results AFTER finetuning")
train_classifical_classifiers(*prepare_data_for_classical(df_train, df_test, sentence_embedder_model))