Experiment 1

Perform document classification based on document embeddings. Document embeddings can be computed by some sentence embedder model and aggregated together by some aggregation procedure (e.g. mean)

# Prepare environment

In [None]:
!git clone https://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:x-oauth-basic@github.com/alexpod1000/Document-Subjectivity.git
%cd Document-Subjectivity/

In [None]:
# get the dataset folder
!git clone https://github.com/francescoantici/SubjectivITA.git

In [None]:
!pip install transformers
!pip install -U sentence-transformers

# Prepare the data

In [None]:
POSSIBLE_EMBEDDERS = [
    "paraphrase-multilingual-MiniLM-L12-v2",
    "paraphrase-multilingual-mpnet-base-v2",
    "distiluse-base-multilingual-cased-v1"
]
SENTENCE_EMBEDDER_MODEL = POSSIBLE_EMBEDDERS[2]

In [None]:
import numpy as np
import pandas as pd

from functools import partial
from sentence_transformers import SentenceTransformer, util
# Sklearn imports
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.utils.class_weight import compute_class_weight

In [None]:
def get_sentences(split):
    """
    Given a split (train, val, test), load the appropriate sentences file.

    Args:
        split (str): string indicating the split.
    Returns:
        df: dataframe with rows representing sentences from articles from
        the given split.
    """
    to_keep = ["ID_ARTICOLO", "ID_FRASE", "FRASE", "TAG_FRASE", "TAG_ARTICOLO", "FONTE"]
    tag_mapper = {"SOG": 0, "OGG": 1}
    df = pd.read_csv(
        "SubjectivITA/datasets/sentences/sentences{}.csv".format(split.capitalize())
    )
    df["TAG_FRASE"] = df["TAG_FRASE"].replace(tag_mapper)
    df["TAG_ARTICOLO"] = df["TAG_ARTICOLO"].replace(tag_mapper)
    df["FONTE"] = df["FONTE"].astype('category').cat.codes
    return df[to_keep]

In [None]:
def load_split_ids(split):
    """
    Given a split (train, val, test), loads all the article ids for that split.

    Args:
        split (str): string indicating the split
    Returns:
        article_ids (List[str]): list containing article ids for the split.
    """
    df_articles = pd.read_csv(
        "SubjectivITA/datasets/articles/articles{}.csv".format(split.capitalize())
    )
    df_articles["ID_ARTICOLO"] = df_articles["ID_ARTICOLO"].astype(np.int32)
    return df_articles["ID_ARTICOLO"].tolist()

In [None]:
# load the article ids for train and test splits
articles_ids_train = load_split_ids("train")
articles_ids_test = load_split_ids("test")

In [None]:
def build_articles_embeddings_data(df, model, aggr_fn = partial(np.mean, axis=0)):
    """
    Given a dataframe, a document embedding model and an embedding aggregation
    function, returns embeddings and subjectivity label as a numpy array.

    Args:
        df: dataframe containing sentences and their corresponding article id.
        model: a document embedding model.
        aggr_fn: function to combine document embeddings into a single representation.

    Returns:
        X (np.array): matrix in the shape of (n_articles, emb_dim)
        y (np.array): vector of article tag for each article
    """
    # Get all the sentences in the dataframe as a list
    sentences_list = df["FRASE"].tolist()
    # Embed all the sentences in the dataframe at once and return them as a list
    # inside the dataframe.
    df["EMB_FRASE"] = model.encode(
        sentences_list, convert_to_tensor=True
    ).cpu().numpy().tolist()
    # Helper dictionary for a pandas dataframe
    articles_dataset = {}
    # Divide articles by groups
    article_groups = df.groupby(['ID_ARTICOLO'])
    # For each unique article
    for article_group in article_groups:
        article_id = article_group[0]
        # All the sentences for the given article_id
        df_by_article_id = article_group[1]
        sentence_embeddings = np.array(df_by_article_id["EMB_FRASE"].tolist())
        # Get embeddings for a single document (from sentence embeddings)
        document_embedding = aggr_fn(sentence_embeddings)
        articles_dataset[article_id] = {
            "document_emb": document_embedding, 
            "article_tag": df_by_article_id["TAG_ARTICOLO"].tolist()[0]
        }
    articles_df = pd.DataFrame(articles_dataset).T
    # Convert to numpy
    X = np.array(articles_df["document_emb"].tolist(), dtype=np.float32)
    y = np.array(articles_df["article_tag"], dtype=np.float32)
    return X, y

In [None]:
# Load the sentence dataframe for each split
df_train = get_sentences("train")
df_val = get_sentences("val")
df_test = get_sentences("test")
# Combine all the splits into a single dataframe
df_all = pd.concat([df_train, df_val, df_test])

In [None]:
# redistribute data according to articles splits
df_train = df_all.query('ID_ARTICOLO in @articles_ids_train')
df_test = df_all.query('ID_ARTICOLO in @articles_ids_test')

In [None]:
# Get document embedder
sentence_embedder_model = SentenceTransformer(SENTENCE_EMBEDDER_MODEL)
aggr_fn = partial(np.mean, axis=0)
# Build the datasets for classical model
X_train, y_train = build_articles_embeddings_data(df_train, sentence_embedder_model, aggr_fn=aggr_fn)
X_test, y_test = build_articles_embeddings_data(df_test, sentence_embedder_model,  aggr_fn=aggr_fn)

In [None]:
X_train.shape, X_test.shape

In [None]:
weights = compute_class_weight(class_weight = 'balanced', classes = [0.0, 1.0], y = y_train)
class_weights = {0 : weights[0], 1: weights[1]}
print(f"Class weights are {class_weights}")

In [None]:
classifier_models_list = {
    "svm": LinearSVC(class_weight=class_weights),
    "logistic": LogisticRegression(class_weight=class_weights),
    "random-forest": RandomForestClassifier(class_weight=class_weights),
    "decision-tree": DecisionTreeClassifier(class_weight=class_weights)
}

In [None]:
for model_name, classifier_model in classifier_models_list.items():
    classifier_model.fit(X_train, y_train)
    print(f"Results for model {model_name}")
    print(classification_report(y_test, classifier_model.predict(X_test)))