Experiment 1

Perform document classification based on document embeddings. Document embeddings can be computed by some sentence embedder model and aggregated together by some aggregation procedure (e.g. mean)

# Prepare environment

In [None]:
!git clone https://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:x-oauth-basic@github.com/alexpod1000/Document-Subjectivity.git
%cd Document-Subjectivity/

In [None]:
!pip install transformers
!pip install -U sentence-transformers

# Prepare the data

In [None]:
!git clone https://github.com/francescoantici/SubjectivITA.git

In [None]:
import numpy as np
import pandas as pd

from functools import partial
from sentence_transformers import SentenceTransformer, util

In [None]:
sentence_embedder_model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')

In [None]:
def get_sentences(split):
    to_keep = ["ID_ARTICOLO", "ID_FRASE", "FRASE", "TAG_FRASE", "TAG_ARTICOLO", "FONTE"]
    tag_mapper = {"SOG": 0, "OGG": 1}
    df = pd.read_csv("SubjectivITA/datasets/sentences/sentences{}.csv".format(split.capitalize()))
    df["TAG_FRASE"] = df["TAG_FRASE"].replace(tag_mapper)
    df["TAG_ARTICOLO"] = df["TAG_ARTICOLO"].replace(tag_mapper)
    # TODO(alexo): check better if different splits have overlapping ids for different FONTE field
    df["FONTE"] = df["FONTE"].astype('category').cat.codes
    return df[to_keep]

In [None]:
def load_split_ids(split):
    df_articles = pd.read_csv("SubjectivITA/datasets/articles/articles{}.csv".format(split.capitalize()))
    df_articles["ID_ARTICOLO"] = df_articles["ID_ARTICOLO"].astype(np.int32)
    return df_articles["ID_ARTICOLO"].tolist()

In [None]:
articles_ids_train = load_split_ids("train")
articles_ids_test = load_split_ids("test")

In [None]:
def build_articles_embeddings_data(df, model, aggr_fn = partial(np.mean, axis=0)):
    sentences_list = df["FRASE"].tolist()
    df["EMB_FRASE"] = model.encode(sentences_list, convert_to_tensor=True).cpu().numpy().tolist()
    articles_dataset = {}
    # Divide articles by groups
    article_groups = df.groupby(['ID_ARTICOLO'])
    for article_group in article_groups:
        article_id = article_group[0]
        df_by_article_id = article_group[1]
        sentence_embeddings = np.array(df_by_article_id["EMB_FRASE"].tolist())
        # Get embeddings for a single document (from sentence embeddings)
        document_embedding = aggr_fn(sentence_embeddings)
        articles_dataset[article_id] = {"document_emb": document_embedding, "article_tag": df_by_article_id["TAG_ARTICOLO"].tolist()[0]}
    articles_df = pd.DataFrame(articles_dataset).T
    # convert to numpy
    X = np.array(articles_df["document_emb"].tolist(), dtype=np.float32)
    y = np.array(articles_df["article_tag"], dtype=np.float32)
    return X, y

In [None]:
df_train = get_sentences("train")
df_val = get_sentences("val")
df_test = get_sentences("test")
# combine all the splits into a single dataframe
df_all = pd.concat([df_train, df_val, df_test])

In [None]:
# redistribute data according to articles splits
df_train = df_all.query('ID_ARTICOLO in @articles_ids_train')
df_test = df_all.query('ID_ARTICOLO in @articles_ids_test')

In [None]:
aggr_fn = partial(np.mean, axis=0)
X_train, y_train = build_articles_embeddings_data(df_train, sentence_embedder_model, aggr_fn=aggr_fn)
X_test, y_test = build_articles_embeddings_data(df_test, sentence_embedder_model,  aggr_fn=aggr_fn)

In [None]:
X_train.shape, X_test.shape

In [None]:
from sklearn.utils.class_weight import compute_class_weight

weights = compute_class_weight(class_weight = 'balanced', classes = [0.0, 1.0], y = y_train)
class_weights = {0 : weights[0], 1: weights[1]}    

In [None]:
class_weights

In [None]:
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

classifier_models_list = {
    "svm": LinearSVC(class_weight=class_weights),
    "logistic": LogisticRegression(class_weight=class_weights),
    "random-forest": RandomForestClassifier(),
    #"naive-bayes": MultinomialNB(),
    "decision-tree": DecisionTreeClassifier()
}

In [None]:
for model_name, classifier_model in classifier_models_list.items():
    print(f"Fitting model {model_name}")
    classifier_model.fit(X_train, y_train)
    print(classification_report(y_test, classifier_model.predict(X_test)))