# Prepare environment

In [None]:
!git clone https://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:x-oauth-basic@github.com/alexpod1000/Document-Subjectivity.git
%cd Document-Subjectivity/

In [None]:
!pip install transformers
!pip install -U sentence-transformers

# Prepare the data

In [None]:
!git clone https://github.com/francescoantici/SubjectivITA.git

In [None]:
import numpy as np
import pandas as pd

from functools import partial
from sentence_transformers import SentenceTransformer, util

In [None]:
import tensorflow as tf
from transformers import AutoTokenizer, TFBertModel
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report
from sklearn.utils.class_weight import compute_class_weight

maxSentenceLen = 20

def prepare_data(X, y):
    pad = tf.keras.preprocessing.sequence.pad_sequences#(seq, padding = 'post', maxlen = maxlen)
    tokenizer = AutoTokenizer.from_pretrained("m-polignano-uniba/bert_uncased_L-12_H-768_A-12_italian_alb3rt0")
    dataFields = {
            "input_ids": [],
            "token_type_ids": [],
            "attention_mask": [],
            "emotions": []
        }
    lbls = {
        'SOG' : 1.0,
        'OGG' : 0.0
    }
    for i in range(len(X)):
        data = tokenizer(X[i])
        padded = pad([data['input_ids'], data['attention_mask'], data['token_type_ids']], padding = 'post', maxlen = maxSentenceLen)
        dataFields['input_ids'].append(padded[0])
        dataFields['attention_mask'].append(padded[1])
        dataFields['token_type_ids'].append(padded[-1])
        dataFields['emotions'].append(lbls[y[i]])
    
    for key in dataFields:
        dataFields[key] = np.array(dataFields[key])
    
    return [dataFields["input_ids"], dataFields["token_type_ids"], dataFields["attention_mask"]], dataFields["emotions"]

def create_sentences_model(useAlberto = False):
    input_ids = tf.keras.layers.Input(shape=(maxSentenceLen,), dtype=tf.int32)
    token_type_ids = tf.keras.layers.Input(shape=(maxSentenceLen,), dtype=tf.int32)
    attention_mask = tf.keras.layers.Input(shape=(maxSentenceLen,), dtype=tf.int32)
    bertModel = TFBertModel.from_pretrained("m-polignano-uniba/bert_uncased_L-12_H-768_A-12_italian_alb3rt0")(input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)[-1]
    out = tf.keras.layers.Dense(1, activation=tf.nn.sigmoid)(tf.keras.layers.Dropout(0.1)(bertModel))
    model = tf.keras.Model(inputs=[input_ids, token_type_ids, attention_mask], outputs=out)
    model.compile(optimizer = tf.optimizers.Adam(1e-5), loss = tf.keras.losses.BinaryCrossentropy(), metrics = ['accuracy'])
    return model

def train_sentences_model(model, Xtrain, ytrain, validation_data, save_weights = True):
  try:
    Xtrain, ytrain = prepare_data(Xtrain, ytrain)
    weights = compute_class_weight(class_weight = 'balanced', classes = [0.0, 1.0], y = ytrain)
    class_weights = {0 : weights[0], 1: weights[1]}
    callback = tf.keras.callbacks.EarlyStopping(monitor = 'val_accuracy', mode = 'max', patience = 2, restore_best_weights = True)
    model.fit(Xtrain, ytrain, validation_data = prepare_data(validation_data[0], validation_data[1]), batch_size = 16, epochs = 4, callbacks = [callback], class_weight = class_weights)
    if save_weights:
      model.save_weights('weights/sentencesModelWeights.h5')
    print("Sentences model trained successfully!")
    return model
  except Exception as e:
    print("Error in training sentences model: {}".format(e))
    return False

def evaluate_sentences_model(model, Xtest, ytest):
  Xtest, ytest = prepare_data(Xtest, ytest)
  y_pred = model.predict(Xtest)
  ypred = toLabels(y_pred)
  ytest = toLabels(ytest)
  with open('results/reports_sentences.txt', 'w') as f:
    f.write(classification_report(ytest, y_pred = ypred)+ "\n")

def get_sentences(split):
  df = pd.read_csv("SubjectivITA/datasets/sentences/sentences{}.csv".format(split.capitalize()))
  return df['FRASE'].values, df['TAG_FRASE'].values
  
def toLabels(data, subT = 0.5):
    ypred = []
    for pred in data:
        if pred >= subT:
            ypred.append('SOG')
        else:
            ypred.append('OGG')
    return ypred

def main(train = False):
  sentencesModel = create_sentences_model()
  sentencesXtrain, sentencesytrain = get_sentences(split = 'train')
  sentencesXval, sentencesyval = get_sentences(split = 'val')
  sentencesXtest, sentencesytest = get_sentences(split = 'test')
  if train:
    sentencesModel = train_sentences_model(sentencesModel, sentencesXtrain, sentencesytrain, validation_data = (sentencesXval, sentencesyval))
  else:
    try:
      sentencesModel.load_weights('weights/sentencesModelWeights.h5')
    except:
      print("No weights found!")
  evaluate_sentences_model(sentencesModel, sentencesXtest, sentencesytest)

main(True)

In [None]:
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Perceptron
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import numpy as np

def create_articles_model(modelName = 'random-forest'):
  modelSwitcher = {
      "svm" : LinearSVC,
      "logistic" : LogisticRegression,
      "decision-tree" : DecisionTreeClassifier,
      "random-forest": RandomForestClassifier,
      "naive-bayes" : MultinomialNB
  }
  return modelSwitcher[modelName]()

def train_articles_model(model, Xtrain, ytrain):
  try:
    model.fit(X = Xtrain, y = ytrain)
    print("Articles model trained successfully!")
    return model
  except Exception as e:
    print("Error in training articles model: {}".format(e))
    return False

def evaluate_articles_model(model, Xtest, ytest):
  y_pred = model.predict(Xtest)
  toLabel = {1:"SOG", 0:"OGG"}
  ypred = list(map(lambda x: toLabel[x], y_pred))
  ytest = list(map(lambda x: toLabel[x], ytest))
  with open('results/reports_articles.txt', 'a') as f:
    f.write(classification_report(ytest, y_pred = ypred)+ "\n")

def get_articles(split):
  df = pd.read_csv("SubjectivITA/datasets/articles/articles{}.csv".format(split.capitalize()))
  fonti = list(df['FONTE'].unique())
  tags = {"OGG" : 0, "SOG" : 1}
  X = df.drop(['ID_ARTICOLO', 'TAG_ARTICOLO'], axis = 1)
  X['FONTE'] = X['FONTE'].map(lambda x: fonti.index(x))
  X['FRASI_SOG'] = X['FRASI_SOG']/X['FRASI']
  X['FRASI_OGG'] = X['FRASI_OGG']/X['FRASI']
  X = X.drop(['FRASI'], axis = 1)
  #scaler = MinMaxScaler()
  #scaler = scaler.fit(X['FONTE'].values.reshape(-1, 1))
  #X['FONTE'] = scaler.transform(X['FONTE'].values.reshape(-1, 1))
  print(X)
  y = np.array(list(map(lambda x: tags[x], df['TAG_ARTICOLO'].values)))
  return X, y

def main():
    articlesXtrain, articlesytrain = get_articles(split='train')
    articlesXTest, articlesytest = get_articles(split='test')
    for model in ["svm", "logistic", "random-forest", "naive-bayes", "decision-tree"]:
      articlesModel = create_articles_model(model)
      articlesModel = train_articles_model(articlesModel, articlesXtrain, articlesytrain)
      evaluate_articles_model(articlesModel, articlesXTest, articlesytest)

main()

In [None]:
"""
OUTLINE

1. creare un dataset pyt a cui passiamo il dataframe, e che ci sputa fuori frasi come stringhe oppure già elaborate da un tokenizzatore
2. 

OUTLINE (per il task più semplice, pretrained model doc embeddings + modello ml classico):
1. passare tutte le stringhe dei documenti ad una funzione che tira fuori doc per doc gli embeddings di tutte le frasi.
2. trovare una strategia per mergere insieme gli embeddings nei singoli doc (mean, lstm, attention(?)).
3. fittare sopra questa rappresentazione, un modello di ML classico (https://bytepawn.com/svm-with-pytorch.html, deeplearning.net/wp-content/uploads/2013/03/dlsvm.pdf).
"""

In [None]:
def get_sentences(split):
    to_keep = ["ID_ARTICOLO", "ID_FRASE", "FRASE", "TAG_FRASE", "TAG_ARTICOLO", "FONTE"]
    tag_mapper = {"SOG": 0, "OGG": 1}
    df = pd.read_csv("SubjectivITA/datasets/sentences/sentences{}.csv".format(split.capitalize()))
    df["TAG_FRASE"] = df["TAG_FRASE"].replace(tag_mapper)
    df["TAG_ARTICOLO"] = df["TAG_ARTICOLO"].replace(tag_mapper)
    # TODO(alexo): check better if different splits have overlapping ids for different FONTE field
    df["FONTE"] = df["FONTE"].astype('category').cat.codes
    return df[to_keep]

In [None]:
def load_split_ids(split):
    df_articles = pd.read_csv("SubjectivITA/datasets/articles/articles{}.csv".format(split.capitalize()))
    df_articles["ID_ARTICOLO"] = df_articles["ID_ARTICOLO"].astype(np.int32)
    return df_articles["ID_ARTICOLO"].tolist()

In [None]:
articles_ids_train = load_split_ids("train")
articles_ids_test = load_split_ids("test")

In [None]:
def build_articles_embeddings_data(df, model, use_sentences_stats=True, aggr_fn = partial(np.mean, axis=0)):
    sentences_list = df["FRASE"].tolist()
    df["EMB_FRASE"] = model.encode(sentences_list, convert_to_tensor=True).cpu().numpy().tolist()
    articles_dataset = {}
    # Divide articles by groups
    article_groups = df.groupby(['ID_ARTICOLO'])
    for article_group in article_groups:
        article_id = article_group[0]
        df_by_article_id = article_group[1]
        if use_sentences_stats:
            sentences_with_tag_1 = sum(df_by_article_id["TAG_FRASE"].tolist())
            total_sentences = len(df_by_article_id)
        sentence_embeddings = np.array(df_by_article_id["EMB_FRASE"].tolist())
        # Get embeddings for a single document (from sentence embeddings)
        document_embedding = aggr_fn(sentence_embeddings)
        if use_sentences_stats:
            articles_dataset[article_id] = {"sent_tag_ratio": [sentences_with_tag_1/total_sentences], "document_emb": document_embedding, "article_tag": df_by_article_id["TAG_ARTICOLO"].tolist()[0]}
        else:
            articles_dataset[article_id] = {"document_emb": document_embedding, "article_tag": df_by_article_id["TAG_ARTICOLO"].tolist()[0]}
    articles_df = pd.DataFrame(articles_dataset).T
    # convert to numpy
    if use_sentences_stats:
        X_sent_feat = np.array(articles_df["sent_tag_ratio"].tolist(), dtype=np.float32)
        #X = np.array(articles_df["document_emb"].tolist(), dtype=np.float32)
        #X = np.concatenate([X, X_sent_feat], axis=-1)
        X = X_sent_feat
    else:
        X = np.array(articles_df["document_emb"].tolist(), dtype=np.float32)
    y = np.array(articles_df["article_tag"], dtype=np.float32)
    return X, y

In [None]:
df_train = get_sentences("train")
df_val = get_sentences("val")
df_test = get_sentences("test")

In [None]:
df_all = pd.concat([df_train, df_val, df_test])

In [None]:
df_train = df_all.query('ID_ARTICOLO in @articles_ids_train')
df_test = df_all.query('ID_ARTICOLO in @articles_ids_test')

In [None]:
X_train, y_train = build_articles_embeddings_data(df_train, model)
#X_val, y_val = build_articles_embeddings_data(df_val, model)
X_test, y_test = build_articles_embeddings_data(df_test, model)

In [None]:
X_train.shape, X_test.shape

In [None]:
from sklearn.utils.class_weight import compute_class_weight

weights = compute_class_weight(class_weight = 'balanced', classes = [0.0, 1.0], y = y_train)
class_weights = {0 : weights[0], 1: weights[1]}    

In [None]:
class_weights

In [None]:
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import classification_report

basic_model = LinearSVC()#class_weight=class_weights)
#basic_model = LogisticRegression(class_weight=class_weights)
basic_model.fit(X_train, y_train)

In [None]:
print("TRAIN SET")
print(classification_report(y_train, basic_model.predict(X_train)))
#print("VAL SET")
#print(classification_report(y_val, basic_model.predict(X_val)))
print("TEST SET")
print(classification_report(y_test, basic_model.predict(X_test)))

In [None]:
import torch

class ArticleSentencesDataset(torch.utils.data.Dataset):
    def __init__(self, df):
        self.df = df

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        sample = self.df.iloc[idx]
        return {
            "article_id": sample["ID_ARTICOLO"], 
            "sentence_id": sample["ID_FRASE"], 
            "sentence": sample["FRASE"], 
            "sentence_tag": sample["TAG_FRASE"], 
            "article_tag": sample["TAG_ARTICOLO"]
        }

In [None]:
ddd = ArticleSentencesDataset(df)

In [None]:
def article_sentences_collate_fn(sample_list):
    # NOTE: the tokenizer in dataloader already pads inputs to have same length of 384
    input_ids_padded = [sample["input_ids"] for sample in sample_list]
    attention_mask_padded = [sample["attention_mask"] for sample in sample_list]
    out = [sample["out_span"] for sample in sample_list]
    # Convert inputs to Torch tensors
    input_ids_padded = torch.tensor(input_ids_padded, dtype=torch.long)
    attention_mask_padded = torch.tensor(attention_mask_padded, dtype=torch.long)
    # Tensor adds an extra dimension, so remove it
    input_ids_padded = input_ids_padded[:, 0, :]
    attention_mask_padded = attention_mask_padded[:, 0, :]
    return {"input_ids": input_ids_padded,
            "attention_mask": attention_mask_padded,
            "y_gt":torch.stack(out),
            "paragraph_id":paragraph_id,
            "question_id":question_id}

In [None]:
#model = SentenceTransformer('paraphrase-MiniLM-L12-v2')
model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')

# Two lists of sentences
#"""
sentences1 = ['The cat sits outside',
             'A man is playing guitar',
             'The new movie is awesome']

sentences2 = ['The dog plays in the garden',
              'A woman watches TV',
              'The new movie is so great']
#"""
"""sentences1 = ['Il nostro prodotto è bello',
             'Il gatto attraversa la strada']

sentences2 = ['Questo prodotto è stupendo',
              'Un animale attraversa la strada']
"""


#Compute embedding for both lists
embeddings1 = model.encode(sentences1, convert_to_tensor=True)
embeddings2 = model.encode(sentences2, convert_to_tensor=True)

#Compute cosine-similarits
cosine_scores = util.pytorch_cos_sim(embeddings1, embeddings2)

#Output the pairs with their score
for i in range(len(sentences1)):
    print("{} \t\t {} \t\t Score: {:.4f}".format(sentences1[i], sentences2[i], cosine_scores[i][i]))

In [None]:
import torch

In [None]:
# Two lists of sentences
sentences1 = ['The animal is running in the woods',
             'A man is playing guitar',
             'The new movie is awesome']

sentences2 = ['A big cat is in the garden',
              'A person is using an instrument',
              'The new movie is so great']

#Compute embedding for both lists
embeddings1 = model.encode(sentences1, convert_to_tensor=True)
embeddings2 = model.encode(sentences2, convert_to_tensor=True)

# let's test a "readout" like operation
embeddings_single_1 = torch.mean(embeddings1, axis=0)
embeddings_single_2 = torch.mean(embeddings2, axis=0)

cosine_single_scores = util.pytorch_cos_sim(embeddings_single_1, embeddings_single_2)

print(cosine_single_scores)

In [None]:
cosine_single_scores

In [None]:
embeddings1.shape

Progress/Notes list:

- Italian BERT can be used still in PyT for example.
- TODO: try PyT Lighting if switching to PyT
- Removed FONTE as a predictor feature. It makes no sense to treat such a categorical feature in a numeric way.
- TODO: try using FONTE embedded by a neural network (MLP from n_FONTE to an embedding dim)