# **Multi-News Highlights Extraction Example**

Example of the use of THExt to extract highlights on Multi-News dataset

In [None]:
from google.colab import drive
drive.mount('/content/drive')

### IMPORT REQUIREMENTS 

In [None]:
! pip install rouge_score
! pip install multiprocess
! pip install py-rouge
! pip install -U sentence-transformers
! pip install rouge_score
! python -m spacy download en_core_web_lg

### Import required libraries

In [None]:
import pandas as pd 
import sys
import os
import multiprocess
import rouge
import sentence_transformers
import csv 
import ast 
import nltk 
nltk.download('punkt')

### Import code


Insert inside ***your_path*** the path to the code folder in your drive: 

In [None]:
your_path = ""

In [None]:
py_file_location = "/content/drive/MyDrive/" + your_path + "Multinews"
sys.path.append(os.path.abspath(py_file_location))

In [None]:
import DatasetAnalysis 
import DatasetPreparation
import HighlighterPreparation
import HighlighterEvaluation
import thext_extended 
from thext_extended import SentenceRankerPlus
from thext_extended import Highlighter
from thext_extended import DatasetPlus

### CALCULATION OF STATISTICS 

In [None]:

path_train = "/content/drive/MyDrive/" + your_path + "Multinews/Multi-News_Dataset/train.src"
path_validation = "/content/drive/MyDrive/" + your_path + "Multinews/Multi-News_Dataset/val.src"
path_test = "/content/drive/MyDrive/" + your_path + "Multinews/Multi-News_Dataset/test.src"

output_path_train = "/content/drive/MyDrive/" + your_path + "Multinews/output_dataset/train_.csv"
output_path_test = "/content/drive/MyDrive/" + your_path + "Multinews/output_dataset/test_.csv"
output_path_validation = "/content/drive/MyDrive/" + your_path + "Multinews/output_dataset/validation_.csv"

if not os.path.exists(output_path_train):
    os.makedirs(output_path_train)
if not os.path.exists(output_path_test):
    os.makedirs(output_path_test)
if not os.path.exists(output_path_validation):
    os.makedirs(output_path_validation)

#da = DatasetAnalysis.DatasetAnalysis()

da = DatasetAnalysis.DatasetAnalysis()

train = da.read_dataset(path_train)
test = da.read_dataset(path_test)
validation = da.read_dataset(path_validation)

train = da.sample(train, 4, output_path_train)
test = da.sample(test, 2, output_path_test)
validation = da.sample(validation, 2, output_path_validation)

avg, max, min = da.get_statistics(train, test, validation)
print("The max number of articles for one cluster is:", max)
print("The average number of articles for each cluster is:", avg)

In [None]:
### PREPARATION FOR DATASETPLUS ### "
path_train_processed = "/content/drive/MyDrive/" + your_path + "Multinews/output_dataset/train_.csv"
path_validation_processed = "/content/drive/MyDrive/" + your_path + "Multinews/output_dataset/test_.csv"
path_test_processed = "/content/drive/MyDrive/" + your_path + "Multinews/output_dataset/validation_.csv"

path_label_train = "/content/drive/MyDrive/" + your_path + "Multinews/Multi-News_Dataset/train.tgt"
path_label_validation = "/content/drive/MyDrive/" + your_path + "Multinews/Multi-News_Dataset/val.tgt"

dp = DatasetPreparation.DatasetPreparation()

list_text, list_abstract = dp.preparation_test_for_DatasetPlus(output_path_test)
test_processed = DatasetPlus(list_text = list_text, list_abstract = list_abstract, n_jobs = 1)

dp.save_output_of_DatasetPlus(path_test_processed, test_processed)


list_text, list_abstract, list_highlights = dp.preparation_train_for_DatasetPlus(output_path_train, path_label_train, n=4)
train_processed = DatasetPlus(list_text = list_text, list_abstract = list_abstract, list_highlights = list_highlights, n_jobs = 1)

dp.save_output_of_DatasetPlus(path_train_processed, train_processed)

list_text, list_abstract, list_highlights = dp.preparation_train_for_DatasetPlus(output_path_validation, path_label_validation, n=2)
validation_processed = DatasetPlus(list_text = list_text, list_abstract = list_abstract, list_highlights = list_highlights, n_jobs = 1)

dp.save_output_of_DatasetPlus(path_validation_processed, validation_processed)

In [None]:
### FINE-TUNING ###

### BERT ###
base_model_name = "google/bert_uncased_L-8_H-512_A-8"
model_name_or_path = "google/bert_uncased_L-8_H-512_A-8"

checkpoint_dir = "/content/drive/MyDrive/" + your_path + "Multinews/checkpoint_directory"

if not os.path.exists(checkpoint_dir):
    os.makedirs(checkpoint_dir)

### LONGFORMER ### 
#base_model_name = "allenai/longformer-4096"
#model_name_or_path = ""


train_set = dp.readFileFromDatasetPlus(path_train_processed)
validation_set = dp.readFileFromDatasetPlus(path_validation_processed)

sr = SentenceRankerPlus(train_set = train_set, eval_set = validation_set, epochs = 2, base_model_name = base_model_name, model_name_or_path = model_name_or_path)
sr.load_model(base_model_name=base_model_name, model_name_or_path=model_name_or_path)
sr.prepare_for_training()
sr.fit(checkpoint_dir = checkpoint_dir)

In [None]:
### PREPARATION FOR GENERATING HIGHLIGHTS ### 

hp = HighlighterPreparation.HighlighterPreparation()
dataset = hp.prepare_data_MultiNews(path_test_processed)

### GENERATION OF HIGHLIGHTS ### 

output_dir = "/content/drive/MyDrive/" + your_path + "Multinews/Output_Dataset/results.csv"

if not os.path.exists(output_dir):
    os.makedirs(output_dir)


#Create the highlighter 
h = Highlighter(sr)
list_high = []
num_highlights = 5

### HIGHLIGHTS SIMPLE ###
for i in range(len(dataset)):
    sentences = list(dataset[i]['clean_sentences'].values())
    abstract = dataset[i]['raw_abstract']
    highlights = h.get_highlights_simple(sentences, abstract, NH = num_highlights)
    list_high.append(highlights)

df_highlights = hp.convert_highlights_in_dataframe(list_high)
df_highlights.to_csv(output_dir)


### EVALUATION OF THE HIGHLIGHTS ### 
path_original_highlights = "/content/drive/MyDrive/" + your_path + "Multinews/Multi-News_Dataset/gold.txt"

he = HighlighterEvaluation.HighlighterEvaluation()
list_original_highlights, list_highlights = he.open_files(path_original_highlights, output_dir)
scores = he.evaluate_highlights(list_highlights, list_original_highlights)
scores