In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
username = 'feralvam'
repository = 'easse'
%cd /content/drive/MyDrive/github_repos
# !git clone https://github.com/{username}/{repository}
%cd {repository}
!pip install -e .
!pip install pyyaml==5.4.1
!pip install sentence_transformers

In [3]:
from collections import Counter
from typing import List
from tseval.feature_extraction import (
    get_compression_ratio,
    count_sentence_splits,
    get_levenshtein_similarity,
    is_exact_match,
    get_additions_proportion,
    get_deletions_proportion,
    get_wordrank_score,
    wrap_single_sentence_vectorizer,
)
import numpy as np
from sacrebleu.metrics import BLEU

import easse
import easse.utils.preprocessing as utils_prep
from easse.sari import compute_ngram_stats, compute_macro_sari, compute_micro_sari, compute_precision_recall_f1
from easse.bleu import corpus_bleu, corpus_averaged_sentence_bleu
from easse.fkgl import FKGLScorer
from easse.bertscore import get_bertscore_sentence_scores
from bert_score import BERTScorer


def sentence_bleu_list(
    sys_sents: List[str],
    refs_sents: List[List[str]],
    smooth_method: str = "floor",
    smooth_value: float = None,
    lowercase: bool = False,
    tokenizer: str = "13a",
    effective_order: bool = True,
):
    ref_sents = refs_sents[0]
    all_bleu = []
    for sys_sent, ref_sent in zip(sys_sents, ref_sents):
        sys_sent = utils_prep.normalize(sys_sent, lowercase, tokenizer)
        ref_sent = utils_prep.normalize(ref_sent, lowercase, tokenizer)

        bleu_scorer = BLEU(lowercase=False, force=True, tokenize="none", smooth_method=smooth_method, smooth_value=smooth_value, effective_order=effective_order)
        all_bleu.append(round(bleu_scorer.sentence_score(
            sys_sent,
            [ref_sent],
            ).score, 3))

    return all_bleu


def get_sentence_sari_operation_scores(
    orig_sents: List[str],
    sys_sents: List[str],
    refs_sents: List[List[str]],
    lowercase: bool = True,
    tokenizer: str = '13a',
    legacy=False,
    use_f1_for_deletion=True,
    use_paper_version=False,
):
    """
    Inputs:
    orig_sents: list of original sentences (len = n_samples)
    sys_sents: list of system sentences (len = n_samples)
    refs_sents: list of list of reference sentences (shape = (n_references, n_samples))
    legacy: Allows reproducing scores reported in previous work.
    It replicates a bug in the original JAVA implementation where only the system outputs and the reference sentences
    are further tokenized.
    In addition, it assumes that all sentences are already lowercased.
    """
    if legacy:
        lowercase = False
    else:
        orig_sents = [utils_prep.normalize(sent, lowercase, tokenizer) for sent in orig_sents]

    sys_sents = [utils_prep.normalize(sent, lowercase, tokenizer) for sent in sys_sents]
    refs_sents = [[utils_prep.normalize(sent, lowercase, tokenizer) for sent in ref_sents] for ref_sents in refs_sents]

    # -------Edits start----------
    all_stats = []
    for orig_sent, sys_sent, *ref_sents in zip(orig_sents, sys_sents, *refs_sents):
        all_stats.append(compute_ngram_stats([orig_sent], [sys_sent], [ref_sents]))
    
    all_operations = []
    for sent_stats in all_stats:
        if not use_paper_version:
            add_score, keep_score, del_score = compute_macro_sari(*sent_stats, use_f1_for_deletion=use_f1_for_deletion)
            all_operations.append((100*add_score, 100*keep_score, 100*del_score))
        else:
            add_score, keep_score, del_score = compute_micro_sari(*sent_stats, use_f1_for_deletion=use_f1_for_deletion)
            all_operations.append((100*add_score, 100*keep_score, 100*del_score))

    return all_operations
    # -------Edits end----------

def sentence_sari(*args, **kwargs):
    """Revised corpus_sari function"""
    all_operations = get_sentence_sari_operation_scores(*args, **kwargs)
    
    all_scores = []
    for sent_operations in all_operations:
        all_scores.append(round(np.mean(sent_operations), 3))
    return all_scores
    # return 3


def sentence_fkgl(sentences: List[str], tokenizer: str = "13a"):

    fkgl_scores = []
    for sentence in sentences:
        scorer = FKGLScorer()
        scorer.add(utils_prep.normalize(sentence, tokenizer=tokenizer))
        fkgl_scores.append(round(scorer.score(), 3))
    
    return fkgl_scores



def sentence_f1_token(sys_sents: List[str], refs_sents: List[List[str]], lowercase: bool = True, tokenizer: str = '13a'):
    def find_correct_tokens(sys_tokens, ref_tokens):
        return list((Counter(sys_tokens) & Counter(ref_tokens)).elements())

    sys_sents = [utils_prep.normalize(sent, lowercase, tokenizer) for sent in sys_sents]
    refs_sents = [[utils_prep.normalize(sent, lowercase, tokenizer) for sent in ref_sents] for ref_sents in refs_sents]

    sent_scores = []
    f1_token_scores = []
    for sys_sent, *ref_sents in zip(sys_sents, *refs_sents):
        sys_tokens = sys_sent.split()
        sys_total = len(sys_tokens)

        candidate_f1_token_scores = []
        for ref_sent in ref_sents:
            ref_tokens = ref_sent.split()
            ref_total = len(ref_tokens)

            correct_tokens = len(find_correct_tokens(sys_tokens, ref_tokens))
            _, _, f1 = compute_precision_recall_f1(correct_tokens, sys_total, ref_total)
            candidate_f1_token_scores.append(f1)

        f1_token_scores.append(np.max(candidate_f1_token_scores))
        sent_scores.append(round(100.0 * np.mean(f1_token_scores), 3))

    return sent_scores


def sentence_bertscore(
    sys_sents: List[str],
    refs_sents: List[List[str]],
    lowercase: bool = False,
    tokenizer: str = "13a",
):
    all_scores = get_bertscore_sentence_scores(sys_sents, refs_sents, lowercase, tokenizer)
    precision, recall, f1 = all_scores

    return precision.tolist(), recall.tolist(), f1.tolist()


def sentence_cos_similarity(
    sys_sents: List[str],
    refs_sents: List[List[str]],
    lowercase: bool = False,
):

    from sentence_transformers import SentenceTransformer, util

    model = SentenceTransformer('all-MiniLM-L6-v2')

    def cos_similarity(sent1, sent2):
        """Calculate cosine similarity of two sentences."""
        encoded_sents = model.encode([sent1.lower(), sent2.lower()])
        cos = util.cos_sim(*encoded_sents)
        return round(cos.item(), 3)
    
    all_sims = []
    for sys_sent, ref_sent in zip(sys_sents, refs_sents[0]):
        all_sims.append(cos_similarity(sys_sent, ref_sent))

    return all_sims



def sentence_quality_estimation(
    orig_sentences: List[str], sys_sentences: List[str], lowercase: bool = False, tokenizer: str = '13a'):
    orig_sentences = [utils_prep.normalize(sent, lowercase, tokenizer) for sent in orig_sentences]
    sys_sentences = [utils_prep.normalize(sent, lowercase, tokenizer) for sent in sys_sentences]
    
    corpus_quality = {
    	'Compression ratio': [],
    	'Sentence splits': [],
    	'Levenshtein similarity': [],
    	'Exact copies': [],
    	'Additions proportion': [],
    	'Deletions proportion': [],
    	'Lexical complexity score': [],
    	}
    
    for orig_sentence, sys_sentence in zip(orig_sentences, sys_sentences):
        corpus_quality['Compression ratio'].append(
            round(get_compression_ratio(orig_sentence, sys_sentence), 3))
        corpus_quality['Sentence splits'].append(
            round(count_sentence_splits(orig_sentence, sys_sentence), 3))
        corpus_quality['Levenshtein similarity'].append(
            round(get_levenshtein_similarity(orig_sentence, sys_sentence), 3))
        corpus_quality['Exact copies'].append(
            is_exact_match(orig_sentence, sys_sentence))
        corpus_quality['Additions proportion'].append(
            round(get_additions_proportion(orig_sentence, sys_sentence), 3))
        corpus_quality['Deletions proportion'].append(
            round(get_deletions_proportion(orig_sentence, sys_sentence), 3))
        corpus_quality['Lexical complexity score'].append(
            round(wrap_single_sentence_vectorizer(get_wordrank_score)(orig_sentence, sys_sentence), 3))
        
    return corpus_quality


In [4]:
from pathlib import Path

import click

from easse.fkgl import corpus_fkgl
from easse.utils.helpers import read_lines
from easse.quality_estimation import corpus_quality_estimation
from easse.sari import corpus_sari, get_corpus_sari_operation_scores
from easse.bleu import corpus_bleu, corpus_averaged_sentence_bleu
from easse.compression import corpus_f1_token
from easse.utils.constants import (
    VALID_TEST_SETS,
    VALID_METRICS,
    DEFAULT_METRICS,
)
from easse.utils.resources import get_orig_sents, get_refs_sents
from easse.report import write_html_report, write_multiple_systems_html_report


def get_sys_sents(test_set, sys_sents_path=None):
    # Get system sentences to be evaluated
    if sys_sents_path is not None:
        return read_lines(sys_sents_path)
    else:
        # read the system output
        with click.get_text_stream("stdin", encoding="utf-8") as system_output_file:
            return system_output_file.read().splitlines()

def get_orig_and_refs_sents(test_set, orig_sents_path=None, refs_sents_paths=None):
    # Get original and reference sentences
    if test_set == "custom":
        assert orig_sents_path is not None
        assert refs_sents_paths is not None
        if type(refs_sents_paths) == str:
            refs_sents_paths = refs_sents_paths.split(",")
        orig_sents = read_lines(orig_sents_path)
        refs_sents = [read_lines(ref_sents_path) for ref_sents_path in refs_sents_paths]
    else:
        orig_sents = get_orig_sents(test_set)
        refs_sents = get_refs_sents(test_set)
    # Final checks
    assert all(
        [len(orig_sents) == len(ref_sents) for ref_sents in refs_sents]
    ), f'Not same number of lines for test_set={test_set}, orig_sents_path={orig_sents_path}, refs_sents_paths={refs_sents_paths}'  # noqa: E501
    return orig_sents, refs_sents

def evaluate_system_output(
    test_set,
    sys_sents_path=None,
    orig_sents_path=None,
    refs_sents_paths=None,
    tokenizer="13a",
    lowercase=True,
    metrics=DEFAULT_METRICS,
    analysis=False,
    quality_estimation=False,
):
    """
    Evaluate a system output with automatic metrics.
    """
    VALID_METRICS = [
    'bleu',
    'sari',
    'samsa',
    'fkgl',
    'sent_bleu',
    'f1_token',
    'sari_legacy',
    'sari_by_operation',
    'bertscore',
    'cos_sim'
]
    for metric in metrics:
        assert metric in VALID_METRICS, f'"{metric}" is not a valid metric. Choose among: {VALID_METRICS}'
    sys_sents = get_sys_sents(test_set, sys_sents_path)
    orig_sents, refs_sents = get_orig_and_refs_sents(test_set, orig_sents_path, refs_sents_paths)

    # compute each metric
    metrics_scores = {}
    if "bleu" in metrics:
        metrics_scores["bleu"] = sentence_bleu_list(
            sys_sents,
            refs_sents,
            # force=True,
            tokenizer=tokenizer,
            lowercase=lowercase,
        )

    if "sent_bleu" in metrics:
        metrics_scores["sent_bleu"] = corpus_averaged_sentence_bleu(
            sys_sents, refs_sents, tokenizer=tokenizer, lowercase=lowercase
        )

    if "sari" in metrics:
        metrics_scores["sari"] = sentence_sari(   # Edited
            orig_sents,
            sys_sents,
            refs_sents,
            tokenizer=tokenizer,
            lowercase=lowercase,
        )

    if "sari_legacy" in metrics:
        metrics_scores["sari_legacy"] = corpus_sari(
            orig_sents,
            sys_sents,
            refs_sents,
            tokenizer=tokenizer,
            lowercase=lowercase,
            legacy=True,
        )

    if "sari_by_operation" in metrics:
        (
            metrics_scores["sari_add"],
            metrics_scores["sari_keep"],
            metrics_scores["sari_del"],
        ) = get_corpus_sari_operation_scores(
            orig_sents,
            sys_sents,
            refs_sents,
            tokenizer=tokenizer,
            lowercase=lowercase,
        )

    if "samsa" in metrics:
        from easse.samsa import corpus_samsa

        metrics_scores["samsa"] = corpus_samsa(
            orig_sents,
            sys_sents,
            tokenizer=tokenizer,
            lowercase=lowercase,
            verbose=True,
        )

    if "fkgl" in metrics:
        metrics_scores["fkgl"] = sentence_fkgl(sys_sents, tokenizer=tokenizer)

    if "f1_token" in metrics:
        metrics_scores["f1_token"] = sentence_f1_token(sys_sents, refs_sents, tokenizer=tokenizer, lowercase=lowercase)

    if "bertscore" in metrics:
        # from easse.bertscore import corpus_bertscore  # Inline import to use EASSE without installing all dependencies

        (
            metrics_scores["bertscore_precision"],
            metrics_scores["bertscore_recall"],
            metrics_scores["bertscore_f1"],
        ) = sentence_bertscore(sys_sents, refs_sents, tokenizer=tokenizer, lowercase=lowercase)
    
    if "cos_sim" in metrics:
        metrics_scores["cos_sim"] = sentence_cos_similarity(sys_sents, refs_sents, lowercase=lowercase)

    if analysis:
        from easse.annotation.word_level import (
            WordOperationAnnotator,
        )  # Inline import to use EASSE without installing all dependencies

        word_operation_annotator = WordOperationAnnotator(tokenizer=tokenizer, lowercase=lowercase, verbose=True)
        metrics_scores["word_level_analysis"] = word_operation_annotator.analyse_operations(
            orig_sents, sys_sents, refs_sents, as_str=True
        )

    if quality_estimation:
        metrics_scores["quality_estimation"] = sentence_quality_estimation(
            orig_sents, sys_sents, tokenizer=tokenizer, lowercase=lowercase
        )

    return metrics_scores


  defaults = yaml.load(f)


In [None]:
import os
data = os.listdir("/content/drive/MyDrive/apln552-project/data/raw_data")
data_files = []
print("Files to process:")
for data_file in data:
    if data_file.startswith("classic_lit_simplified"):
        print(data_file)
        data_files.append(data_file)

scores_bleu = []
scores_sari = []
scores_fkgl = []
scores_bertscore_p = []
scores_bertscore_r = []
scores_bertscore_f = []
scores_f1_token = []
scores_cos_sim = []
scores_comp_ratio = []
scores_sent_split = []
scores_lev_sim = []
scores_exact = []
scores_adds = []
scores_dels = []
scores_lex_comp = []

print("\nBeginning system evaluation:")
for data_file in data_files:
    print("Evaluating ", data_file)
    # sentence_scores = easse.cli.evaluate_system_output(
    results = evaluate_system_output(
        "custom",
        sys_sents_path="/content/drive/MyDrive/apln552-project/data/raw_data/" + data_file,
        orig_sents_path="/content/drive/MyDrive/apln552-project/data/raw_data/classic_lit_complex.txt",
        refs_sents_paths="/content/drive/MyDrive/apln552-project/data/raw_data/classic_lit_simplified_human.txt",
        # tokenizer="13a",
        # lowercase=True,
        metrics=[
                'bleu',
                'fkgl',
                'sari',
                'bertscore',
                'f1_token',
                'cos_sim'
                ],
        # analysis=False,
        quality_estimation=True,
        )
    scores_bleu.append(results['bleu'])
    scores_sari.append(results['sari'])
    scores_fkgl.append(results['fkgl'])
    scores_bertscore_p.append(results['bertscore_precision'])
    scores_bertscore_r.append(results['bertscore_recall'])
    scores_bertscore_f.append(results['bertscore_f1'])
    scores_f1_token.append(results['f1_token'])
    scores_cos_sim.append(results['cos_sim'])
    scores_comp_ratio.append(results['quality_estimation']['Compression ratio'])
    scores_sent_split.append(results['quality_estimation']['Sentence splits'])
    scores_lev_sim.append(results['quality_estimation']['Levenshtein similarity'])
    scores_exact.append(results['quality_estimation']['Exact copies'])
    scores_adds.append(results['quality_estimation']['Additions proportion'])
    scores_dels.append(results['quality_estimation']['Deletions proportion'])
    scores_lex_comp.append(results['quality_estimation']['Lexical complexity score'])

all_scores = {}
all_scores['bleu'] = scores_bleu
all_scores['sari'] = scores_sari
all_scores['fkgl'] = scores_fkgl
all_scores['bertscore_p'] = scores_bertscore_p
all_scores['bertscore_r'] = scores_bertscore_r
all_scores['bertscore_f'] = scores_bertscore_f
all_scores['f1_token'] = scores_f1_token
all_scores['cos_sim'] = scores_cos_sim
all_scores['comp_ratio'] = scores_comp_ratio
all_scores['sent_split'] = scores_sent_split
all_scores['lev_sim'] = scores_lev_sim
all_scores['exact'] = scores_exact
all_scores['adds'] = scores_adds
all_scores['dels'] = scores_dels
all_scores['lex_comp'] = scores_lex_comp

In [27]:
import numpy as np
import pandas as pd

# Organize data into dataframes (one per metric)
all_df = {}
for metric in all_scores:
    array = np.array(all_scores[metric]).transpose()
    all_df[metric] = pd.DataFrame(array, columns=[f for f in data_files])

# Add sentences into the dataframes
filename = "/content/drive/MyDrive/apln552-project/data/raw_data/classic_lit_complex.txt"
def read_lines(filename):
    with open(filename, encoding="utf-8") as f:
        lines = f.readlines()
        lines = [x.strip() for x in lines]
    return lines
sents = read_lines(filename)

for metric in all_df:
    all_df[metric].insert(0, "sentence", sents)

# Write to CSV
%cd /content/drive/MyDrive/apln552-project/data/eval_results/auto_eval

for metric in all_df:
    all_df[metric].to_csv("sent_level_"+metric+".csv")

/content/drive/.shortcut-targets-by-id/1fUU8cfGdJzyPvB9Zonuw3IOAJZqBxivM/apln552-project/data/eval_results/auto_eval
