# Part 1: Imports

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import json
import os 

# Part 2: Process data

In [None]:
evaluation_results_path = 'evaluation_results.json'
with open(evaluation_results_path, 'r') as f:
            data = pd.read_json(f)

images_dir = 'images'   

# Remove all columns except Model_id, Extractive_model, Abstractive_model, Ratio_mode and Evaluation_metrics

data = data[['Model_ID', 'Extractive_model', 'Abstractive_model', 'Ratio_mode', 'Evaluation_metrics']]

# Unpack Evaluation_metrics into separate columns, one for each metric, currently they are in a dictionary
data = pd.concat([data.drop(['Evaluation_metrics'], axis=1), data['Evaluation_metrics'].apply(pd.Series)], axis=1)

# Remove the BART_no_extraction_V1 model as it is not relevant for the evaluation
data = data[data['Model_ID'] != 'BART_no_extraction_V1']
#data = data[data['Model_ID'] != 'BART_no_extraction_V2']

# Not used in eval
data = data[data['Model_ID'] != 'Llama3_no_extraction_V1']
data = data[data['Model_ID'] != 'RoBERTa_Llama3_dependent_V1']
# Add a column which contains a boolean value indicating if the model is a legal model or not 
data['Legal_model'] = data['Model_ID'].apply(lambda x: 'Legal LM' if 'Legal' in x or 'Lex' in x else 'General LM')

# Add a column which adds the extractive model context length
context_lengths ={
        'RoBERTa': 512,
        'LegalBERT': 512,
        'LexLM': 512,
        'Longformer': 4096,
        'LexLM_Longformer': 4096,
        'BART': 1024,
        'T5': 512,
        'LongT5': 16384,
        'Pegasus': 1024,
        'PegasusX': 16384,
        'Llama3': 8192,
        'Mixtral': 32768,
        'No extractive model': None
}

data['Context_length_extractive'] = data['Extractive_model'].apply(lambda x: context_lengths[x])
data['Context_length_abstractive'] = data['Abstractive_model'].apply(lambda x: context_lengths[x])
# Split data to be used for extractive research (i.e. chapter 4.1 - 4.3)
extractive_research_data = data.iloc[:16]
bart_baseline = data[data['Model_ID'] == 'BART_no_extraction_V2']

abstractive_research_data = data.iloc[16:]
abstractive_research_data = abstractive_research_data.append(bart_baseline)
# Reset the index if desired
abstractive_research_data.reset_index(drop=True, inplace=True)

# Part 3: Data analysis of extractive models

In [None]:
#TODO: Create a function that takes in data (Ratio_data, Legal_data etc.) and metric and creates a plot for it.
def create_one_big_plot(data, comparison_type):
    """
    Create a big plot with multiple subplots to visualize various metrics.

    Parameters:
    - data (pandas.DataFrame): The data containing the metrics to be plotted.
    - comparison_type (str): The type of comparison for the metrics.

    Returns:
    None
    """

    # Extract metrics
    rouge_1 = data['ROUGE-1'].values
    rouge_2 = data['ROUGE-2'].values
    rouge_l = data['ROUGE-L'].values
    bertscore = data['BERTScore'].values
    bartscore = data['BARTScore'].values
    blanc = data['BLANC'].values

    # Create subplots
    fig, axs = plt.subplots(2, 2, figsize=(12, 10))

    bar_labels = data.index

    if bar_labels.dtype == 'object':
        bar_labels = [x.capitalize().replace('lm', 'LM') if 'lm' in x.capitalize() else x.capitalize() for x in bar_labels]
    x = np.arange(len(bar_labels))  # the label locations
    width = 0.25  # the width of the bars

    # Plot ROUGE metrics
    axs[0, 0].bar(x - width, rouge_1, width, label='ROUGE-1')
    axs[0, 0].bar(x, rouge_2, width, label='ROUGE-2')
    axs[0, 0].bar(x + width, rouge_l, width, label='ROUGE-L')
    axs[0, 0].set_ylabel('Scores')
    axs[0, 0].set_title(f'ROUGE scores by {comparison_type}')
    axs[0, 0].set_xticks(x)
    axs[0, 0].set_xticklabels(bar_labels)
    axs[0, 0].legend()
    axs[0, 0].set_ylim(0, 1)

    # Plot BertScore
    axs[0, 1].bar(x, bertscore, width *2)
    axs[0, 1].set_ylabel('BERTScore')
    axs[0, 1].set_title(f'BERTScore by {comparison_type}')
    axs[0, 1].set_xticks(x)
    axs[0, 1].set_xticklabels(bar_labels)
    axs[0, 1].set_ylim(0, 1)

    # Plot BARTScore
    axs[1, 0].bar(x, bartscore, width*2)
    axs[1, 0].set_ylabel('BARTScore')
    axs[1, 0].set_title(f'BARTScore by {comparison_type}')
    axs[1, 0].set_xticks(x)
    axs[1, 0].set_xticklabels(bar_labels)
    axs[1, 0].set_ylim(-5, 5)

    # Plot BLANC
    axs[1, 1].bar(x, blanc, width*2)
    axs[1, 1].set_ylabel('BLANC score')
    axs[1, 1].set_title(f'BLANC score by {comparison_type}')
    axs[1, 1].set_xticks(x)
    axs[1, 1].set_xticklabels(bar_labels)
    axs[1, 1].set_ylim(0, 1)

    plt.tight_layout(rect=[0, 0.03, 1, 0.95])
    comparison_type = comparison_type.replace(' ', '_')
    plt.savefig(os.path.join(images_dir, f'metrics_by_{comparison_type}.png'))
    plt.show()


def create_latex_string(data, caption):
    """
    Create a LaTeX string representation of a pandas DataFrame.

    Args:
        data (pandas.DataFrame): The DataFrame to convert to LaTeX.
        caption (str): The caption to include in the LaTeX output.

    Returns:
        str: The LaTeX string representation of the DataFrame.

    Example:
        >>> df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
        >>> create_latex_string(df, 'Example DataFrame')
        '\\begin{tabular}{lcccccc}\n\\toprule\n A & B \\\\\n\\midrule\n 1 & 4 \\\\\n 2 & 5 \\\\\n 3 & 6 \\\\\n\\bottomrule\n\\end{tabular}\n\\caption{Example DataFrame}\n\\label{tab: Example DataFrame}'
    """
    column_format = 'l' + 'c' * (data.shape[1] - 1)
    if caption == 'extractive context length':
        column_format = 'c' + 'c' * (data.shape[1] - 1)
    
    latex_string = data.to_latex(index=False, float_format='%.4f', column_format= column_format, position='h')

    # Modify the LaTeX string to place the caption and label after the tabular environment
    latex_string = latex_string.replace('\\end{tabular}', '\\end{tabular}\n\\caption{' + caption + '}\n\\label{tab: ' + caption + '}')

    return latex_string


## 3.1: Ratio types

### 3.1.1: Further process dataset and print latex

In [None]:
#Average results for models that have the same ratio_mode. 
# So combine the evaluation results for models that have the same ratio_mode and calculate the average for each metric.
ratio_data = extractive_research_data.groupby(['Ratio_mode']).mean()
ratio_data = ratio_data.drop(['Context_length_extractive', 'Context_length_abstractive'], axis=1)

# Create a copy of ratio_data with the index renamed
ratio_data_copy = ratio_data.rename_axis('Ratio mode').reset_index()
ratio_data_copy['Ratio mode'] = ratio_data_copy['Ratio mode'].str.capitalize()

# Reorder the rows as Fixed, Dependent, Hybrid
order = ['Fixed', 'Dependent', 'Hybrid']
ratio_data_copy = ratio_data_copy.set_index('Ratio mode').loc[order].reset_index()

# Convert the modified DataFrame to LaTeX
caption = 'Average evaluation results for models with the same ratio mode'
print(create_latex_string(ratio_data_copy, caption))

### 3.3.2: Create graphs

In [None]:
#Plot the Rouge metrics using matplotlib
rouge_1 = ratio_data['ROUGE-1'].values
rouge_2 = ratio_data['ROUGE-2'].values
rouge_l = ratio_data['ROUGE-L'].values


figure, ax = plt.subplots()
bar_labels = ratio_data.index
bar_labels = [x.capitalize() for x in bar_labels]

ratio_data_dict = {
    bar_labels[0]: rouge_1,
    bar_labels[1]: rouge_2,
    bar_labels[2]: rouge_l
}

bar_colors = ['r', 'g', 'b']

x = np.arange(len(bar_labels))  # the label locations
width = 0.25  # the width of the bars
multiplier = 0


for attribute, measurement in ratio_data_dict.items():
    offset = width * multiplier
    rects = ax.bar(x + offset, measurement, width, label=attribute)
    multiplier += 1

ax.set_ylabel('Scores')
ax.set_title('Scores by ratio mode')
ax.set_xticks(x + width, bar_labels)
ax.legend(['ROUGE-1', 'ROUGE-2', 'ROUGE-L'])
ax.set_ylim(0, 1)

plt.show()


In [None]:
bert_score = ratio_data['BERTScore'].values

figure, ax = plt.subplots()
bar_labels = ratio_data.index
bar_labels = [x.capitalize() for x in bar_labels]

bar_colors = ['r', 'g', 'b']

x = np.arange(len(bar_labels))  # the label locations


rects = ax.bar(x, bert_score, label='BERTScore')

ax.set_ylabel('BERTScore')
ax.set_title('BERTScore by ratio mode')
ax.set_xticks(x, bar_labels)
ax.set_ylim(0, 1)

plt.show()


In [None]:
create_one_big_plot(ratio_data, 'ratio mode')

## 3.2: Legal vs Non Legal LMs (extractive)

### 3.2.1: Further process dataset and print latex

In [None]:
#Average results for models that are Legal and Non legal, resulting in two lists with results of the two groups

legal_data = extractive_research_data.groupby(['Legal_model']).mean()
legal_data = legal_data.drop(columns=['Context_length_extractive', 'Context_length_abstractive'])

# Create a copy of legal_data with the index renamed
legal_data_copy = legal_data.rename_axis('Model type').reset_index()


# Convert the modified DataFrame to LaTeX
caption = 'Average evaluation results for Legal LMs and General LMs'
print(create_latex_string(legal_data_copy, caption))

### 3.2.2: Create graphs

In [None]:
create_one_big_plot(legal_data, 'model type')

## 3.3: Long vs regular context length extractive LMs 

### 3.3.1: Further process dataset and print latex

In [None]:
#Average results for models that have the same context length

extractive_context_length_data = extractive_research_data.groupby(['Context_length_extractive']).mean()
extractive_context_length_data = extractive_context_length_data.drop(columns= ['Context_length_abstractive'])

# Create a copy of extractive_context_length_data with the index renamed
extractive_context_length_data_copy = extractive_context_length_data.rename_axis('Extractive context length').reset_index()


# Convert the modified DataFrame to LaTeX
caption = 'Average evaluation results for models with the same extractive context length'
print(create_latex_string(extractive_context_length_data_copy, caption))

### 3.3.2: Create graphs

In [None]:
create_one_big_plot(extractive_context_length_data, 'extractive context length')

In [None]:
print(data.columns)

## 3.4: Long vs regular context length abstractive LMs 

### 3.4.1: Further process dataset and print latex

### 3.4.2: Create graphs

## Top 5 ranking of extractive models

In [None]:

# Create a dictionary to store the top 5 models for each metric
top_5_models = {metric: [] for metric in extractive_research_data.columns[4:10]}
for metric in top_5_models:
    top_5_models[metric] = extractive_research_data.nlargest(5, metric)['Model_ID'].values

# Print the top 5 models for each metric
for metric, models in top_5_models.items():
    print(f'Top 5 models for {metric}: {models}')

In [None]:
extractive_research_data_copy = extractive_research_data.sort_values(by= 'Extractive_model')
extractive_research_data_copy = extractive_research_data_copy.drop(columns=['Context_length_extractive', 'Context_length_abstractive', 'Model_ID', 'Abstractive_model', 'Legal_model'])
extractive_research_data_copy = extractive_research_data_copy[['Extractive_model', 'Ratio_mode', 'ROUGE-1', 'ROUGE-2', 'ROUGE-L', 'BERTScore', 'BARTScore', 'BLANC']]
extractive_research_data_copy.rename(columns={'Extractive_model': 'Extractive model', 'Ratio_mode': 'Ratio type'}, inplace=True)

extractive_research_data_copy['Ratio type'] = extractive_research_data_copy['Ratio type'].replace('No ratio', 'No extraction')
extractive_research_data_copy['Ratio type'] = extractive_research_data_copy['Ratio type'].replace('dependent', 'Dependent ')
extractive_research_data_copy['Ratio type'] = extractive_research_data_copy['Ratio type'].replace('fixed', 'Fixed ')
extractive_research_data_copy['Ratio type'] = extractive_research_data_copy['Ratio type'].replace('hybrid', 'Hybrid ')

caption = "Evaluation results of all extractive models with all ratio types, fine-tuned on BART. When no extractive model is used, this is showcased with ’-’."
print(create_latex_string(extractive_research_data_copy, caption))

### Appending RoBERTa_BART_dependent_V1 to abstractive dataset as it's the best performing

In [None]:
best_performing_model = extractive_research_data[extractive_research_data['Model_ID'] == 'RoBERTa_BART_dependent_V1']
abstractive_research_data= abstractive_research_data.append(best_performing_model)

# Part 4: Data analysis of abstractive models


In [None]:
# Create a dictionary to store the top 5 models for each metric
#TODO: Remove 
top_5_models = {metric: [] for metric in abstractive_research_data.columns[4:10]}
for metric in top_5_models:
    top_5_models[metric] = abstractive_research_data.nlargest(5, metric)['Model_ID'].values

# Print the top 5 models for each metric
for metric, models in top_5_models.items():
    print(f'Top 5 models for {metric}: {models}')

## 4.1: Effect of extractive step

In [None]:
print(abstractive_research_data)

In [None]:
abstractive_research_data_copy = abstractive_research_data.sort_values(by= 'Abstractive_model')
abstractive_research_data_copy = abstractive_research_data_copy.drop(columns=['Context_length_extractive', 'Model_ID', 'Extractive_model', 'Legal_model'])
abstractive_research_data_copy = abstractive_research_data_copy[['Abstractive_model', 'Ratio_mode', 'Context_length_abstractive', 'ROUGE-1', 'ROUGE-2', 'ROUGE-L', 'BERTScore', 'BARTScore', 'BLANC']]
abstractive_research_data_copy.rename(columns={'Abstractive_model': 'Abstractive model', 'Ratio_mode': 'Ratio type', 'Context_length_abstractive': 'Context length'}, inplace=True)

abstractive_research_data_copy['Ratio type'] = abstractive_research_data_copy['Ratio type'].replace('No ratio', 'No extraction')
abstractive_research_data_copy['Ratio type'] = abstractive_research_data_copy['Ratio type'].replace('dependent', 'Dependent ')

caption = 'Evaluation results of all abstractive models with and without an extractive step'
print(create_latex_string(abstractive_research_data_copy, caption))

## 4.2: Repetition analysis

In [49]:
import re
import os

# Function to detect heavy repetition in a summary and return repeated sentences
def detect_heavy_repetition(text, threshold=5):
    sentences = text.split('.')
    sentence_counts = {}
    
    for sentence in sentences:
        sentence = sentence.strip()
        if sentence in sentence_counts:
            sentence_counts[sentence] += 1
        else:
            sentence_counts[sentence] = 1

    repeated_sentences = [sentence for sentence, count in sentence_counts.items() if count >= threshold and len(sentence) >= 10]
    
    return repeated_sentences if repeated_sentences else None

# List of text files by model type and variant
model_files = {
    "BART": [
        "RoBERTa_BART_dependent_V1_predictions.txt",
        "BART_no_extraction_V2_predictions.txt"
    ],
    "T5": [
        "T5_no_extraction_V1_predictions.txt",
        "RoBERTa_T5_dependent_V1_predictions.txt"
    ],
    "LongT5": [
        "LongT5_no_extraction_V1_predictions.txt",
        "RoBERTa_LongT5_dependent_V1_predictions.txt"
    ],
    "Pegasus": [
        "Pegasus_no_extraction_V1_predictions.txt",
        "RoBERTa_Pegasus_dependent_V1_predictions.txt"
    ],
    "PegasusX": [
        "PegasusX_no_extraction_V1_predictions.txt",
        "RoBeRTa_PegasusX_dependent_V1_predictions.txt"
    ],
    "Llama3": [
        "Llama3_no_extraction_V2_predictions.txt",
        "RoBERTa_Llama3_dependent_V2_predictions.txt"
    ],
    "Baseline": [
        "actual_summaries.txt",
    ]
}

# Initialize dictionaries to store results
model_variant_counts = {model: [] for model in model_files}
model_averages = {}

# Process each model and its variants
with open("repetition_analysis_results.txt", 'w') as output_file:
    for model, files in model_files.items():
        model_total_summaries = 0
        model_total_heavy_repetition = 0

        for file_idx, file_path in enumerate(files):
            file_path = os.path.join('text_outputs', file_path)
            with open(file_path, 'r', encoding='utf-8') as file:
                content = file.read()

            # Split content into individual summaries
            summaries = re.split(r'Summary \d+:', content)[1:]  # Skip the header part

            # Detect heavy repetition in each summary
            heavy_repetition_count = 0
            output_file.write(f"Model: {model} Variant: {files[file_idx]}\n")
            for idx, summary in enumerate(summaries):
                repeated_sentences = detect_heavy_repetition(summary)
                if repeated_sentences:
                    heavy_repetition_count += 1

            model_variant_counts[model].append(heavy_repetition_count)
            model_total_summaries += len(summaries)
            model_total_heavy_repetition += heavy_repetition_count

            output_file.write(f"  Total heavy repetition cases: {heavy_repetition_count}\n")

            for idx, summary in enumerate(summaries):
                repeated_sentences = detect_heavy_repetition(summary)
                if repeated_sentences:
                    output_file.write(f"    Summary {idx}: {repeated_sentences}\n")

        model_averages[model] = model_total_heavy_repetition / len(files)

    # Print results
    for model, counts in model_variant_counts.items():
        print(f"Model: {model}")
        output_file.write(f"Model: {model}\n")
        for i, count in enumerate(counts):
            print(f"  Variant {i + 1}: {count} heavy repetition cases")
            output_file.write(f"  Variant {i + 1}: {count} heavy repetition cases\n")
        print(f"  Average: {model_averages[model]}")
        output_file.write(f"  Average: {model_averages[model]}\n")

print("Analysis results saved to repetition_analysis_results.txt")

Model: BART
  Variant 1: 0 heavy repetition cases
  Variant 2: 0 heavy repetition cases
  Average: 0.0
Model: T5
  Variant 1: 73 heavy repetition cases
  Variant 2: 108 heavy repetition cases
  Average: 90.5
Model: LongT5
  Variant 1: 163 heavy repetition cases
  Variant 2: 160 heavy repetition cases
  Average: 161.5
Model: Pegasus
  Variant 1: 143 heavy repetition cases
  Variant 2: 150 heavy repetition cases
  Average: 146.5
Model: PegasusX
  Variant 1: 103 heavy repetition cases
  Variant 2: 114 heavy repetition cases
  Average: 108.5
Model: Llama3
  Variant 1: 29 heavy repetition cases
  Variant 2: 24 heavy repetition cases
  Average: 26.5
Model: Baseline
  Variant 1: 4 heavy repetition cases
  Average: 4.0
Analysis results saved to repetition_analysis_results.txt


## 4.3: SOTA models

In [34]:
import re
import os

# Function to detect heavy repetition in a summary and return repeated sentences
def detect_heavy_repetition(text, threshold=9):
    sentences = text.split('.')
    sentence_counts = {}
    
    for sentence in sentences:
        sentence = sentence.strip()
        if sentence in sentence_counts:
            sentence_counts[sentence] += 1
        else:
            sentence_counts[sentence] = 1

    repeated_sentences = [sentence for sentence, count in sentence_counts.items() if count >= threshold]
    
    return repeated_sentences if repeated_sentences else None

# List of text files by model type and variant
model_files = {
    "BART": [
        "RoBERTa_BART_dependent_V1_predictions.txt",
        "BART_no_extraction_V2_predictions.txt"
    ],
    "T5": [
        "T5_no_extraction_V1_predictions.txt",
        "RoBERTa_T5_dependent_V1_predictions.txt"
    ],
    "LongT5": [
        "LongT5_no_extraction_V1_predictions.txt",
        "RoBERTa_LongT5_dependent_V1_predictions.txt"
    ],
    "Pegasus": [
        "Pegasus_no_extraction_V1_predictions.txt",
        "RoBERTa_Pegasus_dependent_V1_predictions.txt"
    ],
    "PegasusX": [
        "PegasusX_no_extraction_V1_predictions.txt",
        "RoBeRTa_PegasusX_dependent_V1_predictions.txt"
    ],
    "Llama3": [
        "Llama3_no_extraction_V2_predictions.txt",
        "RoBERTa_Llama3_dependent_V2_predictions.txt"
    ],
    "Baseline": [
        "actual_summaries.txt",
    ]
}

# Initialize dictionaries to store results
model_variant_counts = {model: [] for model in model_files}
model_averages = {}

# Process each model and its variants
with open("repetition_analysis_results.txt", 'w') as output_file:
    for model, files in model_files.items():
        model_total_summaries = 0
        model_total_heavy_repetition = 0

        for file_idx, file_path in enumerate(files):
            file_path = os.path.join('text_outputs', file_path)
            with open(file_path, 'r', encoding='utf-8') as file:
                content = file.read()

            # Split content into individual summaries
            summaries = re.split(r'Summary \d+:', content)[1:]  # Skip the header part

            # Detect heavy repetition in each summary
            heavy_repetition_count = 0
            output_file.write(f"Model: {model} Variant: {files[file_idx]}\n")
            print(f"Model: {model} Variant: {files[file_idx]}\n")
            for idx, summary in enumerate(summaries):
                repeated_sentences = detect_heavy_repetition(summary)
                if repeated_sentences:
                    heavy_repetition_count += 1

            model_variant_counts[model].append(heavy_repetition_count)
            model_total_summaries += len(summaries)
            model_total_heavy_repetition += heavy_repetition_count

            output_file.write(f"  Total heavy repetition cases: {heavy_repetition_count}\n")
            print(f"  Total heavy repetition cases: {heavy_repetition_count}\n")

            for idx, summary in enumerate(summaries):
                repeated_sentences = detect_heavy_repetition(summary)
                if repeated_sentences:
                    output_file.write(f"    Summary {idx}: {repeated_sentences}\n")
                    print(f"    Summary {idx}: {repeated_sentences}\n")

        model_averages[model] = model_total_heavy_repetition / len(files)

    # Print results
    for model, counts in model_variant_counts.items():
        print(f"Model: {model}")
        output_file.write(f"Model: {model}\n")
        for i, count in enumerate(counts):
            print(f"  Variant {i + 1}: {count} heavy repetition cases")
            output_file.write(f"  Variant {i + 1}: {count} heavy repetition cases\n")
        print(f"  Average: {model_averages[model]}")
        output_file.write(f"  Average: {model_averages[model]}\n")

print("Analysis results saved to repetition_analysis_results.txt")

Model: BART Variant: RoBERTa_BART_dependent_V1_predictions.txt

  Total heavy repetition cases: 0

Model: BART Variant: BART_no_extraction_V2_predictions.txt

  Total heavy repetition cases: 0

Model: T5 Variant: T5_no_extraction_V1_predictions.txt

  Total heavy repetition cases: 52

    Summary 1: ['It also introduces a new system for monitoring the compliance of products']

    Summary 3: ['The']

    Summary 4: ['The instrument is a multiannual instrument designed to support projects in the transport, energy and digital sectors']

    Summary 8: ['The programme is funded by the European Commission']

    Summary 15: ['The regulation sets out a number of requirements for the use of the border and visa information systems']

    Summary 21: ['The regulation applies to all EU citizens, regardless of their nationality or residence']

    Summary 23: ['The regulation does not apply to imports from non-EU countries']

    Summary 25: ['The European Semester is the framework for economic 