# Load GitHub repo


In [None]:
# Make sure to put the COMET_Data information inside the path below. Pay attention to the folder names.
dest_folder = "/SICK_Summarization/src/data/COMET_Data"

'/content/SICK_Summarization/src/data/COMET_Data'

## Models and functions

In [None]:
!pip install transformers



In [None]:
from models.emotion_bert import EmotionBERT
from models.topic_model import TopicModel

MODEL_EMOTION_EXTRACTOR = EmotionBERT(
    # INSERT IN PATH LOAD THE PATH TO THE DISTIL BERT WEIGHTS
    path_load="/content/drive/MyDrive/NLP_Project/distil-bert-out",
    path_save="",
)

MODEL_TOPIC_EXTRACTOR = TopicModel(
    path_label_json="final_topic_labels.json",
    confidence_threshold=0.18,
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
  return self.fget.__get__(instance, owner)()


In [None]:
def emotion_injector(sentence: str) -> str:
    """Given a samsum sentence, returns the correspondent injected emotion sentence"""
    emotions = MODEL_EMOTION_EXTRACTOR.predict(sentence)
    person = sentence.split(" ")[0]
    if emotions:
        emotion_phrase_injection = ""
        for emotion in emotions:
            emotion_phrase_injection += emotion + ", "
        emotion_phrase_injection = emotion_phrase_injection[:-2]
        sentence += (
            "\n"
            + f"{person} expressed emotions of "
            + emotion_phrase_injection
            + "."
        )
    return sentence


def topic_injector(dialogue, sentence: str) -> str:
    """Given the whole dialogue and the last sentence, it appends the dialogue topic to the last sentence"""
    topics = MODEL_TOPIC_EXTRACTOR.predict(dialogue)
    if topics:
        topic_phrase_injection = ""
        for topic in topics:
            topic_phrase_injection += topic + ", "
        topic_phrase_injection = topic_phrase_injection[:-2]
        sentence += (
            "\n"
            + "Some topics related to this dialogue are "
            + topic_phrase_injection
            + "."
        )
    return sentence

In [None]:
from tqdm import tqdm
import json


def create_new_samsum_json(
    split: str,
    is_emotion_injection: bool = False,
    is_topic_injection: bool = False,
):
    """Given the samsum json split, this function inject the topic or emotion information inside the json"""
    file_str = f"dialog_{split}_split5_collated.json"
    # MAKE SURE TO PUT THE COMET_DATA IN THIS PATH. PAY ATTENTION TO THE PATH NAMES.
    with open(
        f"/content/drive/MyDrive/NLP_Project/COMET_Data/paracomet/dialogue/samsum/{file_str}",
        "r",
    ) as file:
        samsum = json.load(file)

        pbar = tqdm(total=len(samsum))

        for key1, value1 in samsum.items():
            for key2, value2 in value1.items():
                dialogue_topic = []
                for key3, value3 in value2.items():
                    if key3 == "sentence":
                        dialogue_topic.append(value3)
                        if is_emotion_injection:
                            # we append to the sentence the emotion information.
                            value2[key3] = emotion_injector(value3)
            if is_topic_injection:
                # append to the dialogue the topic information.
                value1[key2]["sentence"] = topic_injector(
                    dialogue_topic, value1[key2]["sentence"]
                )
            pbar.update(1)

    injection_type = ""
    if is_emotion_injection:
        injection_type += "emotion_"
    if is_topic_injection:
        injection_type += "topic_"
    injected_samsum = f"{injection_type}dialog_{split}_split5_collated.json"
    with open(injected_samsum, "w") as new_file:
        json.dump(samsum, new_file, indent=2)

EMOTIONS


In [None]:
# create_new_samsum_json('validation', is_emotion_injection=True) #DONE

In [None]:
# create_new_samsum_json('test', is_emotion_injection=True) #DONE

In [None]:
# create_new_samsum_json('train', is_emotion_injection=True) #DONE

TOPICS

In [None]:
# create_new_samsum_json('test', is_topic_injection=True) #DONE

In [None]:
# create_new_samsum_json('validation', is_topic_injection=True) #DONE

In [None]:
# create_new_samsum_json('train', is_topic_injection=True) #DONE