## Setup

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [14]:
!pip install emoji



## Pre-processing for emoji processing

### Retrieve sentences from json format

In [10]:
import json
def get_sentences_from_json(path):
    sentences = []
    with open(path, 'r') as file:
        data = json.load(file)

        for key, value in data.items():
            for k, v in value.items():
                for inner_key, inner_value in v.items():
                    if inner_key == "sentence":
                        sentences.append(inner_value)
        return sentences

In [11]:
json_path_train = "/content/drive/MyDrive/COMET_data/paracomet/dialogue/samsum/dialog_train_split5_collated.json"
json_path_test = "/content/drive/MyDrive/COMET_data/paracomet/dialogue/samsum/dialog_test_split5_collated.json"
json_path_validation = "/content/drive/MyDrive/COMET_data/paracomet/dialogue/samsum/dialog_validation_split5_collated.json"

sentence_train = get_sentences_from_json(json_path_train)
sentence_test = get_sentences_from_json(json_path_test)
sentence_validation = get_sentences_from_json(json_path_validation)

### Mappings

In [12]:
# define a mapping
emoji_mapping = {
    ':)': '😀',
    ':-)': '😀',
    ':(': '😞',
    ':-(': '😞',
    ':-/': '🫤',
    '<3': '❤️',
    'XD': '😆',
    'xD': '😆',
    'xd': '😆',
    ':D': '😄',
    ':O': '😲',
    ';)': '😉',
    ':P': '😜',
    '>:(': '😠',
    ':-*': '😘',
    ':*': '😘',
    'B)': '😎',
    '>:D': '😈',
    'O:)': '😇',
    ':|': '😐',
    ':S': '😖',
    ':X': '😶',
    '<(")': '🐦',
    '>:O': '😱',
    '\\o/': '🙌',
    '(^_^)/': '🌟',
    '(o_o)/': '🌜',
    '<>_<>': '🎮',
    '(>_<)': '😣',
    '(^_-)': '😄',
    '(^_^)b': '👍',
    '(~_^)': '😂',
    ':-D': '😁',
    ':|)': '😐',
    '>:)': '😏',
}

### Create new mapped json

In [15]:
import emoji

def extract_emojis(text):
    return [char for char in text if char in emoji.EMOJI_DATA]

def substitute_emojis(text, emoji_mapping):
    for key, value in emoji_mapping.items():
        text = text.replace(key, value)
    return text

In [17]:
substitute_emojis('Peter said "The we can do whatever you want :*"', {':*':'NEW_KEY'})

'Peter said "The we can do whatever you want NEW_KEY"'

In [52]:
# map emoticons to emoji
mapped_dialogues = []

for s in sentence_validation:
  mapped_dialogues.append(substitute_emojis(s, emoji_mapping))

In [53]:
def get_aliases_and_tags(emoji):
    dataset_json_path = "/content/drive/MyDrive/datasets/emojis.json"

    with open(dataset_json_path, 'r') as file:
        data = json.load(file)

    for emoji_data in data.get("emojis", []):
        if emoji_data.get("emoji", "") == emoji:
            aliases = ', '.join(emoji_data.get("aliases", []))
            tags = ', '.join(emoji_data.get("tags", []))

            # Replace underscores with spaces
            aliases = aliases.replace('_', ' ') if aliases else ""
            tags = tags.replace('_', ' ') if tags else ""

            result = ""
            if aliases:
                result += aliases
            if aliases and tags:
                result += ', '
            if tags:
                result += tags
            return result
    return ""

In [54]:
get_aliases_and_tags("😙")

'kissing smiling eyes'

In [55]:
def substitute_emojis_with_aliases(dialogues):

    modified_dialogues = []

    for sentence in dialogues:
      sub_emojies = []
      emoji_in_sentence = extract_emojis(sentence)
      modified_sentence = sentence
      for emoji in emoji_in_sentence:
        if not emoji in sub_emojies:
          modified_sentence = modified_sentence.replace(emoji, "<E>" + get_aliases_and_tags(emoji) + "<\E>")
          sub_emojies.append(emoji)
        else:
          modified_sentence = modified_sentence.replace(emoji, "")
      modified_dialogues.append(modified_sentence)

    return modified_dialogues

In [56]:
preprocessed_sentences = substitute_emojis_with_aliases(mapped_dialogues)

In [41]:
def remap_sentences_emoji_codified(sentence, split):
    counter = 0
    split_str = f'dialog_{split}_split5_collated.json'

    with open(f"/content/drive/MyDrive/COMET_data/paracomet/dialogue/samsum/{split_str}", 'r') as file:
        data = json.load(file)

    new_data = {}
    for key, value in data.items():
        new_d = {}
        for k, v in value.items():
            new_value = {}
            for inner_key, inner_value in v.items():
                if inner_key == "sentence":
                    new_value[inner_key] = sentence[counter]
                    counter += 1
                else:
                    new_value[inner_key] = inner_value
            new_d[k] = new_value
        new_data[key] = new_d

    new_json_file_path = f'preprocessed_dialog_{split}_split5_collated.json'
    with open(new_json_file_path, 'w') as new_file:
        json.dump(new_data, new_file, indent=2)

In [57]:
split = "validation"
remap_sentences_emoji_codified(preprocessed_sentences, split)

In [58]:
# move file to drive!
import shutil
dest_folder = "/content/drive/MyDrive/"
source_folder = f"/content/preprocessed_dialog_{split}_split5_collated.json"
shutil.copy(source_folder, dest_folder)

'/content/drive/MyDrive/preprocessed_dialog_validation_split5_collated.json'