In [1]:
import numpy as np
from collections import defaultdict, Counter, OrderedDict
import pickle
import yaml, json
from schema import Schema, And, Use, Any, Optional, SchemaError

from tqdm.notebook import tqdm

In [7]:
file_path = 'emoji2LLMstr.pkl'

# read dictionary from the file
with open(file_path, "rb") as file:
    emoji2desc = pickle.load(file)

In [9]:
print(emoji2desc['ðŸ§Ÿ'])

```yaml
Emoji: "ðŸ§Ÿ"
Description: "This emoji represents a zombie, a fictional undead creature often depicted in popular culture as a reanimated corpse with a hunger for human flesh, typically used in a humorous or scary context."
Semantic_Tags:
  - zombie
  - undead
  - horror
  - fiction
  - scary
  - reanimated
  - creature
```


In [60]:
schema = Schema(
    {
        'Emoji': And(str, lambda s: len(s) > 0),
        'Description': And(str, lambda s: len(s) > 0),
        'Semantic_Tags': And(
            list, 
            lambda tags: all(
                isinstance(tag, str) for tag in tags
            )
        ),
    }
)


def is_valid_yaml(yaml_str):
    try:
        schema.validate(yaml_str)
        return True #print("Data is valid according to the schema.")
        
    except Exception as e:
        #print("Data is not valid according to the schema:", str(e))
        return False

emoji2dict = {}

for emoji, yaml_str in tqdm(emoji2desc.items()):
    
    init_str = '```yaml\n'
    final_str = '```\n'

    yaml_str = yaml_str[len(init_str):-len(final_str)].strip()

    try:
        emoji2dict[emoji] = yaml.safe_load(yaml_str)

    except Exception as e:
        print(f"{emoji} did not parse!")

  0%|          | 0/5034 [00:00<?, ?it/s]

In [61]:
# for e, d in emoji2dict.items():
#     if not is_valid_yaml(d):
#         print(f"{e} is not valid!")
#         print(d)
#         print()

In [62]:

file_path = 'emoji2dict.pkl'

# Write the dictionary into a pickled object
with open(file_path, 'wb') as file:
    pickle.dump(emoji2dict, file)
