# Word Definitions (Dataset)
Here, we provide a code to collect and process the dataset of:
1.  *Word* and corresponding *Synonym*,
2.  *Word* and corresponding *Type*,
3.  *Word* and corresponding *Instance*.

In [None]:
import json
import random
import inflect
import spacy
import numpy as np
from utils import WordsInstances, WordsSynonyms, WordsTypes, is_plural
# imports
from pathlib import Path
# Parent directory
parent_dir = str(Path().resolve().parents[0])
import polars as pl
import re

## 1. Extract data from the `WordsAPI`
You can download `wordsapi_sample.json` from the [WordsAPI Portal](https://www.wordsapi.com/).

In [None]:
nlp = spacy.load("en_core_web_trf")
def check_if_pnoun(term, definition):
    """
    Check if the term is a proper noun
    """
    term_len = len(term)
    doc = nlp(f"{term} is {definition}")
    for token in doc[:term_len]:
        if token.pos_ == "PROPN":
            ent_type = token.ent_type_
            if ent_type == "":
                return "PROPN"
            else:
                return ent_type
    return None

In [None]:
# check
doc = nlp("Harrison is defined as a 9th president of the united states")
check_if_pnoun("madam", "a 9th president of the united states.")

In [None]:
keywords = ["hasTypes", "typeOf",
                "partOf", "hasParts",
                "instanceOf", "hasInstances",
                "memberOf", "hasMembers",
                "substanceOf", "hasSubstances",
                "inCategory", "hasCategories",
                "regionOf", "inRegion"]
def check_attributes(word: dict):
    keys = list(word.keys())
    if "definitions" in keys:
        if type(word["definitions"]) == list:
            for w in word["definitions"]:
                sub_keys = w.keys()
                if "synonyms" in sub_keys and "definition" in sub_keys and "partOfSpeech" in sub_keys and any(keyword in sub_keys for keyword in keywords):
                    pass
                else:
                    return False
        else:
            sub_keys = word["definitions"].keys()
            if "synonyms" in sub_keys and "definition" in sub_keys and "partOfSpeech" in sub_keys and any(keyword in sub_keys for keyword in keywords):
                pass
            else:
                return False
    else:
        return False
    if "frequency" not in keys:
        return False
    return True


# Initialize the inflect engine
p = inflect.engine()
exceptions = ("any", "one", "once", 'something', 'someone', 'somebody', 'anything', 'anyone', 'anybody')
def format_definitions(definition, part_of_speech):
    # Remove text within brackets
    definition = re.sub(r'\s*\(.*?\)\s*', ' ', definition).strip()
    
    # Check the first character of the definition to decide on the article
    definition = definition.strip()
    # Use the first part of the definition\
    definition = definition.split(';')[0]

    if part_of_speech == "noun":
        lower_definition = definition.lower()
        
        # Check specific cases where no article should be used
        if lower_definition.startswith(exceptions):
            article = ''
        else:
            # Use inflect to determine the correct article
            article = p.a(definition).split()[0]
        
        # Check if the definition already starts with an article or 'the'
        if not lower_definition.startswith(('a ', 'an ', 'the ')) and article:
            definition = f"{article} {definition}"
    elif part_of_speech == "verb":
        if not definition.startswith("to "):
            definition = f"to {definition}"

    if definition.endswith('.'):
        definition = definition[:-1]
    
    return definition.strip()

In [None]:
with open(parent_dir + '/source/wordsapi_sample.json') as f:
    data = json.load(f)
keys = data.keys()
valid_keys = [key for key in keys if check_attributes(data[key])]
print("Number of words:", len(valid_keys))

In [None]:
word_list = []
extended_keywords = keywords + ["definition", "synonyms", "partOfSpeech"]
for key in valid_keys:
    word = data[key]
    freq = word["frequency"]
    if type(freq) == dict:
        zipf = freq["zipf"]
        perMillion = freq["perMillion"]
        diversity = freq["diversity"]
    else:
        zipf = freq
        perMillion = None
        diversity = None

    if "letters" in word.keys():
        letters = word["letters"]
    else:
        letters = None
    if "sounds" in word.keys():
        sounds = word["sounds"]
    else:
        sounds = None
    if type(word["definitions"]) == list:
        for w in word["definitions"]:
            _word = {k:v for k,v in w.items() if k in extended_keywords}
            _word["word"] = key
            _word["zipf"] = zipf
            _word["perMillion"] = perMillion
            _word["diversity"] = diversity
            _word["letters"] = letters
            _word["sounds"] = sounds
            _word["definition"] = format_definitions(_word["definition"], _word["partOfSpeech"])
            _word["num_definitions"] = len(word["definitions"])
            word_list.append(_word)
    else:
        _word = {k:v for k,v in word["definitions"].items() if k in extended_keywords}
        _word["word"] = key
        _word["zipf"] = zipf
        _word["perMillion"] = perMillion
        _word["diversity"] = diversity
        _word["letters"] = letters
        _word["sounds"] = sounds
        _word["definition"] = format_definitions(_word["definition"], _word["partOfSpeech"])
        _word["num_definitions"] = 1

        word_list.append(_word)
    

In [None]:
df = pl.from_dicts(word_list) 
## Check if PNOUN
df = df.with_columns(
    pl.struct(["word", "definition"]).map_elements(lambda x: check_if_pnoun(x["word"], x["definition"]), return_dtype=pl.String).alias("pnoun")
)
# Reorder columns
col_order = ["word", "definition", "partOfSpeech", "pnoun", "synonyms"] + keywords + ["zipf", "perMillion", "diversity", "letters", "sounds", "num_definitions"]
col_order = [col for col in col_order if col in df.columns]
df = df.select(col_order)
## do some cleaning  (set to null if the PROPN or if adjective or verb)
df = df.with_columns(pl.when( (pl.col("partOfSpeech") == "adjective") | (pl.col("partOfSpeech") == "verb"))\
                .then(None) \
                .otherwise(pl.col("pnoun")) \
                .alias("pnoun")) \
    .with_columns(pl.when(pl.col("pnoun") == "PROPN") \
                .then(None) \
                .otherwise(pl.col("pnoun")) \
                .alias("pnoun")) 

df.write_json(parent_dir + "/generators/definitions.json")
df.sample(20)
df.filter(pl.col("pnoun").is_not_null())

## 2. Generate `true` and `false` statements

In [None]:
p = inflect.engine()

def is_plural(word):
    """
    Check if a word is plural
    """
    word = word.split(' ')[0]
    check = p.singular_noun(word)
    if type(check) == str:
        return True
    return False

In [None]:
to_upper = ['ORG', 'FAC', 'GPE', 'PRODUCT']
words2filter = pl.Series(['terrorist', 'terrorist group'])
df = (
    pl.read_json(parent_dir + "/generators/definitions.json")
    .filter(pl.col("partOfSpeech") == "noun")
    .with_columns(
        pl.when(pl.col("pnoun").is_in(to_upper) & (pl.col("word").str.len_chars() < 4))
          .then(pl.col("word").str.to_uppercase())
          .otherwise(pl.col("word"))
          .alias("word")
    )
)
df = df.filter(~pl.col('instanceOf').list.contains('terrorist group') & ~pl.col('instanceOf').list.contains('terrorist') \
          & ~pl.col('instanceOf').list.contains('weapon') & ~pl.col('instanceOf').list.contains('ammunition') \
          &  ~pl.col('instanceOf').list.contains('firearm') & ~pl.col('instanceOf').list.contains('toxin'))

df


In [None]:
word_frequency = dict(df.select("word", "zipf").iter_rows())
instanceOf = dict(df.filter(pl.col("instanceOf").is_not_null()).select(['word', 'instanceOf']).iter_rows())
typeOf = dict(df.filter(pl.col("typeOf").is_not_null()).select(['word', 'typeOf']).iter_rows())
synonyms = dict(df.filter(pl.col("synonyms").is_not_null() & pl.col('pnoun').is_null()).select(['word', 'synonyms']).iter_rows())
len(instanceOf), len(typeOf), len(synonyms)

In [None]:

db_inst = WordsInstances(instanceOf, category='instances')
data_inst = db_inst.generate_full_dataset()
data_inst.write_json(f"{parent_dir}/generators/word_instances.json")
db_type = WordsTypes(typeOf, category='types')
data_type = db_type.generate_full_dataset()
data_type.write_json(f"{parent_dir}/generators/word_types.json")
db_synonym = WordsSynonyms(synonyms, category='synonyms')
data_synonym = db_synonym.generate_full_dataset()
data_synonym.write_json(f"{parent_dir}/generators/word_synonyms.json")

In [None]:
from wordfreq import zipf_frequency
def get_zipf(word):
    return zipf_frequency(word, 'en')

objects = list(set(db_inst.keys + db_type.keys + db_synonym.keys))
objects_validated = [word for word in objects if get_zipf(word) > 0]
len(objects), len(objects_validated)

In [None]:
objects = set(db_inst.keys + db_type.keys + db_synonym.keys)
objects = np.random.choice(list(objects), 800, replace=False)
subsample_inst = db_inst.generate_subsample(1000, 42, objects).with_columns(
                 pl.col("correct_object_2").list.join(", ").alias("correct_object_2"))
subsample_inst.write_csv(f"{parent_dir}/word_instances.csv")
subsample_types = db_type.generate_subsample(2000, 42, objects=objects).with_columns(
                 pl.col("correct_object_2").list.join(", ").alias("correct_object_2"))
subsample_types.write_csv(f"{parent_dir}/word_types.csv")

subsample_synonyms = db_synonym.generate_subsample(2000, 42, objects=objects).with_columns(
                    pl.col("correct_object_2").list.join(", ").alias("correct_object_2"))
subsample_synonyms.write_csv(f"{parent_dir}/word_synonyms.csv")


## 3. Generate Syntetic Entities
Here, we generate synthetic names for countries and cities. 
Generated names are stored in `datasets/generators/synthetic/*_raw.txt`.

In [None]:
from english_words import get_english_words_set
web_words = get_english_words_set(['web2'], lower=True)
gcide_words = get_english_words_set(['gcide'], lower=True)

In [None]:
import nltk
nltk.download('words')
from nltk.corpus import words
english_words = set(words.words())

def check_if_exists(word):
    if word in english_words:
        return True
    if word.lower() in web_words:
        return True
    if word.lower() in gcide_words:
        return True
    return False

def check_if_full_exists(phrase):
    return all([check_if_exists(word) for word in phrase.split()])


In [None]:
check_if_exists("owenster"), check_if_full_exists('boaok cover'), check_if_full_exists('book cover')

In [None]:
from namemaker import NameSet
import namemaker

seed = 'udaxihhexdvxrcsnbacghqtargwuwr'
random.seed(seed)
namemaker_rng = namemaker.get_rng()
namemaker_rng.seed(seed)

our_vocab = sorted(df['word'].unique().to_list())
word_NS = NameSet(names = our_vocab)
word_synth = [word_NS.make_name(add_to_history=False) for _ in range(1000)]
word_synth = list(set(word_synth))
# Validate
word_validated = []
for item in word_synth:
    if not check_if_exists(item):
        word_validated.append(item)
    else:
        pass
with open(f"{parent_dir}/generators/synthetic/words_raw.txt", 'w') as f:
    f.write("\n".join(map(str, word_validated)))

inst_vocab = set(db_inst.values).union(set(our_vocab))
inst_NS = NameSet(names = inst_vocab)
inst_synth = [inst_NS.make_name(add_to_history=False) for _ in range(1000)]
inst_synth = list(set(inst_synth))
inst_validated = []
for item in inst_synth:
    if not check_if_full_exists(item):
        inst_validated.append(item)
    else:
        pass
with open(f"{parent_dir}/generators/synthetic/instances_raw.txt", 'w') as f:
    f.write("\n".join(map(str, inst_validated)))

type_vocab = set(db_type.values).union(set(our_vocab))
type_NS = NameSet(names = type_vocab)
type_synth = [type_NS.make_name(add_to_history=False) for _ in range(1000)]
type_synth = list(set(type_synth))
type_validated = []
for item in type_synth:
    if not check_if_full_exists(item):
        type_validated.append(item)
    else:
        pass
with open(f"{parent_dir}/generators/synthetic/types_raw.txt", 'w') as f:
    f.write("\n".join(map(str, type_validated)))


synonyms_vocab = set(db_synonym.values).union(set(our_vocab))
synonyms_NS = NameSet(names = synonyms_vocab)
synonyms_synth = [synonyms_NS.make_name(add_to_history=False) for _ in range(1000)]
synonyms_synth = list(set(synonyms_synth))
synonyms_validated = []
for item in synonyms_synth:
    if not check_if_full_exists(item):
        synonyms_validated.append(item)
    else:
        pass
with open(f"{parent_dir}/generators/synthetic/synonyms_raw.txt", 'w') as f:
    f.write("\n".join(map(str, synonyms_validated)))

### 4. Create `unverifiable` statements
Here, we load the list of names that we manually checked (i.e., filtered raw files).

In [None]:
synth_word2inst = {}
for item in word_validated:
    synth_word2inst[item] = random.sample(inst_validated, 2)
db_syn_word2inst = WordsInstances(synth_word2inst, category='instances', is_fake=True)
data_syn_word2inst = db_syn_word2inst.generate_full_dataset()
data_syn_word2inst.write_json(f"{parent_dir}/generators/synthetic/word2inst.json")

data_syn_word2inst = db_syn_word2inst.generate_subsample(500, 42).with_columns(
                    pl.col("correct_object_2").list.join(", ").alias("correct_object_2"))
data_syn_word2inst.write_csv(f"{parent_dir}/word_instances_synthetic.csv")

In [None]:
synth_word2type = {}
for item in word_validated:
    synth_word2type[item] = random.sample(type_validated, 2)
db_syn_word2type = WordsTypes(synth_word2type, category='types', is_fake=True)
data_syn_word2type = db_syn_word2type.generate_full_dataset()
data_syn_word2type.write_json(f"{parent_dir}/generators/synthetic/word2type.json")

data_syn_word2type = db_syn_word2type.generate_subsample(1500, 42).with_columns(
                    pl.col("correct_object_2").list.join(", ").alias("correct_object_2"))
data_syn_word2type.write_csv(f"{parent_dir}/word_types_synthetic.csv")

In [None]:
synth_word2syn = {}
for item in word_validated:
    synth_word2syn[item] = random.sample(synonyms_validated, 2)
db_syn_word2syn = WordsSynonyms(synth_word2syn, category='synonyms', is_fake=True)
data_syn_word2syn = db_syn_word2syn.generate_full_dataset()
data_syn_word2syn.write_json(f"{parent_dir}/generators/synthetic/word2syn.json")

data_syn_word2syn = db_syn_word2syn.generate_subsample(1500, 42).with_columns(
                    pl.col("correct_object_2").list.join(", ").alias("correct_object_2"))
data_syn_word2syn.write_csv(f"{parent_dir}/word_synonyms_synthetic.csv")
