In [16]:
import json
import random
import inflect
import spacy
import numpy as np
from utils import DatasetGenerator
# imports
from pathlib import Path
# Parent directory
parent_dir = str(Path().resolve().parents[0])
import polars as pl
import re

In [17]:
# !python3 -m spacy download en_core_web_trf

# Data Preprocessing

In [18]:
nlp = spacy.load("en_core_web_trf")
def check_if_pnoun(term, definition):
    """
    Check if the term is a proper noun
    """
    term_len = len(term)
    doc = nlp(f"{term} is {definition}")
    for token in doc[:term_len]:
        if token.pos_ == "PROPN":
            ent_type = token.ent_type_
            if ent_type == "":
                return "PROPN"
            else:
                return ent_type
    return None

In [19]:
# check
doc = nlp("Harrison is defined as a 9th president of the united states")
check_if_pnoun("madam", "a 9th president of the united states.")

'PERSON'

In [20]:
keywords = ["hasTypes", "typeOf",
                "partOf", "hasParts",
                "instanceOf", "hasInstances",
                "memberOf", "hasMembers",
                "substanceOf", "hasSubstances",
                "inCategory", "hasCategories",
                "regionOf", "inRegion"]
def check_attributes(word: dict):
    keys = list(word.keys())
    if "definitions" in keys:
        if type(word["definitions"]) == list:
            for w in word["definitions"]:
                sub_keys = w.keys()
                if "synonyms" in sub_keys and "definition" in sub_keys and "partOfSpeech" in sub_keys and any(keyword in sub_keys for keyword in keywords):
                    pass
                else:
                    return False
        else:
            sub_keys = word["definitions"].keys()
            if "synonyms" in sub_keys and "definition" in sub_keys and "partOfSpeech" in sub_keys and any(keyword in sub_keys for keyword in keywords):
                pass
            else:
                return False
    else:
        return False
    if "frequency" not in keys:
        return False
    return True


# Initialize the inflect engine
p = inflect.engine()
exceptions = ("any", "one", "once", 'something', 'someone', 'somebody', 'anything', 'anyone', 'anybody')
def format_definitions(definition, part_of_speech):
    # Remove text within brackets
    definition = re.sub(r'\s*\(.*?\)\s*', ' ', definition).strip()
    
    # Check the first character of the definition to decide on the article
    definition = definition.strip()
    # Use the first part of the definition\
    definition = definition.split(';')[0]

    if part_of_speech == "noun":
        lower_definition = definition.lower()
        
        # Check specific cases where no article should be used
        if lower_definition.startswith(exceptions):
            article = ''
        else:
            # Use inflect to determine the correct article
            article = p.a(definition).split()[0]
        
        # Check if the definition already starts with an article or 'the'
        if not lower_definition.startswith(('a ', 'an ', 'the ')) and article:
            definition = f"{article} {definition}"
    elif part_of_speech == "verb":
        if not definition.startswith("to "):
            definition = f"to {definition}"

    if definition.endswith('.'):
        definition = definition[:-1]
    
    return definition.strip()

In [21]:
with open(parent_dir + '/source/wordsapi_sample.json') as f:
    data = json.load(f)
keys = data.keys()
valid_keys = [key for key in keys if check_attributes(data[key])]
print("Number of words:", len(valid_keys))

Number of words: 1641


In [22]:
word_list = []
extended_keywords = keywords + ["definition", "synonyms", "partOfSpeech"]
for key in valid_keys:
    word = data[key]
    freq = word["frequency"]
    if type(freq) == dict:
        zipf = freq["zipf"]
        perMillion = freq["perMillion"]
        diversity = freq["diversity"]
    else:
        zipf = freq
        perMillion = None
        diversity = None

    if "letters" in word.keys():
        letters = word["letters"]
    else:
        letters = None
    if "sounds" in word.keys():
        sounds = word["sounds"]
    else:
        sounds = None
    if type(word["definitions"]) == list:
        for w in word["definitions"]:
            _word = {k:v for k,v in w.items() if k in extended_keywords}
            _word["word"] = key
            _word["zipf"] = zipf
            _word["perMillion"] = perMillion
            _word["diversity"] = diversity
            _word["letters"] = letters
            _word["sounds"] = sounds
            _word["definition"] = format_definitions(_word["definition"], _word["partOfSpeech"])
            _word["num_definitions"] = len(word["definitions"])
            word_list.append(_word)
    else:
        _word = {k:v for k,v in word["definitions"].items() if k in extended_keywords}
        _word["word"] = key
        _word["zipf"] = zipf
        _word["perMillion"] = perMillion
        _word["diversity"] = diversity
        _word["letters"] = letters
        _word["sounds"] = sounds
        _word["definition"] = format_definitions(_word["definition"], _word["partOfSpeech"])
        _word["num_definitions"] = 1

        word_list.append(_word)
    

In [23]:
df = pl.from_dicts(word_list) 
## Check if PNOUN
df = df.with_columns(
    pl.struct(["word", "definition"]).map_elements(lambda x: check_if_pnoun(x["word"], x["definition"]), return_dtype=pl.String).alias("pnoun")
)
# Reorder columns
col_order = ["word", "definition", "partOfSpeech", "pnoun", "synonyms"] + keywords + ["zipf", "perMillion", "diversity", "letters", "sounds", "num_definitions"]
col_order = [col for col in col_order if col in df.columns]
df = df.select(col_order)
## do some cleaning  (set to null if the PROPN or if adjective or verb)
df = df.with_columns(pl.when( (pl.col("partOfSpeech") == "adjective") | (pl.col("partOfSpeech") == "verb"))\
                .then(None) \
                .otherwise(pl.col("pnoun")) \
                .alias("pnoun")) \
    .with_columns(pl.when(pl.col("pnoun") == "PROPN") \
                .then(None) \
                .otherwise(pl.col("pnoun")) \
                .alias("pnoun")) 

df.write_json(parent_dir + "/source/definitions.json")
df.sample(20)
df.filter(pl.col("pnoun").is_not_null())

  with torch.cuda.amp.autocast(self._mixed_precision):


word,definition,partOfSpeech,pnoun,synonyms,hasTypes,typeOf,partOf,hasParts,instanceOf,hasInstances,memberOf,hasMembers,substanceOf,inCategory,hasCategories,regionOf,zipf,perMillion,diversity,letters,sounds,num_definitions
str,str,str,str,list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],f64,f64,f64,i64,i64,i64
"""aarhus""","""a port city of Denmark in east…","""noun""","""GPE""","[""arhus""]",,,"[""kingdom of denmark"", ""danmark"", ""denmark""]",,"[""metropolis"", ""port"", … ""city""]",,,,,,,,2.3,0.19,0.0,6,,1
"""abelard""","""a French philosopher and theol…","""noun""","""PERSON""","[""peter abelard"", ""pierre abelard""]",,,,,"[""philosopher"", ""theologian"", … ""theologizer""]",,,,,,,,1.82,0.05,0.0,7,7,1
"""aborigine""","""a member of the people living …","""noun""","""PERSON""","[""australian aborigine"", ""native australian""]",,"[""aussie"", ""australian"", … ""ethnos""]",,,,,,,,,,,2.49,0.29,0.0,9,10,2
"""aborigine""","""an indigenous person who was b…","""noun""","""PERSON""","[""aboriginal"", ""indigen"", … ""native""]","[""russian"", ""levantine"", … ""filipino""]","[""soul"", ""individual"", … ""someone""]",,,,,,,,,,,2.49,0.29,0.0,9,10,2
"""acheson""","""a United States statesman who …","""noun""","""PERSON""","[""dean acheson"", ""dean gooderham acheson""]",,,,,"[""national leader"", ""solon"", ""statesman""]",,,,,,,,2.2,0.15,0.0,7,7,1
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""wtc""","""a twin skyscrapers 110 stories…","""noun""","""FAC""","[""twin towers"", ""world trade center""]",,,"[""new york city"", ""greater new york"", ""new york""]",,"[""skyscraper""]",,,,,"[""terrorism"", ""terrorist act"", ""act of terrorism""]",,,3.05,1.11,0.0,3,,1
"""xtc""","""a street names for methylenedi…","""noun""","""PRODUCT""","[""adam"", ""cristal"", … ""x""]",,"[""mdma"", ""methylenedioxymethamphetamine""]",,,,,,,,,,,2.03,0.09,0.0,3,,1
"""yisrael""","""a Jewish republic in southwest…","""noun""","""GPE""","[""israel"", ""sion"", … ""zion""]",,,"[""mideast"", ""middle east"", ""near east""]","[""tel aviv-yalo"", ""sodom"", … ""gomorrah""]","[""state"", ""land"", ""country""]",,,"[""israeli""]",,,,"[""15 may organization"", ""a'man"", … ""tanzim""]",2.57,0.36,0.0,7,,1
"""yue""","""the dialect of Chinese spoken …","""noun""","""LANGUAGE""","[""cantonese"", ""cantonese dialect"", ""yue dialect""]",,"[""chinese""]",,,,,,,,,,,3.23,1.69,0.0,3,,1


# Create "Statement Dataset"

In [24]:
df = pl.read_json(parent_dir + "/source/definitions.json")

In [25]:
df

word,definition,partOfSpeech,pnoun,synonyms,hasTypes,typeOf,partOf,hasParts,instanceOf,hasInstances,memberOf,hasMembers,substanceOf,inCategory,hasCategories,regionOf,zipf,perMillion,diversity,letters,sounds,num_definitions
str,str,str,str,list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],f64,f64,f64,i64,i64,i64
"""aarhus""","""a port city of Denmark in east…","""noun""","""GPE""","[""arhus""]",,,"[""kingdom of denmark"", ""danmark"", ""denmark""]",,"[""metropolis"", ""port"", … ""city""]",,,,,,,,2.3,0.19,0.0,6,,1
"""abbess""","""the superior of a group of nun…","""noun""",,"[""mother superior"", ""prioress""]","[""mother""]","[""superior""]",,,,"[""brigid"", ""heloise"", … ""bridget""]",,,,,,,2.47,0.28,0.0,6,4,1
"""abelard""","""a French philosopher and theol…","""noun""","""PERSON""","[""peter abelard"", ""pierre abelard""]",,,,,"[""philosopher"", ""theologian"", … ""theologizer""]",,,,,,,,1.82,0.05,0.0,7,7,1
"""aborigine""","""a member of the people living …","""noun""","""PERSON""","[""australian aborigine"", ""native australian""]",,"[""aussie"", ""australian"", … ""ethnos""]",,,,,,,,,,,2.49,0.29,0.0,9,10,2
"""aborigine""","""an indigenous person who was b…","""noun""","""PERSON""","[""aboriginal"", ""indigen"", … ""native""]","[""russian"", ""levantine"", … ""filipino""]","[""soul"", ""individual"", … ""someone""]",,,,,,,,,,,2.49,0.29,0.0,9,10,2
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""xtc""","""a street names for methylenedi…","""noun""","""PRODUCT""","[""adam"", ""cristal"", … ""x""]",,"[""mdma"", ""methylenedioxymethamphetamine""]",,,,,,,,,,,2.03,0.09,0.0,3,,1
"""yisrael""","""a Jewish republic in southwest…","""noun""","""GPE""","[""israel"", ""sion"", … ""zion""]",,,"[""mideast"", ""middle east"", ""near east""]","[""tel aviv-yalo"", ""sodom"", … ""gomorrah""]","[""state"", ""land"", ""country""]",,,"[""israeli""]",,,,"[""15 may organization"", ""a'man"", … ""tanzim""]",2.57,0.36,0.0,7,,1
"""yue""","""the dialect of Chinese spoken …","""noun""","""LANGUAGE""","[""cantonese"", ""cantonese dialect"", ""yue dialect""]",,"[""chinese""]",,,,,,,,,,,3.23,1.69,0.0,3,,1
"""zabaglione""","""a light foamy custard-like des…","""noun""",,"[""sabayon""]",,"[""afters"", ""dessert"", ""sweet""]",,,,,,,,,,,1.6,0.03,0.0,10,,1


In [26]:
p = inflect.engine()

def is_plural(word):
    """
    Check if a word is plural
    """
    word = word.split(' ')[0]
    check = p.singular_noun(word)
    if type(check) == str:
        return True
    return False

In [27]:
to_upper = ['ORG', 'FAC', 'GPE', 'PRODUCT']
words2filter = pl.Series(['terrorist', 'terrorist group'])
df = (
    pl.read_json(parent_dir + "/source/definitions.json")
    .filter(pl.col("partOfSpeech") == "noun")
    # .filter(~pl.col('typeOf').list.contains(words2filter))
    .with_columns(
        pl.when(pl.col("pnoun").is_in(to_upper) & (pl.col("word").str.len_chars() < 4))
          .then(pl.col("word").str.to_uppercase())
          .otherwise(pl.col("word"))
          .alias("word")
    )
)
df = df.filter(~pl.col('instanceOf').list.contains('terrorist group') & ~pl.col('instanceOf').list.contains('terrorist') \
          & ~pl.col('instanceOf').list.contains('weapon') & ~pl.col('instanceOf').list.contains('ammunition') \
          &  ~pl.col('instanceOf').list.contains('firearm') & ~pl.col('instanceOf').list.contains('toxin'))

df


word,definition,partOfSpeech,pnoun,synonyms,hasTypes,typeOf,partOf,hasParts,instanceOf,hasInstances,memberOf,hasMembers,substanceOf,inCategory,hasCategories,regionOf,zipf,perMillion,diversity,letters,sounds,num_definitions
str,str,str,str,list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],f64,f64,f64,i64,i64,i64
"""aarhus""","""a port city of Denmark in east…","""noun""","""GPE""","[""arhus""]",,,"[""kingdom of denmark"", ""danmark"", ""denmark""]",,"[""metropolis"", ""port"", … ""city""]",,,,,,,,2.3,0.19,0.0,6,,1
"""abbess""","""the superior of a group of nun…","""noun""",,"[""mother superior"", ""prioress""]","[""mother""]","[""superior""]",,,,"[""brigid"", ""heloise"", … ""bridget""]",,,,,,,2.47,0.28,0.0,6,4,1
"""abelard""","""a French philosopher and theol…","""noun""","""PERSON""","[""peter abelard"", ""pierre abelard""]",,,,,"[""philosopher"", ""theologian"", … ""theologizer""]",,,,,,,,1.82,0.05,0.0,7,7,1
"""aborigine""","""a member of the people living …","""noun""","""PERSON""","[""australian aborigine"", ""native australian""]",,"[""aussie"", ""australian"", … ""ethnos""]",,,,,,,,,,,2.49,0.29,0.0,9,10,2
"""aborigine""","""an indigenous person who was b…","""noun""","""PERSON""","[""aboriginal"", ""indigen"", … ""native""]","[""russian"", ""levantine"", … ""filipino""]","[""soul"", ""individual"", … ""someone""]",,,,,,,,,,,2.49,0.29,0.0,9,10,2
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""XTC""","""a street names for methylenedi…","""noun""","""PRODUCT""","[""adam"", ""cristal"", … ""x""]",,"[""mdma"", ""methylenedioxymethamphetamine""]",,,,,,,,,,,2.03,0.09,0.0,3,,1
"""yisrael""","""a Jewish republic in southwest…","""noun""","""GPE""","[""israel"", ""sion"", … ""zion""]",,,"[""mideast"", ""middle east"", ""near east""]","[""tel aviv-yalo"", ""sodom"", … ""gomorrah""]","[""state"", ""land"", ""country""]",,,"[""israeli""]",,,,"[""15 may organization"", ""a'man"", … ""tanzim""]",2.57,0.36,0.0,7,,1
"""yue""","""the dialect of Chinese spoken …","""noun""","""LANGUAGE""","[""cantonese"", ""cantonese dialect"", ""yue dialect""]",,"[""chinese""]",,,,,,,,,,,3.23,1.69,0.0,3,,1
"""zabaglione""","""a light foamy custard-like des…","""noun""",,"[""sabayon""]",,"[""afters"", ""dessert"", ""sweet""]",,,,,,,,,,,1.6,0.03,0.0,10,,1


In [28]:
word_frequency = dict(df.select("word", "zipf").iter_rows())

In [29]:
instanceOf = dict(df.filter(pl.col("instanceOf").is_not_null()).select(['word', 'instanceOf']).iter_rows())
typeOf = dict(df.filter(pl.col("typeOf").is_not_null()).select(['word', 'typeOf']).iter_rows())
synonyms = dict(df.filter(pl.col("synonyms").is_not_null() & pl.col('pnoun').is_null()).select(['word', 'synonyms']).iter_rows())
len(instanceOf), len(typeOf), len(synonyms)


(249, 1262, 1028)

In [30]:
class WordsInstances(DatasetGenerator):
    '''
    Class to handle the dataset from WordAPI
    '''    
    def apply_template(self, word: str, definition: str, negated: bool=False):
        if word.upper() == word:
            word = word
        else:
            word = word.capitalize()
        if not is_plural(definition):
            definition = p.a(definition)
        if negated:
            return f"{word} is not {definition}."
        else:
            return f"{word} is {definition}."
    

    def generate_sample(self, key, value, negated: bool):
        correct_values = self.source[key]
        correct = any([value.lower() in v.lower() for v in correct_values])
        if negated:
            correct = not correct
        if not self.is_fake:
            return {'statement': self.apply_template(key, value, negated),
                    'object_1': key,
                    'object_2': value,
                    'correct_object_2':  correct_values,
                    'correct': correct,
                    'negated': negated,
                    'real_object': True,
                    'fake_object': False,
                    'fictional_object': False,
                    'category': self.category,
                    }
        else:
            return {'statement': self.apply_template(key, value, negated),
                    'object_1': key,
                    'object_2': value,
                    # correct_values do notmean anything in this case
                    'correct_object_2':  correct_values,
                    'correct': False,
                    'negated': negated,
                    'real_object': False,
                    'fake_object': True,
                    'fictional_object': False,
                    'category': self.category
                    }

        
    def generate_subsample(self, n: int, seed: int, objects: list = None):
        np.random.seed(seed)
        if objects is not None:
            data = self.data.filter(pl.col("object_1").is_in(objects))
        else:
            data = self.data
        if data.height > n:
            print(f'Downsample from {data.height} to {n}')
            data = data.sample(n, seed=seed, shuffle=True)
        else:
            print(f'Size of the dataset is {data.height}')
        return data


class WordsTypes(WordsInstances):
    '''
    Class to handle the dataset from WordAPI
    '''    
    def apply_template(self, word: str, definition: str, negated: bool=False):
        if word.upper() == word:
            word = word
        else:
            word = word.capitalize()
        if not is_plural(definition):
            definition = p.a(definition)
        if negated:
            return f"{word.capitalize()} is not a type of {definition}."
        else:
            return f"{word.capitalize()} is a type of {definition}."

    
db_inst = WordsInstances(instanceOf, category='instances')
data_inst = db_inst.generate_full_dataset()
data_inst.write_json(f"{parent_dir}/source/word_instances.json")
db_type = WordsTypes(typeOf, category='types')
data_type = db_type.generate_full_dataset()
data_type.write_json(f"{parent_dir}/source/word_types.json")


In [31]:
class WordsSynonyms(WordsInstances):
    '''
    Class to handle the dataset from WordAPI
    '''    
    def apply_template(self, word: str, definition: str, negated: bool=False):
        if word.upper() == word:
            word = word
        else:
            word = word.capitalize()
        if not is_plural(definition):
            definition = p.a(definition)
        if negated:
            return f"{word.capitalize()} is not a synonym of {definition}."
        else:
            return f"{word.capitalize()} is a synonym of {definition}."

    
db_synonym = WordsSynonyms(synonyms, category='synonyms')
data_synonym = db_synonym.generate_full_dataset()
data_synonym.write_json(f"{parent_dir}/source/word_synonyms.json")

In [32]:
from wordfreq import zipf_frequency
def get_zipf(word):
    return zipf_frequency(word, 'en')

objects = list(set(db_inst.keys + db_type.keys + db_synonym.keys))
objects_validated = [word for word in objects if get_zipf(word) > 0]
len(objects), len(objects_validated)

(1492, 1447)

In [33]:
objects = set(db_inst.keys + db_type.keys + db_synonym.keys)
objects = np.random.choice(list(objects), 800, replace=False)
subsample_inst = db_inst.generate_subsample(1000, 42, objects).with_columns(
                 pl.col("correct_object_2").list.join(", ").alias("correct_object_2"))
subsample_inst.write_csv(f"{parent_dir}/word_instances_subsample.csv")
subsample_types = db_type.generate_subsample(2000, 42, objects=objects).with_columns(
                 pl.col("correct_object_2").list.join(", ").alias("correct_object_2"))
subsample_types.write_csv(f"{parent_dir}/word_types_subsample.csv")

subsample_synonyms = db_synonym.generate_subsample(2000, 42, objects=objects).with_columns(
                    pl.col("correct_object_2").list.join(", ").alias("correct_object_2"))
subsample_synonyms.write_csv(f"{parent_dir}/word_synonyms_subsample.csv")


Downsample from 1352 to 1000
Downsample from 5960 to 2000
Downsample from 5008 to 2000


In [34]:
subsample_inst.vstack(subsample_types).vstack(subsample_synonyms).group_by(['correct', 'negated']).count()

  subsample_inst.vstack(subsample_types).vstack(subsample_synonyms).group_by(['correct', 'negated']).count()


correct,negated,count
bool,bool,u32
True,False,1269
False,False,1223
True,True,1251
False,True,1257


# Synthetic

In [107]:
from english_words import get_english_words_set
web_words = get_english_words_set(['web2'], lower=True)
gcide_words = get_english_words_set(['gcide'], lower=True)

In [108]:
import nltk
nltk.download('words')
from nltk.corpus import words
english_words = set(words.words())

def check_if_exists(word):
    if word in english_words:
        return True
    if word.lower() in web_words:
        return True
    if word.lower() in gcide_words:
        return True
    return False

def check_if_full_exists(phrase):
    return all([check_if_exists(word) for word in phrase.split()])


[nltk_data] Downloading package words to /Users/carlomarx/nltk_data...
[nltk_data]   Package words is already up-to-date!


In [109]:
check_if_exists("owenster"), check_if_full_exists('boaok cover'), check_if_full_exists('book cover')

(False, False, True)

In [110]:
from namemaker import NameSet
import namemaker

seed = 'udaxihhexdvxrcsnbacghqtargwuwr'
random.seed(seed)
namemaker_rng = namemaker.get_rng()
namemaker_rng.seed(seed)

our_vocab = sorted(df['word'].unique().to_list())
word_NS = NameSet(names = our_vocab)
word_synth = [word_NS.make_name(add_to_history=False) for _ in range(1000)]
word_synth = list(set(word_synth))
# Validate
word_validated = []
for item in word_synth:
    if not check_if_exists(item):
        word_validated.append(item)
    else:
        pass
with open(f"{parent_dir}/source/synth_words.txt", 'w') as f:
    f.write("\n".join(map(str, word_validated)))

inst_vocab = set(db_inst.values).union(set(our_vocab))
inst_NS = NameSet(names = inst_vocab)
inst_synth = [inst_NS.make_name(add_to_history=False) for _ in range(1000)]
inst_synth = list(set(inst_synth))
inst_validated = []
for item in inst_synth:
    if not check_if_full_exists(item):
        inst_validated.append(item)
    else:
        pass
with open(f"{parent_dir}/source/word_synth_inst.txt", 'w') as f:
    f.write("\n".join(map(str, inst_validated)))

type_vocab = set(db_type.values).union(set(our_vocab))
type_NS = NameSet(names = type_vocab)
type_synth = [type_NS.make_name(add_to_history=False) for _ in range(1000)]
type_synth = list(set(type_synth))
type_validated = []
for item in type_synth:
    if not check_if_full_exists(item):
        type_validated.append(item)
    else:
        pass
with open(f"{parent_dir}/source/word_synth_type.txt", 'w') as f:
    f.write("\n".join(map(str, type_validated)))


synonyms_vocab = set(db_synonym.values).union(set(our_vocab))
synonyms_NS = NameSet(names = synonyms_vocab)
synonyms_synth = [synonyms_NS.make_name(add_to_history=False) for _ in range(1000)]
synonyms_synth = list(set(synonyms_synth))
synonyms_validated = []
for item in synonyms_synth:
    if not check_if_full_exists(item):
        synonyms_validated.append(item)
    else:
        pass
with open(f"{parent_dir}/source/word_synth_synonyms.txt", 'w') as f:
    f.write("\n".join(map(str, synonyms_validated)))

In [111]:
synth_word2inst = {}
for item in word_validated:
    synth_word2inst[item] = random.sample(inst_validated, 2)
db_syn_word2inst = WordsInstances(synth_word2inst, category='instances', is_fake=True)
data_syn_word2inst = db_syn_word2inst.generate_full_dataset()
data_syn_word2inst.write_json(f"{parent_dir}/source/synth_word2inst.json")

data_syn_word2inst = db_syn_word2inst.generate_subsample(500, 42).with_columns(
                    pl.col("correct_object_2").list.join(", ").alias("correct_object_2"))
data_syn_word2inst.write_csv(f"{parent_dir}/synth_word2inst_subsample.csv")

Downsample from 6600 to 500


In [113]:
synth_word2type = {}
for item in word_validated:
    synth_word2type[item] = random.sample(type_validated, 2)
db_syn_word2type = WordsTypes(synth_word2type, category='types', is_fake=True)
data_syn_word2type = db_syn_word2type.generate_full_dataset()
data_syn_word2type.write_json(f"{parent_dir}/source/synth_word2type.json")

data_syn_word2type = db_syn_word2type.generate_subsample(1500, 42).with_columns(
                    pl.col("correct_object_2").list.join(", ").alias("correct_object_2"))
data_syn_word2type.write_csv(f"{parent_dir}/synth_word2type_subsample.csv")

Downsample from 6600 to 1500


In [115]:
synth_word2syn = {}
for item in word_validated:
    synth_word2syn[item] = random.sample(synonyms_validated, 2)
db_syn_word2syn = WordsSynonyms(synth_word2syn, category='synonyms', is_fake=True)
data_syn_word2syn = db_syn_word2syn.generate_full_dataset()
data_syn_word2syn.write_json(f"{parent_dir}/source/synth_word2syn.json")

data_syn_word2syn = db_syn_word2syn.generate_subsample(1500, 42).with_columns(
                    pl.col("correct_object_2").list.join(", ").alias("correct_object_2"))
data_syn_word2syn.write_csv(f"{parent_dir}/synth_word2syn_subsample.csv")


Downsample from 6600 to 1500


In [119]:
data_syn_word2inst.vstack(data_syn_word2type).vstack(data_syn_word2syn).group_by(['correct', 'negated']).count()

  data_syn_word2inst.vstack(data_syn_word2type).vstack(data_syn_word2syn).group_by(['correct', 'negated']).count()


correct,negated,count
bool,bool,u32
False,True,1753
False,False,1747


## Made Up


In [35]:
fictional_words = {
    "Zorfling": "the act of jumping between realities within the same multiverse.",
    "Plimble": "a small, whimsical creature known for its love of shiny objects.",
    "Glavish": "a melodic sound produced by the wind passing through a specific type of hollow tree.",
    "Crabunk": "a feeling of sudden joy experienced when solving a complex problem.",
    "Nexlore": "an ancient, forgotten language that was once spoken by a now-extinct civilization.",
    "Mirdlewomp": "a mysterious, enchanted forest said to exist in the northern reaches of the world.",
    "Swizzle": "a type of magical potion that changes color based on the drinker's mood.",
    "Jorflap": "an intricate dance performed during the Festival of Lights.",
    "Klibber": "a rare and precious stone that glows softly in the dark.",
    "Frozbloom": "a flower that only blooms during a lunar eclipse.",
    "Grimble": "a wise elder who serves as a mentor in a community.",
    "Sparvile": "to swiftly and elegantly maneuver through a crowded space.",
    "Dralic": "a legendary beast known for its strength and benevolence.",
    "Quinthor": "a game played by the ancients, involving strategy and skill, similar to chess.",
    "Wistlawn": "a tranquil meadow where magical creatures are said to gather.",
    "Jimboree": "a large, joyful gathering of friends and family, often involving music and dance.",
    "Klottish": "the state of being slightly disoriented or confused.",
    "Brindlequack": "a peculiar bird known for its colorful plumage and distinctive call.",
    "Thramble": "the sensation of tingling warmth experienced when holding hands with a loved one.",
    "Xyro": "a powerful spell used to summon protective spirits.",
    "Yafflem": "an ancient scroll containing knowledge lost to time.",
    "Blunderbussle": "a clumsy but endearing person who often finds themselves in amusing predicaments.",
    "Cringle": "the sound of laughter carried on a breeze.",
    "Frozzle": "to mix a tea with precise and delicate movements.",
    "Harrowheel": "a child known for their unwavering courage and determination.",
    "Inkwisp": "a mother figure that appears to guide lost travelers.",
    "Jubilark": "a state of overwhelming happiness and contentment.",
    "Lurvish": "the act of whispering sweet nothings to a romantic partner.",
    "Mizzletop": "the highest point of a mountain, often shrouded in mist.",
    "Nimbletree": "a tree known for its agility and ability to move slightly to avoid danger.",
    "Quizzik": "an ancient riddle that has never been solved.",
    "Rafflenook": "a cozy corner in a library filled with rare and old books.",
    "Snizzle": "to giggle quietly to oneself.",
    "Umbrafrost": "a frost that appears only under a full moon.",
    "Vespervine": "a vine that blooms only at dusk and only when looked at.",
    "Whimsywood": "a forest where anything is possible.",
    "Xylith": "a rare mineral that only exists under the schools.",
    "Yonderwisp": "a distant, flickering light that guides mice to their destination.",
    "Zephyrine": "a gentle breeze that carries a hint of garlic.",
    "Blithery": "to speak with great excitement and enthusiasm.",
    "Dromik": "a fast and agile creature known for its keen sense of direction.",
    "Jaxel": "a playful spirit that brings joy and laughter.",
    "Klimora": "the ancient, hidden city known for its advanced technology and wisdom.",
    "Flumplen": "a feeling of joyful confusion.",
    "Snurfle": "to engage in a friendly struggle.",
    "Jinkle": "a small, shiny object that brings good luck.",
    "Wuggle": "to walk with an unsteady gait.",
    "Klabloom": "a type of rare, exotic flower.",
    "Flimbul": "a type of delicate, sparkling fabric.",
    "Jinklewiff": "a playful, mischievous spirit.",
    "Wumplen": "to stumble or trip.",
    "Klabber": "a type of sticky, sweet substance.",
    "Flarglepunk": "the style of music characterized by lively rhythms.",
    "Snurflebug": "the small, insect-like creature.",
    "Flumplenux": "the complex problem or puzzle.",
    "Snazzlefrazz": "a stylish, fashionable outfit.",
    "Jinkleplack": "a type of rare, precious metal.",
    "Flargleplex": "the complicated, confusing situation.",
    "Jabberton": "a talkative, chatty person.",
    "Kabloinga": "a sudden, unexpected event.",
    "Flimbulux": "a delicate, intricate mechanism.",
    "Ignigen": "a creature born from fire.",
    "Jovialix": "a plant that laughs when touched.",
    "Kinetibar": "a bar that serves only energy drinks.",
    "Elysianth": "a flower that blooms in paradise.",
    "Hydrocera": "a wax that repels water, oil and air.",
}


In [37]:
columns = ["statement", "object_1", "object_2", "correct_object_2", "correct", "negation", "real_object", "fictional_object", "fake_object", "category", "freq"]
rows = []
fictional_definitions = list(fictional_words.values())
random.seed(42)
for k,v in fictional_words.items():
    word = k
    definition = v[:-1] # to remove dot
    statement = generate_statement(word, definition, templates)
    object_1 = word
    object_2 = definition
    correct_object_2 = definition
    rows.append([statement, object_1, object_2, correct_object_2, 0, 0, 0, 0, 1, "definition", 0])
    ## Add NEGATION
    word = k
    definition = v
    statement =generate_statement(word, definition, negated_templates)
    object_1 = word
    object_2 = definition
    correct_object_2 = definition
    rows.append([statement, object_1, object_2, correct_object_2, 0, 1, 0, 0, 1, "definition", 0])
results = pl.DataFrame(rows, schema=columns)
results.write_csv(parent_dir + "/definitions_fake.csv")
results.head(5)

statement,object_1,object_2,correct_object_2,correct,negation,real_object,fictional_object,fake_object,category,freq
str,str,str,str,i64,i64,i64,i64,i64,str,i64
"""Zorfling is de…","""Zorfling""","""the act of jum…","""the act of jum…",0,0,0,0,1,"""definition""",0
"""Zorfling is no…","""Zorfling""","""the act of jum…","""the act of jum…",0,1,0,0,1,"""definition""",0
"""Plimble can be…","""Plimble""","""a small, whims…","""a small, whims…",0,0,0,0,1,"""definition""",0
"""A small, whims…","""Plimble""","""a small, whims…","""a small, whims…",0,1,0,0,1,"""definition""",0
"""A melodic soun…","""Glavish""","""a melodic soun…","""a melodic soun…",0,0,0,0,1,"""definition""",0
