In [1]:
taxo_name = "food"
# taxo = "taxos/wn_food.taxo.txt"
taxo = f"taxos/{taxo_name}.taxo"
# taxo = "taxos/equipment.taxo.txt"
is_parent = {}
for line in open(taxo):
    #split by tab
    line = line.strip().split("\t")
    is_parent[line[1]] = line[2]

In [3]:
def get_ancestors(taxo, term):
    """
    Get all ancestors of a term in the taxonomy
    """
    ancestors = []
    while term in is_parent:
        term = is_parent[term]
        ancestors.append(term)
    return ancestors

def get_children(taxo, term):
    """
    Get all children of a term in the taxonomy
    """
    children = []
    for line in open(taxo):
        line = line.strip().split("\t")
        if line[2] == term:
            children.append(line[1])
    return children

#get all tree except the term and its children
#Format:
# root child 1 child 2
# child 1 child 3 child 4
#iterative version
def get_tree_it(taxo,root, term):
    """
    Get all terms in the taxonomy except the term and its children
    """
    tree = ""
    stack = [(root, 0)]
    while stack:
        node, depth = stack.pop()
        if depth > 4:
            continue
        tree += "\nParent: " + node 
        root_children = get_children(taxo, node)
        if len(root_children) != 0:
            tree = tree + "\nChildren: "
        for child in root_children:
            if type(term) == list:
                if child in term:
                    continue
            elif type(term) == str:
                if child == term:
                    continue
            tree += child + ", "
            stack.append((child, depth + 1))
    return tree

def get_leafs(taxo,root, term):
    """
    Get all terms in the taxonomy except the term and its children
    """
    parents_leaves_dict = {}
    stack = [(root, 0)]
    while stack:
        node, depth = stack.pop()
        if depth > 5:
            continue
        root_children = get_children(taxo, node)
        if len(root_children) == 0:
            parent = is_parent[node]
            if parent not in parents_leaves_dict:
                parents_leaves_dict[parent] = [node]
            else:
                parents_leaves_dict[parent].append(node)
        # if term in root_children:
        #     #get parent
        #     parent = is_parent[child]
        #     grand_parent = is_parent[parent]
        #     if grand_parent not in parents_leaves_dict:
        #         parents_leaves_dict[grand_parent] = [parent]
        #     else:
        #         parents_leaves_dict[grand_parent].append(parent)
        #     continue
        for child in root_children:
            if child == term:
                continue
            stack.append((child, depth + 1))
    return parents_leaves_dict

def get_tree_leaves(parents_leaves_dict):
    tree =""
    for parent in parents_leaves_dict:
        if parent == "food":
            continue
        granparent = is_parent[parent]
        tree += "\nGranparent: " + granparent
        tree += "\nParent: " + parent
        tree += "\nChildren: "
        for child in parents_leaves_dict[parent]:
            tree += child + ", "
        tree += "\n"
    return tree

In [4]:
parents_leaves_dict = get_leafs(taxo, "food", "almonds")
tree = get_tree_leaves(parents_leaves_dict)

In [5]:
print(tree)


Granparent: food
Parent: prepared food
Children: frozen dinners, frying pan dinners, sushi, hors d'oeuvres, 

Granparent: prepared food
Parent: appetiser
Children: fruit and nut snacks, snack cakes, breadsticks, puffed rice cakes, sesame sticks, crackers, cheese puffs, croutons, pork scratchings, cereal and granola bars, pretzels, jerky, fruit snacks, salad toppings, chips, sticky rice cakes, popcorn, fruit and nut snacks, snack cakes, breadsticks, puffed rice cakes, sesame sticks, crackers, cheese puffs, croutons, pork scratchings, cereal and granola bars, pretzels, jerky, fruit snacks, salad toppings, chips, sticky rice cakes, popcorn, 

Granparent: side dishes
Parent: pancake
Children: filoz, pan bati, eggette, malapua, thalipeeth, touton, oatcake, gyabrag, beghrir, bánh bèo, blintz, gamjajeon, arisa pitha, crempog, surnoli, okonomiyaki, pol pani, crêpe, serabi, syrniki, gundel pancake, bánh xèo, mofletta, boli, chinese pancake, æbleskiver, david eyre's pancake, galette, hotteok, r

In [6]:
# #choose 20% of the leaf terms
# import random
# terms = list(is_parent.keys())
# #remove terms that are not leafs
# terms = [term for term in terms if len(get_children(taxo, term)) == 0]
# terms = random.sample(terms, int(len(terms) * 0.17))
# len(terms),terms

In [7]:
#load fold
fold = 0
train_terms = []
with open(f"folds/{taxo_name}/train_{fold}.txt", "r") as f:
    for line in f:
        train_terms.append(line.strip())
test_terms = []
with open(f"folds/{taxo_name}/test_{fold}.txt", "r") as f:
    for line in f:
        test_terms.append(line.strip())

In [None]:
from openai import OpenAI

# client = OpenAI(base_url="http://localhost:1234/v1", api_key="lm-studio")
client = OpenAI(
    base_url = 'http://localhost:11434/v1',
    api_key='ollama', # required, but unused
)

# client = OpenAI(base_url="https://api.openai.com/v1", api_key=key)

In [9]:
import glob

system_prompt = f"""You are a system that evaluates taxonomies.
 Given a tree with all the parent nodes and their children, say if the term
 given by the user is a child of the given parent node.

Answer only with "yes" or "no".
 
    Example:
 
    Tree:
        Parent: cereals
        Children: breakfast, lunch

    Term: breakfast cereal
    Node father: breakfast
    Is child?: yes

    Exemplo:
    Tree:
        Parent: cereals
        Children: breakfast, lunch

    Term: wine
    Node father: breakfast
    Is child?: no
 """


# lets translate with llms
def ask(termo,pai,arvore):
    user_prompt = f"""
                Tree: {arvore}

                Term: {termo}
                Node father: {pai}
                Is {termo} child of {pai}? Return only yes or no:"""
    completion = client.chat.completions.create(
        # model="meta-llama-3.1-8b-instruct",
        model="gemma3:27b",
        # model="o3-mini",
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt},
        ],
        # temperature=0.7,
    )
    return completion.choices[0].message.content


In [10]:
# root = "food"
# tree=get_tree_it(taxo,root, terms)

In [11]:
def filter_tree(tree):
    #remove from tree all lines where the next line contains 'Parent:'
    lines = tree.split("\n")
    new_lines = []
    for i in range(len(lines)):
        if lines[i].startswith("Parent:") and i+1 < len(lines) and lines[i+1].startswith("Parent:"):
            continue
        new_lines.append(lines[i])
    tree = "\n".join(new_lines)
    return tree

In [12]:
root = "food"
# term = "waffle crisp"
term = "beer"
parent = is_parent[term]
print(f"Termo: {term}")
print(f"Pai: {parent}")
tree = get_tree_it(taxo, root, term)
tree = filter_tree(tree)
print(len(tree))
# test
ask(term, parent, tree)

Termo: beer
Pai: beverage
25639


'yes'

In [13]:
test_data = []
test_labels = []
for term in test_terms:
    parent = is_parent[term]
    test_data.append((term, parent))
test_labels=[1]*len(test_data)

In [14]:
import random

for i in range(len(test_data)*2):
    r = random.randint(0, len(test_terms)-1)
    term = test_terms[r]
    #get parent
    parent = is_parent[term]
    #negative
    #get any term that is not the parent
    #get random term
    # if random.random() < 0.5:
    r = random.randint(0, len(test_terms)-1)
    term2 = test_terms[r]
    fake_parent = is_parent[term2]
    #check if fake parent is not the true parent
    while fake_parent == parent:
        r = random.randint(0, len(test_terms)-1)
        term2 = test_terms[r]
        fake_parent = is_parent[term2]
    test_data.append((term, fake_parent))
    test_labels.append(0)

In [16]:
import tqdm
#vamos tirar uma metrica
predictions = []
predictions_texts = []
for item in tqdm.tqdm(test_data):
    term, parent = item
    # ancestors = get_ancestors(taxo, term)
    tree = get_tree_it(taxo, root, term)
    # parent = is_parent[term]
    tree = filter_tree(tree)
    result = ask(term, parent, tree)
    predictions_texts.append(result)
    if "yes" in result:
        predictions.append(1)
    else:
        predictions.append(0)

100%|██████████| 1488/1488 [1:08:18<00:00,  2.75s/it]


In [17]:
from sklearn.metrics import classification_report
print(classification_report(test_labels[:len(predictions)], predictions))

              precision    recall  f1-score   support

           0       0.89      0.91      0.90       992
           1       0.82      0.78      0.80       496

    accuracy                           0.87      1488
   macro avg       0.86      0.85      0.85      1488
weighted avg       0.87      0.87      0.87      1488



              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       1.00      0.79      0.88       252

    accuracy                           0.79       252
   macro avg       0.50      0.39      0.44       252
weighted avg       1.00      0.79      0.88       252



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
#accuracy
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(labels[:len(predictions)], predictions)
print("Accuracy: ", accuracy)

Accuracy:  0.7896825396825397


In [None]:
# gemma3:27b

#               precision    recall  f1-score   support

#            0       0.00      0.00      0.00         0
#            1       1.00      0.79      0.88       252

#     accuracy                           0.79       252
#    macro avg       0.50      0.39      0.44       252
# weighted avg       1.00      0.79      0.88       252