In [1]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue Jul 20 12:21:39 2021

@author: sveta
"""
import os

from amratom.space import AmrAtomspace
from amratom import atomese
from opencog.utilities import push_default_atomspace
from amratom.triples import (TripleProcessor, PatternInstanceDict, is_amr_set, is_const)

from amratom.atomese import (default_atomspace)
from amratom.types import *
import re

import pandas as pd
import logging
from datetime import datetime
import time
from opencog.bindlink import execute_atom
from opencog.type_constructors import *

log = logging.getLogger('root')
FORMAT = '%(asctime)s - %(message)s'
logging.basicConfig(format=FORMAT)
log.setLevel(logging.ERROR)


def process_triples(amr_space, triples):
    _single_word_pattern = re.compile(r'^\S+$')
    for triple in triples:
        if is_amr_set(triple) and is_const(triple[2]):
            source, role, target = triple
            no_quotes = target[1:-1]
            if _single_word_pattern.search(no_quotes) is not None:
                amr_space.add_triple(triple)
            # else:
            # top = .utterance_parser.parse_sentence(no_quotes)
            # amr_space.add_triple((source, role, top.name))
        else:
            amr_space.add_triple(triple)


# dataFrm.shape
def parse(amr_space, triple_proc, amr):
    with default_atomspace(amr_space.atomspace):
        parsed_amr = triple_proc.amr_to_triples(amr)
        process_triples(amr_space, parsed_amr)
        # return amr_value_atom(parsed_amr.top)


minTime = 10
maxTime = 0
sumTime = 0
countTime = 0


# count min, max and average execution time
def CountStats(timeDiff):
    global minTime
    global maxTime
    global sumTime
    global countTime

    if minTime > timeDiff:
        minTime = timeDiff
    if maxTime < timeDiff:
        maxTime = timeDiff
    sumTime = sumTime + timeDiff
    countTime = countTime + 1


def QueryListLinkWithAnyRole(atomspace, role, log):
    startTime = datetime.fromtimestamp(time.time())
    results = execute_atom(
        atomspace,
        GetLink(
            VariableList(
                TypedVariableLink(VariableNode("v1"), TypeNode("AmrValue")),
                TypedVariableLink(VariableNode("v2"), TypeNode("AmrValue")),
                TypedVariableLink(VariableNode("c1"), TypeNode("AmrConcept")),
                TypedVariableLink(VariableNode("c2"), TypeNode("AmrConcept")),
            ),
            AndLink(
                EvaluationLink(
                    AmrRole(role),
                    ListLink(
                        VariableNode("v1"),
                        VariableNode("v2")
                    )
                ),
                AmrInstanceLink(VariableNode("v1"), VariableNode('c1')),
                AmrInstanceLink(VariableNode('v2'), VariableNode('c2'))
            )
        )
    ).out

    endTime = datetime.fromtimestamp(time.time())
    timeDiff = (endTime - startTime).total_seconds()
    CountStats(timeDiff)
    log.info(f'Query for role {role}: found items {len(results)},  execution time: {timeDiff}')


def QueryNodeWithHasRole(atomspace, role, log):
    startTime = datetime.fromtimestamp(time.time())
    results = execute_atom(
        atomspace,
        GetLink(
            VariableList(
                TypedVariableLink(
                    VariableNode("v1"),
                    TypeChoice(
                        TypeNode("AmrValue"),
                        TypeNode("AmrConcept")
                    )
                )
            ),
            EvaluationLink(
                PredicateNode("has-role"),
                ListLink(
                    VariableNode("v1"),
                    AmrRole(role)
                )
            )
        )
    ).out

    endTime = datetime.fromtimestamp(time.time())
    timeDiff = (endTime - startTime).total_seconds()
    CountStats(timeDiff)
    log.info(f'Query for "has-role" and role {role}: found items {len(results)},  execution time: {timeDiff}')


def QueryNodeWithIsOptional(atomspace, role, log):
    startTime = datetime.fromtimestamp(time.time())
    results = execute_atom(
        atomspace,
        GetLink(
            VariableList(
                TypedVariableLink(
                    VariableNode("v1"),
                    TypeNode("AmrValue")
                ),
                TypedVariableLink(
                    VariableNode("v2"),
                    TypeNode("AmrValue")
                )
            ),
            EvaluationLink(
                PredicateNode('is-optional'),
                ListLink(
                    VariableNode("v1"),
                    AmrRole(role),
                    VariableNode("v2")
                ),
                tv=atomese.FALSE),
        )
    ).out

    endTime = datetime.fromtimestamp(time.time())
    timeDiff = (endTime - startTime).total_seconds()
    CountStats(timeDiff)
    log.info(f'Query for "is-optional" and role {role}: found items {len(results)},  execution time: {timeDiff}')


def FindAtoms(atomspace):
    roles = [
        ':name',
        ':time',
        ':quant',
        ':location',
        ':purpose',
        ':example',
        ':instrument',
        ':scale-of',
        ':concession',
        ':accompanier',
        ':ARG0',
        ':ARG1'
    ]

    for role in roles:
        QueryNodeWithHasRole(atomspace, role, log)
        QueryListLinkWithAnyRole(atomspace, role, log)
        QueryNodeWithIsOptional(atomspace, role, log)
    log.info(f'Minimum time {minTime}, Maximum time {maxTime}, Average time {sumTime / countTime}')


def process():
    curDir = os.path.abspath(os.getcwd())

    filename = "wiki2Amr10.json"
    dataFrm = pd.read_json(filename)
    dataFrm.drop_duplicates(subset=['origin'], inplace=True)
    print(dataFrm.shape)
    # dataFrm =  dataFrm.sample(90000)

    atomspace = AtomSpace()
    triple_proc = TripleProcessor(PatternInstanceDict)
    push_default_atomspace(atomspace)
    amr_space = AmrAtomspace(atomspace)
    tr = []
    startTime = datetime.fromtimestamp(time.time())
    log.info(f'start time {startTime}')
    for i, row in dataFrm.iterrows():
        try:
            for amr in row['amr']:
                parse(amr_space, triple_proc, amr)
        except Exception as e:
            print(e, "---row---", i)

    endTime = datetime.fromtimestamp(time.time())
    timeDiff = (endTime - startTime).total_seconds()
    log.info(f'Load time {timeDiff}')
    log.info(f'Atoms count {atomspace.size()}')
    FindAtoms(atomspace)
    return atomspace


In [2]:
atomspace = process()

(10, 3)
'amr' ---row--- 0
'amr' ---row--- 1
'amr' ---row--- 2
'amr' ---row--- 3
'amr' ---row--- 4
'amr' ---row--- 5
'amr' ---row--- 6
'amr' ---row--- 7
'amr' ---row--- 8
'amr' ---row--- 9


In [3]:
atomspace

<opencog.atomspace.AtomSpace at 0x7f5e92561e40>