# Requirements

In [60]:
# Standard library imports
import re
import pandas as pd
import spacy


# Named Entity Recognization and Relation Identification from questions

In [61]:
# List of texts to be processed， which are questions extracted from the Competency Questions of the Materials Cloud Ontology
texts = [
    'Who is working in the Computational Materials Science field?',
    'What are the research projects associated to EMMO?',
    'Who are the contributors of the data "datasets"?',
    'Who is working with Researcher "Ebrahim Norouzi" in the same group?',
    'Who is the email address of "ParaView"?',
    'What are the affiliations of Volker Hofmann?',
    'What is "Molecular Dynamics" Software?List the programming language, documentation page, repository, and license information.',
    'What are pre- and post-processing tools for MD simulations?',
    'What are some workflow environments for computational materials science?',
    'How should I cite pyiron?',
    'Where can I find a list of interatomic potentials?',
    'What are python libraries used for calculating local atomic structural environment?',
    'What are the electronic lab notebooks available?',
    'What are the software for Molecular Dynamics (MD)?',
    'What are the ontologies in nanomaterials domain?',
    'What is DAMASK?',
    'What are the data portals for materials science ontologies?',
    'What are the instruments for APT?',
    'In which institution can I find tomography equipment?',
    'What are the educational resources for Ontology?',
    'What is the API of Materials Project?',
    'Which simulation software have a python API?',
    'What is the documentation of the "MatDB Online"?',
    'What are the types of software licenses?',
    'What are the software used to produce the data in the Materials Cloud repository?',
    'What are datasets produced by the BAM organization?',
    'What are some available datasets of mechanical properties of steels?',
    'What are datasets related to "Transmission electron microscopy"?',
    'What is the license of the dataset "Elastic Constant Demo Data"?',
    'What is the repository for "BAM reference data"?',
    'What are the different data formats in the "BAM reference data"?',
    'What is the software version of "pacemaker"?',
    'What is the field of research "BAM reference data"?',
    'What is the description of the "BAM reference data"?',
    'What are the datasets produced in 2022?',
    'Who are the creators of the "BAM reference data"?',
    'What are the datasets published by "BAM"?'
]

In [62]:
import re
import spacy

# Regular expression to match content within quotes
pattern = r'"(.*?)"'

# Use the @Language.component decorator to define a pipeline component
@spacy.Language.component("quote_extractor")
def custom_component(doc):
    matches = re.finditer(pattern, doc.text)
    new_ents = []
    for match in matches:
        start, end = match.span()
        span = doc.char_span(start + 1, end - 1)  # +1 and -1 to remove the quotes
        if span is not None and not any(span.start < ent.end and span.end > ent.start for ent in doc.ents):
            new_ents.append(span)
    doc.ents = list(doc.ents) + new_ents
    return doc

# Load the English model
nlp = spacy.load("en_core_web_sm")  # small model is enough

# Add the custom component to the pipeline
nlp.add_pipe("quote_extractor", after="ner")

text = 'What is "Molecular Dynamics" Software?List the programming language, documentation page, repository, and license information.'
doc = nlp(text)

# Extract and print entities
for ent in doc.ents:
    print(ent.text, ent.label_)


Molecular Dynamics WORK_OF_ART


In [63]:
import spacy
import re

# Regular expression to match content within quotes, ensuring it is not greedy
pattern = r'"(.*?)"'

# Use the @Language.component decorator to define a pipeline component
@spacy.Language.component("quote_extractor")
def custom_component(doc):
    matches = re.finditer(pattern, doc.text)
    new_ents = []
    for match in matches:
        start, end = match.span()
        # Use start + 1 and end - 1 to remove the quotes
        span = doc.char_span(start + 1, end - 1, label="QUOTED_TEXT")
        if span is not None:
            # Check for overlap with existing entities
            overlap = any(span.start < ent.end and span.end > ent.start for ent in doc.ents)
            if not overlap:
                new_ents.append(span)
    doc.ents = list(doc.ents) + new_ents
    return doc

# Load the English model
nlp = spacy.load("en_core_web_sm")  # small model is enough

# Add the custom component to the pipeline
nlp.add_pipe("quote_extractor", after="ner")

# # Example usage
# text = 'What is "Molecular Dynamics" Software? List the programming language, documentation page, repository, and license information.'
# doc = nlp(text)

# # Extract and print entities
# for ent in doc.ents:
#     print(ent.text, ent.label_)


Part-of-speech analysis will help find the correct entity



In [64]:
import spacy

def find_adjacent_noun_phrases(doc):
    """Extracts individual phrases composed of adjacent nouns, including a preceding adjective if present."""
    noun_phrases = []
    current_phrase = []

    for i, token in enumerate(doc):
        if token.pos_ in ['NOUN', 'PROPN']:  # Check for nouns or proper nouns
            if current_phrase and doc[i-1].pos_ not in ['NOUN', 'PROPN', 'ADJ']:
                # If the current phrase is not empty and the previous token is not a noun, proper noun, or adjective, add the phrase to the list
                noun_phrases.append(' '.join(current_phrase))
                current_phrase = []  # Reset the current phrase

            if i > 0 and doc[i-1].pos_ == 'ADJ':  # Check if the previous word is an adjective
                current_phrase.append(doc[i-1].text + ' ' + token.text)  # Add adjective and noun together
            else:
                current_phrase.append(token.text)  # Add noun
        elif token.pos_ == 'ADP' and token.text == 'for':  # Handle preposition 'for'
            if current_phrase:
                noun_phrases.append(' '.join(current_phrase))  # Add the current phrase to the list
                current_phrase = []  # Reset the current phrase
        else:
            if current_phrase:
                noun_phrases.append(' '.join(current_phrase))  # If the current phrase is not empty, add it to the list
                current_phrase = []  # Reset the current phrase

    if current_phrase:  # Ensure the last phrase is added
        noun_phrases.append(' '.join(current_phrase))

    return noun_phrases

# Example usage
nlp = spacy.load("en_core_web_sm")
doc = nlp("What are the data portals for materials science ontologies?	")
print(find_adjacent_noun_phrases(doc))


['data portals', 'materials science ontologies']


In [65]:
# New function to find the part between the preposition and the question mark
def find_prep_to_question_phrase(doc):
    phrase = []
    verb_found = False
    for token in doc:
        if token.pos_ == "ADP":  # If the token is a preposition (ADP)
            for next_token in doc[token.i + 1:]:  # Iterate over following tokens
                if next_token.text == "?":  # Stop if a question mark is found
                    break
                if next_token.pos_ == "VERB":  # Check if the token is a verb
                    verb_found = True
                    break  # Break if a verb is found
                phrase.append(next_token.text)  # Append the token text to the phrase list
            break  # Stop after finding the first preposition

    return '' if verb_found else ' '.join(phrase)  # Return an empty string if a verb is found, else return the phrase


In [66]:
def extract_entities_and_verbs(text):
    doc = nlp(text)

    # Entity Extraction
    named_entities = [ent.text for ent in doc.ents]

    # If named_entities is empty, search for phrases between prepositions and question marks
    if not named_entities:
        phrase = find_prep_to_question_phrase(doc)
        named_entities = [phrase]
        adjacent_nouns = find_adjacent_noun_phrases(doc)
        if adjacent_nouns:
            # Flatten the list of adjacent noun phrases
            named_entities = [phrase for phrase in adjacent_nouns]

    # Remove duplicates
    named_entities = list(set(named_entities))

    # Predicate Verb Extraction
    predicate_verbs = process_text(text)

    return named_entities, predicate_verbs


In [67]:
# Function to process each text
def process_text(text):
    doc = nlp(text)

    # Attributes Extraction
    predicate_verbs = []

    # Function to convert verbs to their lemma form
    def convert_to_lemma(verb):
        return verb.lemma_

    # Function to convert nouns to their singular form
    def convert_to_singular_noun(noun):
        if noun.tag_ == 'NNS':  # Check if it's a plural noun
            return noun.lemma_
        else:
            return noun.text

    # First, look for phrases between 'be' and prepositions
    for token in doc:
        # Looking for phrases between 'be' and prepositions
        if token.lemma_ == "be":
            phrase = []
            preposition_found = False
            for next_token in doc[token.i + 1:]:
                if next_token.pos_ == "ADP":  # Stop at a preposition
                    preposition_found = True
                    break
                phrase.append(next_token)
            # Only add the phrase if it is not empty and preposition is found
            if phrase and preposition_found:
                predicate_verbs.extend([convert_to_lemma(word) if word.pos_ == "VERB" else convert_to_singular_noun(word) for word in phrase])
                break  # Exit the loop if a phrase is found

    # Check if there are lists and use the words after lists as predicate_verbs
    for token in doc:
        if token.text.lower() == "list":
            list_start_index = token.i
            for next_token in doc[token.i + 1:]:
                if next_token.text == ".":
                    predicate_verbs.extend([convert_to_lemma(word) if word.pos_ == "VERB" else convert_to_singular_noun(word) for word in doc[list_start_index + 1:next_token.i]])
                    list_start_index = next_token.i
                    if next_token.text == ".":
                        break
    # Remove 'the' from predicate_verbs
    predicate_verbs = [' '.join([word for word in phrase.split() if word.lower() != "the"]) for phrase in predicate_verbs]

    # Convert predicate_verbs to string format
    predicate_verbs_string = ' '.join(predicate_verbs)

    # Check for 'is' and 'are' and add 'type' and 'description'
    if not predicate_verbs:
        for token in doc:
            if token.lemma_ == "be" and (token.text.lower() == "is" or token.text.lower() == "are"):
                predicate_verbs.extend(["type"])
                break

    # If no phrases found, then look for other verbs
    if not predicate_verbs:
        for token in doc:
            if "VB" in token.tag_ and token.lemma_ not in ["have", "be", "find"]:
                predicate_verbs.append(token.lemma_)

     # If no phrases found, then look for description
    if not predicate_verbs:
       predicate_verbs.append("description")


    return predicate_verbs




In [68]:
# Create a list of dataframes to concatenate
dfs = []

# Process each text and store the results in a list of dataframes
for text in texts:
    named_entities, predicate_verbs = extract_entities_and_verbs(text)
    # Flatten the named_entities if it contains lists and then join
    flattened_entities = [item for sublist in named_entities for item in (sublist if isinstance(sublist, list) else [sublist])]
    df_row = pd.DataFrame({
        "Text": [text],
        "Named Entities": [','.join(flattened_entities)],
        "Predicate Verbs": [' '.join(predicate_verbs)]
    })
    dfs.append(df_row)

# Concatenate the dataframes in the list
df = pd.concat(dfs, ignore_index=True)

# # Save the DataFrame to an Excel file
df.to_excel("EntityandRelationfromQuestionUp.xlsx", index=False)
df

Unnamed: 0,Text,Named Entities,Predicate Verbs
0,Who is working in the Computational Materials ...,the Computational Materials Science,work
1,What are the research projects associated to E...,EMMO,research project associate
2,"Who are the contributors of the data ""datasets""?","data,datasets,contributors",contributor
3,"Who is working with Researcher ""Ebrahim Norouz...",Ebrahim Norouzi,work
4,"Who is the email address of ""ParaView""?","ParaView,email address",email address
5,What are the affiliations of Volker Hofmann?,Volker Hofmann,affiliation
6,"What is ""Molecular Dynamics"" Software?List the...",Molecular Dynamics,type
7,What are pre- and post-processing tools for MD...,MD,pre- and post - processing tool
8,What are some workflow environments for comput...,"workflow environments,computational materials ...",some workflow environment
9,How should I cite pyiron?,pyiron,cite
