# Verbalization

In [7]:
import pandas as pd
import ast  # For safely evaluating strings containing Python literals

# Load the initial DataFrame
df_verbalization = pd.read_excel('for_verbalisation.xlsx')

# Ensure the 'Processed Triples' column exists
if 'Processed Triples' not in df_verbalization.columns:
    df_verbalization['Processed Triples'] = ''  # Initialize with empty strings

# Load supporting DataFrames
entity_df = pd.read_excel('extracted_headentity_list.xlsx')
relation_df = pd.read_excel('extracted_relation_list.xlsx')

for index, row in df_verbalization.iterrows():
    try:
        triples = ast.literal_eval(row['Found Triples'])
    except (ValueError, SyntaxError):
        df_verbalization.at[index, 'Processed Triples'] = 'Invalid format'
        continue

    processed_triples = []
    invalid_format = False
    for triple in triples:
        # Check format of the triple
        if not (isinstance(triple, tuple) and len(triple) == 3):
            df_verbalization.at[index, 'Processed Triples'] = 'Invalid format'
            invalid_format = True
            break

        entity_row = entity_df[entity_df['entity_uri'] == triple[0]]
        if not entity_row.empty:
            triple = (entity_row['entity_label'].iloc[0], triple[1], triple[2])

        relation_row = relation_df[relation_df['Predicate with Namespace'] == triple[1]]
        if not relation_row.empty:
            triple = (triple[0], relation_row['Predicate readable'].iloc[0], triple[2])

        entity_row = entity_df[entity_df['entity_uri'] == triple[2]]
        if not entity_row.empty:
            triple = (triple[0], triple[1], entity_row['entity_label'].iloc[0])

        processed_triples.append(triple)

    # Only update if all triples were processed correctly
    if not invalid_format:
        processed_triples_string = [' '.join(map(str, triple)) for triple in processed_triples]
        df_verbalization.at[index, 'Processed Triples'] = '; '.join(processed_triples_string)

# Save the updated DataFrame to check the result
df_verbalization.to_excel('processed_verbalisation.xlsx', index=False)
df_verbalization


Unnamed: 0,Text,Found Triples,Processed Triples
0,Who is working in the Computational Materials ...,[('http://demo.fiz-karlsruhe.de/matwerk/E10317...,Invalid format
1,What are the research projects associated to E...,[('http://demo.fiz-karlsruhe.de/matwerk/E83757...,ruby has type programming language; Python has...
2,"Who are the contributors of the data ""datasets""?",[('http://demo.fiz-karlsruhe.de/matwerk/E11968...,Invalid format
3,"Who is working with Researcher ""Ebrahim Norouz...",[('http://demo.fiz-karlsruhe.de/matwerk/E10181...,Ahmad Zainul Ihsan has expertise in Materials ...
4,"Who is the email address of ""ParaView""?",[('http://demo.fiz-karlsruhe.de/matwerk/E12310...,Invalid format
5,What are the affiliations of Volker Hofmann?,[('http://demo.fiz-karlsruhe.de/matwerk/E33641...,Markus Schilling has affiliation Federal Insti...
6,"What is ""Molecular Dynamics"" Software? List th...",[('http://demo.fiz-karlsruhe.de/matwerk/E63482...,Invalid format
7,What are pre- and post-processing tools for MD...,[('http://demo.fiz-karlsruhe.de/matwerk/E46999...,Invalid format
8,What are some workflow environments for comput...,[('http://demo.fiz-karlsruhe.de/matwerk/E10660...,Invalid format
9,How should I cite pyiron?,[('http://demo.fiz-karlsruhe.de/matwerk/E59887...,Invalid format


In [5]:
df_verbalization

Unnamed: 0,Text,Found Triples,Processed Triples
0,Who is working in the Computational Materials ...,[('http://demo.fiz-karlsruhe.de/matwerk/E10317...,Invalid format
1,What are the research projects associated to E...,[('http://demo.fiz-karlsruhe.de/matwerk/E83757...,ruby has type programming language; Python has...
2,"Who are the contributors of the data ""datasets""?",[('http://demo.fiz-karlsruhe.de/matwerk/E11968...,Invalid format
3,"Who is working with Researcher ""Ebrahim Norouz...",[('http://demo.fiz-karlsruhe.de/matwerk/E10181...,Ahmad Zainul Ihsan has expertise in Materials ...
4,"Who is the email address of ""ParaView""?",[('http://demo.fiz-karlsruhe.de/matwerk/E12310...,Invalid format
5,What are the affiliations of Volker Hofmann?,[('http://demo.fiz-karlsruhe.de/matwerk/E33641...,Markus Schilling has affiliation Federal Insti...
6,"What is ""Molecular Dynamics"" Software? List th...",[('http://demo.fiz-karlsruhe.de/matwerk/E63482...,Invalid format
7,What are pre- and post-processing tools for MD...,[('http://demo.fiz-karlsruhe.de/matwerk/E46999...,Invalid format
8,What are some workflow environments for comput...,[('http://demo.fiz-karlsruhe.de/matwerk/E10660...,Invalid format
9,How should I cite pyiron?,[('http://demo.fiz-karlsruhe.de/matwerk/E59887...,Invalid format


In [11]:
df_template = pd.read_excel('output template.xlsx')
# delete the 'Related Triples '
df_template.drop('Related Triples ', axis=1, inplace=True)
# Creat a new column named "Related Triples" "Context" after the "Ground Truth"
df_template.insert(2, 'Related Triples', df_verbalization['Found Triples'])
df_template.insert(3, 'Context', df_verbalization['Processed Triples'])

df_template.to_excel('train_perfect.xlsx', index=False)

In [12]:
df_template['Context'][0]

'Invalid format'