# Neo4j format converter

This notebook converts the first version of the ontology-based annotated files to the neo4j format
### Imports

In [1]:
import pandas as pd
import datetime

# timestamp
today = datetime.date.today()

### Workflow

In [2]:
# list files
!ls *txt

top50.chebi-celltype_human.txt top50.chebi-uberon_human.txt
top50.chebi-celltype_plant.txt top50.human.chebi-pheno.txt
top50.chebi-doid_human.txt     top50.plant.chebi-flopo.txt
top50.chebi-envo_human.txt     top50.plant.chebi-to.txt
top50.chebi-envo_plant.txt


In [3]:
# read file
df = pd.read_csv('./top50.chebi-celltype_human.txt',sep='\t')
print('data structure: {0} annotations and {1} data fields'.format(*df.shape))

# modify to csv neo4j format
# edges
# RO:0002616  related via evidence or inference to
clh = (df
        .assign(
            TYPE=lambda x: 'RO:0002616',
            PROPERTY_LABEL=lambda x: 'related via evidence or inference to',
            SPECIES=lambda x: 'Human',
            SENTENCE=lambda x: None, 
            PMCID=lambda x: None)
        .rename(columns={
            'CHEBI_ID': ':START_ID',
            'TYPE': ':TYPE',
            'CL_ID': ':END_ID',
            '#PMCIDs': 'PMCIDs'
        })
     ).copy()
clh_edges = clh[[':START_ID', ':TYPE', ':END_ID', 'PROPERTY_LABEL', 'SPECIES', 'SENTENCE', 'PMCID', 'NPMI_score', 'PMCIDs']].copy()
print('data structure: {0} annotations and {1} data fields'.format(*clh_edges.shape))

# nodes
clh_sub = df[['CHEBI_ID', 'CHEBI_Name']].copy()
clh_sub = (
           clh_sub
                 .assign(LABEL=lambda x: 'METABOLITE', ONTOLOGY_IRI=lambda x: 'http://purl.obolibrary.org/obo/chebi.owl', ONTOLOGY_NAME=lambda x: 'Chemical Entities of Biological Interest Ontology', ONTOLOGY_VERSION_IRI=lambda x: 'http://purl.obolibrary.org/obo/chebi/183/chebi.owl')
                 .rename(columns={
                     'CHEBI_ID':'ID:ID',
                     'LABEL': ':LABEL',
                     'CHEBI_Name': 'NAME'
                 })     
        ).copy()
clh_obj = df[['CL_ID', 'CL_Name']].copy()
clh_obj = (
           clh_obj
                 .assign(LABEL=lambda x: 'CELL_TYPE', ONTOLOGY_IRI=lambda x: 'http://purl.obolibrary.org/obo/cl.owl', ONTOLOGY_NAME=lambda x: 'Cell Ontology', ONTOLOGY_VERSION_IRI=lambda x: 'http://purl.obolibrary.org/obo/cl/releases/2020-01-06')
                 .rename(columns={
                     'CL_ID':'ID:ID',
                     'LABEL': ':LABEL',
                     'CL_Name': 'NAME'
                 })      
        ).copy()
clh_nodes = pd.concat([clh_sub, clh_obj])
clh_nodes.drop_duplicates(inplace=True)
clh_nodes = clh_nodes[['ID:ID',':LABEL', 'NAME', 'ONTOLOGY_IRI', 'ONTOLOGY_NAME', 'ONTOLOGY_VERSION_IRI']].copy()
print('data structure: {0} annotations and {1} data fields'.format(*clh_nodes.shape))

data structure: 81430 annotations and 6 data fields
data structure: 81430 annotations and 9 data fields
data structure: 3221 annotations and 6 data fields


In [4]:
# read file
df = pd.read_csv('./top50.chebi-celltype_plant.txt',sep='\t')
print('data structure: {0} annotations and {1} data fields'.format(*df.shape))

# modify to csv neo4j format
# edges
# RO:0002616  related via evidence or inference to
clp = (df
        .assign(
            TYPE=lambda x: 'RO:0002616',
            PROPERTY_LABEL=lambda x: 'related via evidence or inference to',
            SPECIES=lambda x: 'Plant',
            SENTENCE=lambda x: None, 
            PMCID=lambda x: None)
        .rename(columns={
            'CHEBI_ID': ':START_ID',
            'TYPE': ':TYPE',
            'CL_ID': ':END_ID',
            '#PMCIDs': 'PMCIDs'
        })
     ).copy()
clp_edges = clp[[':START_ID', ':TYPE', ':END_ID', 'PROPERTY_LABEL', 'SPECIES', 'SENTENCE', 'PMCID', 'NPMI_score', 'PMCIDs']].copy()
print('data structure: {0} annotations and {1} data fields'.format(*clp_edges.shape))

# nodes
clp_sub = df[['CHEBI_ID', 'CHEBI_Name']].copy()
clp_sub = (
           clp_sub
                 .assign(LABEL=lambda x: 'METABOLITE', ONTOLOGY_IRI=lambda x: 'http://purl.obolibrary.org/obo/chebi.owl', ONTOLOGY_NAME=lambda x: 'Chemical Entities of Biological Interest Ontology', ONTOLOGY_VERSION_IRI=lambda x: 'http://purl.obolibrary.org/obo/chebi/183/chebi.owl')
                 .rename(columns={
                     'CHEBI_ID':'ID:ID',
                     'LABEL': ':LABEL',
                     'CHEBI_Name': 'NAME'
                 })     
        ).copy()
clp_obj = df[['CL_ID', 'CL_Name']].copy()
clp_obj = (
           clp_obj
                 .assign(LABEL=lambda x: 'CELL_TYPE', ONTOLOGY_IRI=lambda x: 'http://purl.obolibrary.org/obo/cl.owl', ONTOLOGY_NAME=lambda x: 'Cell Ontology', ONTOLOGY_VERSION_IRI=lambda x: 'http://purl.obolibrary.org/obo/cl/releases/2020-01-06')
                 .rename(columns={
                     'CL_ID':'ID:ID',
                     'LABEL': ':LABEL',
                     'CL_Name': 'NAME'
                 })      
        ).copy()
clp_nodes = pd.concat([clp_sub, clp_obj])
clp_nodes.drop_duplicates(inplace=True)
clp_nodes = clp_nodes[['ID:ID',':LABEL', 'NAME', 'ONTOLOGY_IRI', 'ONTOLOGY_NAME', 'ONTOLOGY_VERSION_IRI']].copy()
print('data structure: {0} annotations and {1} data fields'.format(*clp_nodes.shape))

data structure: 61114 annotations and 6 data fields
data structure: 61114 annotations and 9 data fields
data structure: 3072 annotations and 6 data fields


In [5]:
# read file
df = pd.read_csv('./top50.chebi-doid_human.txt',sep='\t')
print('data structure: {0} annotations and {1} data fields'.format(*df.shape))

# modify to csv neo4j format
# edges
# RO:0002616  related via evidence or inference to
do = (df
        .assign(
            TYPE=lambda x: 'RO:0002616',
            PROPERTY_LABEL=lambda x: 'related via evidence or inference to',
            SPECIES=lambda x: 'Human',
            SENTENCE=lambda x: None, 
            PMCID=lambda x: None)
        .rename(columns={
            'CHEBI_ID': ':START_ID',
            'TYPE': ':TYPE',
            'DO_ID': ':END_ID',
            '#PMCIDs': 'PMCIDs'
        })
     ).copy()
do_edges = do[[':START_ID', ':TYPE', ':END_ID', 'PROPERTY_LABEL', 'SPECIES', 'SENTENCE', 'PMCID', 'NPMI_score', 'PMCIDs']].copy()
print('data structure: {0} annotations and {1} data fields'.format(*do_edges.shape))

# nodes
do_sub = df[['CHEBI_ID', 'CHEBI_Name']].copy()
do_sub = (
           do_sub
                 .assign(LABEL=lambda x: 'METABOLITE', ONTOLOGY_IRI=lambda x: 'http://purl.obolibrary.org/obo/chebi.owl', ONTOLOGY_NAME=lambda x: 'Chemical Entities of Biological Interest Ontology', ONTOLOGY_VERSION_IRI=lambda x: 'http://purl.obolibrary.org/obo/chebi/183/chebi.owl')
                 .rename(columns={
                     'CHEBI_ID':'ID:ID',
                     'LABEL': ':LABEL',
                     'CHEBI_Name': 'NAME'
                 })     
        ).copy()
do_obj = df[['DO_ID', 'DO_Name']].copy()
do_obj = (
           do_obj
                 .assign(LABEL=lambda x: 'DISEASE', ONTOLOGY_IRI=lambda x: 'http://purl.obolibrary.org/obo/doid.owl', ONTOLOGY_NAME=lambda x: 'Human Disease Ontology', ONTOLOGY_VERSION_IRI=lambda x: 'http://purl.obolibrary.org/obo/doid/releases/2020-01-15/doid.owl')
                 .rename(columns={
                     'DO_ID':'ID:ID',
                     'LABEL': ':LABEL',
                     'DO_Name': 'NAME'
                 })      
        ).copy()
do_nodes = pd.concat([do_sub, do_obj])
do_nodes.drop_duplicates(inplace=True)
do_nodes = do_nodes[['ID:ID',':LABEL', 'NAME', 'ONTOLOGY_IRI', 'ONTOLOGY_NAME', 'ONTOLOGY_VERSION_IRI']].copy()
print('data structure: {0} annotations and {1} data fields'.format(*do_nodes.shape))

data structure: 94586 annotations and 6 data fields
data structure: 94586 annotations and 9 data fields
data structure: 7671 annotations and 6 data fields


In [6]:
# read file
df = pd.read_csv('./top50.chebi-envo_human.txt',sep='\t')
print('data structure: {0} annotations and {1} data fields'.format(*df.shape))

# modify to csv neo4j format
# edges
# RO:0002616  related via evidence or inference to
envoh = (df
        .assign(
            TYPE=lambda x: 'RO:0002616',
            PROPERTY_LABEL=lambda x: 'related via evidence or inference to',
            SPECIES=lambda x: 'Human',
            SENTENCE=lambda x: None, 
            PMCID=lambda x: None)
        .rename(columns={
            'CHEBI_ID': ':START_ID',
            'TYPE': ':TYPE',
            'ENVO_ID': ':END_ID',
            '#PMCIDs': 'PMCIDs'
        })
     ).copy()
envoh_edges = envoh[[':START_ID', ':TYPE', ':END_ID', 'PROPERTY_LABEL', 'SPECIES', 'SENTENCE', 'PMCID', 'NPMI_score', 'PMCIDs']].copy()
print('data structure: {0} annotations and {1} data fields'.format(*envoh_edges.shape))

# nodes
envoh_sub = df[['CHEBI_ID', 'CHEBI_Name']].copy()
envoh_sub = (
           envoh_sub
                 .assign(LABEL=lambda x: 'METABOLITE', ONTOLOGY_IRI=lambda x: 'http://purl.obolibrary.org/obo/chebi.owl', ONTOLOGY_NAME=lambda x: 'Chemical Entities of Biological Interest Ontology', ONTOLOGY_VERSION_IRI=lambda x: 'http://purl.obolibrary.org/obo/chebi/183/chebi.owl')
                 .rename(columns={
                     'CHEBI_ID':'ID:ID',
                     'LABEL': ':LABEL',
                     'CHEBI_Name': 'NAME'
                 })     
        ).copy()
envoh_obj = df[['ENVO_ID', 'ENVO_Name']].copy()
envoh_obj = (
           envoh_obj
                 .assign(LABEL=lambda x: 'ENVIRONMENTAL', ONTOLOGY_IRI=lambda x: 'http://purl.obolibrary.org/obo/envo.owl', ONTOLOGY_NAME=lambda x: 'Environment Ontology', ONTOLOGY_VERSION_IRI=lambda x: 'http://purl.obolibrary.org/obo/envo.owl')
                 .rename(columns={
                     'ENVO_ID':'ID:ID',
                     'LABEL': ':LABEL',
                     'ENVO_Name': 'NAME'
                 })      
        ).copy()
envoh_nodes = pd.concat([envoh_sub, envoh_obj])
envoh_nodes.drop_duplicates(inplace=True)
envoh_nodes = envoh_nodes[['ID:ID',':LABEL', 'NAME', 'ONTOLOGY_IRI', 'ONTOLOGY_NAME', 'ONTOLOGY_VERSION_IRI']].copy()
print('data structure: {0} annotations and {1} data fields'.format(*envoh_nodes.shape))

data structure: 61886 annotations and 6 data fields
data structure: 61886 annotations and 9 data fields
data structure: 3805 annotations and 6 data fields


In [7]:
# read file
df = pd.read_csv('./top50.chebi-envo_plant.txt',sep='\t')
print('data structure: {0} annotations and {1} data fields'.format(*df.shape))

# modify to csv neo4j format
# edges
# RO:0002616  related via evidence or inference to
envop = (df
        .assign(
            TYPE=lambda x: 'RO:0002616',
            PROPERTY_LABEL=lambda x: 'related via evidence or inference to',
            SPECIES=lambda x: 'Plant',
            SENTENCE=lambda x: None, 
            PMCID=lambda x: None)
        .rename(columns={
            'CHEBI_ID': ':START_ID',
            'TYPE': ':TYPE',
            'ENVO_ID': ':END_ID',
            '#PMCIDs': 'PMCIDs'  
        })
     ).copy()
envop_edges = envop[[':START_ID', ':TYPE', ':END_ID', 'PROPERTY_LABEL', 'SPECIES', 'SENTENCE', 'PMCID', 'NPMI_score', 'PMCIDs']].copy()
print('data structure: {0} annotations and {1} data fields'.format(*envop_edges.shape))

# nodes
envop_sub = df[['CHEBI_ID', 'CEHBI_Name']].copy()
envop_sub = (
           envop_sub
                 .assign(LABEL=lambda x: 'METABOLITE', ONTOLOGY_IRI=lambda x: 'http://purl.obolibrary.org/obo/chebi.owl', ONTOLOGY_NAME=lambda x: 'Chemical Entities of Biological Interest Ontology', ONTOLOGY_VERSION_IRI=lambda x: 'http://purl.obolibrary.org/obo/chebi/183/chebi.owl')
                 .rename(columns={
                     'CHEBI_ID':'ID:ID',
                     'LABEL': ':LABEL',
                     'CEHBI_Name': 'NAME'
                 })     
        ).copy()
envop_obj = df[['ENVO_ID', 'ENVO_Name']].copy()
envop_obj = (
           envop_obj
                 .assign(LABEL=lambda x: 'ENVIRONMENTAL', ONTOLOGY_IRI=lambda x: 'http://purl.obolibrary.org/obo/envo.owl', ONTOLOGY_NAME=lambda x: 'Environment Ontology', ONTOLOGY_VERSION_IRI=lambda x: 'http://purl.obolibrary.org/obo/envo.owl')
                 .rename(columns={
                     'ENVO_ID':'ID:ID',
                     'LABEL': ':LABEL',
                     'ENVO_Name': 'NAME'
                 })      
        ).copy()
envop_nodes = pd.concat([envop_sub, envop_obj])
envop_nodes.drop_duplicates(inplace=True)
envop_nodes = envop_nodes[['ID:ID',':LABEL', 'NAME', 'ONTOLOGY_IRI', 'ONTOLOGY_NAME', 'ONTOLOGY_VERSION_IRI']].copy()
print('data structure: {0} annotations and {1} data fields'.format(*envop_nodes.shape))

data structure: 46201 annotations and 6 data fields
data structure: 46201 annotations and 9 data fields
data structure: 3827 annotations and 6 data fields


In [8]:
# read file
df = pd.read_csv('./top50.chebi-uberon_human.txt',sep='\t')
print('data structure: {0} annotations and {1} data fields'.format(*df.shape))

# modify to csv neo4j format
# edges
# RO:0002616  related via evidence or inference to
uber = (df
        .assign(
            TYPE=lambda x: 'RO:0002616',
            PROPERTY_LABEL=lambda x: 'related via evidence or inference to',
            SPECIES=lambda x: 'Human',
            SENTENCE=lambda x: None, 
            PMCID=lambda x: None)
        .rename(columns={
            'CEHBI_ID': ':START_ID',
            'TYPE': ':TYPE',
            'UBERON_ID': ':END_ID',
            '#PMCIDs': 'PMCIDs'  
        })
     ).copy()
uber_edges = uber[[':START_ID', ':TYPE', ':END_ID', 'PROPERTY_LABEL', 'SPECIES', 'SENTENCE', 'PMCID', 'NPMI_score', 'PMCIDs']].copy()
print('data structure: {0} annotations and {1} data fields'.format(*uber_edges.shape))

# nodes
uber_sub = df[['CEHBI_ID', 'CEHBI_Name']].copy()
uber_sub = (
           uber_sub
                 .assign(LABEL=lambda x: 'METABOLITE', ONTOLOGY_IRI=lambda x: 'http://purl.obolibrary.org/obo/chebi.owl', ONTOLOGY_NAME=lambda x: 'Chemical Entities of Biological Interest Ontology', ONTOLOGY_VERSION_IRI=lambda x: 'http://purl.obolibrary.org/obo/chebi/183/chebi.owl')
                 .rename(columns={
                     'CEHBI_ID':'ID:ID',
                     'LABEL': ':LABEL',
                     'CEHBI_Name': 'NAME'
                 })     
        ).copy()
uber_obj = df[['UBERON_ID', 'UBERON_Name']].copy()
uber_obj = (
           uber_obj
                 .assign(LABEL=lambda x: 'ANATOMY', ONTOLOGY_IRI=lambda x: 'http://purl.obolibrary.org/obo/uberon.owl', ONTOLOGY_NAME=lambda x: 'Uberon multi-species anatomy ontology', ONTOLOGY_VERSION_IRI=lambda x: 'http://purl.obolibrary.org/obo/uberon/releases/2019-11-22/uberon.owl')
                 .rename(columns={
                     'UBERON_ID':'ID:ID',
                     'LABEL': ':LABEL',
                     'UBERON_Name': 'NAME'
                 })      
        ).copy()
uber_nodes = pd.concat([uber_sub, uber_obj])
uber_nodes.drop_duplicates(inplace=True)
uber_nodes = uber_nodes[['ID:ID',':LABEL', 'NAME', 'ONTOLOGY_IRI', 'ONTOLOGY_NAME', 'ONTOLOGY_VERSION_IRI']].copy()
print('data structure: {0} annotations and {1} data fields'.format(*uber_nodes.shape))

data structure: 100594 annotations and 6 data fields
data structure: 100594 annotations and 9 data fields
data structure: 7995 annotations and 6 data fields


In [9]:
# read file
df = pd.read_csv('./top50.human.chebi-pheno.txt',sep='\t')
print('data structure: {0} annotations and {1} data fields'.format(*df.shape))

# modify to csv neo4j format
# edges
# RO:0002616  related via evidence or inference to
pheno = (df
        .assign(
            TYPE=lambda x: 'RO:0002616',
            PROPERTY_LABEL=lambda x: 'related via evidence or inference to',
            SPECIES=lambda x: 'Human|Mammalian',
            SENTENCE=lambda x: None, 
            PMCID=lambda x: None,
            PMCIDs=lambda x: None)
        .rename(columns={
            'CEHBI_ID': ':START_ID',
            'TYPE': ':TYPE',
            'Phenotype_ID': ':END_ID'
        })
     ).copy()
pheno_edges = pheno[[':START_ID', ':TYPE', ':END_ID', 'PROPERTY_LABEL', 'SPECIES', 'SENTENCE', 'PMCID', 'NPMI_score', 'PMCIDs']].copy()
print('data structure: {0} annotations and {1} data fields'.format(*pheno_edges.shape))

# nodes
pheno_sub = df[['CEHBI_ID', 'CEHBI_Name']].copy()
pheno_sub = (
           pheno_sub
                 .assign(LABEL=lambda x: 'METABOLITE', ONTOLOGY_IRI=lambda x: 'http://purl.obolibrary.org/obo/chebi.owl', ONTOLOGY_NAME=lambda x: 'Chemical Entities of Biological Interest Ontology', ONTOLOGY_VERSION_IRI=lambda x: 'http://purl.obolibrary.org/obo/chebi/183/chebi.owl')
                 .rename(columns={
                     'CEHBI_ID':'ID:ID',
                     'LABEL': ':LABEL',
                     'CEHBI_Name': 'NAME'
                 })     
        ).copy()
pheno_obj = df[['Phenotype_ID', 'Phenotype_Name']].copy()
pheno_obj = (
           pheno_obj
                 .assign(LABEL=lambda x: 'PHENOTYPE', ONTOLOGY_IRI=lambda x: '["http://purl.obolibrary.org/obo/hp.owl","http://purl.obolibrary.org/obo/mp.owl"]', ONTOLOGY_NAME=lambda x: '["Human Phenotype Ontology","Mammalian Phenotype Ontology"]', ONTOLOGY_VERSION_IRI=lambda x: '["http://purl.obolibrary.org/obo/hp/releases/2019-11-08/hp.owl","http://purl.obolibrary.org/obo/mp/releases/2019-11-07"]')
                 .rename(columns={
                     'Phenotype_ID':'ID:ID',
                     'LABEL': ':LABEL',
                     'Phenotype_Name': 'NAME'
                 })      
        ).copy()
pheno_nodes = pd.concat([pheno_sub, pheno_obj])
pheno_nodes.drop_duplicates(inplace=True)
pheno_nodes = pheno_nodes[['ID:ID',':LABEL', 'NAME', 'ONTOLOGY_IRI', 'ONTOLOGY_NAME', 'ONTOLOGY_VERSION_IRI']].copy()
print('data structure: {0} annotations and {1} data fields'.format(*pheno_nodes.shape))

data structure: 143617 annotations and 5 data fields
data structure: 143617 annotations and 9 data fields
data structure: 12811 annotations and 6 data fields


In [10]:
# read file
df = pd.read_csv('./top50.plant.chebi-flopo.txt',sep='\t')
print('data structure: {0} annotations and {1} data fields'.format(*df.shape))

# modify to csv neo4j format
# edges
# RO:0002616  related via evidence or inference to
flopo = (df
        .assign(
            TYPE=lambda x: 'RO:0002616',
            PROPERTY_LABEL=lambda x: 'related via evidence or inference to',
            SPECIES=lambda x: 'Plant',
            SENTENCE=lambda x: None, 
            PMCID=lambda x: None,
            PMCIDs=lambda x: None)
        .rename(columns={
            'CEHBI_ID': ':START_ID',
            'TYPE': ':TYPE',
            'Phenotype_ID': ':END_ID'
        })
     ).copy()
flopo_edges = flopo[[':START_ID', ':TYPE', ':END_ID', 'PROPERTY_LABEL', 'SPECIES', 'SENTENCE', 'PMCID', 'NPMI_score', 'PMCIDs']].copy()
print('data structure: {0} annotations and {1} data fields'.format(*flopo_edges.shape))

# nodes
flopo_sub = df[['CEHBI_ID', 'CEHBI_Name']].copy()
flopo_sub = (
           flopo_sub
                 .assign(LABEL=lambda x: 'METABOLITE', ONTOLOGY_IRI=lambda x: 'http://purl.obolibrary.org/obo/chebi.owl', ONTOLOGY_NAME=lambda x: 'Chemical Entities of Biological Interest Ontology', ONTOLOGY_VERSION_IRI=lambda x: 'http://purl.obolibrary.org/obo/chebi/183/chebi.owl')
                 .rename(columns={
                     'CEHBI_ID':'ID:ID',
                     'LABEL': ':LABEL',
                     'CEHBI_Name': 'NAME'
                 })     
        ).copy()
flopo_obj = df[['Phenotype_ID', 'Phenotype_Name']].copy()
flopo_obj = (
           flopo_obj
                 .assign(LABEL=lambda x: 'PHENOTYPE', ONTOLOGY_IRI=lambda x: 'http://purl.obolibrary.org/obo/flopo.owl', ONTOLOGY_NAME=lambda x: 'Flora Phenotype Ontology', ONTOLOGY_VERSION_IRI=lambda x: 'http://purl.obolibrary.org/obo/flopo.owl')
                 .rename(columns={
                     'Phenotype_ID':'ID:ID',
                     'LABEL': ':LABEL',
                     'Phenotype_Name': 'NAME'
                 })      
        ).copy()
flopo_nodes = pd.concat([flopo_sub, flopo_obj])
flopo_nodes.drop_duplicates(inplace=True)
flopo_nodes = flopo_nodes[['ID:ID',':LABEL', 'NAME', 'ONTOLOGY_IRI', 'ONTOLOGY_NAME', 'ONTOLOGY_VERSION_IRI']].copy()
print('data structure: {0} annotations and {1} data fields'.format(*flopo_nodes.shape))

data structure: 18571 annotations and 5 data fields
data structure: 18571 annotations and 9 data fields
data structure: 2340 annotations and 6 data fields


In [11]:
# read file
df = pd.read_csv('./top50.plant.chebi-to.txt',sep='\t')
print('data structure: {0} annotations and {1} data fields'.format(*df.shape))

# modify to csv neo4j format
# edges
# RO:0002616  related via evidence or inference to
to = (df
        .assign(
            TYPE=lambda x: 'RO:0002616',
            PROPERTY_LABEL=lambda x: 'related via evidence or inference to',
            SPECIES=lambda x: 'Plant',
            SENTENCE=lambda x: None, 
            PMCID=lambda x: None,
            PMCIDs=lambda x: None)
        .rename(columns={
            'CHEBI_ID': ':START_ID',
            'TYPE': ':TYPE',
            'Phenotype_ID': ':END_ID'
        })
     ).copy()
to_edges = to[[':START_ID', ':TYPE', ':END_ID', 'PROPERTY_LABEL', 'SPECIES', 'SENTENCE', 'PMCID', 'NPMI_score', 'PMCIDs']].copy()
print('data structure: {0} annotations and {1} data fields'.format(*to_edges.shape))

# nodes
to_sub = df[['CHEBI_ID', 'CHEBI_Name']].copy()
to_sub = (
           to_sub
                 .assign(LABEL=lambda x: 'METABOLITE', ONTOLOGY_IRI=lambda x: 'http://purl.obolibrary.org/obo/chebi.owl', ONTOLOGY_NAME=lambda x: 'Chemical Entities of Biological Interest Ontology', ONTOLOGY_VERSION_IRI=lambda x: 'http://purl.obolibrary.org/obo/chebi/183/chebi.owl')
                 .rename(columns={
                     'CHEBI_ID':'ID:ID',
                     'LABEL': ':LABEL',
                     'CHEBI_Name': 'NAME'
                 })     
        ).copy()
to_obj = df[['Phenotype_ID', 'Phenotype_Name']].copy()
to_obj = (
           to_obj
                 .assign(LABEL=lambda x: 'PHENOTYPE', ONTOLOGY_IRI=lambda x: 'http://purl.obolibrary.org/obo/to.owl', ONTOLOGY_NAME=lambda x: 'Plant Trait Ontology', ONTOLOGY_VERSION_IRI=lambda x: 'http://purl.obolibrary.org/obo/to/releases/2019-05-21/to.owl')
                 .rename(columns={
                     'Phenotype_ID':'ID:ID',
                     'LABEL': ':LABEL',
                     'Phenotype_Name': 'NAME'
                 })      
        ).copy()
to_nodes = pd.concat([to_sub, to_obj])
to_nodes.drop_duplicates(inplace=True)
to_nodes = to_nodes[['ID:ID',':LABEL', 'NAME', 'ONTOLOGY_IRI', 'ONTOLOGY_NAME', 'ONTOLOGY_VERSION_IRI']].copy()
print('data structure: {0} annotations and {1} data fields'.format(*to_nodes.shape))

data structure: 17059 annotations and 5 data fields
data structure: 17059 annotations and 9 data fields
data structure: 2009 annotations and 6 data fields


In [12]:
# edges
# concat 
edataframes = list([clh_edges, 
                    clp_edges, 
                    do_edges,
                    envoh_edges,
                    envop_edges,
                    uber_edges,
                    pheno_edges,
                    flopo_edges,
                    to_edges
                   ])
edges = pd.concat(edataframes)

# drop redundant
edges.drop_duplicates(inplace=True)
print('data structure: {0} annotations and {1} data fields'.format(*edges.shape))

# save
edges.fillna('NA').to_csv('./edges_v{}.csv'.format(today), index=False)

data structure: 625058 annotations and 9 data fields


In [13]:
# nodes
# concat
ndataframes = list([clh_nodes, 
                    clp_nodes, 
                    do_nodes,
                    envoh_nodes,
                    envop_nodes,
                    uber_nodes,
                    pheno_nodes,
                    flopo_nodes,
                    to_nodes])
nodes = pd.concat(ndataframes)

# drop redundant
nodes
nodes.drop_duplicates(inplace=True)
print('data structure: {0} annotations and {1} data fields'.format(*nodes.shape))

# save
nodes.fillna('NA').to_csv('./nodes_v{}.csv'.format(today), index=False)

data structure: 29891 annotations and 6 data fields


In [14]:
edges.head(2)

Unnamed: 0,:START_ID,:TYPE,:END_ID,PROPERTY_LABEL,SPECIES,SENTENCE,PMCID,NPMI_score,PMCIDs
0,CHEBI:25830,RO:0002616,CL:0000508,related via evidence or inference to,Human,,,0.16056,2
1,CHEBI:25830,RO:0002616,CL:0002180,related via evidence or inference to,Human,,,0.160489,2


In [15]:
nodes.head(2)

Unnamed: 0,ID:ID,:LABEL,NAME,ONTOLOGY_IRI,ONTOLOGY_NAME,ONTOLOGY_VERSION_IRI
0,CHEBI:25830,METABOLITE,[p-quinones],http://purl.obolibrary.org/obo/chebi.owl,Chemical Entities of Biological Interest Ontology,http://purl.obolibrary.org/obo/chebi/183/chebi...
51,CHEBI:60980,METABOLITE,[beta-glucoside],http://purl.obolibrary.org/obo/chebi.owl,Chemical Entities of Biological Interest Ontology,http://purl.obolibrary.org/obo/chebi/183/chebi...


In [16]:
print(clh_edges.shape,clh_nodes.shape)
print(clp_edges.shape,clp_nodes.shape)
print(do_edges.shape,do_nodes.shape)
print(envoh_edges.shape,envoh_nodes.shape)
print(envop_edges.shape,envop_nodes.shape)
print(uber_edges.shape,uber_nodes.shape)
print(pheno_edges.shape,pheno_nodes.shape)
print(flopo_edges.shape,flopo_nodes.shape)
print(to_edges.shape,to_nodes.shape)

(81430, 9) (3221, 6)
(61114, 9) (3072, 6)
(94586, 9) (7671, 6)
(61886, 9) (3805, 6)
(46201, 9) (3827, 6)
(100594, 9) (7995, 6)
(143617, 9) (12811, 6)
(18571, 9) (2340, 6)
(17059, 9) (2009, 6)
