In [42]:
import pandas as pd
import numpy as np
import re

In [43]:
card_prevalence = pd.read_csv("card_prevalence.txt", sep="\t")

In [44]:
card_prevalence.head(10)

Unnamed: 0,ARO Accession,Name,Model ID,Model Type,Pathogen,NCBI Plasmid,NCBI WGS,NCBI Chromosome,NCBI Genomic Island,Criteria,ARO Categories
0,ARO:3002501,PDC-4,1,protein homolog model,Pseudomonas aeruginosa,0.0,0.04,0.0,0.0,perfect_strict,antibiotic inactivation; carbapenem; cephalosp...
1,ARO:3002999,CblA-1,2,protein homolog model,Phocaeicola dorei,0.0,1.92,0.0,0.0,perfect,antibiotic inactivation; cephalosporin
2,ARO:3002999,CblA-1,2,protein homolog model,Phocaeicola dorei,0.0,1.92,0.0,0.0,perfect_strict,antibiotic inactivation; cephalosporin
3,ARO:3001109,SHV-52,4,protein homolog model,Klebsiella pneumoniae,0.0,0.15,0.0,0.0,perfect,antibiotic inactivation; carbapenem; cephalosp...
4,ARO:3001109,SHV-52,4,protein homolog model,Klebsiella pneumoniae,0.0,0.16,0.0,0.0,perfect_strict,antibiotic inactivation; carbapenem; cephalosp...
5,ARO:3002867,dfrF,5,protein homolog model,Streptococcus suis,0.0,0.99,0.0,0.0,perfect_strict,antibiotic target replacement; diaminopyrimidi...
6,ARO:3002867,dfrF,5,protein homolog model,Enterococcus faecalis,0.0,4.72,0.0,0.0,perfect,antibiotic target replacement; diaminopyrimidi...
7,ARO:3002867,dfrF,5,protein homolog model,Streptococcus agalactiae,0.0,0.17,0.0,0.0,perfect_strict,antibiotic target replacement; diaminopyrimidi...
8,ARO:3002867,dfrF,5,protein homolog model,Streptococcus suis,0.0,0.15,0.0,0.0,perfect,antibiotic target replacement; diaminopyrimidi...
9,ARO:3002867,dfrF,5,protein homolog model,Staphylococcus aureus,0.0,0.11,0.0,0.0,perfect,antibiotic target replacement; diaminopyrimidi...


In [45]:
card_prevalence["Model Type"].unique()

array(['protein homolog model', 'protein variant model',
       'protein overexpression model', 'rRNA gene variant model'],
      dtype=object)

In [46]:
aro_file = "aro.obo"
content = open(aro_file, 'r').read()

In [47]:
rx_term = re.compile(r"(\[Term\]\n.+?\n\n)", re.DOTALL)
term = re.findall(rx_term, content)

In [48]:
term_dictionary= {}

rx_key = re.compile(r'(\w+): (.+)')

for t in term:
    t = t.strip()
    field = t.split("\n")

    temp_dictionary= {}

    for f in field:
        key_match = rx_key.match(f)

        if key_match:
            key = key_match.group(1)
            value = key_match.group(2)

            if key in ['synonym', 'is_a', 'relationship']:
                if key not in temp_dictionary:
                    temp_dictionary[key] = []
                temp_dictionary[key].append(value)
            else:
                if key == "def":
                    value = value.replace('"', "")
                temp_dictionary[key] = value
    
    term_dictionary[temp_dictionary["id"]] = temp_dictionary

In [49]:
term_dictionary["ARO:0000020"]

{'def': 'Carbapenems are a class of beta-lactam antibiotics with a broad spectrum of antibacterial activity, and have a structure which renders them highly resistant to beta-lactamases. Carbapenem antibiotics are bactericidal, and act by inhibiting the synthesis of the peptidoglycan layer of bacterial cell walls. The peptidoglycan layer is important for cell wall structural integrity, especially in Gram-positive organisms. [PMID:11585791, PMID:15673804]',
 'id': 'ARO:0000020',
 'is_a': ['ARO:3000007 ! beta-lactam antibiotic'],
 'name': 'carbapenem',
 'namespace': 'antibiotic_resistance'}

In [50]:
term_dictionary["ARO:3001109"]

{'def': 'SHV-52 is a beta-lactamase that has been found in clinical isolates. []',
 'id': 'ARO:3001109',
 'is_a': ['ARO:3000015 ! SHV beta-lactamase'],
 'name': 'SHV-52',
 'namespace': 'antibiotic_resistance'}

In [51]:
resistances = set()
pathogens = set()
drugs = set()

pathogen_resistance = f"pathogen,resistance,criteria,model_type\n"

for i, row in card_prevalence.iterrows():
    pathogen = row["Pathogen"] 
    resistance = row["ARO Accession"]
    criteria = row["Criteria"]
    model_type = row["Model Type"]

    pathogens.add(pathogen)
    resistances.add(resistance)

    pathogen_resistance += f'"{pathogen}","{resistance}","{criteria}","{model_type}"\n'

output = open("pathogen_resistance.csv", 'w')
output.write(pathogen_resistance)
output.close()


In [52]:
resistance_drug = f"resistance,drug\n"

for id in term_dictionary:
    term = term_dictionary[id]

    if "relationship" in term:
        for r in term["relationship"]:
            if r.startswith("confers_resistance_to_drug_class"):
                s = r.replace("confers_resistance_to_drug_class", "").strip()
                fields = s.split("!")

                aro_id = fields[0].strip()

                resistance_drug += f'"{id}","{aro_id}"\n'

                
                drugs.add(aro_id)
                
output = open("resistance_drug.csv", 'w')
output.write(resistance_drug)
output.close()

In [53]:
drug = f"aro,name,definition,is_a\n"

for aro_id in drugs:
    name = term_dictionary[aro_id]['name']
    definition = term_dictionary[aro_id]['def']
    is_a = term_dictionary[aro_id]['is_a']

    drug += f'"{aro_id}","{name}","{definition}","{is_a}"\n'
output = open("drug.csv", 'w')
output.write(drug)
output.close()

In [54]:
resistance = f"aro,name,definition,is_a\n"

for res in resistances:
    name = term_dictionary[res]['name']
    definition = term_dictionary[aro_id]['def']
    is_a = term_dictionary[aro_id]['is_a']

    resistance += f'"{res}","{name}","{definition}","{is_a}"\n'

output = open("resistance.csv", 'w')
output.write(resistance)
output.close()


In [55]:
pathogen = f"name\n"

for p in pathogens:
    pathogen += f"{p}\n"

output = open("pathogen.csv", 'w')
output.write(pathogen)
output.close()