In [52]:
import pandas as pd

In [53]:
df_mouse_data = pd.read_csv("MGI_DO.tsv", sep="\t")
df_mouse_data

Unnamed: 0,DO Disease ID,DO Disease Name,OMIM IDs,Common Organism Name,NCBI Taxon ID,Symbol,EntrezGene ID,Mouse MGI ID
0,DOID:0112248,17-beta hydroxysteroid dehydrogenase 3 deficiency,OMIM:264300,human,9606,HSD17B3,3293.0,
1,DOID:0112248,17-beta hydroxysteroid dehydrogenase 3 deficiency,OMIM:264300,"mouse, laboratory",10090,Hsd17b3,15487.0,MGI:107177
2,DOID:0111453,2-aminoadipic 2-oxoadipic aciduria,OMIM:204750,human,9606,DHTKD1,55526.0,
3,DOID:0111453,2-aminoadipic 2-oxoadipic aciduria,OMIM:204750,"mouse, laboratory",10090,Dhtkd1,209692.0,MGI:2445096
4,DOID:0050573,2-hydroxyglutaric aciduria,,human,9606,L2HGDH,79944.0,
...,...,...,...,...,...,...,...,...
18631,DOID:905,Zellweger syndrome,,human,9606,PHYH,5264.0,
18632,DOID:905,Zellweger syndrome,,"mouse, laboratory",10090,Pex1,71382.0,MGI:1918632
18633,DOID:905,Zellweger syndrome,,"mouse, laboratory",10090,Pex11b,18632.0,MGI:1338882
18634,DOID:0060478,Zika fever,,human,9606,STAT2,6773.0,


In [54]:
df_mouse_disease_omims = df_mouse_data[["DO Disease Name", "OMIM IDs"]]
df_mouse_disease_omims

Unnamed: 0,DO Disease Name,OMIM IDs
0,17-beta hydroxysteroid dehydrogenase 3 deficiency,OMIM:264300
1,17-beta hydroxysteroid dehydrogenase 3 deficiency,OMIM:264300
2,2-aminoadipic 2-oxoadipic aciduria,OMIM:204750
3,2-aminoadipic 2-oxoadipic aciduria,OMIM:204750
4,2-hydroxyglutaric aciduria,
...,...,...
18631,Zellweger syndrome,
18632,Zellweger syndrome,
18633,Zellweger syndrome,
18634,Zika fever,


In [55]:
df_mouse_disease_omims = df_mouse_disease_omims.dropna()
df_mouse_disease_omims = df_mouse_disease_omims.drop_duplicates()
df_mouse_disease_omims

Unnamed: 0,DO Disease Name,OMIM IDs
0,17-beta hydroxysteroid dehydrogenase 3 deficiency,OMIM:264300
2,2-aminoadipic 2-oxoadipic aciduria,OMIM:204750
6,3MC syndrome 1,OMIM:257920
7,3MC syndrome 2,OMIM:265050
8,3MC syndrome 3,OMIM:248340
...,...,...
18622,Y-linked deafness 2,OMIM:400047
18623,Y-linked spermatogenic failure 2,OMIM:415000
18624,Yoon-Bellen neurodevelopmental syndrome,OMIM:619701
18625,Yunis-Varon syndrome,OMIM:216340


In [56]:
df_mouse_disease_omims.to_csv("unique_mouse_disease_omims.tsv", index=None)

In [57]:
disease_names = list()
omim_ids = list()

for i, row in df_mouse_disease_omims.iterrows():
    r_values = row.values
    d_name = r_values[0]
    d_omim = r_values[1].split("|")
    if len(d_omim) > 1:
        for omim in d_omim:
            disease_names.append(d_name)
            omim_ids.append(omim)
    else:
        disease_names.append(d_name)
        omim_ids.append(d_omim[0])
        
df_disease_names_omim = pd.DataFrame(zip(disease_names, omim_ids), columns=["DiseaseNames", "OMIMIDs"])
df_disease_names_omim

Unnamed: 0,DiseaseNames,OMIMIDs
0,17-beta hydroxysteroid dehydrogenase 3 deficiency,OMIM:264300
1,2-aminoadipic 2-oxoadipic aciduria,OMIM:204750
2,3MC syndrome 1,OMIM:257920
3,3MC syndrome 2,OMIM:265050
4,3MC syndrome 3,OMIM:248340
...,...,...
4895,Y-linked deafness 2,OMIM:400047
4896,Y-linked spermatogenic failure 2,OMIM:415000
4897,Yoon-Bellen neurodevelopmental syndrome,OMIM:619701
4898,Yunis-Varon syndrome,OMIM:216340


In [58]:
df_disease_names_omim = df_disease_names_omim.drop_duplicates()
df_disease_names_omim

Unnamed: 0,DiseaseNames,OMIMIDs
0,17-beta hydroxysteroid dehydrogenase 3 deficiency,OMIM:264300
1,2-aminoadipic 2-oxoadipic aciduria,OMIM:204750
2,3MC syndrome 1,OMIM:257920
3,3MC syndrome 2,OMIM:265050
4,3MC syndrome 3,OMIM:248340
...,...,...
4895,Y-linked deafness 2,OMIM:400047
4896,Y-linked spermatogenic failure 2,OMIM:415000
4897,Yoon-Bellen neurodevelopmental syndrome,OMIM:619701
4898,Yunis-Varon syndrome,OMIM:216340


In [None]:
import subprocess
import xmltodict
import json

OMIM_ids = list()
OMIM_names = list()
OMIM_pubmed_text = list()



# https://www.nlm.nih.gov/dataguide/classes/edirect-for-pubmed/samplecode3.html
#bash_command = 'esearch -db pubmed -query "ACROMICRIC DYSPLASIA[Title/Abstract]" | efetch -format xml | xtract -pattern PubmedArticle -element ArticleTitle AbstractText'
#| xtract -pattern PubmedArticle -element ArticleTitle AbstractText'

# | xtract -pattern PubmedArticle -element TitleText AbstractText
# | xtract -pattern PubmedArticle -element Abstract

for i, row in df_disease_names_omim.iterrows():
    r_values = row.values
    disease_name = str(r_values[0])
    #bash_command = 'esearch -db pubmed -query "3MC SYNDROME 1[Title/Abstract]" | efetch -format xml | xtract -pattern PubmedArticle -element ArticleTitle AbstractText'
    #| xtract -pattern PubmedArticle -element ArticleTitle AbstractText'
    bash_command = 'esearch -db pubmed -query "' + disease_name + '" | efetch -format xml | xtract -pattern PubmedArticle -element ArticleTitle AbstractText'
    result = subprocess.run(bash_command, shell=True, capture_output=True, text=True)
    if result.returncode == 0:
        pubmed_data = result.stdout.split("\n")
        for item in pubmed_data:
            if len(item) > 0:
                OMIM_ids.append(r_values[1])
                OMIM_pubmed_text.append(item.replace("\t", ""))
                OMIM_names.append(r_values[0])
    if i % 10 == 0 and i > 0:
        print("{} OMIM ids processed".format(i))
    if i > 50:
        break

df_mouse_disease_pubmed_data = pd.DataFrame(zip(OMIM_ids, OMIM_names, OMIM_pubmed_text), columns=["OMIM", "DiseaseNames", "PubmedTitleAbstract"])
df_mouse_disease_pubmed_data

0 OMIM ids processed
10 OMIM ids processed
20 OMIM ids processed


In [94]:
df_mouse_disease_pubmed_data.to_csv("mouse_pubmed_text/mouse_disease_pubmed_data.tsv", sep="\t", index=None)