In [1]:
import bio2bel_hmdd
import bio2bel_mirbase
from collections import defaultdict

In [2]:
bio2bel_hmdd.get_version()

'0.0.3-dev'

In [3]:
bio2bel_mirbase.get_version()

'0.1.1-dev'

In [4]:
hmdd_df = bio2bel_hmdd.parser.get_hmdd_df()
hmdd_df.head()

Unnamed: 0,category,mir,disease,pmid,root_name,doid,icd10cm,mesh,omim,hpo,description
0,circulation_biomarker_diagnosis_down,hsa-mir-210,Acute Cerebral Infarction,25476086,cardiovascular system disease,DOID:3526,I63,D002544,,,The serum level of miR-210 in ACI was signific...
1,circulation_biomarker_diagnosis_down,hsa-mir-126,Acute Heart Failure,26580972,,,I50,,,,Levels of miR-126 and miR-423-5p were lower i...
2,circulation_biomarker_diagnosis_down,hsa-mir-27a,Acute Heart Failure,26569364,,,I50,,,,The increase in creatinine during the first 3 ...
3,circulation_biomarker_diagnosis_down,hsa-mir-335,Acute Ischemic Stroke,27856935,cardiovascular system disease,DOID:224,I63.9,D002546,,HP:0002140,Decreased plasma miR-335 expression in patient...
4,circulation_biomarker_diagnosis_down,hsa-mir-214,Acute Myocardial Infarction,25931214,cardiovascular system disease,DOID:9408,I21,D056989,608446.0,HP:0001658,The circulating level of miR-214 was significa...


In [5]:
mirbase_name_to_id = bio2bel_mirbase.get_mirbase_name_to_id()
mirbase_id_to_name = {v: k for k, v in mirbase_name_to_id.items()}

<_io.TextIOWrapper name='/Users/cthoyt/.bio2bel/mirbase/miRNA.dat.gz' encoding='UTF-8'>


parsing: 100%|██████████| 38589/38589 [00:00<00:00, 56347.90it/s]


rules:

- if it is missing, try adding "a" to the end
- of it ends in a -number, try adding a before the number

In [6]:
unmapped = {
    name
    for name in hmdd_df.mir.unique()
    if name not in mirbase_name_to_id
}

print(f'There were {len(unmapped)} unmapped miRNA names of {len(hmdd_df.mir.unique())} miRNAs')

There were 250 unmapped miRNA names of 1104 miRNAs


In [7]:
mapped_by_letter = defaultdict(list)
for name in sorted(unmapped):
    for suffix in 'abc':
        candidate_name = f'{name}{suffix}'
        if candidate_name in mirbase_name_to_id:
            print(f'{name:14} => {candidate_name}')
            mapped_by_letter[name].append(candidate_name)

hsa-let-7     => hsa-let-7b
hsa-let-7     => hsa-let-7c
hsa-mir-10    => hsa-mir-10a
hsa-mir-10    => hsa-mir-10b
hsa-mir-106   => hsa-mir-106a
hsa-mir-106   => hsa-mir-106b
hsa-mir-125   => hsa-mir-125a
hsa-mir-1260  => hsa-mir-1260a
hsa-mir-1260  => hsa-mir-1260b
hsa-mir-1268  => hsa-mir-1268a
hsa-mir-1268  => hsa-mir-1268b
hsa-mir-1269  => hsa-mir-1269a
hsa-mir-1269  => hsa-mir-1269b
hsa-mir-1273  => hsa-mir-1273c
hsa-mir-1295  => hsa-mir-1295a
hsa-mir-1295  => hsa-mir-1295b
hsa-mir-130   => hsa-mir-130a
hsa-mir-130   => hsa-mir-130b
hsa-mir-133   => hsa-mir-133b
hsa-mir-135   => hsa-mir-135b
hsa-mir-146   => hsa-mir-146a
hsa-mir-146   => hsa-mir-146b
hsa-mir-147   => hsa-mir-147a
hsa-mir-147   => hsa-mir-147b
hsa-mir-148   => hsa-mir-148a
hsa-mir-148   => hsa-mir-148b
hsa-mir-15    => hsa-mir-15a
hsa-mir-15    => hsa-mir-15b
hsa-mir-151   => hsa-mir-151a
hsa-mir-151   => hsa-mir-151b
hsa-mir-18    => hsa-mir-18a
hsa-mir-18    => hsa-mir-18b
hsa-mir-181   => hsa-mir-181c
hsa-mir-19 

In [8]:
print(f'There were {len(mapped_by_letter)} mappings when adding letter suffixes')

There were 67 mappings when adding letter suffixes


In [9]:
# miRNA families that aren't resolved
mapped_by_number_suffix = defaultdict(list)
for name in sorted(unmapped):
    for suffix in range(4):
        candidate_name = f'{name}-{suffix}'
        if candidate_name in mirbase_name_to_id:
            print(f'{name:14} => {candidate_name}')
            mapped_by_number_suffix[name].append(candidate_name)

hsa-let-7a    => hsa-let-7a-1
hsa-let-7a    => hsa-let-7a-2
hsa-let-7a    => hsa-let-7a-3
hsa-let-7f    => hsa-let-7f-1
hsa-let-7f    => hsa-let-7f-2
hsa-mir-1     => hsa-mir-1-1
hsa-mir-1     => hsa-mir-1-2
hsa-mir-101   => hsa-mir-101-1
hsa-mir-101   => hsa-mir-101-2
hsa-mir-103a  => hsa-mir-103a-1
hsa-mir-103a  => hsa-mir-103a-2
hsa-mir-103b  => hsa-mir-103b-1
hsa-mir-103b  => hsa-mir-103b-2
hsa-mir-105   => hsa-mir-105-1
hsa-mir-105   => hsa-mir-105-2
hsa-mir-1184  => hsa-mir-1184-1
hsa-mir-1184  => hsa-mir-1184-2
hsa-mir-1184  => hsa-mir-1184-3
hsa-mir-1185  => hsa-mir-1185-1
hsa-mir-1185  => hsa-mir-1185-2
hsa-mir-1233  => hsa-mir-1233-1
hsa-mir-1233  => hsa-mir-1233-2
hsa-mir-124   => hsa-mir-124-1
hsa-mir-124   => hsa-mir-124-2
hsa-mir-124   => hsa-mir-124-3
hsa-mir-1244  => hsa-mir-1244-1
hsa-mir-1244  => hsa-mir-1244-2
hsa-mir-1244  => hsa-mir-1244-3
hsa-mir-125b  => hsa-mir-125b-1
hsa-mir-125b  => hsa-mir-125b-2
hsa-mir-128   => hsa-mir-128-1
hsa-mir-128   => hsa-mir-128-2
h

In [10]:
print(f'There were {len(mapped_by_number_suffix)} mappings when adding number suffixes')

There were 70 mappings when adding number suffixes


In [11]:
mapped_by_letter_pre_number = defaultdict(list)

for name in sorted(unmapped):
    for suffix_number in range(5):
        dash_suffix_number = f'-{suffix_number}'
        if not name.endswith(dash_suffix_number):
            continue
        
        for suffix_letter in 'abcd':
            candidate_name = f'{name[:-len(dash_suffix_number)]}{suffix_letter}{dash_suffix_number}'            
            if candidate_name in mirbase_name_to_id:
                mapped_by_letter_pre_number[name].append(candidate_name)
                print(f'{name:14} => {candidate_name}')

hsa-mir-103-1 => hsa-mir-103a-1
hsa-mir-103-1 => hsa-mir-103b-1
hsa-mir-103-2 => hsa-mir-103a-2
hsa-mir-103-2 => hsa-mir-103b-2
hsa-mir-181-2 => hsa-mir-181a-2
hsa-mir-181-2 => hsa-mir-181b-2
hsa-mir-219-1 => hsa-mir-219a-1
hsa-mir-219-2 => hsa-mir-219a-2
hsa-mir-92-1  => hsa-mir-92a-1


In [12]:
print(f'There were {len(mapped_by_letter_pre_number)} mappings when adding letter suffixes before the final dash')

There were 6 mappings when adding letter suffixes before the final dash


In [13]:
mirbase_alias_to_id, dups = bio2bel_mirbase.download.get_mirbase_alias_to_id()

In [14]:
mapped_by_alias = {}
for name in sorted(unmapped):
    if name in mirbase_alias_to_id:
        mapped_id = mirbase_alias_to_id[name]
        if mapped_id not in mirbase_id_to_name:
            print(f'{name:14} DEAD {mapped_id}')
            continue
        print(f'{name:14} => {mirbase_id_to_name[mirbase_alias_to_id[name]]}')
        mapped_by_alias[name] = mirbase_id_to_name[mirbase_alias_to_id[name]]

hsa-let-7a    DEAD MIMAT0000062
hsa-let-7f    DEAD MIMAT0000067
hsa-mir-101   => hsa-mir-101-1
hsa-mir-103-1 => hsa-mir-103a-1
hsa-mir-103-2 => hsa-mir-103a-2
hsa-mir-106   => hsa-mir-106a
hsa-mir-1184  => hsa-mir-1184-1
hsa-mir-1201  DEAD MI0006333
hsa-mir-122a  => hsa-mir-122
hsa-mir-1233  => hsa-mir-1233-1
hsa-mir-1244  => hsa-mir-1244-1
hsa-mir-124a-1 => hsa-mir-124-1
hsa-mir-124a-2 => hsa-mir-124-2
hsa-mir-124a-3 => hsa-mir-124-3
hsa-mir-1254  DEAD MI0006388
hsa-mir-1254-1 DEAD MI0006388
hsa-mir-1260  => hsa-mir-1260a
hsa-mir-1268  => hsa-mir-1268a
hsa-mir-1269  => hsa-mir-1269a
hsa-mir-1273  DEAD MI0006409
hsa-mir-1273a DEAD MI0006409
hsa-mir-1273d DEAD MI0014254
hsa-mir-1273g DEAD MI0018003
hsa-mir-1274a DEAD MI0006410
hsa-mir-1274b DEAD MI0006427
hsa-mir-1280  DEAD MI0006437
hsa-mir-128a  => hsa-mir-128-1
hsa-mir-128b  => hsa-mir-128-2
hsa-mir-129   => hsa-mir-129-1
hsa-mir-1295  => hsa-mir-1295a
hsa-mir-129b  => hsa-mir-129-2
hsa-mir-1300  DEAD MI0006360
hsa-mir-1308  DEAD MI0

In [19]:
print(f'Mapped {len(mapped_by_alias)} with aliases dictionary')

Mapped 77 with aliases dictionary


In [16]:
for name in sorted(unmapped):
    for identifier, names in dups.items():
        if name in names:
            print(f'{name:14} to {identifier}')

In [21]:
still_unmapped = unmapped - (
    set(mapped_by_letter_pre_number) |
    set(mapped_by_letter) |
    set(mapped_by_number_suffix) |
    set(mapped_by_alias)
)
print(f'There are still {len(still_unmapped)} still unmapped')

There are still 97 still unmapped


In [18]:
for name in sorted(still_unmapped):
    print(name)

Hsa-mir-93
hsa-mir-103
hsa-mir-112
hsa-mir-1201
hsa-mir-124a
hsa-mir-1254
hsa-mir-1254-1
hsa-mir-126a
hsa-mir-1273a
hsa-mir-1273d
hsa-mir-1273g
hsa-mir-1274a
hsa-mir-1274b
hsa-mir-1280
hsa-mir-1300
hsa-mir-1308
hsa-mir-145a
hsa-mir-156a
hsa-mir-157
hsa-mir-160
hsa-mir-161
hsa-mir-1826
hsa-mir-189
hsa-mir-1897
hsa-mir-191a
hsa-mir-192-2
hsa-mir-196a2
hsa-mir-1974
hsa-mir-200C
hsa-mir-220a
hsa-mir-294
hsa-mir-299a
hsa-mir-3
hsa-mir-3007a
hsa-mir-3098
hsa-mir-3172
hsa-mir-322
hsa-mir-326b
hsa-mir-350
hsa-mir-352
hsa-mir-355
hsa-mir-355p
hsa-mir-3560
hsa-mir-3588
hsa-mir-3607
hsa-mir-3653
hsa-mir-3656
hsa-mir-3673
hsa-mir-3676
hsa-mir-3687
hsa-mir-422b
hsa-mir-423-5p
hsa-mir-43c
hsa-mir-4417
hsa-mir-4419b
hsa-mir-453
hsa-mir-4532
hsa-mir-463
hsa-mir-466b
hsa-mir-467
hsa-mir-467a
hsa-mir-467d
hsa-mir-4792
hsa-mir-5096
hsa-mir-516
hsa-mir-528a
hsa-mir-5338
hsa-mir-5481
hsa-mir-550
hsa-mir-565
hsa-mir-566
hsa-mir-6315
hsa-mir-633b
hsa-mir-648a
hsa-mir-672
hsa-mir-674
hsa-mir-687
hsa-mir-690
h