In [199]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import re
import json
from tqdm import tqdm, tqdm_notebook
from Identifiers_converter import Identifiers_converter
%matplotlib inline

In [200]:
cache = {}
with open('cache.json') as f:
    cache = json.load(f)
cache_df = pd.DataFrame(zip(cache.keys(), cache.values()), columns=['drugName', 'drugID'])
cache_df

Unnamed: 0,drugName,drugID
0,Ganciclovir,DB01004
1,Foscarnet,DB00529
2,Clozapine,DB00363
3,Glycine,DB00145
4,D-cycloserine,DB00260
...,...,...
30513,Heparin Infusion,
30514,Heparin SC,
30515,Enoxaparin/Lovenox Intermediate Dose,
30516,Tirofiban Injection,


In [201]:
def regex_or_x(x, regexes):
    for regex in regexes:
        res = regex.findall(x)
        if res != []:
            x = res[0]
    return x

In [202]:
comparator_regex = re.compile('Comparator: (.*)')
remove_mg_kg = re.compile('^(.*?)(?:(?:\/\d)|(?: \d)|(?:,(?:.*)\d)).*?(?:mg|kg|μg)(?:.*?)$')

regs = [comparator_regex, remove_mg_kg]
cache_df['cleanedDrugName'] = cache_df['drugName'].apply(lambda x: regex_or_x(x, regs))
# cache_df['cleanedDrugName'] = cache_df['cleanedDrugName'].apply(lambda x: x.split("/"))

In [203]:
cache_df[cache_df['drugName'] == 'Tositumomab 450 mg']

Unnamed: 0,drugName,drugID,cleanedDrugName
2003,Tositumomab 450 mg,,Tositumomab


In [222]:
print("Some examples for fixes:")
print(f'(Tositumomab 450 mg) --> ({regex_or_x("Tositumomab 450 mg", regs)})')
print(f'(Valsartan 160 mg capsules) --> ({regex_or_x("Valsartan 160 mg capsules", regs)})')
print(f'(Comparator: timolol maleate) --> ({regex_or_x("Comparator: timolol maleate", regs)})')
print(f'(Prochlorperazine 0.5 mg IV over 5 sec) --> ({regex_or_x("Prochlorperazine 0.5 mg IV over 5 sec", regs)})')



Some examples for fixes:
(Tositumomab 450 mg) --> (Tositumomab)
(Valsartan 160 mg capsules) --> (Valsartan)
(Comparator: timolol maleate) --> (timolol maleate)
(Prochlorperazine 0.5 mg IV over 5 sec) --> (Prochlorperazine)


In [223]:
print(f"Here we fixed {int(((cache_df['drugName'] != cache_df['cleanedDrugName']).sum() / cache_df['cleanedDrugName'].count()) *1000) / 1000} of the drug names")

Here we fixed 0.097 of the drug names


In [224]:
cache_df['changed'] = cache_df['drugName'] != cache_df['cleanedDrugName']
fixed_drugs_df = cache_df[cache_df['changed']]
for row in fixed_drugs_df.iterrows():
    row = row[1]
    print(f"({row[0]}) --> ({row[2]})")

(BCNU 200mg/m2) --> (BCNU)
(BCNU 150mg/m2) --> (BCNU)
(BCNU 80mg/m2) --> (BCNU)
(TMZ 150mg/m2 six 6-week cycles) --> (TMZ)
(TMZ 150mg/m2 six 8-week cycles) --> (TMZ)
(temozolomide 150 mg/m^2) --> (temozolomide)
(temozolomide 100 mg/m^2) --> (temozolomide)
(temozolomide 200 mg/m^2) --> (temozolomide)
(Bevacizumab 7.5 mg/kg) --> (Bevacizumab)
(Oxaliplatin 130 mg/m^2) --> (Oxaliplatin)
(Leucovorin 200 mg/m^2) --> (Leucovorin)
(Capecitabine 1000 mg/m^2) --> (Capecitabine)
(Bevacizumab 5 mg/kg) --> (Bevacizumab)
(Fluorouracil 400 mg/m^2) --> (Fluorouracil)
(Oxaliplatin 85 mg/m^2) --> (Oxaliplatin)
(Palifermin 2 x 180 μg/kg/day) --> (Palifermin)
(Palifermin 6 x 60 μg/kg/day) --> (Palifermin)
(teduglutide 0.2 mg) --> (teduglutide)
(Bosentan 125 mg) --> (Bosentan)
(Bosentan 62.5 mg) --> (Bosentan)
(Capecitabine 1650 mg/m^2/day) --> (Capecitabine)
(Capecitabine 1200 mg/m^2/day) --> (Capecitabine)
(Comparator: Pioglitazone) --> (Pioglitazone)
(Comparator: Sitagliptin) --> (Sitagliptin)
(dutaster

(Foradil Aerolizer 12 μg) --> (Foradil Aerolizer)
(80/2.25 μg Symbicort pMDI) --> (80)
(lanreotide ATG 120mg) --> (lanreotide ATG)
(octreotide LAR 30mg) --> (octreotide LAR)
(Comparator: insulin glargine) --> (insulin glargine)
(ezetimibe 10 mg) --> (ezetimibe)
(AZD2516, 40 mg) --> (AZD2516)
(AZD2516, 16 mg) --> (AZD2516)
(AZD2516, 5 mg) --> (AZD2516)
(lopinavir 400 mg/ritonavir 100mg) --> (lopinavir)
(emtricitabine 200mg /tenofovir 300mg) --> (emtricitabine)
(Clopidogrel 600 mg post cangrelor) --> (Clopidogrel)
(5-FU 2600 mg/m², 24-hour-Infusion, day 1) --> (5-FU)
(Folinic acid 200 mg/m², 1-2-hour-Infusion, day 1) --> (Folinic acid)
(Oxaliplatin 85 mg/m², 2-hour-Infusion, day 1) --> (Oxaliplatin)
(Docetaxel 50 mg/m2, 1-hour-Infusion, day 1) --> (Docetaxel)
(Ipilimumab, 10 mg) --> (Ipilimumab)
(Ipilimumab, 3 mg) --> (Ipilimumab)
(Naprosyn 250 mg tablets) --> (Naprosyn)
(levonorgestrel patch with BMI 32 kg/m2 to <40 kg/m2) --> (levonorgestrel patch with BMI)
(Solifenacin 5mg) --> (Solif

(ACT-129968 500 mg tablet) --> (ACT-129968)
(Lorazepam 1 mg IM) --> (Lorazepam)
(Ativan 1 mg IM) --> (Ativan)
(Inhaled loxapine 10 mg) --> (Inhaled loxapine)
(ADASUVE 10 mg) --> (ADASUVE)
(Lamotrigine Compressed tablet 25mg) --> (Lamotrigine Compressed tablet)
(Lamotrigine Dispersible/Chewable tablets 5mg*5) --> (Lamotrigine Dispersible/Chewable tablets)
(RDEA3170 10 mg) --> (RDEA3170)
(Febuxostat 40 mg) --> (Febuxostat)
(naproxen 250 mg) --> (naproxen)
(indomethacin 25 mg) --> (indomethacin)
(lesinurad 400 mg) --> (lesinurad)
('TOPROL-XL®' ER Tablets 50 mg) --> ('TOPROL-XL®' ER Tablets)
(Metoprolol Succinate ER Tablets 50 mg) --> (Metoprolol Succinate ER Tablets)
(Metoprolol Succinate ER Tablet 200 mg) --> (Metoprolol Succinate ER Tablet)
('TOPROL-XL®' ER Tablets 200 mg) --> ('TOPROL-XL®' ER Tablets)
(Doxycycline 100mg) --> (Doxycycline)
(Risperidone Long-acting Injectable (LAI) 25 mg) --> (Risperidone Long-acting Injectable (LAI))
(Risperidone LAI 37.5 mg) --> (Risperidone LAI)
(Risp

(Metoclopramide 0.1 mg/kg, max 10mg) --> (Metoclopramide)
(Ketorolac 0.5mg/kg, max 30mg) --> (Ketorolac)
(Normal saline fluid bolus 20mL/kg, max 1000mL) --> (Normal saline fluid bolus)
(Esketamine 112 mg) --> (Esketamine)
(Duvie Tab. 0.5mg, Glucophage XR Tab. 500mg) --> (Duvie Tab.)
(Glucophage XR Tab. 500mg) --> (Glucophage XR Tab.)
(Duvie Tab. 0.5mg) --> (Duvie Tab.)
(CKD-395 0.25/500mg) --> (CKD-395)
(Reference (ambrisentan 10 mg + tadalafil 40 mg given concurrently)) --> (Reference (ambrisentan)
(FDC (ambrisentan 10 mg-tadalafil 40 mg) single dose) --> (FDC (ambrisentan)
(Ustekinumab 0.75 mg/kg) --> (Ustekinumab)
(Bupropion HCl XL tablet 300mg) --> (Bupropion HCl XL tablet)
(Bupropion HCl XL tablet 150mg) --> (Bupropion HCl XL tablet)
(Calcium folinate 200 mg/m2 iv) --> (Calcium folinate)
(Irinotecan 165 mg/m2 iv) --> (Irinotecan)
(5-fluorouracil 3200 mg/m2) --> (5-fluorouracil)
(Oxaliplatin 85 mg/m2 iv) --> (Oxaliplatin)
(Oral Contraceptive (1mg norethindrone, 0.035mg ethinyl estr

(Decapeptyl ( GnRH Agonist 0.1 mg*2 )) --> (Decapeptyl ( GnRH Agonist)
(Telmisartan 80mg + Amlodipine 5mg + Chlorthalidone 25mg) --> (Telmisartan)
(Amosartan Tab. 5/50 mg, Amosartan Tab. 5/100 mg) --> (Amosartan Tab.)
(Betamethasone 4 mg/ml) --> (Betamethasone)
(Phentermine Pill 37.5mg) --> (Phentermine Pill)
(Decapeptyl 0.1mg amp) --> (Decapeptyl)
(Vildagliptin 50 mg Oral Tablet) --> (Vildagliptin)
(Galvus 50 mg) --> (Galvus)
(Glimepiride upto 4 mg Oral Tablet) --> (Glimepiride upto)
(Amaryl 4 mg) --> (Amaryl)
(Metformin 1000 mg Oral Tablet) --> (Metformin)
(Glucophage 1000 mg) --> (Glucophage)
(JNJ-53718678, 4500 mg or Dose to be decided) --> (JNJ-53718678)
(JNJ-53718678 500 mg) --> (JNJ-53718678)
(Aspirin 81mg tablet) --> (Aspirin)
(celecoxib 200mg capsule) --> (celecoxib)
(naproxen sodium 550mg tablet) --> (naproxen sodium)
(PvP001 300 mg) --> (PvP001)
(PvP001 900 mg) --> (PvP001)
(PvP001 100 mg) --> (PvP001)
(PvP001 600 mg) --> (PvP001)
(anlotinib 8mg + AP/PC) --> (anlotinib)
(anl

In [225]:
fixed_drugs_df['cleanedDrugName']
for row in fixed_drugs_df[fixed_drugs_df['cleanedDrugName'].str.contains(',')].iterrows():
    row = row[1]
    print(f"({row[0]}) --> ({row[2]})")

(Comparator: Treatment B (Zofran, ondansetron)) --> (Treatment B (Zofran, ondansetron))
(Comparator: Treatment C (Zofran, ondansetron)) --> (Treatment C (Zofran, ondansetron))
(Comparator: Treatment A (Zofran, ondansetron)) --> (Treatment A (Zofran, ondansetron))
