In [1]:
%%writefile Drugbank.py
"""Get needed info about approved by FDA ligands and targets from Drugbank, 
tailor and save it. Possible to load info into correspondent variables and download/process the database
"""
import os
import json
import xml.etree.ElementTree as etree
#import xml
import datetime  # For debug
from pathlib import Path
import subprocess
import zipfile
import pubchempy

import Auxiliary as aux


def download_from_drugbank(key, USERNAME, PASSWORD, root, release='5-1-3', overwrite=False, verbose=False):
    """Download needed data from Drugbank (https://www.drugbank.ca/releases/latest),
    it usually updates from every 2 weeks to 4 months.
    !! Carefully use it: the whole database is 130 MB packed, 1.3 GB unpacked, SDF is 40 MB, ids if 1 MB
    INPUT:
        USERNAME of the user and PASSWORD
        root - store data in root/'Drugbank_extracted'. root -- root of the whole protocol.
        key - what to download(key='f' - for whole database
                               key='s' - for file with sdf structures of ligands
                               key='i' - for downloading ids of targets)
        release - release version (e.g. '5-1-3')
    OUTPUT:
        path to downloaded file, -1 if key is inappropriate
    """ 
    key_list = ['f', 's', 'i']
    if key not in key_list:
        print("Inappropriate key, should be \
              'f' for full database, \
              's' for structures of ligands, \
              'i' for targets ids")
        return -1
    else:
        # Names with which to download
        filename_list = ['all-full-database', 'all-structures', 'target-all-polypeptide-ids']
        key_filename = dict(zip(key_list, filename_list))
        filename = key_filename[key]
        url = 'https://www.drugbank.ca/releases/'+ release + '/downloads/' + filename
        # Names of extracted files
        result_name_list = ['full database.xml', 'structures.sdf', 'all.csv']
        name_dict = dict(zip(filename_list, result_name_list))
        result_name = name_dict[filename]
        # Directory where to extract
        drugbank_path = str(Path(root) / 'Drugbank_extracted')
        make_dir(drugbank_path)
        # Resulting file name
        path_to_file = str(Path(drugbank_path) / result_name)
        print(path_to_file)
        # Check if existed or needed to be overwrited
        if overwrite or not (Path(root) / 'Drugbank_extracted' / result_name).is_file():
            out_path = str(Path(drugbank_path) / (filename +'.zip'))
            # Get file
            subprocess.check_output(['curl', '-Lfv', '-o', out_path, 
                                     '-u', USERNAME + ':' + PASSWORD, url])
            print('Downloading Drugbank')
            # Unpack file to the same directory
            with zipfile.ZipFile(out_path, 'r') as zip_ref:
                print("Extracting " + out_path)
                zip_ref.extractall(drugbank_path)
            # Delete downloaded .zip
            subprocess.check_output(['rm', out_path])
        return path_to_file


def make_dir(dir_path):
    """Make directory with absolute path dir_name recursively"""
    if not os.path.exists(dir_path):
        os.makedirs(dir_path, exist_ok=True)
        print("Directory " , dir_path ,  " Created ")
    else:
        pass
    
    
def make_dir_from_list(dirList):
    for dirName in dirList:
        if not os.path.exists(dirName):
            os.mkdir(dirName)
            print("Directory ", dirName,  " created ")
        else:
            pass
        

def get_name_seq_from_fasta_lines(fasta):
    """Returns name and a/a sequence from fasta as a string
    INPUT - fasta as a string (with \n)
    OUTPUT - list (name, seq)
    """
    fasta_splitted = fasta.split('\n')
    name = fasta_splitted[0].split('|')[-1]
    seq = ''.join(fasta_splitted[1:])
    return name, seq
        

def dump_info_db(root):
    """Save all collected from Drugbank data as json files to root/Drugbank_exracted"""
    # All names of files to be dumped to root/Drugbank_extracted with name.txt, where name is from names
    names = ['ligands_unii', 'ligands_drugbank_ids', 'ligands_names', 'ligands_ids', 'ligands_resources',
             'ligands_ids_by_names', 'ligands_resources_by_names',
             'ligands_names_and_their_targets_ids', 'ligands_names_and_their_targets_resources',
             'targets_ids', 'targets_resources', 'targets_names',
             'ligands_smiles', 'ligands_names_and_smiles',
             'approved_flags', 'ligands_and_approved_flags',
             'targets_fastas', 'targets_names_and_fastas',
            ]
    name_full = str(Path(root) / 'Drugbank_extracted')
    make_dir_from_list([name_full])    
    for name in names:
        exec('global ' + name)
    for name in names:
        with open(str(Path(name_full) / (name + ".txt")), 'w') as f:
            exec('json.dump(' + name + ', f, ensure_ascii=False)')
            

def load_info_db(root):
    """Load all collected from Drugbank data as json files from root/Drugbank_exracted"""
    # All names of files to be loaded from root/Drugbank_extracted with name.txt, where name is from names
    names = ['ligands_unii', 'ligands_drugbank_ids', 'ligands_names', 'ligands_ids', 'ligands_resources',
             'ligands_ids_by_names', 'ligands_resources_by_names',
             'ligands_names_and_their_targets_ids', 'ligands_names_and_their_targets_resources',
             'targets_ids', 'targets_resources', 'targets_names',
             'ligands_smiles', 'ligands_names_and_smiles',
             'approved_flags', 'ligands_and_approved_flags',
             'targets_fastas', 'targets_names_and_fastas',
            ]
    name_full = str(Path(root) / 'Drugbank_extracted')
    aux.load_info_db_from_namelist(names, root)


def db_tag(element, string):
    """Check that tag of element in Drugbank == needed string"""
    return element.tag.split("{http://www.drugbank.ca}")[1] == string


def process_drugbank(root, name='full database.xml'):
    """Get needed info from the full Drugbank database, placed in root with name
    https://www.drugbank.ca/docs/drugbank.xsd -- scheme of the base
    """
    # Location of the database
    source = str(Path(root) / 'Drugbank_extracted' / name)
    # Get an iterable
    context = etree.iterparse(source, events=("start", "end"))

    # Turn it into an iterator
    context = iter(context)

    # Get the root element, for Python 2 here should be: event, root_tree = context.next()
    event, root_tree = next(context)

    # Here go lists with collected information about approved by FDA ligands in Drugbank
    # !!! Maybe use some OOP instead of list of lists
    
    # Initialize variables for ligands
    global ligands_unii, ligands_drugbank_ids, ligands_names, ligands_ids, ligands_resources
    global ligands_ids_by_names, ligands_resources_by_names
    global approved_flags, ligands_and_approved_flags
    # Forligands and their targets
    global ligands_names_and_their_targets_ids, ligands_names_and_their_targets_resources
    global ligands_smiles, ligands_names_and_smiles
    # For targets
    global targets_ids, targets_resources, targets_names
    global targets_fastas, targets_names_and_fastas
    
    
    ligands_unii = []  # List of ligands' UNII ids
    ligands_drugbank_ids = []  # List of lists of Drugbank ids (one ligand could have several)
    ligands_names = []  # List of usual names
    ligands_ids = []  # List of lists of ids in different DBs
    ligands_resources = []  # List of lists of resources in different DBs
    ligands_smiles = []  # List of all SMILES of ligands
    approved_flags = []  # List of True/False of approvance of drug by FDA
    smiles = None
    
    targets_ids = []  # List of lists of lists of ids in different DBs
    targets_resources = []  # List of lists of lists of resources in different DBs
    targets_names = []  # List of lists of names of all targets
    targets_fastas = []  # List of lists of fastas
    unii = ""
    
    # Database iteration (DB is too big to parse it directly)
    for event, elem in context:
        # Iterate over drugs
        if event == "end" and db_tag(elem,"drug"):
            # Will be true in the end if one of products is approved
            # The ligand will be regarded as approved
            fda_approved = False
            # Flag check that
            f_drug_entry = False
            # Flag of having SMILES
            f_smiles = False
            smiles = None
            
            # Refresh all temporal lists of ids and resources
            # for ligands
            l_ids = []
            l_resources = []
            l_db_ids = []
            # for targets 
            t_id = []  # Id of one target in one database
            t_ids = []  # List of ids of one target in all databases
            t_resource = []  # Resource of one target (name of one database) 
            t_resources = []  # List of resources of one target (names of all databases)
            t_name = []  # Name of one target 
            t_names = []  # List of names of all targets
            t_fasta = []  # Fasta of one target
            t_fastas = []  # List of fasta strings of targets

            # Iteration over all fields of one drug
            for item in list(elem):
                # Get basic info of drug
                if db_tag(item, "unii"):
                    unii = item.text
                    # Check that it's not part of other drug entry (mb name 'drug' too as an interacting drug)
                    f_drug_entry = True
                if db_tag(item, "drugbank-id"):
                    l_db_ids.append(item.text)
                if db_tag(item, "name"):
                    name = item.text                    
                    
                # Get SMILES
                if db_tag(item, 'calculated-properties'):
                    # Iterate over properties
                    for item1 in list(item):
                        # Iterate over info about properties
                        for item2 in list(item1):
                            if db_tag(item2, 'kind'):
                                if item2.text == 'SMILES':
                                    f_smiles = True
                            if db_tag(item2, 'value') and f_smiles:
                                f_smiles = False
                                smiles = item2.text
                            
                # Checking if drug is approved by FDA
                if db_tag(item, "products"):
                    # Iterate over products
                    for it1 in list(item):
                        for it in list(it1):
                            # Check whether approved product exists
                            if db_tag(it, "approved"):
                                if it.text == "true":
                                    fda_approved = True

                # Get identifiers of drug and their databases
                if db_tag(item, "external-identifiers"):   
                    for it1 in list(item):
                        for it in list(it1):
                            if db_tag(it, "resource"):
                                l_resources.append(it.text)
                            if db_tag(it, "identifier"):
                                l_ids.append(it.text)

                # Get identifiers of targets and their databases
                if db_tag(item, "targets"):
                    for it1 in list(item):
                        f_polypeptide = False
                        # Iterating over properties of one target
                        for it2 in list(it1):
                            # Get name of the target
                            if db_tag(it2, 'name'):
                                t_name = it2.text
                            # Get info only about polypeptide target
                            if db_tag(it2, "polypeptide"):
                                f_polypeptide = True
                                for it3 in list(it2):
                                    if db_tag(it3, "external-identifiers"):
                                        for it4 in list(it3):
                                            for it5 in list(it4):
                                                if db_tag(it5, "resource"):
                                                    t_resource.append(it5.text)
                                                if db_tag(it5, "identifier"):
                                                    t_id.append(it5.text)
                                    # Get a/a sequence
                                    if db_tag(it3, "amino-acid-sequence"):
                                        t_fasta = it3.text
                                            
                                    # Gather all ids and resources of one polypeptide target
                                if f_polypeptide:
                                    t_ids.append(t_id)
                                    t_id = []
                                    t_resources.append(t_resource)
                                    t_resource = []
                                    t_names.append(t_name)
                                    t_name = []
                                    t_fastas.append(t_fasta)
                                    t_fasta = []


            # Clear in order not to store the whole database in memory    
            root_tree.clear()

            # If it was really drug entry => add information
            
            if f_drug_entry and fda_approved:  # If needed to store all drugs, then delete fda_approved
                approved_flags.append(fda_approved)
                ligands_names.append(name)
                ligands_unii.append(unii)
                ligands_ids.append(l_ids)
                ligands_drugbank_ids.append(l_db_ids)
                ligands_resources.append(l_resources)
                ligands_smiles.append(smiles)

                targets_ids.append(t_ids)
                targets_resources.append(t_resources)
                targets_names.append(t_names)
                targets_fastas.append(t_fastas)
    
    # Create some useful dictionaries
    ligands_ids_by_names = dict(zip(ligands_names, ligands_ids))
    ligands_resources_by_names = dict(zip(ligands_names, ligands_resources))
    ligands_names_and_their_targets_ids = dict(zip(ligands_names, targets_ids))
    ligands_names_and_their_targets_resources = dict(zip(ligands_names, targets_resources))
    ligands_names_and_smiles = dict(zip(ligands_names, ligands_smiles))
    ligands_and_approved_flags = dict(zip(ligands_names, approved_flags))
    # Make dictionary {name of target:a/a sequence} and write to file sequences
    list_names = []
    list_fastas = []
    # Create file where to save fastas
    with open(str(Path(root) / 'Drugbank_extracted' / 'Drugbank_targets.fasta'), "w+") as myfile:
        pass
    for l_targets in targets_fastas:
        for fasta in l_targets:
            name, seq = get_name_seq_from_fasta_lines(fasta)
            if name not in list_names:
                list_names.append(name)
                list_fastas.append(seq)
                with open(str(Path(root) / 'Drugbank_extracted' / 'Drugbank_targets.fasta'), "a+") as myfile:
                    myfile.write(fasta)
                    myfile.write('\n')
    targets_names_and_fastas = dict(zip(list_names, list_fastas))
    
    # Save obtained data        
    dump_info_db(root)
    

def add_smiles_from_pubchem(root, ligands_names_and_smiles):
    """Add SMILES of ligands which don't have SMILES in Drugbank, but have it in PubChem (~10 ligands)"""
    # Load needed lists with data
    aux.load_info_db_from_namelist(['ligands_names', 'ligands_smiles'], root)
    # Iterating over names, finding ones without SMILES and trying to get SMILES from PubChem    
    for name in ligands_names:
        if not ligands_names_and_smiles[name]:
            try:
                ind_compound = ligands_resources_by_names[name].index('PubChem Compound')
                # Find SMILES
                pubchem = ligands_ids_by_names[name][ind_compound]
                # Get smiles from PubCHEM
                c = pubchempy.Compound.from_cid(pubchem)
                smiles = c.isomeric_smiles
                ligands_smiles[ligands_names.index(name)] = smiles
            except ValueError:
                pass
    # Save corrected data
    ligands_names_and_smiles = dict(zip(ligands_names, ligands_smiles))
    name_full = str(Path(root) / 'Drugbank_extracted')
    with open(str(Path(name_full) / ('ligands_smiles' + '.txt')), 'w') as f:
            json.dump(ligands_smiles, f, ensure_ascii=False)
    with open(str(Path(name_full) / ('ligands_names_and_smiles' + ".txt")), 'w') as f:
            json.dump(ligands_names_and_smiles, f, ensure_ascii=False)


if __name__ == "__main__":
    
    # Directory where all data placed
    root = '/home/anton_maximov/BACHELOR'
    root = os.getcwd()
    print(root)
    
    
    # If needed to download new version of Drugbank
    # key='f' - for whole database
    # key='s' - for file with sdf structures of ligands
    # key='i' - for targets ids
    download_from_drugbank('i', 'maksimov.as@phystech.edu', 'drugsandbanks', root, release='5-1-3')
    
    # Processing if new information needed or wasn't dumped before
    #process_drugbank(root)
    # Add SMILES of ligands which don't have one in Drugbank but have it in PubChem
    #add_smiles_from_pubchem(root, ligands_names_and_smiles)
    print(datetime.datetime.now())
    # Start of user actions
    # Load data from .txts from root/Drugbank_extracted to program
    #load_info_db(root)
    
    # Some checks of work
    #print(ligands_ids_by_names)
    #name_lig = 'Acetazolamide'#'Methazolamide'  #'Acetaminophen' 'Acetazolamide' 
    #uniprot = 'P00918'
    #a = get_all_smiles_of_ligands(ligands_ids_by_names, ligands_resources_by_names)
    #print(aux.get_pdbs_from_smiles(sm, 0.5))
    #print(aux.get_common_pdbs_from_ligand_name_and_target_uniprot(name_lig, uniprot, -0.1,
    #                                                  ligands_ids_by_names, ligands_resources_by_names,
    #                       ligands_names_and_their_targets_ids, ligands_names_and_their_targets_resources))
    #print(aux.get_common_pdbs_with_all_targets_of_ligand(name_lig, -0.1,
    #                                                  ligands_ids_by_names, ligands_resources_by_names,
    #                       ligands_names_and_their_targets_ids, ligands_names_and_their_targets_resources))

Overwriting Drugbank.py


In [8]:
print(len(ligands_names_and_smiles))
k = 0
for el in ligands_names_and_smiles.keys():
    if ligands_names_and_smiles[el]:
        k += 1
print(k)

3220
1939


In [17]:
print(targets_names_and_fastas['Low affinity immunoglobulin gamma Fc region receptor III-B'])
#print(targets_fastas[1][1].split('\n'))
#print(get_name_seq_from_fasta_lines(targets_fastas[1][1]))

MWQLLLPTALLLLVSAGMRTEDLPKAVVFLEPQWYSVLEKDSVTLKCQGAYSPEDNSTQWFHNESLISSQASSYFIDAATVNDSGEYRCQTNLSTLSDPVQLEVHIGWLLLQAPRWVFKEEDPIHLRCHSWKNTALHKVTYLQNGKDRKYFHHNSDFHIPKATLKDSGSYFCRGLVGSKNVSSETVNITITQGLAVSTISSFSPPGYQVSFCLVMVLLFAVDTGLYFSVKTNI


In [6]:
#print(list(ligands_and_approved_flags.values()).count(True))
#print(targets_fastas[100])
#print(ligands_smiles)
import RDkit as rd
df = rd.get_closest_smiles_name('ClC1=CC=CC=C1CN1CCC2=C(C1)C=CS2', ligands_names_and_smiles, 3)
#print(ligands_and_approved_flags)
#print(ligands_names_and_their_targets_ids)
#print(ligands_names)
#for name, flag in ligands_and_approved_flags.items():    # for name, age in dictionary.iteritems():  (for Python 2.x)
#    if not flag:
#        print(name)

ModuleNotFoundError: No module named 'pandas'

In [None]:
download_drugbank

In [21]:
#load_info_db(root)
#add_smiles_from_pubchem(root)
#dump_info_db(root)
name = 'Dalbavancin'
print(ligands_smiles[ligands_names.index(name)])
print(ligands_names_and_smiles[name])
#add_smiles_from_pubchem(root)
#print(len(ligands_names))
#print(len(ligands_smiles))

CC(C)CCCCCCCCC(=O)N[C@@H]1[C@H]([C@@H]([C@H](O[C@H]1OC2=C3C=C4C=C2OC5=C(C=C(C=C5)[C@H]([C@H]6C(=O)NC(C7=CC(=CC(=C7C8=C(C=CC(=C8)[C@H](C(=O)N6)NC(=O)[C@@H]4NC(=O)[C@@H]9C1=CC(=CC(=C1Cl)O)OC1=C(C=CC(=C1)[C@H](C(=O)N[C@H](CC1=CC=C(O3)C=C1)C(=O)N9)NC)O)O)O[C@@H]1[C@H]([C@H]([C@@H]([C@H](O1)CO)O)O)O)O)C(=O)NCCCN(C)C)O)Cl)C(=O)O)O)O
CC(C)CCCCCCCCC(=O)N[C@@H]1[C@H]([C@@H]([C@H](O[C@H]1OC2=C3C=C4C=C2OC5=C(C=C(C=C5)[C@H]([C@H]6C(=O)NC(C7=CC(=CC(=C7C8=C(C=CC(=C8)[C@H](C(=O)N6)NC(=O)[C@@H]4NC(=O)[C@@H]9C1=CC(=CC(=C1Cl)O)OC1=C(C=CC(=C1)[C@H](C(=O)N[C@H](CC1=CC=C(O3)C=C1)C(=O)N9)NC)O)O)O[C@@H]1[C@H]([C@H]([C@@H]([C@H](O1)CO)O)O)O)O)C(=O)NCCCN(C)C)O)Cl)C(=O)O)O)O


In [3]:
print(aux.get_smiles_from_name_from_pubchem('Dalbavancin', ligands_ids_by_names, ligands_resources_by_names))

CC(C)CCCCCCCCC(=O)N[C@@H]1[C@H]([C@@H]([C@H](O[C@H]1OC2=C3C=C4C=C2OC5=C(C=C(C=C5)[C@H]([C@H]6C(=O)NC(C7=CC(=CC(=C7C8=C(C=CC(=C8)[C@H](C(=O)N6)NC(=O)[C@@H]4NC(=O)[C@@H]9C1=CC(=CC(=C1Cl)O)OC1=C(C=CC(=C1)[C@H](C(=O)N[C@H](CC1=CC=C(O3)C=C1)C(=O)N9)NC)O)O)O[C@@H]1[C@H]([C@H]([C@@H]([C@H](O1)CO)O)O)O)O)C(=O)NCCCN(C)C)O)Cl)C(=O)O)O)O


In [17]:
#print(a)
#print(ligands_smiles)
#load_info_db(root)
#print(ligands_names_and_smiles)
for name in ligands_names:
    try:
        if not ligands_names_and_smiles[name]:
            ind_compound = ligands_resources_by_names[name].index('PubChem Compound')   
            print(name)
    except:
        pass
#dump_info_db(root)

Colestipol
Benzylpenicilloyl Polylysine
Dalbavancin
Butriptyline
Phenoxyethanol
Carbon monoxide
Labetuzumab govitecan


In [18]:
ligands_names_and_smiles['Naronapride']
ligands_resources_by_names['Naronapride']
print(ligands_names_and_their_targets_ids['Carbon monoxide'])

[['HGNC:6915', 'P02144', 'MYG_HUMAN']]


In [5]:
d = dict(zip(ligands_names, targets_names))
print(len(d.keys()))
k = 0
for name in d.keys():
    if not d[name]:
        k += 1
        print(name)

11048
Dornase alfa
Asparaginase Escherichia coli
Gramicidin D
Rasburicase
Imiglucerase
Indium In-111 satumomab pendetide
Pegaspargase
Human Serum Albumin
Digoxin Immune Fab (Ovine)
Daptomycin
Pancrelipase
Alglucerase
Laronidase
Agalsidase beta
Gadodiamide
Pipobroman
Calcium acetate
Crotamiton
Diatrizoate
Chlorambucil
Mitomycin
Capreomycin
Calcium glucoheptonate
Pyrazinamide
Aminohippuric acid
Tioguanine
Colestipol
Gentian violet cation
Entecavir
Verteporfin
Altretamine
Vancomycin
Bentoquatam
Oxaliplatin
Gadoversetamide
Methoxsalen
Perflutren
Succimer
Carbenicillin
Gadoteridol
Linezolid
Furazolidone
Butoconazole
Nystatin
Metaxalone
Trimethobenzamide
Benzyl Benzoate
Amphotericin B
Icodextrin
Mannitol
Gadobenic acid
Nalidixic acid
Uracil mustard
Haloprogin
Colistin
Natamycin
Phensuximide
Hydroxypropyl cellulose
Temozolomide
Chlorphenesin
Penicillamine
Pemirolast
Mechlorethamine
Tinidazole
Colesevelam
Metacycline
Dirithromycin
Carboplatin
Ethiodized oil
Edetic Acid
Telithromycin
Busulfan
H

Pinus elliottii pollen
Pinus virginiana pollen
Pinus monticola pollen
Populus nigra pollen
Elaeagnus angustifolia pollen
Salix lasiolepis pollen
Salix discolor pollen
Populus balsamifera subsp. trichocarpa pollen
Taraxacum officinale pollen
Atriplex polycarpa pollen
Baccharis halimifolia pollen
Hymenoclea salsola pollen
Allenrolfea occidentalis pollen
Atriplex lentiformis pollen
Artemisia douglasiana pollen
Artemisia ludoviciana pollen
Urtica dioica pollen
Iva axillaris pollen
Ambrosia deltoidea pollen
Sarcobatus vermiculatus pollen
Atriplex canescens pollen
Artemisia absinthium pollen
Bordetella pertussis filamentous hemagglutinin antigen (formaldehyde inactivated)
Bordetella pertussis pertactin antigen
Bordetella pertussis fimbriae 2/3 antigen
Poliovirus type 1 antigen (formaldehyde inactivated)
Poliovirus type 2 antigen (formaldehyde inactivated)
Poliovirus type 3 antigen (formaldehyde inactivated)
Neisseria meningitidis group a capsular polysaccharide diphtheria toxoid conjugate an

OSI-027
Evocalcet
Zamicastat
MBX-8025
Sagopilone
Resminostat
Fanapanel
Eleclazine
Esreboxetine
Fresolimumab
DSM-265
Naproxen etemesil
Polmacoxib
Voxtalisib
Bromperidol
Pumosetrag
Samarium
Remimazolam
Triciribine
Lisofylline
Iobitridol
PF-03635659
Vatreptacog alfa
Sabarubicin
R-428
Gemigliptin
Indusatumab vedotin
Usistapide
Galeterone
VTP-27999
Anagliptin
HSD-016
Diazepinomicin
Somavaratan
Sulforaphane
Fozivudine Tidoxil
MK-3207
Liothyronine I-131
Bitopertin
Orvepitant
PAC-14028
CI-1040
BC-3781
Muplestim
CC-401
SCH-900271
Steviolbioside
Tipelukast
Filociclovir
Ioforminol
Verdiperstat
Tavilermide
Sonedenoson
Theanine
Derenofylline
Nadifloxacin
Lifibrol
Tempol
SCY-635
Caprylic alcohol
Chlorine Dioxide
Zalypsis
Bococizumab
Rimegepant
Belotecan
Conbercept
GLPG-0492
Ralinepag
Semagacestat
Bevenopran
Favipiravir
TU-100
Pf-04531083
Dexelvucitabine
MK-3118
Levodropropizine
Taurolidine
Lynestrenol
Biphenyl dimethyl dicarboxylate
CPG-52852
Perfluoro tert-butylcyclohexane
Zabofloxacin
Betulinic Ac

In [11]:
#print(k)
ligands_smiles

[None,
 None,
 None,
 None,
 'CC[C@H](C)[C@H](NC(=O)[C@H](CCC(O)=O)NC(=O)[C@H](CCC(O)=O)NC(=O)[C@H](CC1=CC=CC=C1)NC(=O)[C@H](CC(O)=O)NC(=O)CNC(=O)[C@H](CC(N)=O)NC(=O)CNC(=O)CNC(=O)CNC(=O)CNC(=O)[C@@H]1CCCN1C(=O)[C@H](CCCNC(N)=N)NC(=O)[C@@H]1CCCN1C(=O)[C@H](N)CC1=CC=CC=C1)C(=O)N1CCC[C@H]1C(=O)N[C@@H](CCC(O)=O)C(=O)N[C@@H](CCC(O)=O)C(=O)N[C@@H](CC1=CC=C(O)C=C1)C(=O)N[C@@H](CC(C)C)C(O)=O',
 None,
 None,
 None,
 None,
 None,
 None,
 'CC(C)C[C@H](NC(=O)[C@@H](COC(C)(C)C)NC(=O)[C@H](CC1=CC=C(O)C=C1)NC(=O)[C@H](CO)NC(=O)[C@H](CC1=CNC2=CC=CC=C12)NC(=O)[C@H](CC1=CN=CN1)NC(=O)[C@@H]1CCC(=O)N1)C(=O)N[C@@H](CCCN=C(N)N)C(=O)N1CCC[C@H]1C(=O)NNC(N)=O',
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 'CC(C)C[C@@H](NC(=O)CNC(=O)[C@@H](NC=O)C(C)C)C(=O)N[C@@H](C)C(=O)N[C@H](C(C)C)C(=O)N[C@@H](C(C)C)C(=O)N[C@H](C(C)C)C(=O)N[C@@H](CC1=CNC2=C1C=CC=C2)C(=O)N[C@H](CC(C)C)C(=O)N[C@@H](CC1=CNC2=C1C=CC=C2)C(=O)N[C@H](CC(C)C)C(=O)N[C@@H](CC1=CNC2=C1C=CC=C2)C(=O)N[C@H](CC(C)C)C(=O)N[C

In [7]:
ligands_names_and_smiles = []
dump_info_db(root)

In [1]:
root = '/media/anton/b8150e49-6ff0-467b-ad66-40347e8bb188/anton/BACHELOR'
#dump_info_db(root)
pubchem = 46506142# 1983
c = pubchempy.Substance.from_sid(pubchem)
#d = c.to_dict(properties=['atoms', 'bonds', 'inchi'])
#print(d['atoms'])
print(c.smiles)

NameError: name 'pubchempy' is not defined

In [19]:
#ligands_resources
c = []
s = []
for ind, i in enumerate(ligands_resources):
    if 'PubChem Substance' in i:
        s.append(ind)
    if 'PubChem Compound' in i:
        c.append(ind)

In [39]:
name='full database.xml'
root = '/media/anton/b8150e49-6ff0-467b-ad66-40347e8bb188/anton/BACHELOR'
source = str(Path(root) / name)
# Get an iterable
context = etree.iterparse(source, events=("start", "end"))

# Turn it into an iterator
context = iter(context)

# Get the root element, for Python 2 here should be: event, root_tree = context.next()
event, root_tree = next(context)
sm = []
names = []
f_smiles = False
for event, elem in context:
    # Find drugs storage
    if event == "end" and db_tag(elem,"drug"):
        for item in list(elem):
            if db_tag(item, "targets"):
                for it1 in list(item):
                    for it2 in list(it1):
                        #print('{' + it2.tag.split('}')[1] + '}')
                        if db_tag(it2, 'name'):
                            print(it2.text)
                        #if db_tag(it2, "polypeptide"):
                        #    for it3 in list(it2):
                         #       print('|' + it3.tag.split('}')[1] + '|')
                         #       if it3.text:
                          #          print(item.text)
                                #if db_tag(it3, "external-identifiers"):
                                #    for it4 in list(it3):
                                #        for it5 in list(it4):
                                 #           print(it5.tag, it5.text)
            if db_tag(item, "name"):
                name = item.text
                #print(name)
                names.append(name)
    root_tree.clear()
            #for it1 in list(item):
            #    print(it1.tag)
                #    print(it1.text)#print(it1.tag, it1.text)
                #if db_tag(item, "experimental-properties"):
                #for it1 in list(item):
                 #   print(it1)
                    #for it2 in it1:
                    #   
#d = dict(zip(names, sm))            

Prothrombin
Epidermal growth factor receptor
Low affinity immunoglobulin gamma Fc region receptor III-B
Complement C1r subcomponent
Complement C1q subcomponent subunit A
Complement C1q subcomponent subunit B
Complement C1q subcomponent subunit C
Low affinity immunoglobulin gamma Fc region receptor III-A
Complement C1s subcomponent
High affinity immunoglobulin gamma Fc receptor I
Low affinity immunoglobulin gamma Fc region receptor II-a
Low affinity immunoglobulin gamma Fc region receptor II-b
Low affinity immunoglobulin gamma Fc region receptor II-c
DNA
Interleukin-2 receptor subunit alpha
Interleukin-2 receptor subunit beta
Cytokine receptor common subunit gamma
Tumor necrosis factor
Tumor necrosis factor receptor superfamily member 1B
High affinity immunoglobulin gamma Fc receptor I
Low affinity immunoglobulin gamma Fc region receptor III-A
Low affinity immunoglobulin gamma Fc region receptor II-a
Low affinity immunoglobulin gamma Fc region receptor II-b
Low affinity immunoglobulin g

KeyboardInterrupt: 

In [56]:
print(d['Omalizumab'])

COC(=O)[C@H](CC1=CC=CC=C1)NC(=O)[C@@H](N)CC(O)=O


In [23]:
### GARBAGE
def iterative_processing(root, name='full database.xml', *args):
    """Get needed info from the full Drugbank database, placed in root with name"""
    # !!! Check the first doubled entry
    source = str(Path(root) / name)
    # Get an iterable
    context = etree.iterparse(source, events=("start", "end"))

    # Turn it into an iterator
    context = iter(context)

    # Get the root element, for Python 2 here should be: event, root_tree = context.next()
    event, root_tree = next(context)
    for event, elem in context:
        # Find drugs storage
        if event == "end" and db_tag(elem,"drug"):
            item_pre = 'elem'
            for ind, arg in enumerate(args):
                item_cur = 'item_' + str(ind)
                exec(item)
                exec('for ' + item_cur + ' in list(' + item_cur + '):'
                     
                    ind = 1
                    exec('')
                    for item
                if db_tag(item, "targets"):
                        for it1 in list(item):
                            for it2 in list(it1):
                                if db_tag(it2, "polypeptide"):
                                    for it3 in list(it2):
                                        if db_tag(it3, "external-identifiers"):
                                            for it4 in list(it3):
                                                for it5 in list(it4):
                                                    if db_tag(it5, "resource")
        root_tree.clear()

5310
5323
5328
5331
5332
5336
5337
5378
5384
5392
5434
5435
5459
5460
5469
5471
5502
5515
5525
5536
5571
10693
10694
10696
10761
10892
10893
10894
10895
10899
10934
10935
10961
10963
10974
10983
10991
10992
10993


In [28]:
#ligands_names_and_their_targets_resources = dict(zip(ligands_names, targets_resources))
#print(ligands_names_and_their_targets_resources)
#print(targets_ids)
print(ligands_names)

['Cetuximab', 'Dornase alfa', 'Denileukin diftitox', 'Etanercept', 'Bivalirudin', 'Leuprolide', 'Peginterferon alfa-2a', 'Alteplase', 'Sermorelin', 'Interferon alfa-n1', 'Urokinase', 'Goserelin', 'Reteplase', 'Erythropoietin', 'Salmon Calcitonin', 'Interferon alfa-n3', 'Pegfilgrastim', 'Sargramostim', 'Peginterferon alfa-2b', 'Asparaginase Escherichia coli', 'Thyrotropin alfa', 'Antihemophilic factor, human recombinant', 'Anakinra', 'Gramicidin D', 'Immune Globulin Human', 'Anistreplase', 'Tenecteplase', 'Menotropins', 'Interferon gamma-1b', 'Interferon Alfa-2a, Recombinant', 'Desmopressin', 'Coagulation factor VIIa Recombinant Human', 'Oprelvekin', 'Palifermin', 'Glucagon', 'Aldesleukin', 'Botulinum Toxin Type B', 'Omalizumab', 'Lutropin alfa', 'Lyme disease vaccine (recombinant OspA)', 'Insulin Lispro', 'Insulin Glargine', 'Collagenase clostridium histolyticum', 'Rasburicase', 'Cetrorelix', 'Adalimumab', 'Somatotropin', 'Imiglucerase', 'Abciximab', 'Drotrecogin alfa', 'Gemtuzumab ozo

0.3


ParseError: syntax error: line 1, column 0 (<string>)

In [29]:
aux.get_pdbs_from_uniprot('P00915')

['1AZM',
 '1BZM',
 '1CRM',
 '1CZM',
 '1HCB',
 '1HUG',
 '1HUH',
 '1J9W',
 '1JV0',
 '2CAB',
 '2FOY',
 '2FW4',
 '2IT4',
 '2NMX',
 '2NN1',
 '2NN7',
 '3LXE',
 '3W6H',
 '3W6I',
 '4WR7',
 '4WUP',
 '4WUQ',
 '5E2M',
 '5GMM',
 '6EVR',
 '6EX1',
 '6F3B',
 '6FAF',
 '6FAG',
 '6G3V',
 '6HWZ']

In [9]:
print(resources_by_names['Cetuximab'])
print(resources_by_names[name].index('PubChem Substance'))

NameError: name 'resources_by_names' is not defined

In [21]:
#dump_info_db(root)
#print(ligands_names)

In [34]:
import requests
url = 'https://www.drugbank.ca/releases/latest'

#print(r.content)

# Create an ElementTree instance   
import xml.etree.ElementTree as ET

response = requests.get(url, allow_redirects=True)
with open(str(DRUGBANK_PATH / 'all.xml'), 'wb') as foutput:
    foutput.write(response.content)
#open(str(DRUGBANK_PATH / 'all.xml'), 'wb').write(response.content)
#tree = ElementTree.fromstring(response.content)

#tree = ET.parse(str(DRUGBANK_PATH / 'all.xml'))
#root = tree.getroot()
# Get all 'book' elements that have a 'name' child with a string value of 'abc'
#books = tree.xpath('//*[@id="full"]/div[2]/table/tbody/tr/td[2]')
#print(books)
#print(a)
 # '//*[@id="full"]/div[2]/table/tbody/tr/td[2]'

In [32]:
import urllib3
url = 'https://www.drugbank.ca/releases/latest'
s = urllib3.urlopen(url)
contents = s.read()
file = open("export.xml", 'w')
file.write(contents)
file.close()

AttributeError: module 'urllib3' has no attribute 'urlopen'

Checking databases of ligands

In [None]:
print(len(all_resources_of_ligands))
k = 0
b = []
for i in all_resources_of_ligands:
    f = False
    for j in i:
        if j == 'PubChem Substance':
            f = True
    if f:
        k += 1
    else:
        for el in i:
            b.append(el)
print(set(b))
from collections import Counter

print(Counter(b).values())
types = dict(zip(b, Counter(b).values()))
print(types)
print(sum(Counter(b).values()))
print(k, 'from ', len(all_resources_of_ligands), 'have PubCHEM ID')

# Retrieve PubCHEM ids of ligands and get their SMILES

https://pubchempy.readthedocs.io/en/latest/guide/install.html

additional manual check of id name smiles

In [39]:
for ind, i in enumerate(pubchem_numbers):
    print(ind)
    print(pubchem_ids[ind], all_ligands_name[pubchem_numbers[ind]], pubchem_smiles[ind])

0
46507011 Lepirudin COC1=CC=CC(=C1)CN2C(=O)C3=C(C(=CN3)C4=CC=CC=C4)NC2=S
1
46507042 Cetuximab CCOC1=CC=C(C=C1)NS(=O)(=O)C2=CC(=NN2)C(=O)NC3=CC(=CC=C3)SC
2
46507792 Dornase alfa CCOC(=O)C1=C(N(N=C1)C2=NC=C(C=C2)C(=O)NC3=CC=C(C=C3)C(F)(F)F)N
3
46506950 Denileukin diftitox C1=CC=C(C=C1)C2=CNC3=C2NC(=S)N(C3=O)CCC(=O)O
4
46506732 Etanercept CC1=C(C=C(C=C1)NC(=O)CN2C(=CC(=O)N3C2=CC(=N3)C)C)C


get smiles by pubchem for manual check

In [None]:
pubchem =
c = pcp.Compound.from_cid(pubchem)
print(c.isomeric_smiles)

# Get pdbs with similar substructures as input SMILES

In [110]:
uniprot = 'P00533'
path = ROOT / 'SMILES'
make_dir_from_list([str(path)])
List = open(str(ROOT / 'hive' / 'pdb' / uniprot / (uniprot + "_pdbs.txt"))).readlines()
for el in List:
    if el.rstrip() in pdbs_from_smiles:
        print(el)

5HIB



In [None]:
#print(all_resources_of_ligands[ligands_unii.index("EFY6W0M8TG")+1:ligands_unii.index("EFY6W0M8TG")+3])
#print(all_ids_of_ligands[ligands_unii.index("EFY6W0M8TG")+1:ligands_unii.index("EFY6W0M8TG")+3])

How many different targets?

In [60]:
a = []
#print(all_ids_of_targets)
for i in all_ids_of_targets:
    for j in i:
        if j not in a:
            a.append(j)
len(a)

2513

# Further goes more or less obsolete code

## Check ligands and targets from table Drugbank-PDB

In [8]:
import pandas as pd
db_pdb = pd.read_csv("/media/anton/b8150e49-6ff0-467b-ad66-40347e8bb188/anton/BACHELOR/Drugbank/drugTable.csv", sep=',', lineterminator='\n', header=0)
#print(db_pdb["DrugBank ID"].values.tolist())

In [20]:
print(len(set(db_pdb["Generic Name"].values.tolist())))

508


In [13]:
found = []
not_found = []
flat_ids = [item for sublist in all_ligands_db_id for item in sublist]
for i in db_pdb["DrugBank ID"].values.tolist():
    if i in flat_ids:
        found.append(i)
    else:
        not_found.append(i)

In [27]:
#print(flat_ids)

In [15]:
print(len(set(not_found)))
print(len(set(found)))
print(set(not_found))

58
450
{'DB04786', 'DB01244', 'DB04552', 'DB01361', 'DB00914', 'DB01094', 'DB01051', 'DB04942', 'DB11699', 'DB01412', 'DB00147', 'DB00274', 'DB00336', 'DB04729', 'DB04794', 'DB00311', 'DB01628', 'DB00170', 'DB00212', 'DB01021', 'DB00179', 'DB01055', 'DB08800', 'DB03313', 'DB00778', 'DB00154', 'DB03575', 'DB01092', 'DB00197', 'DB08954', 'DB00786', 'DB00127', 'DB00148', 'DB01245', 'DB04787', 'DB03849', 'DB01283', 'DB04133', 'DB01103', 'DB08915', 'DB06693', 'DB01422', 'DB00866', 'DB01603', 'DB01296', 'DB04570', 'DB01454', 'DB00234', 'DB05266', 'DB04743', 'DB01614', 'DB01336', 'DB01536', 'DB01034', 'DB08838', 'DB00821', 'DB00606', 'DB04573'}


In [16]:
print(db_pdb["DrugBank ID"].values.tolist())
#DB01245

['DB01454', 'DB01454', 'DB01454', 'DB01536', 'DB01536', 'DB01048', 'DB05812', 'DB00284', 'DB00284', 'DB00284', 'DB00284', 'DB01614', 'DB01614', 'DB01614', 'DB01614', 'DB00316', 'DB00316', 'DB00819', 'DB00819', 'DB00819', 'DB00819', 'DB00819', 'DB00819', 'DB00551', 'DB06151', 'DB06151', 'DB00945', 'DB00945', 'DB00787', 'DB00787', 'DB00787', 'DB00640', 'DB00640', 'DB00640', 'DB00640', 'DB08916', 'DB08916', 'DB08916', 'DB08838', 'DB08838', 'DB08838', 'DB08838', 'DB08838', 'DB08838', 'DB08838', 'DB11363', 'DB08915', 'DB08915', 'DB00630', 'DB09026', 'DB00523', 'DB00523', 'DB00523', 'DB00523', 'DB00523', 'DB00523', 'DB06203', 'DB00132', 'DB00132', 'DB00404', 'DB00866', 'DB00866', 'DB00866', 'DB00770', 'DB00770', 'DB00915', 'DB00915', 'DB00915', 'DB00479', 'DB00594', 'DB00594', 'DB00594', 'DB00594', 'DB00513', 'DB00513', 'DB01118', 'DB01118', 'DB01118', 'DB00321', 'DB00321', 'DB00321', 'DB00381', 'DB00381', 'DB00381', 'DB00381', 'DB00381', 'DB00182', 'DB00182', 'DB00182', 'DB00182', 'DB00182'

# Another table

In [27]:
import pandas as pd
db_uniprot = pd.read_csv("/media/anton/b8150e49-6ff0-467b-ad66-40347e8bb188/anton/BACHELOR/Drugbank/uniprot links.csv", sep=',', lineterminator='\n', header=0)

In [35]:
#print(db_uniprot["UniProt ID"].values.tolist())
found = []
not_found = []
flat_ids = [item  for sublist in all_ids_of_targets for subsublist in sublist for item in subsublist]
for i in db_uniprot["UniProt ID"].values.tolist():
    if i in flat_ids:
        found.append(i)
    else:
        not_found.append(i)

In [36]:
print(len(set(not_found)))
print(len(set(found)))
print(set(not_found))

192
2502
{'Q16775', 'O15440', 'Q9L5C8', 'Q8N0U8', 'O75715', 'P11759', 'Q9P2R7', 'Q07820', 'P16050', 'P18283', 'P06435', 'P36894', 'P05181', 'Q9BZV3', 'P19440', 'P54802', 'P0A3R9', 'Q96I59', 'Q8VP84', 'P77390', 'P10635', 'P12532', 'Q9NUB1', 'Q8TE23', 'O60513', 'Q08257', 'Q82122', 'P36275', 'P04936', 'Q27796', 'Q16678', 'P20815', 'Q99643', 'P16070', 'Q07817', 'P08870', 'Q9ZEU2', 'P02929', 'A1L3X4', 'Q9Y3I0', 'Q9H4Y5', 'Q9BUP3', 'P06434', 'O14880', 'Q96I99', 'Q14872', 'Q8WWT9', 'O43776', 'Q9NR19', 'P35754', 'P02943', 'Q9BZW2', 'P09467', 'P51606', 'Q9UBX3', 'Q16873', 'P68133', 'O95865', 'P0A953', 'Q59976', 'Q92945', 'P0A094', 'P31153', 'P61889', 'Q86VB7', 'O60512', 'Q00266', 'Q9HAB3', 'A0A1Q8GFY7', 'P55055', 'Q9UJ70', 'P53597', 'P0A0K8', 'Q13183', 'O95881', 'P10620', 'P04732', 'P22891', 'P59796', 'Q9I194', 'P98066', 'Q6TGC4', 'P49228', 'P17540', 'Q9RVD6', 'P48029', 'P22340', 'P55263', 'P98155', 'Q13133', 'P83689', 'P11413', 'P46439', 'P10915', 'Q8N339', 'O60909', 'Q7L5Y9', 'P11712', 'Q9NS1

In [5]:
#for ID in all_ligands_db_ids:
import numpy as np
import requests
import os
home = "/media/anton/b8150e49-6ff0-467b-ad66-40347e8bb188/anton/BACHELOR"
ID = 'DB00509'
ligand_pdbs = []
percent = []
#target_pdbs
for i in range(3):
    frame = db_pdb.loc[db_pdb['DrugBank ID'] == ID]["PDB ID " + str(i+1)].dropna()
    for el in frame:
        ligand_pdbs.append(el)
print(ligand_pdbs)
ligand_pdbs = set(ligand_pdbs)
path_id = os.path.join(home, "Drugbank", ID)
try:  
    os.mkdir(path_id)
except OSError:  
    pass#print ("Directory %s failed (maybe exists)" % fda_folder)
for pdb in ligand_pdbs:
    url_file = "https://files.rcsb.org/download/" + pdb + ".pdb"

    r = requests.get(url_file, allow_redirects=True)
    path =  os.path.join(path_id, url_file.split("/")[-1])
    print("Downloading " + url_file.split("/")[-1])
    open(path, 'wb').write(r.content)

['1Y0X', '4LNX', '4LNX', '1Y0X']
Downloading 4LNX.pdb
Downloading 1Y0X.pdb


# some old tests (garbage)

In [17]:
#exec("%s = %d" % (x,2))
def get_json(name):
    with open('/media/anton/b8150e49-6ff0-467b-ad66-40347e8bb188/anton/BACHELOR/' + name + ".txt", 'r') as f: 
        a = json.load(f)
    return a

#print(get_json("all_ids_of_targets"))
a = get_json("all_resources_of_targets")
k = 0
uniprot = 0
for el in a:
    for el1 in el:
        k += 1
        for el2 in el1:
            if el2 == u'UniProtKB':
                uniprot += 1

In [18]:
print(k, uniprot)

(9048, 9048)


In [7]:
print(len(I))
kol = 0
for sp in I:
    kol += sp.count("ChEMBL")
print(kol)

11922
6754


In [9]:
import pandas as pd
a = pd.read_csv("/media/anton/b8150e49-6ff0-467b-ad66-40347e8bb188/anton/FDA/Data/unii_inchikey.csv", sep=',', lineterminator='\n', header=0)

In [28]:
c = 0
c1 = 0
All = []
processed = []
for elem in drugs1["unii"].values.tolist():
    if elem in targets:
        c += 1
        ind = targets.index(elem)
        if elem not in processed:
            processed.append(elem)
            for el in I[ind]:
                if "ChEMBL" not in I[ind]:
                    All.append(el)
        if "ChEMBL" in I[ind]:
            c1 += 1
from collections import Counter
print(All.count("ChEMBL"))
print(Counter(All).values())
types = dict(zip(All, Counter(All).values()))
print(types)
print(sum(Counter(All).values()))
print(c)
print(c1)
#print(len(processed))

0
[16, 1, 868, 40, 514, 2, 9, 11, 29, 345, 61, 105, 53, 23, 2, 44]
{'UniProtKB': 2, 'PubChem Substance': 105, 'PharmGKB': 23, 'Wikipedia': 29, 'GenBank': 53, 'KEGG Compound': 868, 'Drugs Product Database (DPD)': 44, 'Therapeutic Targets Database': 11, 'KEGG Drug': 40}
2123
3090
1986


In [30]:
len(a["unii"].values.tolist())

1793

In [21]:
a["unii"].values.tolist()

1793

In [1]:
import xml.etree.ElementTree as etree
import xml

def db_tag(element, string):
    return element.tag.split("{http://www.drugbank.ca}")[1] == string

source = "/media/anton/b8150e49-6ff0-467b-ad66-40347e8bb188/anton/BACHELOR/full database.xml"
# get an iterable
context = etree.iterparse(source, events=("start", "end"))

# turn it into an iterator
context = iter(context)

# get the root element
event, root = context.next()
drugs1 = []
k = 0
for event, elem in context:
    if event == "end" and db_tag(elem,"drug"): #and k == 1:
        f = True
        for item in list(elem):
            if db_tag(item, "unii"):
                unii = item.text
                if item.text in drugs1:
                    f = False
            if db_tag(item, "products"):
                for it1 in list(item):
                    for it in list(it1):
                        if db_tag(it, "approved"):
                            #print(it.tag + "|" + it.text)
                            if it.text == "true" and f:
                                k += 1
                                drugs1.append(unii)
                                f = False
                           # numbers.append(it)
        root.clear()

In [13]:
len(a)

1793

In [26]:
#print(drugs1)

In [25]:
len(drugs1)

3091

In [8]:
len(set(drugs))

7612

In [7]:
import pandas as pd
import os
df = pd.DataFrame({"unii" : drugs1})
df.to_csv(os.path.join("/media/anton/b8150e49-6ff0-467b-ad66-40347e8bb188/anton/BACHELOR/Drugbank", "drugbank_unii_fda.csv"), index=False)

In [3]:
import pandas as pd
a = pd.read_csv("/media/anton/b8150e49-6ff0-467b-ad66-40347e8bb188/anton/FDA/Data/unii_inchikey.csv", sep=',', lineterminator='\n', header=0)

In [4]:
drugs1 = pd.read_csv("/media/anton/b8150e49-6ff0-467b-ad66-40347e8bb188/anton/BACHELOR/Drugbank/drugbank_unii_fda.csv", sep=',', lineterminator='\n', header=0)

In [38]:
b = []
for el in drugs1['unii'].to_list():
    if el not in a["unii"].to_list():
        b.append(el)

In [19]:
len(b)

2252

In [25]:
#print(b)

In [39]:
c = []
for el in a["unii"].to_list():
    if el not in drugs1['unii'].to_list():
        c.append(el)

In [22]:
len(c)

954

In [24]:
#print(c)

In [22]:
import pandas as pd
import os
#df = pd.Series(c)
#print(c)
#df.to_csv(os.path.join("/media/anton/b8150e49-6ff0-467b-ad66-40347e8bb188/anton/BACHELOR/Drugbank", "drugbank_not_in_fda.csv"), index=False)

In [23]:
#print(drugs1)