In [2]:
%%writefile Search.py

""" Here are the functions: 
    to calculate similarity between: 
        two fasta sequences (using BioPython)
        one fasta and the whole database targets (using BioPython)
        SMILES and list of SMILES (using RDkit)
        SMILES and the whole database ligands (using RDkit)
        ligand and the whole database (using pybel fingerprints)
        two protein structures (TM-score and RMSD, using TM-align) # in process
        two complex structures using TM-align

RDkit: installation https://github.com/rdkit/rdkit/blob/master/Docs/Book/Install.md
       how to use https://www.rdkit.org/docs/Cookbook.html
TM-align: https://zhanglab.ccmb.med.umich.edu/TM-align/
Open Babel: http://openbabel.org/docs/current/UseTheLibrary/Python_Pybel.html
Biopython: https://biopython.org/
           about alignments http://biopython.org/DIST/docs/api/Bio.pairwise2-module.html
"""

import os
import pandas as pd
import subprocess  # To execute like as from cmd
import json
from pathlib import Path  # To process paths
import ntpath
import pickle
import argparse
import datetime  # For debug

#from rdkit import Chem
#from rdkit import DataStructs
#from rdkit.Chem.Fingerprints import FingerprintMols

import openbabel
import pybel

from Bio import SeqIO  # To process sequences
from Bio import pairwise2  # To make sequence alignments
import Bio.SubsMat.MatrixInfo  # To get info about available distance matrices
from Bio.SubsMat.MatrixInfo import *
from Bio.PDB.PDBParser import PDBParser   # To parse PDB files
from Bio.PDB import Select, PDBIO, Dice

import Auxiliary as aux  # Needed for work auxiliary functions
import Drugbank as db  # Needed to process raw data from Drugbank



# The same function as in DATABASES_SMILES.py
def load_info_db_from_namelist(namelist, root):
    """Load listed in namelist names.txt collected from Drugbank data as json files from root/Drugbank_extracted"""
    # All names of files to be loaded from root/Drugbank_extracted with name.txt, where name is from names
    name_full = str(Path(root) / 'Drugbank_extracted')
    for name in namelist:
        with open(str(Path(name_full) / (name + ".txt")), 'r') as f:
            exec('global ' + name + '\n' + name + ' = json.load(f)')
            
##################   SEQUENCE SIMILARITY (for targets)   ####################################

# Useful if Bio.SeqIO doesn't work somehow, otherwise useless    
class get_seq_from_fasta_file:
    """ Get name (first line of .fasta) and string from .fasta 
    (Biopython's SeqIO sometimes doesn't work due to improper installation:( ))"""
    def __init__(self, path):
        with open(path, 'r') as f:
            seq = ''
            for line in f.readlines():
                if line[0] == '>':
                    name = line.rstrip()
                else:
                    seq += line.rstrip()
            self.name = name
            self.seq = seq

            
def get_seq_from_fasta_uniprot_or_seq(input1):
    """ Return sequence from input as sequnce, path to fasta or Uniprot ID."""
    # If input is fasta
    if input1.split('.')[-1] == 'fasta':
        # If import from Bio import SeqIO works
        seq1 = Bio.SeqIO.read(input1, "fasta")
        # If Bio.SeqIO doesn't work
        #seq1 = get_seq_from_fasta_file(input1)
        str1 = seq1.seq        
    # If input is seq or uniprot
    else:
        # Uniprot ID have length of 6, so checking if it is possibly ID or not
        if len(input1) <= 10:
            # Flag of being Uniprot ID
            f_uniprot = False
            for i in range(len(input1)):
                if input1[i].isdigit():
                    f_uniprot = True
            if f_uniprot:
                str1 = aux.get_seq_from_uniprot(input1)
            else:
                print(f"Is it really such a short protein sequence or invalid Uniprot ID {input1}")
                str1 = input1
        else:
            str1 = input1
    return str1


def get_sequences_similarity(input1, input2, align_matrix='blosum62', verbose=False):
    """ Calculates similarity of two inputs (could be raw seq, path to fasta or uniprot ID) 
    using align_matrix from Biopython (blosum62 by default, list of all by Bio.SubsMat.MatrixInfo.available_matrices)
    Input -  sequences, paths to single fastas or Uniprot IDs of proteins
    Output - float similiarity and integer identity
    """
    # Draft for using different substitution matrices
    #print('Available matrices:', Bio.SubsMat.MatrixInfo.available_matrices)
    #print('Which one would you like to use? Type [Enter] to use blosum62')
    #align_matr = input()
    # Process input data
    seq1 = get_seq_from_fasta_uniprot_or_seq(input1)
    seq2 = get_seq_from_fasta_uniprot_or_seq(input2)
    # Get needed matrix
    exec('matr_bio = Bio.SubsMat.MatrixInfo.' + align_matrix, globals())
    # Make an alignment
    try:
        alignments = pairwise2.align.globalds(seq1, seq2, matr_bio, -10, -0.5)  
        alignments_id = pairwise2.align.globalms(seq1, seq2, 1, 0, 0, 0) 
        # Print info
        sim = float(alignments[0][2])
        ident = int(str(alignments_id[0][2]).split('.')[0])
        if verbose:
            print(f'Similarity={sim}, identity={ident}')
            print("Matrix " + align_matrix + ", number of alignments = " + str(len(alignments)))
            print(pairwise2.format_alignment(*alignments[0]))
        return sim, ident
    except:
        print('Smth went wrong with comparison to ', input2)
        return -1000, 0


def get_closest_fastas_in_fasta_file_from_fasta_uniprot_or_seq(input1, path_to_data_in_fasta, 
                                                       k=0, align_matrix='blosum62', sort_by='s'):
    """ Returns k or all (if k == 0) of closest to input fasta molecules from path_to_data_in_fasta multi-fasta.
    OUTPUT -- dataframe: 'query':repeated input fasta, 
                    'position_in_fasta': position in input file (to find later needed info)
                     'similarity':similarity, 'identity': identity, 
                     'sequence': sequence of compared target, 'name':name of compared target
            Also writes the best alignment
    INPUT -- input1 -- input a/a sequence, path to single fasta file or Uniprot ID of protein,
            path_to_data_in_fasta -- fasta file to compare with, 
            k -- number of the best to find (k == 0 if want to get all), 
            sort_by == 's' => sort descending by seimilarity. == 'i' => by identity
    """
    # Load fastas to compare with
    records = list(SeqIO.parse(path_to_data_in_fasta, "fasta"))
    # Process when input is path to fasta file
    seq = get_sequence_from_fasta_uniprot_or_sequence(input1)
    # Get similarities and identities for all targets in Drugbank
    similarity_list = [] 
    identity_list = []
    seq_list = []
    name_list = []
    for ind, element in enumerate(records):
        if element.seq == seq and k == 1:
            sim, ident = get_sequences_similarity(seq, element.seq, align_matrix)
            d =  {'query':fasta, 'position_in_fasta':ind,  
                  'similarity':sim, 'identity':ident,
                  'sequence':seq, 'name':element.name,
                 }
            return pd.DataFrame(data=d)
        sim, ident = get_sequences_similarity(seq, element.seq, align_matrix)
        similarity_list.append(sim)
        identity_list.append(ident)
        seq_list.append(element.seq)
        name_list.append(element.description)
    # Create correspondent dictionary and then dataframe
    d = {'query':[fasta]*len(similarity_list), 'position_in_fasta':range(len(similarity_list)), 
         'similarity':similarity_list, 'identity':identity_list,
         'sequence':seq_list, 'name':name_list,
        }
    res = pd.DataFrame(data=d)
    if sort_by == 's':
        res = res.sort_values('similarity', ascending=False)
    else:
        res = res.sort_values('identity', ascending=False)
        if sort_by != 'i':
            print('Sorted descending by identity. If you need by similarity, corresp. key should be "s"')
    if k < res.size and k != 0:
        result = res[0:k]
    else:
        result = res
    return result

                   
def get_element_of_fasta_by_number(path_to_data_in_fasta, n):
    """ Returns SeqIO record of n-th order from multi-fasta file.
    OUTPUT -- SeqIO fasta sequence element
    INPUT -- path to fasta file with compared fastas, number of needed element in this fasta."""
    # Load records           
    records = list(SeqIO.parse(path_to_data_in_fasta, "fasta"))
    elem = records[n]
    print('Name = ', elem.description)
    print('Seq = ', elem.seq)
    return elem                          


def print_closest_fastas_data(input1, path_to_data_in_fasta, k=0, align_matrix='blosum62', sim_or_ident=True):
    """ Get k or all (if k == 0) closest proteins in file by fasta from seq/uniprot/path to fasta.
    Print alignments of k or 5 (if  k > 5) with input
    """
    # Get seq of input
    seq1 = get_seq_from_fasta_uniprot_or_seq(input1)
    # Get dataframe with sorted by similarity or identity fastas
    df = get_closest_fastas_in_fasta_file_from_fasta_uniprot_or_seq(seq1, path_to_data_in_fasta, 
                                                       k=0, align_matrix='blosum62', sim_or_ident=True)
    print(df)
    # How many to align, from 1 to 5
    if k:
        k1 = min(k, 5)
    else:
        k1 = 5
    res = df[0:k1]
    # Load records
    records = list(SeqIO.parse(path_to_data_in_fasta, "fasta"))
    for row in res.iterrows():
        # Position of this target in whole fasta file
        n = row['position_in_fasta']
        seq2 = row['sequence']
        print('Name = ', records[n].description)
        # Print alignment, sim and indent coeffs
        get_sequences_similarity(seq1, seq2, align_matrix='blosum62', verbose=True)
    return df


##################   SMILES SIMILARITY (for ligands)   ####################################

def get_smiles_similiarity(smiles, list_smiles):
    """ Get dataframe 'query':input SMILES, 
                    'target_smiles':list_smiles_cleaned, 
                    'similarity':similarities correspondently
    INPUT -- SMILES (smiles) and list of smiles to compare with (list_smiles)
    """
    # Proof and make a list of SMILES
    c_smiles = []
    # Delete Nones
    list_smiles_cleaned = [i for i in list_smiles if i]
    # List of indices to delete because SMILES are invalid
    del_indices = []
    for ind, ds in enumerate(list_smiles_cleaned):
        try:
            cs = Chem.CanonSmiles(ds)
            c_smiles.append(cs)
        except:
            # Delete smiles if it's invalid
            del_indices.append(ind)
            print('Invalid SMILES, deleted from list to compare with:', ds)
    # Delete elements starting from end
    for ind in del_indices[::-1]:
        del list_smiles_cleaned[ind]
    try:
        smiles = Chem.CanonSmiles(smiles)
    except:
        print('Invalid Input SMILES:', ds)
        return -1

    # Make a list of mols
    ms = [Chem.MolFromSmiles(x) for x in c_smiles]

    # Make a list of fingerprints (fp)
    fps = [FingerprintMols.FingerprintMol(x) for x in ms]
    # Input fingerprint
    fp_in = FingerprintMols.FingerprintMol(Chem.MolFromSmiles(smiles))

    # Compare all fps with fp_in
    sim = (DataStructs.BulkTanimotoSimilarity(fp_in, fps[:]))
    #print()

    # Build the dataframe and sort it
    print(len([smiles]*len(sim)), len(list_smiles_cleaned), len(sim))
    d = {'query':[smiles]*len(sim), 'smiles':list_smiles_cleaned, 'similarity':sim}
    df_final = pd.DataFrame(data=d)
    df_final = df_final.sort_values('similarity', ascending=False)
    return df_final#dict(zip(df_final['Similarity'], df_final['target']))


def get_closest_smiles_names(smiles, root, k=1):
    """ Get k names and smiles of the closest to input smiles, k=1 by default
    INPUT -- SMILES (smiles), 
            k -- number of the best smiles to find (k == 0 if want to get all)
            root - where all protocol is located (ligands_names_and_smiles.txt in root/Drugbank_extracted)
    OUTPUT -- dataframe of similar by smiles ligands: 
            'name' -- names of ligands
            'smiles' -- SMILES of ligands
            'query' -- input SMILES (same for all)
            'similarity' -- level of similarity (1 - identical, 0 - abs. different)
    """
    # Load needed dictionary
    load_info_db_from_namelist(['ligands_names_and_smiles'], root)
    # Delete ligands with None smiles
    dict_cleaned = {k: v for k, v in ligands_names_and_smiles.items() if v is not None}
    # Get dataframe of sorted by descending similarity smiles
    try:
        res = get_smiles_similiarity(smiles, list(dict_cleaned.values()))
    except:
        if res == -1:
            print('Input SMILES is invalid, abort')
            return -1
    # Take only needed amount of smiles
    if k < res.size and k != 0:
        result = res[0:k]
    else:
        result = res
    # Get names of correspondent ligands
    names = []
    for sm in result['smiles']:
        for name in ligands_names_and_smiles.keys():
            if ligands_names_and_smiles[name] == sm:
                names.append(name)
    result['name'] = names
    return result


##################   Structure-Fingerprints SIMILARITY (for ligands)   ####################################

def extract_approved_sdf(path_to_sdf_from_drugbank, root, overwrite=False):
    """ Extract approved ligands taken from 'ligands_drugbank_ids' to new multi-sdf file
    with changed name as added by _approved. Overwrite - flag of overwriting this file
    """
    # Set path of file with approved sdfs
    path_to_approved_sdf = path_to_sdf_from_drugbank.split('.sdf')[0] + '_approved.sdf'
    # If file with approved structured doesn't exist or should be overwrited
    if not Path(path_to_approved_sdf).is_file() or overwrite:
        # Load list of ids of approved ligands
        load_info_db_from_namelist(['ligands_drugbank_ids'], root)
        sdf_approved = pybel.Outputfile("sdf", path_to_approved_sdf, overwrite=True)
        for mol in pybel.readfile('sdf', path_to_sdf_from_drugbank):
            mol_id = mol.data['DATABASE_ID']
            # Check if ligand is in approved list
            f_approved = False
            for lig in ligands_drugbank_ids:
                if mol_id in lig:
                    f_approved = True
                    break
            if f_approved:
                sdf_approved.write(mol)
        sdf_approved.close()
    return path_to_approved_sdf


def get_closest_ligands_from_3d_structure(path_to_structure, path_to_sdf_approved, root, 
                                          fptype='fp2', number_to_print=1):
    """ Get sorted descending list of tanimoto coeff from fingerprints and correspondent DB IDs list
    More about fingerprints http://openbabel.org/docs/current/UseTheLibrary/Python_Pybel.html
    Their formats: http://openbabel.org/docs/current/Fingerprints/fingerprints.html#fingerprint-format-details
    INPUT:
        path_to_structure -- path to single .sdf, .pdb or .mol2 structure of molecule
        path_to_sdf_approved -- path to multi-sdf file to compare with
        fptype - type of fingerptint, such as 'ftp2', 'maccs', 'ecfp0' etc.
        list of all available can be taken by 'pybel.fps'
        number_to_print - how many to print
    OUTPUT:
        dataframe: 'Name' : name of compared ligand,'Tanimoto coeff' : corresp similarity,
        'Drugbank ID' : Drugbank ID of compared ligand, 'Fingerprint type' : name of used fingerprint      
    """
    # Fingerprints of ligands
    fps = []
    # Drugbank IDs of ligands
    ids = []
    # Tanimoto coefficients between fingerprints
    tanim = []
    for mymol in pybel.readfile('sdf', path_to_structure):
        fp_mol = mymol.calcfp(fptype)
    for mol in pybel.readfile('sdf', path_to_sdf_approved):
        # Get correspondent fingerprint
        fp = mol.calcfp(fptype)
        fps.append(fp)
        tanim.append(fp_mol | fp)
        # Get ID in Drugbank
        ids.append(mol.data['DATABASE_ID'])
    tanim, ids= zip(*sorted(zip(tanim, ids)))
    # Make them descending
    tanim = tanim[::-1]
    ids = ids[::-1]
    # Get names of ligands
    load_info_db_from_namelist(['ligands_names', 'ligands_drugbank_ids'], root)
    ligands_db_ids_by_names = dict(zip(ligands_names, ligands_drugbank_ids))
    names = []
    for lig_id in ids:
        for name in ligands_db_ids_by_names.keys():
            if lig_id in ligands_db_ids_by_names[name]:
                names.append(name)
    # Make dataframe from obtained data
    data_tuples = list(zip(names, tanim, ids, [fptype]*len(ids)))
    df = pd.DataFrame(data_tuples, columns=['Name','Tanimoto coeff', 'Drugbank ID', 'Fingerprint type'])
    # Print the best k
    print(df[0:number_to_print])
    return df


##################   Structure TM-score and RMSD SIMILARITY (for targets and complexes)   ####################################
def get_TMscore_and_RMSD_of_proteins_or_complexes(input1, input2, root,
                                                  compare_type='p', verbose=False):
    """ Get TM-score and RMSD of two protein or complex structure files
    INPUT -- paths of two protein structure files (.sdf, mol2, pdb)
            root - save to root/'pdb', if PDB ID is in input
            compare_type == 'p' => comparing proteins
            compare_type == 'c' => comparing complexes, else also try as complexes, but with warning
            
    OUTPUT -- (TM-score, RMSD) of the files
        if 0.0 < TM-score < 0.17, then random structural similarity 
        if 0.5 < TM-score < 1.00, then in about the same fold 
        Returns 0.0 if no common residues were found
    """
    # Convert files to .pdb if needed, saving in the same directory and changing extension
    # Or download PDB file, if PDB ID is as input
    pdb1_path = aux.get_path_to_pdb_from_pdb_id_or_path_to_structure(input1, root)
    pdb2_path = aux.get_path_to_pdb_from_pdb_id_or_path_to_structure(input2, root)
    # ?? Delete new structures after calculation or not??
    
    # Get result of TM-align work in protein and complex comparison types
    if compare_type == 'p':
        res = subprocess.check_output(['TMscore', pdb1_path, pdb2_path])
    else:
        res = subprocess.check_output(['TMscore', '-c', pdb1_path, pdb2_path])
        # Process invalid key
        if compare_type != 'c':
            print(f"Calculated as for complexes, but key was '{compare_type}' and in ['p', 'c']")
            compare_type = 'c'
    text = res.decode('utf-8')        
    # Find needed results in the output of TM-align
    if text.find('TM-score') == -1:
        print(f'Something went wrong when TM-align compared {pdb1} and {pdb2}')
        return(0.0)
    else:
        if text.find('Warning') != -1:
            print(f'When comparing {input1} and {input2}:')
            print(text.split('*')[0])
        if compare_type == 'p':
            tm_score = float(text.split('TM-score')[3].split()[1])
            rmsd = float(text.split('RMSD')[1].split()[4])
            common_res = int(text.split('common=')[1].split()[0])
            
        if compare_type == 'c':
            tm_score = float(text.split('TM-score')[3].split()[1])
            rmsd = float(text.split('RMSD')[1].split()[4])
            common_res = int(text.split('common=')[1].split()[0])
        if verbose:
            print(f'TM-score = {tm_score}, RMSD = {rmsd}, number of common residues in alignment = {common_res}')
            print(text)
        return tm_score, rmsd, common_res

    
def download_proteome_of_one_species(name, root, overwrite=False):
    """Download reviewed proteomes of name (name in ['human', 'rat', 'mouse']) 
    from Uniprot and save it to root/'Uniprot_proteomes'/name + '_proteome.fasta'
    INPUT:
        name in ['human', 'rat', 'mouse']
        root of the protocol
    OUTPUT:
        paths to proteome. If incorrect name, -1
    """
    # Create directory (if needed) where to save 
    uniprot_dir = str(Path(root) / 'Uniprot_proteomes')
    aux.make_dir(uniprot_dir)
    # Names of species
    list_names = ['human', 'rat', 'mouse']
    # Correspondent Uniprot IDs
    list_ids = ['9606', '10116', '10090']
    if name not in list_names:
        print("Inappropriate species name, should be in ['human', 'rat', 'mouse']")
        return -1
    else:
        organism_dict = dict(zip(list_names, list_ids))
        # Download reviewed proteomes with ids from organism_dict.keys
        organism_id = organism_dict[name] #human, rat, mouse
        url = 'https://www.uniprot.org/uniprot/?query=reviewed:yes+AND+organism:' + organism_id + '&format=fasta'
        path = name + '_proteome.fasta'
        aux.download_url(url, uniprot_dir, path, overwrite)
        return path


def download_all_proteomes(root, overwrite=False):
    """Download reviewed proteomes of species from species_list=['human', 'rat', 'mouse'] 
    from Uniprot and save them to root/'Uniprot_proteomes' with species_proteome.fasta
    INPUT:
        root of the protocol
    OUTPUT:
        paths to proteomes of species from species_list
    """
    species_list = ['human', 'rat', 'mouse']
    paths = []
    for name in species_list:
        path = download_proteome_of_one_species(name, root, overwrite)
        paths.append(path)
    return paths


def get_fasta_from_pdb(pdb, directory_to_save=None):
    """ Download fasta file for PDB structure from PDB ID and save it to directory, creating it if not existed"""
    url = 'https://www.rcsb.org/pdb/download/downloadFastaFiles.do?structureIdList=' \
                + pdb + '&compressionType=uncompressed'
    # If needed to save
    if directory_to_save:
        aux.make_dir(directory_to_save)
        r = aux.download_url(url, str(Path(directory_to_save)), (pdb + '.fasta'))
    else:
        r = aux.download_url(url)
    return r


def get_best_pdb_of_target(uniprot, root, verbose=False):
    """ Get pdb of target which has the most biggest similarity to its sequence.
    Input:
        uniprot -- Uniprot ID of target
        root -- root of the protocol
    Output:
        info about downloaded file: path_new, pdb, chain_to_save, sim 
        if something went wrong => path_new = None
    How to work with PDB in Biopython http://biopython.org/DIST/docs/tutorial/Tutorial.html#htoc157
    """
    # Get list of pdbs where this uniprot is mentioned
    pdbs = aux.get_pdbs_from_uniprot(uniprot)
    # Get sequence of target
    seq = aux.get_seq_from_uniprot(uniprot)
    # Info about every chain in every pdb                   
    sims = []
    chains = []
    for pdb in pdbs:
        if verbose:
            print(pdb)
        # Get fastas attached to pdbs
        pdb_dir = str(Path(root) / 'pdb')
        get_fasta_from_pdb(pdb, pdb_dir)
        for record in SeqIO.parse(str(Path(pdb_dir) / (pdb + '.fasta')), 'fasta'):
            # Get chain id of current chain
            chain = record.description.split('|')[0].split(':')[1]
            chains.append(chain)
            # Compare current chain with the input pdb
            sim, ident = get_sequences_similarity(seq, record.seq, verbose=False)
            sims.append(sim)
    try:
        sims, chains, pdbs = zip(*sorted(zip(sims, chains, pdbs), reverse=True))
    except:
        print('Smth WRONG')
        print(sims)
        print(chains)
        print(pdbs)
        return None, 'bla', 'bla', 0
    if verbose:
        print(chain[0], sims[0], pdbs[0])
    # Process file with the best chain
    sim = sims[0]
    pdb = pdbs[-1]
    chain_to_save = chains[-1]
    pdb_dir = str(Path(root) / 'pdbs')
    pdb_path = aux.download_pdb(pdb, pdb_dir)
    # Parse and get needed chain
    parser = PDBParser(PERMISSIVE=1)
    structure = parser.get_structure(pdb, pdb_path)
    # Get resolution
    #resolution = structure.header["resolution"]
    path_new = None
    for chain in structure.get_chains():
        if chain.id == chain_to_save:
            io = PDBIO()
            io.set_structure(chain)
            path_new = str(Path(pdb_dir) / (pdb + '_' + chain_to_save + '.pdb'))
            try:
                io.save(path_new)
            except:
                print(f'Something went wrong with processing {pdb} and chain {chain_to_save}')
    return path_new, pdb, chain_to_save, sim          


def save_df_with_best_pdbs_for_list_of_uniprots(uniprots, path_to_save, overwrite=False):
    """ Save dataframe with info about best pdbs for all uniprots.
    INPUT: 
        list of Uniprots
    OUTPUT:
        dataframe with columns ['uniprot', 'pdb', 'chain', 'similarity', 'path']  
    """
    if overwrite or not Path(path_to_save).is_file():
        paths = []
        pdbs = []
        chains = []
        sims = []
        correct_uniprots = []
        for uniprot in uniprots:
            path, pdb, chain, sim = get_best_pdb_of_target(uniprot, root, verbose=False)
            # If something was wrong before, path will be None
            if path:
                correct_uniprots.append(uniprot)
                paths.append(path)
                pdbs.append(pdb)
                chains.append(chain)
                sims.append(sim)
        data_tuples = list(zip(correct_uniprots, pdbs, chains, sims, paths))
        df = pd.DataFrame(data_tuples, columns=['uniprot','pdb', 'chain', 'similarity', 'path'])
        path_to_save = path_to_save
        df.to_pickle(path_to_save)
    return path_to_save


def save_best_protein_structures_of_targets(root, overwrite=False):
    """ Save dataframe with info about best pdbs for all targets from Drugbank.
    OUTPUT:
        path to dataframe with columns ['uniprot', 'pdb', 'chain', 'similarity', 'path']
    """
    # Get list of uniprots (or produce)
    path_to_uniprots = aux.save_uniprots_lignames_dict_from_db(root, overwrite)
    with open(path_to_uniprots, 'rb') as f:
        uniprots_dict = pickle.load(f)
    path_to_save = str(Path(root) / 'Drugbank_extracted' / 'targets_best_pdbs.df')
    # Get resulting dataframe
    path_to_df = save_df_with_best_pdbs_for_list_of_uniprots(uniprots_dict.keys(), path_to_save, overwrite)
    return path_to_df


def save_best_protein_structures_of_proteome(input1, root, NAME=None, PASSWORD=None, overwrite=False, verbose=False):
    """ Save dataframe with info about best pdbs for all proteins from proteome
    INPUT
        input1 - path to proteome fasta file or name of species from ['human', 'rat', 'mouse']
    OUTPUT
        dataframe with columns ['uniprot', 'pdb', 'chain', 'similarity', 'path']   
    """
    # Check if input is just name and not a path
    if input1 in ['human', 'mouse', 'rat']:
        path_to_proteome = str(Path(root) / 'Uniprot_proteomes' / (input1 + '_proteome.fasta'))
        path_to_save = str(Path(root) / 'Drugbank_extracted' / input1) + '_best_pdbs.df'
        species = input1
    else:
        path_to_proteome = input1
        # Path in the same dir with name species_best_pdbs.df
        # Assume that input name is species_proteome.fasta
        species = ntpath.basename(input1).split('_')[0]
        path_to_save = str(Path(ntpath.dirname(input1)) / (species + '_best_pdbs.df'))
    
    path_to_ids_csv = download_from_drugbank('i', NAME, PASSWORD, root, release='5-1-3')
    df = pd.read_csv(path_to_ids_csv)
    # Produce list of uniprots
    uniprots = []
    records = list(SeqIO.parse(path_to_data_in_fasta, "fasta"))
    for ind, rec in enumerate(records):
        name = rec.description.split('|')[2]
        uniprot = df.loc[df['Name'] == name]['UniProt ID']
        if verbose:
            print(rec.description)
        uniprots.append(uniprot)
    # Get resulting dataframe
    path_to_df = save_df_with_best_pdbs_for_list_of_uniprots(uniprots, path_to_save, overwrite)
    return path_to_df


def get_closest_pdb_for_target_in_drugbank(input1, path_to_df, root):
    """ Get data about input's closest target in Drugbank approved targets 
    INPUT
        input1 - path to pdb, sdf, mol2 or Uniprot ID of species from ['human', 'rat', 'mouse']
    """
    path_to_df = str(Path(root) / 'Drugbank_extracted' / 'targets_best_pdbs')
    # dataframe with columns ['uniprot', 'pdb', 'chain', 'similarity', 'path']
    df = pd.read_pickle(path_to_df)
    for path in df:
        tm_score, rmsd, common_res = get_TMscore_and_RMSD_of_proteins_or_complexes(input1, path)
        all_tm_score.append(tm_score)
        all_rmsd.append(rmsd) 
        all_common_res.append(common_res)
    sims, chains, pdbs = zip(*sorted(zip(sims, chains, pdbs), reverse=True))
    return 0
                

def get_closest_pdb_for_target_in_proteome(input1, species, path_to_df, root):
    """ Get data about input's closest target in proteome of species
    INPUT
        input1 - path to pdb, sdf, mol2 or Uniprot ID of species from ['human', 'rat', 'mouse']
    """
    path_to_df = str(Path(root) / 'Drugbank_extracted' / 'targets_best_pdbs')
    # dataframe with columns ['uniprot', 'pdb', 'chain', 'similarity', 'path']
    df = pd.read_pickle(path_to_df)
    str(Path(root) / 'Uniprot_proteomes' / (input1 + '_proteome.fasta'))

    
##################   Search by TM-score SIMILARITY (for complexes)   ####################################

def get_closest_complexes(input1, sim, sim_min, root, k_print=1):
    """ Get info about closest complexes to input pdb, print about k_print
    Input:
        input1 - path to structure in .pdb, .sdf or .mol2, or PDB ID
        sim - level of similarity of SMILES of ligands to search for their pdbs
        sim_min -- max level of sim of SMILES (see get_pdbs_from_smiles from Auxiliary.py)
        root -- root of the protocol
    Output:
        dictionary with keys (ligand_name, target_uniprot, sim, sim_min, input1) 
                        values (TM-score, RMSD, number of residues in alignment)
    """
    # Load dict of pdbs by name of ligand
    filename, path = produce_name_and_path_of_file_with_sims('all_pdbs_of_all_connections_', sim, sim_min, root)
    # Load dict (name of ligand, uniprot, sim_of_SMILES, sim_min) : [pdbs where is ligand and target]
    with open(path, 'rb') as f:
        connect_dict = pickle.load(f)
    sim_values = []
    connection_keys = []
    # Produce dictionary
    # keys --  (ligand_name, target_uniprot, sim, sim_min, input1) and
    # values -- (TM-score, RMSD, number of residues in alignment)
    # Iterating over pairs (ligand, target)
    for name_uniprot_sim, pdbs in connect_dict.items():     
        # Iteration over pdbs correspondent to one (ligand, target)
        for pdb in pdbs:
            # Get similarity measures for this pdb with input
            tm, rmsd, common_res = get_TMscore_and_RMSD_of_proteins_or_complexes(input1, pdb, root, compare_type='c')
            sim_value = (tm, rmsd, common_res)
            sim_values.append(sim_value)
            connection_key = (name_uniprot_sim[0], name_uniprot_sim[1], 
                              name_uniprot_sim[2], name_uniprot_sim[3],
                             input1)
            connection_keys.append(connection_key)
    res = dict(zip(connection_keys, sim_values))
    # Sort by tm-score
    res_sort = sorted(res.items(), key=lambda e: e[1][1])
    path = str(Path(root) / 'pdb' / '1.txt')
    with open(path, 'wb') as f:
        pickle.dump(res_sorted, f, pickle.HIGHEST_PROTOCOL)
    print(res_sorted[0:k_print])
    return path


####################    TESTS   ################# 
#root = '/media/anton/b8150e49-6ff0-467b-ad66-40347e8bb188/anton/BACHELOR'
#root = '/home/anton_maximov/BACHELOR'
root = os.getcwd()


# Test of SMILES search
#df = get_closest_smiles_names('ClCCNC(=O)N(CCCl)N=O', root)
#print(df)

# Test of seq search
uniprot = 'P00533'
uniprot = 'O43451'
uniprot = 'P30556'
#print(aux.get_seq_from_uniprot(uniprot))
fasta = '/home/anton_maximov/BACHELOR/P08069.fasta'
path_to_data_in_fasta = '/home/anton_maximov/BACHELOR/Drugbank_extracted/Drugbank_targets.fasta'
df = get_closest_fastas_in_fasta_file_from_fasta_uniprot_or_seq(fasta, path_to_data_in_fasta, k=3, sim_or_ident=True)
print(df['position_in_fasta'], df['similarity'])
print(df)

# Test of fingerprint search
path_to_structure = str(Path(root) / 'Drugbank_extracted' / 'SDF_ideal.sdf')
path_to_sdf_from_drugbank =  str(Path(root) / 'Drugbank_extracted' / 'structures.sdf') #str(Path(root) / 'Drugbank_extracted' / 'structures_approved_by_db.sdf')#
#path_to_sdf_approved = extract_approved_sdf(path_to_sdf_from_drugbank, root, overwrite=True)

#get_closest_ligands_from_3d_structure(path_to_structure, path_to_sdf_approved, root,
#                                                          fptype='maccs', number_to_print=5)

pdb_dir = Path(root) / 'pdb'
#pdb2 = '1AZM'
pdb1 = '3W6H'
#pdb1 = '3L4Y'
pdb2 = '3L4Z'
aux.download_pdb(pdb1, pdb_dir)
aux.download_pdb(pdb2, pdb_dir)
struct1_path = str(pdb_dir / (pdb1 + '.pdb'))
struct2_path = str(pdb_dir / (pdb2 + '.pdb'))
pdb1 = '2JIT'
# Test of target structure search 
#extract_pdb(pdb1, struct1_path)
#get_best_pdb_of_target(uniprot, root,)
#get_best_pdb_of_target(uniprot, root)
# Test of complex structure search by TM-align
#a = get_TMscore_and_RMSD_of_proteins_or_complexes('/home/anton_maximov/BACHELOR/pdb/3W2O.pdb', 
#                                                 struct2_path, compare_type='c', verbose=True)
#print(a)

# Other auxiliary functions
#get_seq_from_fasta_uniprot_or_seq('P08100')
#get_sequences_similarity('P08100', 'P32238', align_matrix='blosum62', verbose=True)
#get_element_of_fasta_by_number(path_to_data_in_fasta, 1)

#args = parser.parse_args()
#print(args.accumulate(args.integers))

Overwriting Search.py


In [21]:
print(datetime.datetime.now())
df = save_best_protein_structures_of_targets(root)
print(datetime.datetime.now())
print(df)

2019-06-25 00:30:07.887520
Is it really such a short protein sequence or invalid Uniprot ID XLXR
Is it really such a short protein sequence or invalid Uniprot ID DFEEIPEEYL
Is it really such a short protein sequence or invalid Uniprot ID DFEEIPEEYL
Is it really such a short protein sequence or invalid Uniprot ID DFEEIPEEYL
Is it really such a short protein sequence or invalid Uniprot ID DFEEIPEEYL
Is it really such a short protein sequence or invalid Uniprot ID FPR
Is it really such a short protein sequence or invalid Uniprot ID DFEEI
Is it really such a short protein sequence or invalid Uniprot ID DFEEIPEEYL
Is it really such a short protein sequence or invalid Uniprot ID DFEEIPEEYL
Is it really such a short protein sequence or invalid Uniprot ID DFEEIPEEYL
Is it really such a short protein sequence or invalid Uniprot ID DFEEIPEEYL
Is it really such a short protein sequence or invalid Uniprot ID DYEPIPEEAF
Is it really such a short protein sequence or invalid Uniprot ID APR
Is it real



Is it really such a short protein sequence or invalid Uniprot ID THYYLLP
Is it really such a short protein sequence or invalid Uniprot ID THYYLLP
Is it really such a short protein sequence or invalid Uniprot ID DEEDYYEIP
Is it really such a short protein sequence or invalid Uniprot ID PDHQYYNDF




Something went wrong with processing 2WNU and chain A




Something went wrong with processing 1NZI and chain A




Something went wrong with processing 4W4O and chain A




Is it really such a short protein sequence or invalid Uniprot ID LPKXTGG
Is it really such a short protein sequence or invalid Uniprot ID LPKXTGG




Is it really such a short protein sequence or invalid Uniprot ID GQVPFSKEEC
Is it really such a short protein sequence or invalid Uniprot ID GQVPFSKEEC




No pdbs found, probably invalid uniprot P30968 or no structures
Smth WRONG
[]
[]
[]




Is it really such a short protein sequence or invalid Uniprot ID GPRP
Is it really such a short protein sequence or invalid Uniprot ID GPRP
Is it really such a short protein sequence or invalid Uniprot ID GPRP
Is it really such a short protein sequence or invalid Uniprot ID GPRP
Is it really such a short protein sequence or invalid Uniprot ID GHRP
Is it really such a short protein sequence or invalid Uniprot ID GHRP
Is it really such a short protein sequence or invalid Uniprot ID GHRP
Is it really such a short protein sequence or invalid Uniprot ID GHRP
Is it really such a short protein sequence or invalid Uniprot ID GHRP
Is it really such a short protein sequence or invalid Uniprot ID GHRP
Is it really such a short protein sequence or invalid Uniprot ID GHRP
Is it really such a short protein sequence or invalid Uniprot ID GHRP
Is it really such a short protein sequence or invalid Uniprot ID GHRP
Is it really such a short protein sequence or invalid Uniprot ID GHRP
Is it really such a 



Is it really such a short protein sequence or invalid Uniprot ID XTVASSX
Is it really such a short protein sequence or invalid Uniprot ID XTVASSX
Is it really such a short protein sequence or invalid Uniprot ID TCRQSMCTAR
Is it really such a short protein sequence or invalid Uniprot ID TCPX
Is it really such a short protein sequence or invalid Uniprot ID CPAYSAYLDC
Is it really such a short protein sequence or invalid Uniprot ID CPAYSAYLDC
Is it really such a short protein sequence or invalid Uniprot ID CPAYSAYLDC
Is it really such a short protein sequence or invalid Uniprot ID CPAYSRYLDC
Is it really such a short protein sequence or invalid Uniprot ID CPAYSAYLDC
Is it really such a short protein sequence or invalid Uniprot ID CPAYSAYLAC
Is it really such a short protein sequence or invalid Uniprot ID CPAYSRYIGC
Is it really such a short protein sequence or invalid Uniprot ID CPAYSAYIGC
Is it really such a short protein sequence or invalid Uniprot ID CSAR
Is it really such a short prot



Is it really such a short protein sequence or invalid Uniprot ID TCGLRQY
Is it really such a short protein sequence or invalid Uniprot ID TCGLRQY




Is it really such a short protein sequence or invalid Uniprot ID CGLR
Is it really such a short protein sequence or invalid Uniprot ID CGLR
Is it really such a short protein sequence or invalid Uniprot ID CGLR




Smth WRONG
[]
[]
['1NDX']
Smth WRONG
[]
[]
['1LUT', '1XUL']




Something went wrong with processing 5II0 and chain A
Is it really such a short protein sequence or invalid Uniprot ID XAAPAX
Is it really such a short protein sequence or invalid Uniprot ID XAAPXX
Is it really such a short protein sequence or invalid Uniprot ID XXXMXX




Is it really such a short protein sequence or invalid Uniprot ID GNLVS




Is it really such a short protein sequence or invalid Uniprot ID YIRLP
Smth went wrong with comparison to  GAGAGCCCCAGCGAGAUAAUACUUGGCCCCGCUCUU
Smth went wrong with comparison to  GAGAGCCCCAGCGAGAUAAUACUUGGCCCCGCUCUU




Is it really such a short protein sequence or invalid Uniprot ID FPR
Is it really such a short protein sequence or invalid Uniprot ID FPR
Is it really such a short protein sequence or invalid Uniprot ID FPR
Is it really such a short protein sequence or invalid Uniprot ID FPR
Is it really such a short protein sequence or invalid Uniprot ID FPR
Is it really such a short protein sequence or invalid Uniprot ID EGR




No pdbs found, probably invalid uniprot P07307 or no structures
Smth WRONG
[]
[]
[]




Is it really such a short protein sequence or invalid Uniprot ID MLLSVPLLLG
Is it really such a short protein sequence or invalid Uniprot ID MLLSVPLLLG




Something went wrong with processing 3POS and chain C
No pdbs found, probably invalid uniprot P27824 or no structures
Smth WRONG
[]
[]
[]




Something went wrong with processing 3A4U and chain A
No pdbs found, probably invalid uniprot Q92637 or no structures
Smth WRONG
[]
[]
[]




Is it really such a short protein sequence or invalid Uniprot ID CLGR
Is it really such a short protein sequence or invalid Uniprot ID DYMNMS
Is it really such a short protein sequence or invalid Uniprot ID XLYASSNPAY




Is it really such a short protein sequence or invalid Uniprot ID DLYCYEQLN
Is it really such a short protein sequence or invalid Uniprot ID PPTLHELYDL
Is it really such a short protein sequence or invalid Uniprot ID PPTLHELYDL
Is it really such a short protein sequence or invalid Uniprot ID PPTLHELYDL




No pdbs found, probably invalid uniprot P05787 or no structures
Smth WRONG
[]
[]
[]
Is it really such a short protein sequence or invalid Uniprot ID KLPSTTL




No pdbs found, probably invalid uniprot P47901 or no structures
Smth WRONG
[]
[]
[]
Is it really such a short protein sequence or invalid Uniprot ID XKQLRX




No pdbs found, probably invalid uniprot P38435 or no structures
Smth WRONG
[]
[]
[]
Is it really such a short protein sequence or invalid Uniprot ID WYTRX




Is it really such a short protein sequence or invalid Uniprot ID WYTRX




No pdbs found, probably invalid uniprot Q14626 or no structures
Smth WRONG
[]
[]
[]




Is it really such a short protein sequence or invalid Uniprot ID RRRWHRWRL
Is it really such a short protein sequence or invalid Uniprot ID RRRWHRWRL




No pdbs found, probably invalid uniprot O95838 or no structures
Smth WRONG
[]
[]
[]
Is it really such a short protein sequence or invalid Uniprot ID XXGTXTSDXX




No pdbs found, probably invalid uniprot P23763 or no structures
Smth WRONG
[]
[]
[]
No pdbs found, probably invalid uniprot Q8N9I0 or no structures
Smth WRONG
[]
[]
[]
No pdbs found, probably invalid uniprot Q01362 or no structures
Smth WRONG
[]
[]
[]




Something went wrong with processing 5CTD and chain A




Is it really such a short protein sequence or invalid Uniprot ID RGDFV
Is it really such a short protein sequence or invalid Uniprot ID GSHAWDTAN
Is it really such a short protein sequence or invalid Uniprot ID XRGDWPCX
Is it really such a short protein sequence or invalid Uniprot ID LGGAKQAGDV
Is it really such a short protein sequence or invalid Uniprot ID LGGAKQRGDV
Is it really such a short protein sequence or invalid Uniprot ID GRGDSP
Is it really such a short protein sequence or invalid Uniprot ID GRGDSP
Is it really such a short protein sequence or invalid Uniprot ID GRGDSP
Is it really such a short protein sequence or invalid Uniprot ID GRGDSP
Is it really such a short protein sequence or invalid Uniprot ID GRGDSP
Is it really such a short protein sequence or invalid Uniprot ID GRGDSP
Is it really such a short protein sequence or invalid Uniprot ID GRGDSP
Is it really such a short protein sequence or invalid Uniprot ID GRGDSP
Is it really such a short protein sequence or invali



Is it really such a short protein sequence or invalid Uniprot ID XRGDWPCX
Is it really such a short protein sequence or invalid Uniprot ID LGGAKQAGDV
Is it really such a short protein sequence or invalid Uniprot ID LGGAKQRGDV
Is it really such a short protein sequence or invalid Uniprot ID GRGDSP
Is it really such a short protein sequence or invalid Uniprot ID GRGDSP
Is it really such a short protein sequence or invalid Uniprot ID GRGDSP
Is it really such a short protein sequence or invalid Uniprot ID GRGDSP
Is it really such a short protein sequence or invalid Uniprot ID GRGDSP
Is it really such a short protein sequence or invalid Uniprot ID GRGDSP
Is it really such a short protein sequence or invalid Uniprot ID GRGDSP
Is it really such a short protein sequence or invalid Uniprot ID GRGDSP
Is it really such a short protein sequence or invalid Uniprot ID GRGDSP
Is it really such a short protein sequence or invalid Uniprot ID GRGDSP
Is it really such a short protein sequence or invalid 



No pdbs found, probably invalid uniprot P35237 or no structures
Smth WRONG
[]
[]
[]




Is it really such a short protein sequence or invalid Uniprot ID VPPPVPPRRR
Is it really such a short protein sequence or invalid Uniprot ID XKYVNVP
Is it really such a short protein sequence or invalid Uniprot ID KPFYVNVEF
Is it really such a short protein sequence or invalid Uniprot ID XYVNV
Is it really such a short protein sequence or invalid Uniprot ID XYVNV
Is it really such a short protein sequence or invalid Uniprot ID XYVNV
Is it really such a short protein sequence or invalid Uniprot ID XYVNV
Is it really such a short protein sequence or invalid Uniprot ID RHYRPLPPLP
Is it really such a short protein sequence or invalid Uniprot ID XYYN
Is it really such a short protein sequence or invalid Uniprot ID XYYN
Is it really such a short protein sequence or invalid Uniprot ID APSYVNVQN
Is it really such a short protein sequence or invalid Uniprot ID KPFYVNVX
Is it really such a short protein sequence or invalid Uniprot ID XEYINQX
Is it really such a short protein sequence or invalid 



Is it really such a short protein sequence or invalid Uniprot ID APPA
Is it really such a short protein sequence or invalid Uniprot ID APPA
Is it really such a short protein sequence or invalid Uniprot ID AAA
Is it really such a short protein sequence or invalid Uniprot ID AAA
Is it really such a short protein sequence or invalid Uniprot ID QFXKX
Is it really such a short protein sequence or invalid Uniprot ID QFXKX
Is it really such a short protein sequence or invalid Uniprot ID AAA
Is it really such a short protein sequence or invalid Uniprot ID AAA
Is it really such a short protein sequence or invalid Uniprot ID AAA
Is it really such a short protein sequence or invalid Uniprot ID AAA




No pdbs found, probably invalid uniprot P01906 or no structures
Smth WRONG
[]
[]
[]




Is it really such a short protein sequence or invalid Uniprot ID XVVXAX
Is it really such a short protein sequence or invalid Uniprot ID XVVXAX




No pdbs found, probably invalid uniprot P16870 or no structures
Smth WRONG
[]
[]
[]
No pdbs found, probably invalid uniprot P16519 or no structures
Smth WRONG
[]
[]
[]
No pdbs found, probably invalid uniprot P29120 or no structures
Smth WRONG
[]
[]
[]
No pdbs found, probably invalid uniprot P48745 or no structures
Smth WRONG
[]
[]
[]
No pdbs found, probably invalid uniprot Q16270 or no structures
Smth WRONG
[]
[]
[]
Is it really such a short protein sequence or invalid Uniprot ID EYLGLDVPV
Is it really such a short protein sequence or invalid Uniprot ID EYLGLDVPV
Is it really such a short protein sequence or invalid Uniprot ID PQPEYVNQPD
Is it really such a short protein sequence or invalid Uniprot ID IISAVVGIL
Is it really such a short protein sequence or invalid Uniprot ID IISAVVGIL
Is it really such a short protein sequence or invalid Uniprot ID PQPEYVNQPD
Is it really such a short protein sequence or invalid Uniprot ID PEYLGLD




Is it really such a short protein sequence or invalid Uniprot ID REEYDV
Is it really such a short protein sequence or invalid Uniprot ID REEYDV
Is it really such a short protein sequence or invalid Uniprot ID QRATKMX
Is it really such a short protein sequence or invalid Uniprot ID RRATKMX
Is it really such a short protein sequence or invalid Uniprot ID GVVASQPARV
Is it really such a short protein sequence or invalid Uniprot ID GVVASQPARV
Is it really such a short protein sequence or invalid Uniprot ID GVVASQPARV
Is it really such a short protein sequence or invalid Uniprot ID GVVASQPARV




Something went wrong with processing 3RK2 and chain A
Is it really such a short protein sequence or invalid Uniprot ID NDKYEPFWE
Is it really such a short protein sequence or invalid Uniprot ID NDKYEPFWE




No pdbs found, probably invalid uniprot P31358 or no structures
Smth WRONG
[]
[]
[]
Is it really such a short protein sequence or invalid Uniprot ID XDE
No pdbs found, probably invalid uniprot P49069 or no structures
Smth WRONG
[]
[]
[]
No pdbs found, probably invalid uniprot Q96LZ3 or no structures
Smth WRONG
[]
[]
[]
Is it really such a short protein sequence or invalid Uniprot ID HAGPIA
Is it really such a short protein sequence or invalid Uniprot ID HAGPIA
Is it really such a short protein sequence or invalid Uniprot ID HAGPIA
Is it really such a short protein sequence or invalid Uniprot ID HAGPIA
Is it really such a short protein sequence or invalid Uniprot ID HAGPIA
Is it really such a short protein sequence or invalid Uniprot ID HAGPIA
Is it really such a short protein sequence or invalid Uniprot ID HAGPIA
Is it really such a short protein sequence or invalid Uniprot ID HAGPIA
Is it really such a short protein sequence or invalid Uniprot ID HAGPIA
Is it really such a short prote



Something went wrong with processing 4PJ5 and chain D




Is it really such a short protein sequence or invalid Uniprot ID RGDNP
Is it really such a short protein sequence or invalid Uniprot ID RGDNP
Is it really such a short protein sequence or invalid Uniprot ID RGD
Is it really such a short protein sequence or invalid Uniprot ID GRGDSP
Is it really such a short protein sequence or invalid Uniprot ID ACRGDGWC




Is it really such a short protein sequence or invalid Uniprot ID RGDFV




Is it really such a short protein sequence or invalid Uniprot ID EFPDFP
Is it really such a short protein sequence or invalid Uniprot ID HIYPDFPTD
Is it really such a short protein sequence or invalid Uniprot ID NPISDFPD
Is it really such a short protein sequence or invalid Uniprot ID YVPML
Is it really such a short protein sequence or invalid Uniprot ID YVPML




No pdbs found, probably invalid uniprot P30872 or no structures
Smth WRONG
[]
[]
[]
No pdbs found, probably invalid uniprot P35346 or no structures
Smth WRONG
[]
[]
[]
No pdbs found, probably invalid uniprot P30874 or no structures
Smth WRONG
[]
[]
[]
No pdbs found, probably invalid uniprot P30559 or no structures
Smth WRONG
[]
[]
[]
No pdbs found, probably invalid uniprot P01178 or no structures
Smth WRONG
[]
[]
[]
Is it really such a short protein sequence or invalid Uniprot ID QYKSILQE
Is it really such a short protein sequence or invalid Uniprot ID QYKSILQE




Is it really such a short protein sequence or invalid Uniprot ID LIGRTQ
No pdbs found, probably invalid uniprot Q53I07 or no structures
Smth WRONG
[]
[]
[]
No pdbs found, probably invalid uniprot P13843 or no structures
Smth WRONG
[]
[]
[]




No pdbs found, probably invalid uniprot Q9BYV1 or no structures
Smth WRONG
[]
[]
[]




Something went wrong with processing 2HZP and chain A




Something went wrong with processing 3II0 and chain A




No pdbs found, probably invalid uniprot P80404 or no structures
Smth WRONG
[]
[]
[]




Something went wrong with processing 3DYD and chain A
No pdbs found, probably invalid uniprot Q8IYQ7 or no structures
Smth WRONG
[]
[]
[]
Is it really such a short protein sequence or invalid Uniprot ID GPYY




Something went wrong with processing 3DD1 and chain A
No pdbs found, probably invalid uniprot O15270 or no structures
Smth WRONG
[]
[]
[]




Something went wrong with processing 2JIS and chain A




Something went wrong with processing 4E1O and chain A
No pdbs found, probably invalid uniprot Q96A70 or no structures
Smth WRONG
[]
[]
[]
No pdbs found, probably invalid uniprot O75600 or no structures
Smth WRONG
[]
[]
[]
No pdbs found, probably invalid uniprot P23378 or no structures
Smth WRONG
[]
[]
[]
No pdbs found, probably invalid uniprot P24298 or no structures
Smth WRONG
[]
[]
[]




No pdbs found, probably invalid uniprot P13196 or no structures
Smth WRONG
[]
[]
[]
Something went wrong with processing 2OYC and chain A
No pdbs found, probably invalid uniprot O15269 or no structures
Smth WRONG
[]
[]
[]




Something went wrong with processing 5BWT and chain A
No pdbs found, probably invalid uniprot O94903 or no structures
Smth WRONG
[]
[]
[]
No pdbs found, probably invalid uniprot O95954 or no structures
Smth WRONG
[]
[]
[]
Something went wrong with processing 5IKO and chain A




No pdbs found, probably invalid uniprot Q2TU84 or no structures
Smth WRONG
[]
[]
[]
No pdbs found, probably invalid uniprot Q53ET4 or no structures
Smth WRONG
[]
[]
[]
No pdbs found, probably invalid uniprot Q59FK2 or no structures
Smth WRONG
[]
[]
[]
No pdbs found, probably invalid uniprot Q59GM9 or no structures
Smth WRONG
[]
[]
[]
No pdbs found, probably invalid uniprot Q59HE2 or no structures
Smth WRONG
[]
[]
[]
No pdbs found, probably invalid uniprot Q5JAM2 or no structures
Smth WRONG
[]
[]
[]
No pdbs found, probably invalid uniprot Q5VZ30 or no structures
Smth WRONG
[]
[]
[]
No pdbs found, probably invalid uniprot Q6IBS8 or no structures
Smth WRONG
[]
[]
[]
No pdbs found, probably invalid uniprot Q6P996 or no structures
Smth WRONG
[]
[]
[]
No pdbs found, probably invalid uniprot Q6YP21 or no structures
Smth WRONG
[]
[]
[]
No pdbs found, probably invalid uniprot Q6ZQY3 or no structures
Smth WRONG
[]
[]
[]




No pdbs found, probably invalid uniprot Q6WRI0 or no structures
Smth WRONG
[]
[]
[]
No pdbs found, probably invalid uniprot Q8IUZ5 or no structures
Smth WRONG
[]
[]
[]
No pdbs found, probably invalid uniprot Q8IVA8 or no structures
Smth WRONG
[]
[]
[]




Something went wrong with processing 6HRH and chain A
Something went wrong with processing 3IHJ and chain A
No pdbs found, probably invalid uniprot Q96EN8 or no structures
Smth WRONG
[]
[]
[]




No pdbs found, probably invalid uniprot Q96JQ3 or no structures
Smth WRONG
[]
[]
[]
No pdbs found, probably invalid uniprot Q9BXA1 or no structures
Smth WRONG
[]
[]
[]
Something went wrong with processing 3L6B and chain A
Smth went wrong with comparison to  GCCCGGAUGAUCCUCAGUGGUCUGGGGUGCAGGCUUCAAACCUGUAGCUGUCUAGCGACAGAGUGGUUCAAUUCCACCUUUCGGGCGCCA
Smth went wrong with comparison to  GCCCGGAUGAUCCUCAGUGGUCUGGGGUGCAGGCUUCAAACCUGUAGCUGUCUAGCGACAGAGUGGUUCAAUUCCACCUUUCGGGCG
Smth went wrong with comparison to  GCCCGGAUGAUCCUCAGUGGUCUGGGGUGCAGGCUUCAAACCUGUAGCUGUCUAGCGACAGAGUGGUUCAAUUCCACCUUUCGGGCG




Something went wrong with processing 3HL2 and chain A
No pdbs found, probably invalid uniprot Q9NUV7 or no structures
Smth WRONG
[]
[]
[]
No pdbs found, probably invalid uniprot Q9UGI5 or no structures
Smth WRONG
[]
[]
[]
No pdbs found, probably invalid uniprot Q9UJX1 or no structures
Smth WRONG
[]
[]
[]




Something went wrong with processing 2O2K and chain A
Something went wrong with processing 2XIJ and chain A
Something went wrong with processing 2QTL and chain A




Something went wrong with processing 2WWW and chain A




No pdbs found, probably invalid uniprot Q99624 or no structures
Smth WRONG
[]
[]
[]
No pdbs found, probably invalid uniprot P42357 or no structures
Smth WRONG
[]
[]
[]
No pdbs found, probably invalid uniprot O15427 or no structures
Smth WRONG
[]
[]
[]
No pdbs found, probably invalid uniprot P36021 or no structures
Smth WRONG
[]
[]
[]
No pdbs found, probably invalid uniprot O15375 or no structures
Smth WRONG
[]
[]
[]




Something went wrong with processing 4IMA and chain A
No pdbs found, probably invalid uniprot O15403 or no structures
Smth WRONG
[]
[]
[]
No pdbs found, probably invalid uniprot O60669 or no structures
Smth WRONG
[]
[]
[]




No pdbs found, probably invalid uniprot O15374 or no structures
Smth WRONG
[]
[]
[]
No pdbs found, probably invalid uniprot O95907 or no structures
Smth WRONG
[]
[]
[]
No pdbs found, probably invalid uniprot P53985 or no structures
Smth WRONG
[]
[]
[]




No pdbs found, probably invalid uniprot Q9UHI5 or no structures
Smth WRONG
[]
[]
[]




Smth went wrong with comparison to  GCCGAGGUAGCUCAGUUGGUAGAGCAUGCGACUGAAAAUCGCAGUGUCGGCGGUUCGAUUCUGCUCCUCGGCACCA




Something went wrong with processing 2XSN and chain A
No pdbs found, probably invalid uniprot P05166 or no structures
Smth WRONG
[]
[]
[]
No pdbs found, probably invalid uniprot P50747 or no structures
Smth WRONG
[]
[]
[]
No pdbs found, probably invalid uniprot Q9Y289 or no structures
Smth WRONG
[]
[]
[]
No pdbs found, probably invalid uniprot Q9HCC0 or no structures
Smth WRONG
[]
[]
[]
Something went wrong with processing 2JKU and chain A




Something went wrong with processing 4ASI and chain A
No pdbs found, probably invalid uniprot Q9Y5K3 or no structures
Smth WRONG
[]
[]
[]




No pdbs found, probably invalid uniprot P49585 or no structures
Smth WRONG
[]
[]
[]
No pdbs found, probably invalid uniprot O14939 or no structures
Smth WRONG
[]
[]
[]




Something went wrong with processing 6EZ2 and chain A
No pdbs found, probably invalid uniprot Q13393 or no structures
Smth WRONG
[]
[]
[]
No pdbs found, probably invalid uniprot Q8TCT1 or no structures
Smth WRONG
[]
[]
[]
No pdbs found, probably invalid uniprot P30825 or no structures
Smth WRONG
[]
[]
[]
No pdbs found, probably invalid uniprot O43246 or no structures
Smth WRONG
[]
[]
[]
No pdbs found, probably invalid uniprot Q8WY07 or no structures
Smth WRONG
[]
[]
[]
No pdbs found, probably invalid uniprot P52569 or no structures
Smth WRONG
[]
[]
[]




Something went wrong with processing 3BJU and chain A
Is it really such a short protein sequence or invalid Uniprot ID GPATPAP
Is it really such a short protein sequence or invalid Uniprot ID GPATPAP
Is it really such a short protein sequence or invalid Uniprot ID GPATPAP
Is it really such a short protein sequence or invalid Uniprot ID GPATPAP




Is it really such a short protein sequence or invalid Uniprot ID RGDINNNV




No pdbs found, probably invalid uniprot P09848 or no structures
Smth WRONG
[]
[]
[]
No pdbs found, probably invalid uniprot O00469 or no structures
Smth WRONG
[]
[]
[]
Something went wrong with processing 3N6W and chain A




Something went wrong with processing 4ZEL and chain A
No pdbs found, probably invalid uniprot P19021 or no structures
Smth WRONG
[]
[]
[]
Is it really such a short protein sequence or invalid Uniprot ID HHHHHH
Is it really such a short protein sequence or invalid Uniprot ID PPGPPGPPG
Is it really such a short protein sequence or invalid Uniprot ID PPGPPGPPG
Is it really such a short protein sequence or invalid Uniprot ID PPPPPPPPP




No pdbs found, probably invalid uniprot Q32P28 or no structures
Smth WRONG
[]
[]
[]
No pdbs found, probably invalid uniprot Q6N063 or no structures
Smth WRONG
[]
[]
[]




No pdbs found, probably invalid uniprot Q8IVL5 or no structures
Smth WRONG
[]
[]
[]
No pdbs found, probably invalid uniprot Q8IVL6 or no structures
Smth WRONG
[]
[]
[]
Something went wrong with processing 4NHX and chain A
Something went wrong with processing 2IUW and chain A




No pdbs found, probably invalid uniprot Q9UHI7 or no structures
Smth WRONG
[]
[]
[]
No pdbs found, probably invalid uniprot Q02809 or no structures
Smth WRONG
[]
[]
[]
No pdbs found, probably invalid uniprot Q9H6Z9 or no structures
Smth WRONG
[]
[]
[]
No pdbs found, probably invalid uniprot Q9NVH6 or no structures
Smth WRONG
[]
[]
[]
No pdbs found, probably invalid uniprot Q9NXG6 or no structures
Smth WRONG
[]
[]
[]




Is it really such a short protein sequence or invalid Uniprot ID IFQINS




Something went wrong with processing 2I3C and chain A
No pdbs found, probably invalid uniprot P08243 or no structures
Smth WRONG
[]
[]
[]




Something went wrong with processing 1Q7L and chain A




Something went wrong with processing 4J15 and chain A




Something went wrong with processing 2V40 and chain A
No pdbs found, probably invalid uniprot P43005 or no structures
Smth WRONG
[]
[]
[]
No pdbs found, probably invalid uniprot Q5T6L4 or no structures
Smth WRONG
[]
[]
[]




Something went wrong with processing 4O0C and chain A
Smth WRONG
[]
[]
['2GJO']
No pdbs found, probably invalid uniprot Q96HD9 or no structures
Smth WRONG
[]
[]
[]




Something went wrong with processing 5BWA and chain A
No pdbs found, probably invalid uniprot Q9BXI2 or no structures
Smth WRONG
[]
[]
[]
No pdbs found, probably invalid uniprot Q9Y619 or no structures
Smth WRONG
[]
[]
[]
No pdbs found, probably invalid uniprot O95190 or no structures
Smth WRONG
[]
[]
[]
No pdbs found, probably invalid uniprot Q9UMX2 or no structures
Smth WRONG
[]
[]
[]




Something went wrong with processing 2VO1 and chain A
No pdbs found, probably invalid uniprot Q06203 or no structures
Smth WRONG
[]
[]
[]




Something went wrong with processing 2OJW and chain A
No pdbs found, probably invalid uniprot O60427 or no structures
Smth WRONG
[]
[]
[]
No pdbs found, probably invalid uniprot O95864 or no structures
Smth WRONG
[]
[]
[]
No pdbs found, probably invalid uniprot P32418 or no structures
Smth WRONG
[]
[]
[]
No pdbs found, probably invalid uniprot Q8NER1 or no structures
Smth WRONG
[]
[]
[]
No pdbs found, probably invalid uniprot Q9GZR5 or no structures
Smth WRONG
[]
[]
[]
Is it really such a short protein sequence or invalid Uniprot ID NALLRYLLDK
Is it really such a short protein sequence or invalid Uniprot ID NALLRYLLDK
Is it really such a short protein sequence or invalid Uniprot ID NALLRYLLDK
Is it really such a short protein sequence or invalid Uniprot ID ALLRYLLD
Is it really such a short protein sequence or invalid Uniprot ID ALLRYLLD
Is it really such a short protein sequence or invalid Uniprot ID HKILHRLLQ
Is it really such a short protein sequence or invalid Uniprot ID HKILHR
Is 



Is it really such a short protein sequence or invalid Uniprot ID HKLVQLLTTT
Is it really such a short protein sequence or invalid Uniprot ID SLLKKLLLA
Is it really such a short protein sequence or invalid Uniprot ID KSLLQQLLTE




Smth went wrong with comparison to  CAGGTCATTUCAGGTCAG
Is it really such a short protein sequence or invalid Uniprot ID HKILHRLLQD
Is it really such a short protein sequence or invalid Uniprot ID HKILHRLLQD
Is it really such a short protein sequence or invalid Uniprot ID HKILHRLLQD
Is it really such a short protein sequence or invalid Uniprot ID KHKILHRLL
Is it really such a short protein sequence or invalid Uniprot ID KILHRLLQD
Is it really such a short protein sequence or invalid Uniprot ID KILHRLLQ
Is it really such a short protein sequence or invalid Uniprot ID HKILHRLLQD
Is it really such a short protein sequence or invalid Uniprot ID HKILHRLLQD
Is it really such a short protein sequence or invalid Uniprot ID HKILHRLLQ
Is it really such a short protein sequence or invalid Uniprot ID HKILHR
Is it really such a short protein sequence or invalid Uniprot ID HKILHRLLQ
Is it really such a short protein sequence or invalid Uniprot ID HKILHR




Smth went wrong with comparison to  GGCCGGAUGAUCCUCAGUGGUCUGGGGUGCAGGCUUCAAACCUGUAGCUGUCUAGCGACAGAGUGGUUCAAUUCCACCUUUCGGGCGCCA
Smth went wrong with comparison to  GGCCGGAUGAUCCUCAGUGGUCUGGGGUGCAGGCUUCAAACCUGUAGCUGUCUAGCGACAGAGUGGUUCAAUUCCACCUUUCGGGCGCCA
Smth went wrong with comparison to  GGCCGGAUGAUCCUCAGUGGUCUGGGGUGCAGGCUUCAAACCUGUAGCUGUCUAGCGACAGAGUGGUUCAAUUCCACCUUUCGGGCGCCA




Something went wrong with processing 3VBB and chain A




No pdbs found, probably invalid uniprot Q9H2M3 or no structures
Smth WRONG
[]
[]
[]




No pdbs found, probably invalid uniprot P31260 or no structures
Smth WRONG
[]
[]
[]
No pdbs found, probably invalid uniprot Q9UPY5 or no structures
Smth WRONG
[]
[]
[]
No pdbs found, probably invalid uniprot O60931 or no structures
Smth WRONG
[]
[]
[]
No pdbs found, probably invalid uniprot P82251 or no structures
Smth WRONG
[]
[]
[]
No pdbs found, probably invalid uniprot Q07837 or no structures
Smth WRONG
[]
[]
[]
Something went wrong with processing 1NB0 and chain A




Something went wrong with processing 1HDO and chain A




Something went wrong with processing 3KS9 and chain A
No pdbs found, probably invalid uniprot Q14833 or no structures
Smth WRONG
[]
[]
[]
No pdbs found, probably invalid uniprot Q16099 or no structures
Smth WRONG
[]
[]
[]
No pdbs found, probably invalid uniprot Q16478 or no structures
Smth WRONG
[]
[]
[]




Something went wrong with processing 6BSZ and chain A
No pdbs found, probably invalid uniprot O00341 or no structures
Smth WRONG
[]
[]
[]
No pdbs found, probably invalid uniprot O14841 or no structures
Smth WRONG
[]
[]
[]
No pdbs found, probably invalid uniprot O15067 or no structures
Smth WRONG
[]
[]
[]
No pdbs found, probably invalid uniprot O15399 or no structures
Smth WRONG
[]
[]
[]
Something went wrong with processing 5KCA and chain A
No pdbs found, probably invalid uniprot O60391 or no structures
Smth WRONG
[]
[]
[]
No pdbs found, probably invalid uniprot O75879 or no structures
Smth WRONG
[]
[]
[]




No pdbs found, probably invalid uniprot P43004 or no structures
Smth WRONG
[]
[]
[]
No pdbs found, probably invalid uniprot P48506 or no structures
Smth WRONG
[]
[]
[]
No pdbs found, probably invalid uniprot P48664 or no structures
Smth WRONG
[]
[]
[]




Something went wrong with processing 2VPI and chain A
No pdbs found, probably invalid uniprot Q05932 or no structures
Smth WRONG
[]
[]
[]
Is it really such a short protein sequence or invalid Uniprot ID XVVD




No pdbs found, probably invalid uniprot Q14957 or no structures
Smth WRONG
[]
[]
[]
No pdbs found, probably invalid uniprot Q5JPH6 or no structures
Smth WRONG
[]
[]
[]
No pdbs found, probably invalid uniprot Q5TDP6 or no structures
Smth WRONG
[]
[]
[]
No pdbs found, probably invalid uniprot Q6IA69 or no structures
Smth WRONG
[]
[]
[]




No pdbs found, probably invalid uniprot Q8TCU5 or no structures
Smth WRONG
[]
[]
[]
No pdbs found, probably invalid uniprot Q9H1K4 or no structures
Smth WRONG
[]
[]
[]
No pdbs found, probably invalid uniprot Q9H936 or no structures
Smth WRONG
[]
[]
[]




Something went wrong with processing 4DYO and chain A
No pdbs found, probably invalid uniprot Q9ULK0 or no structures
Smth WRONG
[]
[]
[]
No pdbs found, probably invalid uniprot Q9Y646 or no structures
Smth WRONG
[]
[]
[]
Something went wrong with processing 2ZNS and chain A
No pdbs found, probably invalid uniprot P42261 or no structures
Smth WRONG
[]
[]
[]
No pdbs found, probably invalid uniprot P48507 or no structures
Smth WRONG
[]
[]
[]




Something went wrong with processing 3RN8 and chain A
No pdbs found, probably invalid uniprot P42263 or no structures
Smth WRONG
[]
[]
[]
No pdbs found, probably invalid uniprot P48058 or no structures
Smth WRONG
[]
[]
[]




Something went wrong with processing 2H5G and chain A




Something went wrong with processing 3QXM and chain A
No pdbs found, probably invalid uniprot Q13003 or no structures
Smth WRONG
[]
[]
[]
No pdbs found, probably invalid uniprot P49619 or no structures
Smth WRONG
[]
[]
[]
No pdbs found, probably invalid uniprot Q9BVG9 or no structures
Smth WRONG
[]
[]
[]
No pdbs found, probably invalid uniprot Q9NXE4 or no structures
Smth WRONG
[]
[]
[]
Something went wrong with processing 5UVG and chain A
No pdbs found, probably invalid uniprot Q9Y2Q0 or no structures
Smth WRONG
[]
[]
[]
No pdbs found, probably invalid uniprot P48651 or no structures
Smth WRONG
[]
[]
[]
No pdbs found, probably invalid uniprot Q9UG56 or no structures
Smth WRONG
[]
[]
[]
No pdbs found, probably invalid uniprot Q8WTV0 or no structures
Smth WRONG
[]
[]
[]
No pdbs found, probably invalid uniprot P23434 or no structures
Smth WRONG
[]
[]
[]
Smth went wrong with comparison to  GCGCCGCUGGUGUAGUGGUAUCAUGCAAGAUUCCCAUUCUUGCGACCCGGGUUCGAUUCCCGGGCGGCGCACCA
Smth went wrong with comp



No pdbs found, probably invalid uniprot Q14032 or no structures
Smth WRONG
[]
[]
[]
No pdbs found, probably invalid uniprot Q14330 or no structures
Smth WRONG
[]
[]
[]
No pdbs found, probably invalid uniprot Q6IB77 or no structures
Smth WRONG
[]
[]
[]
No pdbs found, probably invalid uniprot Q7Z2H8 or no structures
Smth WRONG
[]
[]
[]
No pdbs found, probably invalid uniprot Q8WU03 or no structures
Smth WRONG
[]
[]
[]
No pdbs found, probably invalid uniprot Q969I3 or no structures
Smth WRONG
[]
[]
[]
No pdbs found, probably invalid uniprot Q9H598 or no structures
Smth WRONG
[]
[]
[]
No pdbs found, probably invalid uniprot Q9P0Z9 or no structures
Smth WRONG
[]
[]
[]
No pdbs found, probably invalid uniprot Q9Y345 or no structures
Smth WRONG
[]
[]
[]
No pdbs found, probably invalid uniprot P48067 or no structures
Smth WRONG
[]
[]
[]
No pdbs found, probably invalid uniprot P48167 or no structures
Smth WRONG
[]
[]
[]




No pdbs found, probably invalid uniprot P23416 or no structures
Smth WRONG
[]
[]
[]




No pdbs found, probably invalid uniprot Q15031 or no structures
Smth WRONG
[]
[]
[]
No pdbs found, probably invalid uniprot O60294 or no structures
Smth WRONG
[]
[]
[]




Smth went wrong with comparison to  GACCUCGUGGCGCAAUGGUAGCGCGUCUGACUCCAGAUCAGAAGGUUGCGUGUUCGAAUCACGUCGGGGUCA
Smth went wrong with comparison to  GACCUCGUGGCGCAAUGGUAGCGCGUCUGACUCCAGAUCAGAAGGUUGCGUGUUCGAAUCACGUCGGGGUCACCA
Smth went wrong with comparison to  GACCUCGUGGCGCAAUGGUAGCGCGUCUGACUCCAGAUCAGAAGGUUGCGUGUUCGAAUCACGUCGGGGUCACCA
Smth went wrong with comparison to  GACCUCGUGGCGCAAUGGUAGCGCGUCUGACUCCAGAUCAGAAGGUUGCGUGUUCGAAUCACGUCGGGGUCACCA




No pdbs found, probably invalid uniprot P49589 or no structures
Smth WRONG
[]
[]
[]




No pdbs found, probably invalid uniprot Q9BZV2 or no structures
Smth WRONG
[]
[]
[]
No pdbs found, probably invalid uniprot Q9HA77 or no structures
Smth WRONG
[]
[]
[]
Something went wrong with processing 6BPR and chain A
No pdbs found, probably invalid uniprot O60779 or no structures
Smth WRONG
[]
[]
[]
No pdbs found, probably invalid uniprot Q9BW92 or no structures
Smth WRONG
[]
[]
[]




Something went wrong with processing 2Q3E and chain A




Something went wrong with processing 1PL6 and chain A




No pdbs found, probably invalid uniprot Q6ZMR3 or no structures
Smth WRONG
[]
[]
[]




Something went wrong with processing 4LAU and chain A




Something went wrong with processing 2DFD and chain A




Something went wrong with processing 5IXS and chain A




No pdbs found, probably invalid uniprot Q9BYZ2 or no structures
Smth WRONG
[]
[]
[]
No pdbs found, probably invalid uniprot P07864 or no structures
Smth WRONG
[]
[]
[]




No pdbs found, probably invalid uniprot Q08426 or no structures
Smth WRONG
[]
[]
[]




No pdbs found, probably invalid uniprot Q16798 or no structures
Smth WRONG
[]
[]
[]
No pdbs found, probably invalid uniprot Q02338 or no structures
Smth WRONG
[]
[]
[]
No pdbs found, probably invalid uniprot P40925 or no structures
Smth WRONG
[]
[]
[]




Something went wrong with processing 2GF2 and chain A
No pdbs found, probably invalid uniprot P37058 or no structures
Smth WRONG
[]
[]
[]




Something went wrong with processing 5N6C and chain A
No pdbs found, probably invalid uniprot P56937 or no structures
Smth WRONG
[]
[]
[]




No pdbs found, probably invalid uniprot O43837 or no structures
Smth WRONG
[]
[]
[]




No pdbs found, probably invalid uniprot P37059 or no structures
Smth WRONG
[]
[]
[]
No pdbs found, probably invalid uniprot Q92781 or no structures
Smth WRONG
[]
[]
[]
No pdbs found, probably invalid uniprot O95479 or no structures
Smth WRONG
[]
[]
[]




Something went wrong with processing 2PD6 and chain A




Something went wrong with processing 2FVL and chain A




Something went wrong with processing 4HMN and chain A
Something went wrong with processing 2GDZ and chain A




Something went wrong with processing 4B8W and chain A
Something went wrong with processing 1MRQ and chain A




Something went wrong with processing 3CH6 and chain A
No pdbs found, probably invalid uniprot P26439 or no structures
Smth WRONG
[]
[]
[]
No pdbs found, probably invalid uniprot P14060 or no structures
Smth WRONG
[]
[]
[]




No pdbs found, probably invalid uniprot P80365 or no structures
Smth WRONG
[]
[]
[]




Something went wrong with processing 4JQ4 and chain A
No pdbs found, probably invalid uniprot Q15738 or no structures
Smth WRONG
[]
[]
[]
No pdbs found, probably invalid uniprot P48448 or no structures
Smth WRONG
[]
[]
[]
No pdbs found, probably invalid uniprot Q02252 or no structures
Smth WRONG
[]
[]
[]




Something went wrong with processing 3SZA and chain A
Something went wrong with processing 4WJ9 and chain A




Something went wrong with processing 3H9E and chain O




No pdbs found, probably invalid uniprot P30837 or no structures
Smth WRONG
[]
[]
[]
No pdbs found, probably invalid uniprot P43353 or no structures
Smth WRONG
[]
[]
[]




Something went wrong with processing 4WNC and chain O




Something went wrong with processing 4QGK and chain A




Something went wrong with processing 2J6L and chain A
No pdbs found, probably invalid uniprot Q96C36 or no structures
Smth WRONG
[]
[]
[]




Something went wrong with processing 5UAU and chain A
No pdbs found, probably invalid uniprot P29803 or no structures
Smth WRONG
[]
[]
[]




Something went wrong with processing 3NXX and chain A
Is it really such a short protein sequence or invalid Uniprot ID QLSPFPFDL
Is it really such a short protein sequence or invalid Uniprot ID QLSPFPFDL




No pdbs found, probably invalid uniprot Q9UBM7 or no structures
Smth WRONG
[]
[]
[]




Something went wrong with processing 2QWX and chain A






Something went wrong with processing 1DJL and chain A






Smth went wrong with comparison to  GCUAAACCUAGCCCCAAACCCACUCCACCUUACUACCAGACAACCUUAGCCAAACCAUUUACCCAAAUAAAGUAUAGGCGAUAGAAAUUGAAACCUGGCGCAAUAGAUAUAGUACCGCAAGGGAAAGAUGAAAAAUUAUAACCAAGCAUAAUAUAGCAAGGACUAACCCCUAUACCUUCUGCAUAAUGAAUUAACUAGAAAUAACUUUGCAAGGAGAGCCAAAGCUAAGACCCCCGAAACCAGACGAGCUACCUAAGAACAGCUAAAAGAGCACACCCGUCUAUGUAGCAAAAUAGUGGGAAGAUUUAUAGGUAGAGGCGACAAACCUACCGAGCCUGGUGAUAGCUGGUUGUCCAAGAUAGAAUCUUAGUUCAACUUUAAAUUUGCCCACAGAACCCUCUAAAUCCCCUUGUAAAUUUAACUGUUAGUCCAAAGAGGAACAGCUCUUUGGACACUAGGAAAAAACCUUGUAGAGAGAGUAAAAAAUUUAACACCCAUAGUAGGCCUAAAAGCAGCCACCAAUUAAGAAAGCGUUCAAGCUCAACACCCACUACCUAAAAAAUCCCAAACAUAUAACUGAACUCCUCACACCCAAUUGGACCAAUCUAUCACCCUAUAGAAGAACUAAUGUUAGUAUAAGUAACAUGAAAACAUUCUCCUCCGCAUAAGCCUGCGUCAGAUUAAAACACUGAACUGACAAUUAACAGCCCAAUAUCUACAAUCAACCAACAAGUCAUUAUUACCCUCACUGUCAACCCAACACAGGCAUGCUCAUAAGGAAAGGUUAAAAAAAGUAAAAGGAACUCGGCAAAUCUUACCCCGCCUGUUUACCAAAAACAUCACCUCUAGCAUCACCAGUAUUAGAGGCACCGCCUGCCCAGUGACACAUGUUUAACGGCCGCGGUACCCUAACCGUGCAAAGGUAGCAUAAUCACUUGUUCCUUAAAUAGGGACCUGUAUGAAU







Something went wrong with processing 1ZMC and chain A




No pdbs found, probably invalid uniprot Q9NRX3 or no structures
Smth WRONG
[]
[]
[]




No pdbs found, probably invalid uniprot Q02928 or no structures
Smth WRONG
[]
[]
[]




Something went wrong with processing 1Y8N and chain A




No pdbs found, probably invalid uniprot Q15800 or no structures
Smth WRONG
[]
[]
[]




No pdbs found, probably invalid uniprot P14679 or no structures
Smth WRONG
[]
[]
[]
No pdbs found, probably invalid uniprot P41439 or no structures
Smth WRONG
[]
[]
[]
No pdbs found, probably invalid uniprot P23219 or no structures
Smth WRONG
[]
[]
[]
No pdbs found, probably invalid uniprot O60488 or no structures
Smth WRONG
[]
[]
[]
No pdbs found, probably invalid uniprot O95573 or no structures
Smth WRONG
[]
[]
[]
Something went wrong with processing 4PHU and chain A
No pdbs found, probably invalid uniprot Q5JTZ9 or no structures
Smth WRONG
[]
[]
[]
Something went wrong with processing 5T5S and chain A
No pdbs found, probably invalid uniprot P43007 or no structures
Smth WRONG
[]
[]
[]
No pdbs found, probably invalid uniprot P26640 or no structures
Smth WRONG
[]
[]
[]
No pdbs found, probably invalid uniprot Q96NR8 or no structures
Smth WRONG
[]
[]
[]
No pdbs found, probably invalid uniprot O75911 or no structures
Smth WRONG
[]
[]
[]
No pdbs found, probably invalid uniprot Q6NUM9 or no



Something went wrong with processing 3O4R and chain A
No pdbs found, probably invalid uniprot Q9HBH5 or no structures
Smth WRONG
[]
[]
[]
No pdbs found, probably invalid uniprot Q9NYR8 or no structures
Smth WRONG
[]
[]
[]
No pdbs found, probably invalid uniprot O95237 or no structures
Smth WRONG
[]
[]
[]
Something went wrong with processing 5H8T and chain A
No pdbs found, probably invalid uniprot Q8NBN7 or no structures
Smth WRONG
[]
[]
[]
No pdbs found, probably invalid uniprot P10745 or no structures
Smth WRONG
[]
[]
[]




Something went wrong with processing 4J5W and chain C




Something went wrong with processing 3V99 and chain A
Something went wrong with processing 6IIE and chain A
No pdbs found, probably invalid uniprot P62714 or no structures
Smth WRONG
[]
[]
[]
Is it really such a short protein sequence or invalid Uniprot ID ALDRXEX
Is it really such a short protein sequence or invalid Uniprot ID ALDRXEX
Is it really such a short protein sequence or invalid Uniprot ID ALDRXEX
Is it really such a short protein sequence or invalid Uniprot ID ALDRXEX
Is it really such a short protein sequence or invalid Uniprot ID ALDRXEX
Is it really such a short protein sequence or invalid Uniprot ID ALDRXEX
Is it really such a short protein sequence or invalid Uniprot ID ALDRXEX
Is it really such a short protein sequence or invalid Uniprot ID ALDRXEX
Is it really such a short protein sequence or invalid Uniprot ID ALDRXEX
Is it really such a short protein sequence or invalid Uniprot ID ALDRXEX
Is it really such a short protein sequence or invalid Uniprot ID ALDRXEX
Is it



Something went wrong with processing 4TLG and chain A
Something went wrong with processing 4UYB and chain A




No pdbs found, probably invalid uniprot Q9NSE4 or no structures
Smth WRONG
[]
[]
[]




Something went wrong with processing 2JIF and chain A
No pdbs found, probably invalid uniprot P41252 or no structures
Smth WRONG
[]
[]
[]
Is it really such a short protein sequence or invalid Uniprot ID PPPPPPPPP
Is it really such a short protein sequence or invalid Uniprot ID PPGPAGPPG
Is it really such a short protein sequence or invalid Uniprot ID PPGPRGPPG
Is it really such a short protein sequence or invalid Uniprot ID PPGPEGPPG




No pdbs found, probably invalid uniprot O43272 or no structures
Smth WRONG
[]
[]
[]
Smth went wrong with comparison to  AUCGCUUCUCGGCCUUUUGGCUAAGAUCAAGUGUAGUAUCUGUUCUUAUCAGUUUAAUAUCUGAUACGUCCUCUAUCCGAGGACAAUAUAUUAAAUGGAUUUUUGGAGCAGGGAGAUGGAAUAGGAGCUUGCUCCGUCCACUCCACGCAUCGACCUGGUAUUGCAGUACCUCCAGGAACGGUGCACCC
Smth went wrong with comparison to  AGCUUUGCGCAGUGGCAGUAUCGUAGCCAAUGAGGUUUAUCCGAGGCGCGAUUAUUGCUAAUUGAAAACUUUUCCCAAUACCCCGCCGUGACGACUUGCAAUAUAGUCGGCAUUGGCAAUUUUUGACAGUCUCUACGGAGACUGG
Smth went wrong with comparison to  AUACUCUGGUUUCUCUUCAGAUCGCAUAAAUCUUUCGCCUUUUACUAAAGAUUUCCGUGGAGAGGAACAACUCUGAGUCUUAACCCAAUUUUUUGAGCCUUGCCUUGGCAAGGCUA
Smth went wrong with comparison to  GUGCUCGCUUCGGCAGCACAUAUACUAAAAUUGGAACGAUACAGAGAAGAUUAGCAUGGCCCCUGCGCAAGGAUGACACGCAAAUUCGUGAAGCGUUCCAUAUUUU
Smth went wrong with comparison to  GGGAGACGGAAUUCGAGCUCGCCCACUCUUGGAUCGGAAACCCGUCGGCCUCCGAACGGUAAGAGCCUAGCAUGUAGAACUGGUUACCUGCAGCCCAAGCUUGCUGCACGUCUAGGGCGCAGUAGUCCAGGGUUUCCUUGAUGAUGUCAUACUUAUCCUGUCCCUUUUUUUUCCACA



Smth went wrong with comparison to  AUACUCUGGUUUCUCUUCAGAUCGCAUAAAUCUUUCGCCUUUUACUAAAGAUUUCCGUGGAGAGGAACAACUCUGAGUCUUAACCCAAUUUUUUGAGGCCUUGCUUUGGCAAGGCUA
Smth went wrong with comparison to  GUGCUCGCUUCGGCAGCACAUAUACUAAAAUUGGAACGAUACAGAGAAGAUUAGCAUGGCCCCUGCGCAAGGAUGACACGCAAAUUCGUGAAGCGUUCCAUAUUUUU
Smth went wrong with comparison to  GGGAAUACACGGAAUUCGAGCUCGCCCACUCUUGGAUCGGAAACCCGUCGGCCUCCGAACGNGUAAGAGCCUAGCAUGUAGAACUGGUUACCUGCAGCCCAAGCUUGCGUACACCAUCAGGGUACGUACUAGUACGUACACCAUCAGGGUACGGCUGCACGUCUAGGGCGCAGUAGUCCAGGGUUUCCUUGAUGAUGUCAUACUUAUCCUGUCCCUUUUUUUUCCACGGCUCGCGGUUGAGGACAAACUCUUCGCGGUCUUUCCAGUGGGGAUCC
Smth went wrong with comparison to  AUCGCUUCUCGGCCUUUUGGCUAAGAUCAAGUGUAGUAUCUGUUCUUAUCAGUUUAAUAUCUGAUACGUCCUCUAUCCGAGGACAAUAUAUUAAAUGGAUUUUUGGAGCAGGGAGAUGGAAUAGGAGCUUGCUCCGUCCACUCCACGCAUCGACCUGGUAUUGCAGUACCUCCAGGAACGGUGCACCC
No pdbs found, probably invalid uniprot Q4W8W1 or no structures
Smth WRONG
[]
[]
[]
No pdbs found, probably invalid uniprot Q7L3T8 or no structures
Smth WRONG
[]
[]




Something went wrong with processing 6FD4 and chain A
Something went wrong with processing 4YML and chain A
Something went wrong with processing 1XWW and chain A




Something went wrong with processing 1YXM and chain A




Something went wrong with processing 4KQJ and chain A




Is it really such a short protein sequence or invalid Uniprot ID GRVYIHPI
Is it really such a short protein sequence or invalid Uniprot ID GRVYIHPI
Is it really such a short protein sequence or invalid Uniprot ID DRVYIHPF
Is it really such a short protein sequence or invalid Uniprot ID FRHDSGY
Is it really such a short protein sequence or invalid Uniprot ID FRHDSGY
Is it really such a short protein sequence or invalid Uniprot ID FRHDSGY
Is it really such a short protein sequence or invalid Uniprot ID FRHDSGY
Is it really such a short protein sequence or invalid Uniprot ID MVGGVVIA
Is it really such a short protein sequence or invalid Uniprot ID MVGGVVIA
No pdbs found, probably invalid uniprot P46663 or no structures
Smth WRONG
[]
[]
[]




No pdbs found, probably invalid uniprot Q05940 or no structures
Smth WRONG
[]
[]
[]
No pdbs found, probably invalid uniprot Q01959 or no structures
Smth WRONG
[]
[]
[]
No pdbs found, probably invalid uniprot Q96RJ0 or no structures
Smth WRONG
[]
[]
[]
No pdbs found, probably invalid uniprot P23975 or no structures
Smth WRONG
[]
[]
[]
No pdbs found, probably invalid uniprot P35348 or no structures
Smth WRONG
[]
[]
[]
No pdbs found, probably invalid uniprot P35368 or no structures
Smth WRONG
[]
[]
[]
No pdbs found, probably invalid uniprot P25100 or no structures
Smth WRONG
[]
[]
[]
Smth WRONG
[]
[]
['2CVA']
No pdbs found, probably invalid uniprot P18825 or no structures
Smth WRONG
[]
[]
[]




Smth WRONG
[]
[]
['2CDW']




Something went wrong with processing 2VZ2 and chain A
No pdbs found, probably invalid uniprot Q99870 or no structures
Smth WRONG
[]
[]
[]


SSLError: HTTPSConnectionPool(host='www.rcsb.org', port=443): Max retries exceeded with url: /pdb/download/downloadFastaFiles.do?structureIdList=2K59&compressionType=uncompressed (Caused by SSLError(SSLError("bad handshake: SysCallError(104, 'ECONNRESET')")))

In [27]:
get_uniprots_lignames_dict_from_db(root)

NameError: name 'path' is not defined

In [7]:
path_to_uniprots = aux.get_uniprots_lignames_dict_from_db(root)
with open(path_to_uniprots, 'rb') as f:
    uniprots = pickle.load(f)
print(uniprots)

NameError: name 'path' is not defined

In [20]:
uniprot = 'P00533'
path, pdb, chain = get_best_pdb_of_target(uniprot, root, verbose=True)

1DNQ
1DNR
1IVO
Similarity=3136.0, identity=622
Similarity=3136.0, identity=622
Similarity=-511.5, identity=52
Similarity=-511.5, identity=52
1M14
Similarity=1270.0, identity=333
1M17
Similarity=1270.0, identity=333
1MOX
Similarity=2354.5, identity=501
Similarity=2354.5, identity=501
Similarity=-510.0, identity=49
Similarity=-510.0, identity=49
1NQL
Similarity=3117.5, identity=623
Similarity=-511.5, identity=52
1XKK
Similarity=1310.5, identity=351
1YY9
Similarity=3108.5, identity=621
Similarity=-358.5, identity=165
Similarity=-350.5, identity=166
1Z9I
Similarity=-337.5, identity=53
2EB2
Similarity=1268.0, identity=334
2EB3
Similarity=1268.0, identity=333
2EXP
2EXQ
2GS2
Similarity=1264.5, identity=330
2GS6
Similarity=1264.5, identity=330
Similarity=-595.0, identity=13
2GS7
Similarity=1257.5, identity=329
Similarity=1257.5, identity=329
2ITN
Similarity=1254.5, identity=327
2ITO
Similarity=1254.5, identity=327
2ITP
Similarity=1254.5, identity=327
2ITQ
Similarity=1254.5, identity=327
2ITT
S

Similarity=1268.0, identity=330
4WKQ
Similarity=1264.5, identity=330
4WRG
Similarity=1264.5, identity=330
4ZAU
Similarity=1264.5, identity=330
4ZJV
Similarity=1269.0, identity=331
Similarity=1269.0, identity=331
Similarity=-500.5, identity=68
Similarity=-500.5, identity=68
4ZSE
Similarity=1256.0, identity=329
Similarity=1256.0, identity=329
Similarity=1256.0, identity=329
Similarity=1256.0, identity=329
5C8K
Similarity=1235.5, identity=326
5C8M
Similarity=1235.5, identity=326
5C8N
Similarity=1235.5, identity=326
5CAL
Similarity=1235.5, identity=326
5CAN
Similarity=1235.5, identity=326
5CAO
Similarity=1235.5, identity=326
5CAP
Similarity=1235.5, identity=326
5CAQ
Similarity=1235.5, identity=326
5CAS
Similarity=1235.5, identity=326
5CAU
Similarity=1235.5, identity=326
5CAV
Similarity=1265.5, identity=331
5CNN
Similarity=1361.5, identity=350
Similarity=1361.5, identity=350
5CNO
Similarity=1257.5, identity=329
Similarity=1257.5, identity=329
Similarity=1257.5, identity=329
5CZH
Similarity=

In [17]:
load_info_db_from_namelist(['ligands_names_and_their_targets_resources'], root)
print(ligands_names_and_their_targets_resources)

{'Lepirudin': [['HUGO Gene Nomenclature Committee (HGNC)', 'GenAtlas', 'GenBank Gene Database', 'GenBank Protein Database', 'Guide to Pharmacology', 'UniProtKB', 'UniProt Accession']], 'Cetuximab': [['HUGO Gene Nomenclature Committee (HGNC)', 'GenAtlas', 'GenBank Gene Database', 'GenBank Protein Database', 'Guide to Pharmacology', 'UniProtKB', 'UniProt Accession'], ['HUGO Gene Nomenclature Committee (HGNC)', 'GenAtlas', 'GenBank Gene Database', 'GenBank Protein Database', 'UniProtKB', 'UniProt Accession'], ['HUGO Gene Nomenclature Committee (HGNC)', 'GenAtlas', 'GenBank Gene Database', 'GenBank Protein Database', 'Guide to Pharmacology', 'UniProtKB', 'UniProt Accession'], ['HUGO Gene Nomenclature Committee (HGNC)', 'GenAtlas', 'GenBank Gene Database', 'GenBank Protein Database', 'UniProtKB', 'UniProt Accession'], ['HUGO Gene Nomenclature Committee (HGNC)', 'GenAtlas', 'GenBank Gene Database', 'GenBank Protein Database', 'UniProtKB', 'UniProt Accession'], ['HUGO Gene Nomenclature Commit

In [11]:
get_closest_complexes('3W6H', -0.05, 0.90, root, 10)
print(datetime.datetime.now())

NameError: name 'connection' is not defined

In [6]:
name_uniprot_sim = ('1', '2', 3, 5)
input1 = 'pdb'
connection_keys = []
#connection_key = (name_uniprot_sim[0], name_uniprot_sim[1], name_uniprot_sim[2], name_uniprot_sim[3], input1)
connection_key = (name_uniprot_sim[:], input1)
connection_keys.append(connection_key)
name_uniprot_sim = ('3', '4', 7, 4)
connection_key = (name_uniprot_sim[0], name_uniprot_sim[1], name_uniprot_sim[2], name_uniprot_sim[3], input1)
connection_keys.append(connection_key)
print(connection_keys)

[(('1', '2', 3, 5), 'pdb'), ('3', '4', 7, 4, 'pdb')]


In [6]:
pdb_dir = Path(root) / 'pdb'
pdb1 = '3W2O'
aux.download_pdb(pdb1, pdb_dir)
pdb2 = '1IVO'
aux.download_pdb(pdb2, pdb_dir)
struct1_path = str(pdb_dir / (pdb1 + '.pdb'))
struct2_path = str(pdb_dir / (pdb2 + '.pdb'))
get_TMscore_and_RMSD(struct1_path, struct2_path, verbose=True)


 *****************************************************************************
 *                                 TM-SCORE                                  *
 * A scoring function to assess the similarity of protein structures         *
 * Based on statistics:                                                      *
 *       0.0 < TM-score < 0.17, random structural similarity                 *
 *       0.5 < TM-score < 1.00, in about the same fold                       *
 * Reference: Yang Zhang and Jeffrey Skolnick, Proteins 2004 57: 702-710     *
 * For comments, please email to: zhng@umich.edu                             *
 *****************************************************************************

Structure1: /home/anto  Length=  306
Structure2: /home/anto  Length=  306 (by which all scores are normalized)
Number of residues in common=  306
RMSD of  the common residues=    0.005

TM-score    = 1.0000  (d0= 6.42)
MaxSub-score= 1.0000  (d0= 3.50)
GDT-TS-score= 1.0000 %(d<1)=1.0000 

(1.0, 0.005)

In [19]:
uniprot1 = 'P08100'
aux.get_pdbs_from_uniprot(uniprot1)

['4ZWJ', '5DGY', '5W0P', '6CMO']

In [5]:
extract_approved_sdf(path_to_sdf_from_drugbank, root)

'/home/anton_maximov/BACHELOR/Drugbank_extracted/structures_approved.sdf'

In [18]:
download_proteomes(root, overwrite=True)

In [13]:
%%time
print(get_closest_fastas_from_uniprot('Q14416', path_to_data_in_fasta, k=0, align_matrix='blosum62', sim_or_ident=True))

MGSLLALLALLLLWGAVAEGPAKKVLTLEGDLVLGGLFPVHQKGGPAEDCGPVNEHRGIQRLEAMLFALDRINRDPHLLPGVRLGAHILDSCSKDTHALEQALDFVRASLSRGADGSRHICPDGSYATHGDAPTAITGVIGGSYSDVSIQVANLLRLFQIPQISYASTSAKLSDKSRYDYFARTVPPDFFQAKAMAEILRFFNWTYVSTVASEGDYGETGIEAFELEARARNICVATSEKVGRAMSRAAFEGVVRALLQKPSARVAVLFTRSEDARELLAASQRLNASFTWVASDGWGALESVVAGSEGAAEGAITIELASYPISDFASYFQSLDPWNNSRNPWFREFWEQRFRCSFRQRDCAAHSLRAVPFEQESKIMFVVNAVYAMAHALHNMHRALCPNTTRLCDAMRPVNGRRLYKDFVLNVKFDAPFRPADTHNEVRFDRFGDGIGRYNIFTYLRAGSGRYRYQKVGYWAEGLTLDTSLIPWASPSAGPLPASRCSEPCLQNEVKSVQPGEVCCWLCIPCQPYEYRLDEFTCADCGLGYWPNASLTGCFELPQEYIRWGDAWAVGPVTIACLGALATLFVLGVFVRHNATPVVKASGRELCYILLGGVFLCYCMTFIFIAKPSTAVCTLRRLGLGTAFSVCYSALLTKTNRIARIFGGAREGAQRPRFISPASQVAICLALISGQLLIVVAWLVVEAPGTGKETAPERREVVTLRCNHRDASMLGSLAYNVLLIALCTLYAFKTRKCPENFNEAKFIGFTMYTTCIIWLAFLPIFYVTSSDYRVQTTTMCVSVSLSGSVVLGCLFAPKLHIILFQPQKNVVSHRAPTSRFGSAAARASSSLGQGSGSQFVPTVCNGREVVDSTTSSL
-43 263
-74 379
-171 158
-57 279
-205 156
-159 154
-190 156
-152 164
-33 281
-105 205
-165 182
-164 180
-147 185
-134 161
-86 2

-96 228
-130 222
-185 163
-49 267
-77 268
-241 134
-229 467
-238 466
-207 445
-141 216
-228 459
-63 230
-13 348
-197 453
-144 193
-139 195
-44 328
-50 342
-159 178
-25 338
-258 114
-84 221
-17 323
-63 275
-24 279
-53 269
-66 237
-47 314
-116 217
-198 141
-118 189
-82 193
-79 336
-91 369
-98 221
-58 333
-188 444
-264 465
-225 428
-260 462
-202 142
-208 144
-191 150
-62 263
-234 141
-104 215
-39 284
-30 295
-41 310
-25 314
-73 302
-115 314
-117 198
-145 207
-206 149
-55 322
-148 169
-5 365
-252 110
-149 187
-229 127
-97 221
-46 368
-92 367
-43 378
-170 163
-134 197
-475 518
-319 88
-61 304
-424 502
-134 204
-95 206
-120 202
-74 374
-96 249
-263 107
-166 184
-84 217
-97 221
-214 467
-189 143
-155 176
-73 373
-120 225
-112 219
-136 198
-181 164
-61 225
-251 121
-83 214
-171 415
-404 499
-38 360
-79 270
-467 499
-327 484
-53 267
-61 276
-29 265
-71 272
-56 353
-43 334
0 302
-71 269
-100 405
-87 407
-28 343
2 335
-20 370
-77 394
-181 170
-99 221
-126 236
-65 327
-67 276
-72 294
-27 361
-68 2

-23 307
-191 151
-120 201
-247 123
-136 199
-215 147
-288 102
-234 133
-204 140
-184 148
-140 428
-223 138
-270 105
-221 149
-128 204
-172 180
-125 201
-213 147
-157 190
-126 209
-259 119
-217 141
-91 221
-167 177
-300 100
-245 137
-104 211
-272 124
-94 218
-120 205
-223 140
-323 77
-77 358
-74 232
-63 350
-164 173
-218 143
-130 205
-167 172
-287 117
-186 157
-302 95
-151 191
-135 190
-199 144
-115 193
-68 373
-192 163
-40 299
-164 177
-168 177
-84 326
-344 69
-253 110
-210 163
-225 138
-162 177
-222 151
-303 96
-53 268
-249 127
-147 202
-200 171
-259 124
-185 169
-169 173
-279 98
-268 104
-63 380
-50 301
-150 188
-69 222
-219 148
-20 295
-176 179
-243 128
-85 235
-107 266
-172 177
-243 142
-288 99
-198 149
-223 133
-93 229
-296 91
-176 162
-116 212
-168 175
-58 287
-180 188
-42 290
-42 339
-219 140
-90 238
-223 137
-204 147
-91 216
-65 329
-116 212
-233 145
-119 204
-145 201
-331 71
-123 208
-137 190
-150 178
-202 142
-278 106
-167 184
-119 204
-174 168
-114 216
-176 177
-208 160
-279

In [None]:
from multiprocessing import Pool

def process_image(name):
    sci=fits.open('{}.fits'.format(name))
    <process>

if __name__ == '__main__':
    pool = Pool()                         # Create a multiprocessing Pool
    pool.map(process_image, data_inputs)


In [9]:
from multiprocessing import Pool
import os

if __name__ == '__main__':
    pool = Pool(os.cpu_count() - 1)                         # Create a multiprocessing Pool
    pool.map(get_closest_fastas_from_uniprot, 'P08100', path_to_data_in_fasta) 


TypeError: '<=' not supported between instances of 'str' and 'int'

Process ForkPoolWorker-79:
Process ForkPoolWorker-82:
Process ForkPoolWorker-77:
Process ForkPoolWorker-59:
Process ForkPoolWorker-60:
Process ForkPoolWorker-72:
Process ForkPoolWorker-62:
Process ForkPoolWorker-65:
Process ForkPoolWorker-83:
Process ForkPoolWorker-70:
Process ForkPoolWorker-64:
Process ForkPoolWorker-63:
Process ForkPoolWorker-61:
Process ForkPoolWorker-73:
Process ForkPoolWorker-57:
Process ForkPoolWorker-78:
Process ForkPoolWorker-81:
Process ForkPoolWorker-74:
Process ForkPoolWorker-80:
Process ForkPoolWorker-58:
Traceback (most recent call last):
Process ForkPoolWorker-76:
Process ForkPoolWorker-75:
Process ForkPoolWorker-71:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call l

  File "/home/anton_maximov/anaconda3/lib/python3.7/multiprocessing/pool.py", line 110, in worker
    task = get()
  File "/home/anton_maximov/anaconda3/lib/python3.7/multiprocessing/pool.py", line 110, in worker
    task = get()
  File "/home/anton_maximov/anaconda3/lib/python3.7/multiprocessing/pool.py", line 110, in worker
    task = get()
  File "/home/anton_maximov/anaconda3/lib/python3.7/multiprocessing/process.py", line 99, in run
    self._target(*self._args, **self._kwargs)
  File "/home/anton_maximov/anaconda3/lib/python3.7/multiprocessing/pool.py", line 110, in worker
    task = get()
  File "/home/anton_maximov/anaconda3/lib/python3.7/multiprocessing/pool.py", line 110, in worker
    task = get()
  File "/home/anton_maximov/anaconda3/lib/python3.7/multiprocessing/pool.py", line 110, in worker
    task = get()
  File "/home/anton_maximov/anaconda3/lib/python3.7/multiprocessing/pool.py", line 110, in worker
    task = get()
  File "/home/anton_maximov/anaconda3/lib/python3.7/

KeyboardInterrupt
KeyboardInterrupt
KeyboardInterrupt
KeyboardInterrupt
  File "/home/anton_maximov/anaconda3/lib/python3.7/multiprocessing/synchronize.py", line 95, in __enter__
    return self._semlock.__enter__()
KeyboardInterrupt
KeyboardInterrupt
  File "/home/anton_maximov/anaconda3/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes
    buf = self._recv(4)
KeyboardInterrupt
KeyboardInterrupt
  File "/home/anton_maximov/anaconda3/lib/python3.7/multiprocessing/process.py", line 99, in run
    self._target(*self._args, **self._kwargs)
KeyboardInterrupt
  File "/home/anton_maximov/anaconda3/lib/python3.7/multiprocessing/queues.py", line 351, in get
    with self._rlock:
  File "/home/anton_maximov/anaconda3/lib/python3.7/multiprocessing/process.py", line 99, in run
    self._target(*self._args, **self._kwargs)
KeyboardInterrupt
KeyboardInterrupt
KeyboardInterrupt
KeyboardInterrupt
Traceback (most recent call last):
  File "/home/anton_maximov/anaconda3/lib/python3

In [68]:
print(len(mol_id_list))
print(len(tanim))
print(len(ligands_drugbank_ids))
print(tanim)

9680
1940
3220
(0.34328358208955223, 0.3176470588235294, 0.3088235294117647, 0.2905982905982906, 0.2682926829268293, 0.26582278481012656, 0.2631578947368421, 0.25, 0.25, 0.24, 0.23655913978494625, 0.23595505617977527, 0.22857142857142856, 0.22580645161290322, 0.2222222222222222, 0.22105263157894736, 0.22105263157894736, 0.2169811320754717, 0.21649484536082475, 0.21138211382113822, 0.21138211382113822, 0.2111111111111111, 0.21052631578947367, 0.208955223880597, 0.2079207920792079, 0.2079207920792079, 0.20754716981132076, 0.20754716981132076, 0.20754716981132076, 0.20388349514563106, 0.2037037037037037, 0.2, 0.19696969696969696, 0.19658119658119658, 0.19658119658119658, 0.19469026548672566, 0.1941747572815534, 0.19327731092436976, 0.1926605504587156, 0.1926605504587156, 0.19130434782608696, 0.19090909090909092, 0.19047619047619047, 0.1891891891891892, 0.18867924528301888, 0.18840579710144928, 0.18803418803418803, 0.18796992481203006, 0.18705035971223022, 0.18518518518518517, 0.1846153846

In [8]:
def get_element_of_fasta_by_number(path_to_data_in_fasta, n):
    """OUTPUT -- SeqIO fasta sequence element
    INPUT -- path to fasta file with compared fastas, number of needed element"""
    # Load fastas with thich compared           
    records = list(SeqIO.parse(path_to_data_in_fasta, "fasta"))
    return records[n]  

#print(get_element_of_fasta_by_number(path_to_data_in_fasta, 63).seq)
#print()
#print(get_element_of_fasta_by_number(path_to_data_in_fasta, 62).seq)
#print()
#print(Bio.SeqIO.read(fasta, "fasta").seq)
print_closest_fastas('/home/anton_maximov/BACHELOR/Drugbank_extracted/Drugbank_targets.fasta', [0])
#print(df.iloc[0].loc['position_in_fasta'])

Name =  lcl|BSEQ0016004|Prothrombin
Seq =  MAHVRGLQLPGCLALAALCSLVHSQHVFLAPQQARSLLQRVRRANTFLEEVRKGNLERECVEETCSYEEAFEALESSTATDVFWAKYTACETARTPRDKLAACLEGNCAEGLGTNYRGHVNITRSGIECQLWRSRYPHKPEINSTTHPGADLQENFCRNPDSSTTGPWCYTTDPTVRRQECSIPVCGQDQVTVAMTPRSEGSSVNLSPPLEQCVPDRGQQYQGRLAVTTHGLPCLAWASAQAKALSKHQDFNSAVQLVENFCRNPDGDEEGVWCYVAGKPGDFGYCDLNYCEEAVEEETGDGLDEDSDRAIEGRTATSEYQTFFNPRTFGSGEADCGLRPLFEKKSLEDKTERELLESYIDGRIVEGSDAEIGMSPWQVMLFRKSPQELLCGASLISDRWVLTAAHCLLYPPWDKNFTENDLLVRIGKHSRTRYERNIEKISMLEKIYIHPRYNWRENLDRDIALMKLKKPVAFSDYIHPVCLPDRETAASLLQAGYKGRVTGWGNLKETWTANVGKGQPSVLQVVNLPIVERPVCKDSTRIRITDNMFCAGYKPDEGKRGDACEGDSGGPFVMKSPFNNRWYQMGIVSWGEGCDRDGKYGFYTHVFRLKKWIQKVIDQFGE


In [10]:
struct1_path = '/home/anton_maximov/BACHELOR/Drugbank_extracted/SDF_ideal.sdf'
struct2_path = '/home/anton_maximov/BACHELOR/Drugbank_extracted/SDF_ideal.pdb'
#aux.convert_single_structure(struct1_path, struct2_path)
get_TMscore_and_RMSD(struct1_path, struct2_path)

 There is no common residues in the input structures



0.0

In [34]:
pybel.fps

['ecfp0',
 'ecfp10',
 'ecfp2',
 'ecfp4',
 'ecfp6',
 'ecfp8',
 'fp2',
 'fp3',
 'fp4',
 'maccs']

In [None]:
https://www.rcsb.org/pdb/download/downloadFastaFiles.do?structureIdList=1IVO&compressionType=uncompressed

In [15]:
pdb_dir = '/home/anton_maximov/BACHELOR/pdbs'
pdb_list = aux.get_pdbs_from_uniprot('P00533')
for pdb in pdb_list:
    aux.download_pdb(pdb, pdb_dir)
struct1_path = '/home/anton_maximov/BACHELOR/pdbs/' + pdb_list[0] + '.pdb'
for pdb in pdb_list[1:]:
    struct1_path = '/home/anton_maximov/BACHELOR/pdbs/' + pdb + '.pdb'
    print(f'{pdb} {get_TMscore_and_RMSD(struct1_path, struct2_path)}')

1DNR (1.0, 0.0)
1IVO (0.5054, 37.432)
1M14 (1.0, 0.0)
1M17 (1.0, 0.006)
1MOX (0.5017, 37.343)
1NQL (0.9408, 5.826)
1XKK (1.0, 0.0)
1YY9 (0.5928, 51.98)
1Z9I (1.0, 0.0)
2EB2 (1.0, 0.0)
2EB3 (1.0, 0.0)
2EXP (0.5759, 17.215)
2EXQ (0.7328, 23.519)
2GS2 (1.0, 0.0)
2GS6 (1.0, 0.0)
2GS7 (0.5328, 29.906)
2ITN (1.0, 0.01)
2ITO (1.0, 0.0)
2ITP (1.0, 0.002)
2ITQ (1.0, 0.009)
2ITT (1.0, 0.004)
2ITU (1.0, 0.006)
2ITV (1.0, 0.008)
2ITW (1.0, 0.002)
2ITX (1.0, 0.0)
2ITY (1.0, 0.0)
2ITZ (1.0, 0.0)
2J5E (1.0, 0.0)
2J5F (1.0, 0.0)
2J6M (1.0, 0.0)
2JIT (0.5302, 25.941)
2JIU (0.5293, 26.045)
2JIV (0.536, 27.775)
2KS1 (1.0, 0.0)
2M0B (0.5482, 8.478)
2M20 (1.0, 0.0)
2N5S (1.0, 0.0)
2RF9 (0.5358, 35.777)
2RFD (0.5324, 24.959)
2RFE (0.3279, 38.944)
2RGP (1.0, 0.0)
3B2U (0.1789, 67.92)
3B2V (0.7824, 13.137)
3BEL (1.0, 0.0)
3BUO (0.5347, 26.075)
3C09 (0.4056, 42.898)
3G5V (0.5776, 17.267)
3G5Y (0.5747, 17.291)
3GOP (1.0, 0.003)
3GT8 (0.287, 53.554)
3IKA (0.5317, 26.0)
3LZB (0.2998, 38.188)
3NJP (0.5211, 36.494)

In [13]:
#input1 = '/home/anton_maximov/BACHELOR/Drugbank_extracted/uniprot-reviewed%3Ayes+AND+proteome%3Aup000005640.fasta'
input1 = '/home/anton_maximov/BACHELOR/Drugbank_extracted/Drugbank_targets.fasta'
records = list(SeqIO.parse(input1, "fasta"))
print(records.seq)
print(len(records))

AttributeError: 'list' object has no attribute 'seq'

In [14]:
for i in range(1,5):

range(1, 5)

In [4]:
input1 = '/home/anton_maximov/BACHELOR/P00533.fasta'
input2 = '/home/anton_maximov/BACHELOR/P08069.fasta'
get_sequences_similarity(input1, input2)

Using blosum62
Number of alignments = 1000
MRPSGTAGAA------LLALLAALC--PASRALEEKKVCQGTSNKLTQLGT-FEDHFLSLQRMFNNCEVVLGNLEITYVQ-----RNYDLSFLKTIQEVAGYVLI----ALNTVERIPLENLQIIRG-NMYYENSYALAVLSNYDANKTGLKELPMRNLQEILHGAVRFSNNPALCNVESIQWR---DIVSSDFLSNMSMDFQNHLGS--CQKCDPSCP------------------NGSCWGAGEENCQKLTKIICAQQCSGRCRGKSPSDCCHNQCAAGCTGP-RESDCLVCRKFRDEATCKDTCPPLMLYNPTTYQMDVNPEGKYSFGATCVKK--C----------PRNYVVTDHGSCVRAC-------GADSYEMEEDGVRKCKKCEGPCRKVCNGIGIGEFKDSLSINA-TNIKHFKNCTSISGDLHILPVAFRGDSFTHTPPLDPQELD-ILKTVKEITGFLLIQAWPENRTDLHAFENLEIIRGRTKQHGQFSLAVV-SLNITSL---GLRSLKEISDGDVIISGNKNLCYANTINWKKLFGTSGQKTK---IISNRGEN-SCKATGQVCHALCSPEG------CW----GPEPRDCVS---------CRNVSR--GRECVDKCN---------------------LLEG-EP------------REFVENSE----------------------------------CIQCHPECLPQ---AMNITCTGRGPDN--------C----IQCAHYID---------------------GPHCVKTCP------------------------------------AGVMGENNTLV------WKYADAGHV-------------------------------------CHLC-HPNCTYGCTG-----------------PG---LEGCPTNG-----PKIPSIATGMV----

(666, 497)

In [24]:
df = get_closest_smiles_names('ClC1=CC=CC=C1CN1CCCC2=C(C1)C=CS2', 3)
print(df, df['name'], df['smiles'],df['query'], df['similarity'])

NameError: name 'df' is not defined

In [None]:
print(len(targets_names_and_fastas.keys()))

In [13]:
%%writefile RDkit1.py

from pathlib import Path
from rdkit.Chem import AllChem, rdMolAlign
from rdkit import Chem, DataStructs, RDConfig
from rdkit.Chem.Pharm2D import Gobbi_Pharm2D, Generate
import DATABASES_SMILES as db

pdb1 = '1AZM'
pdb2 = '2KI5'
root = '/media/anton/b8150e49-6ff0-467b-ad66-40347e8bb188/anton/BACHELOR'
path = str((Path(root) / 'RDkit' / 'pdbs'))
aux.make_dir(path)
# acetazolamide
smiles1 = 'CC(=O)NC1=NN=C(S1)S(N)(=O)=O'
aux.download_pdb(pdb1, path)
pdb1_path = str(Path(path) / (pdb1 + '.pdb'))
# acyclovir
smiles2 = 'NC1=NC(=O)C2=C(N1)N(COCCO)C=N2'
aux.download_pdb(pdb2, path)
pdb2_path = str(Path(path) / (pdb2 + '.pdb'))
# The reference molecule
ref1 = Chem.MolFromSmiles(smiles1)
ref2 = Chem.MolFromSmiles(smiles2)
# The PDB conformations
mol1 = Chem.MolFromPDBFile(pdb1_path)
mol1 = AllChem.AssignBondOrdersFromTemplate(ref1, mol1)
mol2 = Chem.MolFromPDBFile(pdb2_path)
mol2 = AllChem.AssignBondOrdersFromTemplate(ref1, mol2)

# pharmacophore fingerprint
factory = Gobbi_Pharm2D.factory
fp1 = Generate.Gen2DFingerprint(mol1, factory, dMat=Chem.Get3DDistanceMatrix(mol1))
fp2 = Generate.Gen2DFingerprint(mol2, factory, dMat=Chem.Get3DDistanceMatrix(mol2))
# Tanimoto similarity
tani = DataStructs.TanimotoSimilarity(fp1, fp2)
print(tani)
# Align them
#rms = rdMolAlign.AlignMol(mol1, mol2)
#print(rms)
# Align them with OPEN3DAlign
#pyO3A = rdMolAlign.GetO3A(mol1, mol2)
#score = pyO3A.Align()
#print(score)

Overwriting RDkit1.py


In [33]:
#%%writefile RDkit1.py

from rdkit import Chem
from rdkit.Chem import AllChem
mol = Chem.MolFromSmiles('NC(=[NH2+])c1ccc(C[C@@H](NC(=O)CNS(=O)(=O)c2ccc3ccccc3c2)C(=O)N2CCCCC2)cc1')
cids = AllChem.EmbedMultipleConfs(mol, numConfs=50, maxAttempts=1000, pruneRmsThresh=0.1)
print(len(cids))
# align the conformers
rmslist = []
AllChem.AlignMolConformers(mol, RMSlist=rmslist)
print(len(rmslist))
# calculate RMS of confomers 1 and 9 separately
rms = AllChem.GetConformerRMS(mol, 1, 9, prealigned=True)

Overwriting RDkit1.py


In [2]:
#%%writefile Openbabel.py
import pybel
mymol = pybel.readstring("smi", "CCCC")
print(mymol.molwt)

58.1222


In [14]:
#%%writefile Openbabel.py
import openbabel
from pathlib import Path

name = 'A3551'
root = '/media/anton/b8150e49-6ff0-467b-ad66-40347e8bb188/anton/BACHELOR'
path = str(Path(root) / 'openbabel')
convert_structure('pdb', path, 'sdf', path, name)