In [7]:
import pandas as pd
import wget
import re

In [3]:
## Download the data
DSET_URL = {
    "ATP": "https://webs.iiitd.edu.in/raghava/ccpdb/datasets/ATP.txt",
    "ADP": "https://webs.iiitd.edu.in/raghava/ccpdb/datasets/ADP.txt",
    "GTP": "https://webs.iiitd.edu.in/raghava/ccpdb/datasets/GTP.txt",
    "GDP": "https://webs.iiitd.edu.in/raghava/ccpdb/datasets/GDP.txt",
    "NAD": "https://webs.iiitd.edu.in/raghava/ccpdb/datasets/NAD.txt",
    "FAD": "https://webs.iiitd.edu.in/raghava/ccpdb/datasets/FAD.txt",
    "FMN": "https://webs.iiitd.edu.in/raghava/ccpdb/datasets/FMN.txt",
    "UDP": "https://webs.iiitd.edu.in/raghava/ccpdb/datasets/UDP.txt",
}

for k, v in DSET_URL.items():
    print(f"Downloading: {k}") 
    wget.download(v)

Downloading: ATP
100% [............................................................................] 258824 / 258824Downloading: ADP
100% [............................................................................] 309267 / 309267Downloading: GTP
100% [..............................................................................] 82648 / 82648Downloading: GDP
100% [..............................................................................] 84786 / 84786Downloading: NAD
100% [............................................................................] 108965 / 108965Downloading: FAD
100% [............................................................................] 150034 / 150034Downloading: FMN
100% [..............................................................................] 69101 / 69101Downloading: UDP
100% [..............................................................................] 51890 / 51890

In [8]:
from Bio import SeqIO

def parse_dset(filename: str, write: bool = False) -> pd.DataFrame:
    with open(filename) as fasta_file:  # Will close handle cleanly
        identifiers = []
        chains = []
        sequences = []
        interactions = []
        lengths = []
        for seq_record in SeqIO.parse(fasta_file, 'fasta'):  # (generator)
            identifiers.append(seq_record.id[:-1])
            chains.append(seq_record.id[-1])
            lengths.append(len(seq_record.seq)/2)

            parsed_sequence_and_interactions = re.split(';', re.sub("\+|-", lambda match: ';' + match.group(), str(seq_record.seq), count=1), maxsplit=1)#str(seq_record.seq).split("-", maxsplit=1)
            sequences.append(parsed_sequence_and_interactions[0])
            interactions.append(parsed_sequence_and_interactions[1])

    data = zip(identifiers, chains, sequences, interactions,lengths)
    df = pd.DataFrame.from_dict(data)
    df.columns = ["PDB", "chain", "sequence", "interacting_residues", "length"]
    df["interactor"] = filename.split(".")[0]
    
    if write:
        df.to_csv(filename.split(".")[0] + ".csv")
    return df

In [9]:
## Iterate over datasets
dataset_list = [parse_dset(k + ".txt", write=True) for k in DSET_URL.keys()]

In [11]:
df = pd.concat(dataset_list)
df.reset_index(inplace=True, drop=True)
df.to_csv("PROTEINS_NUCLEOTIDES.csv")

In [12]:
df

Unnamed: 0,PDB,chain,sequence,interacting_residues,length,interactor
0,1a0i,A,VNIKTNPFKAVSFVESAIKKALDNAGYLIAEIKYDGVRGNICVDNT...,------------------------------+-++---+--------...,332.0,ATP
1,1a49,A,IQTQQLHAAMADTFLEHMCRLDIDSAPITARNTGIICTIGPASRSV...,-------------------------------------+--+-----...,519.0,ATP
2,1a82,A,SKRYFVTGTDTEVGKTVASCALLQAAKAAGYRTAGYKPVASGSEKT...,----------+++++++-------------------+---------...,224.0,ATP
3,1asz,A,EDTAKDNYGKLPLIQSRDSDRTGQKRVKFVDLDEAKDSDKEVLFRA...,----------------------------------------------...,490.0,ATP
4,1b76,A,AASSLDELVALCKRRGFIFQSSEIYGGLQGVYDYGPLGVELKNNLK...,----------------------------------------------...,442.0,ATP
...,...,...,...,...,...,...
1361,5huu,A,GKVLVVSNRIPVTIKRLDNGSYDYSMSSGGLVTALQGLKKTTEFQW...,----------------------------------------------...,469.0,UDP
1362,5lvv,A,HHVPYSSAQSTSKTSVTLSLGGGTHADSLNNLANIKREQGNIEEAV...,--++++----------------------------------------...,720.0,UDP
1363,5n80,A,AMKIAFIGEAVSGFGGMETVISNVIHTFENSSPKINCEMFFFCRND...,----------------------------------------------...,358.0,UDP
1364,5ndf,A,KVRWPDFNQEAYVGGTMVRSGQDPYARNKFNQVESDKLRMDRAIPD...,----------------------------------------------...,495.0,UDP
