In [5]:
import pandas as pd
import wget
import re

In [36]:
## Download the data
DSET_URL = {
    "SO4": "https://webs.iiitd.edu.in/raghava/ccpdb/datasets/SO4.txt",
    "PO4": "https://webs.iiitd.edu.in/raghava/ccpdb/datasets/PO4.txt",
    "NAG": "https://webs.iiitd.edu.in/raghava/ccpdb/datasets/NAG.txt",
    "HEM": "https://webs.iiitd.edu.in/raghava/ccpdb/datasets/HEM.txt",
    "BME": "https://webs.iiitd.edu.in/raghava/ccpdb/datasets/BME.txt",
    "EDO": "https://webs.iiitd.edu.in/raghava/ccpdb/datasets/EDO.txt",
    "PLP": "https://webs.iiitd.edu.in/raghava/ccpdb/datasets/PLP.txt",
}

for k, v in DSET_URL.items():
    print(f"Downloading: {k}") 
    wget.download(v)

Downloading: SO4
Downloading: PO4
Downloading: NAG
Downloading: HEM
Downloading: BME
Downloading: EDO
Downloading: PLP


In [34]:
from Bio import SeqIO

def parse_dset(filename: str, write: bool = False) -> pd.DataFrame:
    with open(filename) as fasta_file:  # Will close handle cleanly
        identifiers = []
        chains = []
        sequences = []
        interactions = []
        lengths = []
        for seq_record in SeqIO.parse(fasta_file, 'fasta'):  # (generator)
            identifiers.append(seq_record.id[:-1])
            chains.append(seq_record.id[-1])
            lengths.append(len(seq_record.seq))

            parsed_sequence_and_interactions = re.split(';', re.sub("\+|-", lambda match: ';' + match.group(), str(seq_record.seq), count=1), maxsplit=1)#str(seq_record.seq).split("-", maxsplit=1)
            sequences.append(parsed_sequence_and_interactions[0])
            interactions.append(parsed_sequence_and_interactions[1])

    data = zip(identifiers, chains, sequences, interactions,lengths)
    df = pd.DataFrame.from_dict(data)
    df.columns = ["PDB", "chain", "sequence", "interacting_residues", "length"]
    df["interactor"] = filename.split(".")[0]
    
    if write:
        df.to_csv(filename.split(".")[0] + ".csv")
    return df
    
df = parse_dset("BME.txt", write=True)
df.head()

Unnamed: 0,PDB,chain,sequence,interacting_residues,length,interactor
0,1914,A,MVLLESEQFLTELTRLFQKCRSSGSVFITLKKYDEGLEPAENKCLL...,----------------------------------------------...,342,BME
1,1a6j,B,MTNNDTTLQLSSVLNRECTRSRVHCQSKKRALEIISELAAKQLSLP...,----------------------------------------------...,314,BME
2,1bff,A,KDPKRLYCKNGGFFLRIHPDGRVDGVREKSDPHIKLQLQAEERGVV...,----------------------------------------------...,258,BME
3,1btc,A,SNMLLNYVPVYVMLPLGVVNVDNVFEDPDGLKEQLLQLRAAGVDGV...,--+-------------------------------------------...,982,BME
4,1cws,A,SDHRELIGDYSKAFLLQTVDGKHQDLKYISPETMVALLTGKFSNIV...,------------------------+---------------------...,356,BME


In [37]:
## Iterate over datasets
dataset_list = [parse_dset(k + ".txt", write=True) for k in DSET_URL.keys()]

In [44]:
df = pd.concat(dataset_list)
df.reset_index(inplace=True, drop=True)
df.to_csv("PROTEINS_LIGANDS.csv")

Unnamed: 0,PDB,chain,sequence,interacting_residues,length,interactor
0,2olg,A,RNRRPELLPNDCGYQVEADKILNGDDTVPEEFPWTAMIGYKNSSNF...,----------------------------------------------...,552,SO4
1,4fi1,A,MKCRVWSEARVYTNINKQRTEEYWDYENTVIDWSTNTKDYEIENKV...,----------------------------------------------...,742,SO4
2,4g9f,E,NAGVTQTPKFQVLKTGQSMTLQCAQDMNHEYMSWYRQDPGMGLRLI...,----------------------------------------------...,500,SO4
3,101m,A,MVLSEGEWQLVLHVWAKVEADVAGHGQDILIRLFKSHPETLEKFDR...,----------------------------------------------...,308,SO4
4,4ah9,A,SSPGIWQLDCTHLEGKVILVAVHVASGYIEAEVIPAETGQETAYFL...,---------++---++------------+-+---++-------+--...,308,SO4
...,...,...,...,...,...,...
7272,5hsj,A,SLKDLDLNALFIGDKAENGQLYKDLLNKLVDEHLGWRKNYIPSDPN...,----------------------------------------------...,1212,PLP
7273,5hxx,A,VSLQDFDAERIGLFHEDIKRKFDELKSKNLKLDLTRGKPSSEQLDF...,----------------------------------------------...,848,PLP
7274,5ijg,A,LLHPETQMLNSEIVEDRLAVYEGAESAALFSSGMSAIATTLFAFVR...,------------------------------+++--+----------...,706,PLP
7275,5w71,A,IPFDHWPEWPQHSDRTRRKIEEVFQSNRWAISGYWTGEESMERKFA...,----------------------------------------------...,826,PLP
