In [2]:
import pandas as pd
import wget
import re

In [33]:
## Download the data
DSET_URL = {
    "FE": "https://webs.iiitd.edu.in/raghava/ccpdb/datasets/FE.txt",
    "MG": "https://webs.iiitd.edu.in/raghava/ccpdb/datasets/MG.txt",
    "CA": "https://webs.iiitd.edu.in/raghava/ccpdb/datasets/CA.txt",
    "MN": "https://webs.iiitd.edu.in/raghava/ccpdb/datasets/MN.txt",
    "ZN": "https://webs.iiitd.edu.in/raghava/ccpdb/datasets/ZN.txt",
    "CO": "https://webs.iiitd.edu.in/raghava/ccpdb/datasets/CO.txt",
    "NI": "https://webs.iiitd.edu.in/raghava/ccpdb/datasets/NI.txt",
}

for k, v in DSET_URL.items():
    print(f"Downloading: {k}") 
    wget.download(v)

Downloading: FE
100% [............................................................................] 142509 / 142509Downloading: MG
100% [..........................................................................] 1329558 / 1329558Downloading: CA
100% [............................................................................] 949658 / 949658Downloading: MN
100% [............................................................................] 364159 / 364159Downloading: ZN
100% [............................................................................] 988449 / 988449Downloading: CO
100% [............................................................................] 114983 / 114983Downloading: NI
100% [............................................................................] 196016 / 196016

In [94]:
from Bio import SeqIO

def parse_dset(filename: str, write: bool = False) -> pd.DataFrame:
    with open(filename) as fasta_file:  # Will close handle cleanly
        identifiers = []
        chains = []
        sequences = []
        interactions = []
        lengths = []
        for seq_record in SeqIO.parse(fasta_file, 'fasta'):  # (generator)
            identifiers.append(seq_record.id[:-1])
            chains.append(seq_record.id[-1])
            lengths.append(len(seq_record.seq)/2)

            parsed_sequence_and_interactions = re.split(';', re.sub("\+|-", lambda match: ';' + match.group(), str(seq_record.seq), count=1), maxsplit=1)#str(seq_record.seq).split("-", maxsplit=1)
            sequences.append(parsed_sequence_and_interactions[0])
            interactions.append(parsed_sequence_and_interactions[1])

    data = zip(identifiers, chains, sequences, interactions,lengths)
    df = pd.DataFrame.from_dict(data)
    df.columns = ["PDB", "chain", "sequence", "interacting_residues", "length"]
    df["interactor"] = filename.split(".")[0]
    
    if write:
        df.to_csv(filename.split(".")[0] + ".csv")
    return df

In [95]:
## Iterate over datasets
dataset_list = [parse_dset(k + ".txt", write=True) for k in DSET_URL.keys()]

In [96]:
df = pd.concat(dataset_list)
df.reset_index(inplace=True, drop=True)
df.to_csv("PROTEINS_METAL.csv")

In [99]:
df

Unnamed: 0,PDB,chain,sequence,interacting_residues,length,interactor
0,1z6o,M,TQCNVNPVQIPKDWITMHRSCRNSMRQQIQMEVGASLQYLAMGAHF...,-------------------------------+--------------...,191.0,FE
1,1eo2,B,IIWGAYAQRNTEDHPPAYAPGYKTSVLRSPKNALISIAETLSEVTA...,----------------------------------------------...,238.0,FE
2,1vei,A,MTSFTIPGLSDKKASDVADLLQKQLSTYNDLHLTLKHVHWNVVGPN...,----------------------------------------------...,175.0,FE
3,5cry,A,YTRVVWCAVGPEEQKKCQQWSQQSGQNVTCATASTTDDCIVLVLKG...,----------------------------------------------...,348.0,FE
4,1y67,D,AYTLPQLPYAYDALEPHIDARTMEIHHTKHHQTYVDNANKALEGTE...,-------------------------+--------------------...,214.0,FE
...,...,...,...,...,...,...
6257,2eaq,A,QFSDMRISINQTPGKSLDFGFTIKWDIPGIFVASVEAGSPAEFSQL...,------------+----------+----------------------...,89.0,NI
6258,3mgq,F,KRHRKVLRDNIQGITKPAIRRLARRGGVKRISGLIYEETRGVLKVF...,--------+--+----------------------------------...,87.0,NI
6259,5ht8,A,TKAVTFYEDINYGGAHVHLQPGNYTLSQLNTAKIPNDWMTSLKVPS...,---------------+-+----------------------------...,87.0,NI
6260,5ht9,A,NAAEVIVYEHVNFGGKSFDATSDQPGAGDNLNDKISSIKVKSGTWR...,---------+------------------------------------...,84.0,NI
