In [4]:
#%pip install BioPython

In [5]:
def get_sequence_from_PDB(PDB_id):
    from Bio.PDB import PDBList
    from Bio.PDB import PDBParser
    from Bio.PDB import PPBuilder

    pdbl = PDBList()
    print(pdbl)
    pdbl.retrieve_pdb_file(PDB_id,pdir='PDB_URV_Database/Protein/', file_format ='pdb')

    parser = PDBParser(QUIET=True)
    file= 'PDB_URV_Database/Protein/' + PDB_id + '_protein' + '.pdb'
    structure = parser.get_structure(PDB_id.lower(), file)
    seq=[]
    for model in structure:
        for chain in model:
            seq.append(PPBuilder().build_peptides(chain)[0].get_sequence())
    print(seq)
    return seq  # The output is a list with the sequences of each chain

In [6]:
import pandas as pd
import csv

# Read the Affinity TXT file into a dataframe
dataframe = pd.read_csv('PDB_URV_DATABASE/Affinity.txt',sep=r'\s+',header=None)  # Use the appropriate delimiter if your file is not tab-separated

# Display the dataframe
#print(dataframe)

# Select the first column
protiens = dataframe.iloc[:, 0]

# Loop over the values in the first column
sequences = []
protien_df = pd.DataFrame(columns=['protien', 'sequence_length', 'first_sequence', 'first_sequence_length', 'second_sequence', 'second_sequence_length', 'selected_sequence'])
for protien in protiens:

    sequence = get_sequence_from_PDB(protien)
    #print(len(sequence))
    #print(sequence[0])

    if(len(sequence) == 1):
        selected_sequence = sequence[0]
    elif(len(sequence) == 2):
        if(len(sequence[0]) >= len(sequence[1])):
            selected_sequence = sequence[0]
        else: 
            selected_sequence = sequence[1]

    new_row = {'protien': protien, 'sequence_length': len(sequence), 'first_sequence': sequence[0], 'first_sequence_length': len(sequence[0]), 'second_sequence': sequence[1] if len(sequence) == 2 else '' , 'second_sequence_length': len(sequence[1]) if len(sequence) == 2 else '', 'selected_sequence':selected_sequence}
    protien_df = pd.concat([protien_df, pd.DataFrame([new_row])], ignore_index=True)

    # Cast a column as a string
    protien_df = protien_df.astype({'protien': str})
# print(sequences)
# print(len(sequences))
protien_df.to_csv('PDB_URV_Database/logs/protiens_file.csv', index=False, quoting=csv.QUOTE_NONNUMERIC)  # Set index=False to avoid writing row indices
protien_df.head(53)

<Bio.PDB.PDBList.PDBList object at 0x000002A1052F3340>
Structure exists: 'PDB_URV_Database/Protein/pdb6m2n.ent' 
[Seq('SGFRKMAFPSGKVEGCMVQVTCGTTTLNGLWLDDVVYCPRHVICTSEDMLNPNY...TFQ'), Seq('SGFRKMAFPSGKVEGCMVQVTCGTTTLNGLWLDDVVYCPRHVICTSEDMLNPNY...VTF')]
<Bio.PDB.PDBList.PDBList object at 0x000002A105742E60>
Structure exists: 'PDB_URV_Database/Protein/pdb6w63.ent' 
[Seq('SGFRKMAFPSGKVEGCMVQVTCGTTTLNGLWLDDVVYCPRHVICTSEDMLNPNY...VTF'), Seq('SGFRKMAFPSGKVEGCMVQVTCGTTTLNGLWLDDVVYCPRHVICTSEDMLNPNY...VTF')]
<Bio.PDB.PDBList.PDBList object at 0x000002A1052F3550>
Structure exists: 'PDB_URV_Database/Protein/pdb7au4.ent' 
[Seq('SGFRKMAFPSGKVEGCMVQVTCGTTTLNGLWLDDVVYCPRHVICTSEDMLNPNY...VTF'), Seq('SGFRKMAFPSGKVEGCMVQVTCGTTTLNGLWLDDVVYCPRHVICTSEDMLNPNY...VTF')]
<Bio.PDB.PDBList.PDBList object at 0x000002A105BC8880>
Structure exists: 'PDB_URV_Database/Protein/pdb7b2j.ent' 
[Seq('SGFRKMAFPSGKVEGCMVQVTCGTTTLNGLWLDDVVYCPRHVICTSEDMLNPNY...VTF'), Seq('SGFRKMAFPSGKVEGCMVQVTCGTTTLNGLWLDDVVYCPRHVICTSEDMLNPNY..

Unnamed: 0,protien,sequence_length,first_sequence,first_sequence_length,second_sequence,second_sequence_length,selected_sequence
0,6M2N,2,"(S, G, F, R, K, M, A, F, P, S, G, K, V, E, G, ...",306,"(S, G, F, R, K, M, A, F, P, S, G, K, V, E, G, ...",305.0,"(S, G, F, R, K, M, A, F, P, S, G, K, V, E, G, ..."
1,6W63,2,"(S, G, F, R, K, M, A, F, P, S, G, K, V, E, G, ...",305,"(S, G, F, R, K, M, A, F, P, S, G, K, V, E, G, ...",305.0,"(S, G, F, R, K, M, A, F, P, S, G, K, V, E, G, ..."
2,7AU4,2,"(S, G, F, R, K, M, A, F, P, S, G, K, V, E, G, ...",305,"(S, G, F, R, K, M, A, F, P, S, G, K, V, E, G, ...",305.0,"(S, G, F, R, K, M, A, F, P, S, G, K, V, E, G, ..."
3,7B2J,2,"(S, G, F, R, K, M, A, F, P, S, G, K, V, E, G, ...",305,"(S, G, F, R, K, M, A, F, P, S, G, K, V, E, G, ...",305.0,"(S, G, F, R, K, M, A, F, P, S, G, K, V, E, G, ..."
4,7B2U,2,"(S, G, F, R, K, M, A, F, P, S, G, K, V, E, G, ...",305,"(S, G, F, R, K, M, A, F, P, S, G, K, V, E, G, ...",305.0,"(S, G, F, R, K, M, A, F, P, S, G, K, V, E, G, ..."
5,7B5Z,2,"(S, G, F, R, K, M, A, F, P, S, G, K, V, E, G, ...",305,"(S, G, F, R, K, M, A, F, P, S, G, K, V, E, G, ...",305.0,"(S, G, F, R, K, M, A, F, P, S, G, K, V, E, G, ..."
6,7B77,2,"(S, G, F, R, K, M, A, F, P, S, G, K, V, E, G, ...",305,"(S, G, F, R, K, M, A, F, P, S, G, K, V, E, G, ...",305.0,"(S, G, F, R, K, M, A, F, P, S, G, K, V, E, G, ..."
7,7E18,2,"(S, G, F, R, K, M, A, F, P, S, G, K, V, E, G, ...",303,"(S, G, F, R, K, M, A, F, P, S, G, K, V, E, G, ...",303.0,"(S, G, F, R, K, M, A, F, P, S, G, K, V, E, G, ..."
8,7E19,2,"(S, G, F, R, K, M, A, F, P, S, G, K, V, E, G, ...",305,"(S, G, F, R, K, M, A, F, P, S, G, K, V, E, G, ...",305.0,"(S, G, F, R, K, M, A, F, P, S, G, K, V, E, G, ..."
9,7KX5,2,"(S, G, F, R, K, M, A, F, P, S, G, K, V, E, G, ...",303,"(S, G, F, R, K, M, A, F, P, S, G, K, V, E, G, ...",303.0,"(S, G, F, R, K, M, A, F, P, S, G, K, V, E, G, ..."
