In [16]:
import pandas as pd
import numpy as np
from Bio.PDB.MMCIF2Dict import MMCIF2Dict

In [17]:
def string2range(x):
    
    """
    This function takes in a `string` representing a region of interest in a
    protein. The region of interest can be a single region or multiple regions
    of a protein. Returns a range for single regions or a list of ranges for
    multiple regions.
    
    Parameters:
    
        x (string): String containing a region or several regions of interest in a 
            protein.
            Format of x: single region -> 'start-end'
                         multiple regions -> 'start1-end1,start2-end2'
                     
    Returns:
    
        range or list of ranges: For single region proteins a range is returned. For 
            multiple region proteins a list of ranges is returned

            Format: single region -> range(start, end+1)
                    multiple region -> [range(start1, end1+1), range(start2, end2+1)]
    """
    # Handle instances with more than one range
    if ',' in x:
        list_temp = x.split(sep = ',') #list_temp = ['123-456,' '789-1111']
        for y in range(len(list_temp)): 
            list_temp[y] = list_temp[y].split(sep = '-') #list_temp[y] = [['123', '456'], ['789', '1111']]
        for y in range(len(list_temp)): 
            for x in range(len(list_temp[y])):
                list_temp[y][x] = int(list_temp[y][x]) #turns each list item into an integer

        # Make a range object with the bounds of the range. Note to the 
        # end a 1 has to be added in order to include the last position in the range
        for y in range(len(list_temp)): #[1, 2] where 1=[123, 456] and 2=[789, 1111]
            for x in range(len(list_temp[y])): #[123, 456]       
                list_temp[y] = list(range(list_temp[y][x], list_temp[y][x+1]+1)) #list_temp[0][0] = [123], list_temp[0][0+1]+1 or [456] + 1 = [457]
                break

        return list(set([item for sublist in list_temp for item in sublist]))

    # Handle instances with only one range
    else:
        list_temp = x.split(sep = '-')
        for y in range(len(list_temp)):
            list_temp[y] = int(list_temp[y]) #

        # Make a range object with the bounds of the region. Note to the 
        # end a 1 has to be added in order to include the last position in the range
        return list(range(list_temp[0], list_temp[1]+1))

Get the interfaces for proteins with simple annotated regions.

In [18]:
cf = pd.read_csv('./project_pipeline/data/classified_files.tsv', sep='\t')
intf = pd.read_csv('./project_pipeline/data/alphafold_interface.tsv', sep='\t')

proteins = cf['uniprot'].unique().tolist()
intf = intf[intf['uniprot'].isin(proteins)]

In [19]:
intf.shape

(128, 9)

In [20]:
# Select only proteins with a single range of residues in region_1 and region_2
intf = intf[intf['region_1'].str.contains(',') == False]
intf = intf[intf['region_2'].str.contains(',') == False]

intf['region_1 search'] = intf['region_1'].apply(string2range)
intf['region_2 search'] = intf['region_2'].apply(string2range)

# Make a list of these proteins
prots_one_range = intf['uniprot'].unique().tolist()

# Now we have to get rid of any na's.
intf = intf.dropna().reset_index(drop=True)

In [21]:
intf.head()

Unnamed: 0,uniprot,region_1,region_2,af_filename,interacting_residue_pairs,interface_residues,number_interface_residues,region_1 search,region_2 search
0,P62826,179-216,10-169,F-P62826-F1-model_v3.cif,"[(53, 179), (53, 180)]","{179, 180, 53}",3.0,"[179, 180, 181, 182, 183, 184, 185, 186, 187, ...","[10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 2..."
1,Q07889,1-198,198-565,F-Q07889-F1-model_v3.cif,"[(183, 435), (198, 203), (183, 497), (170, 545...","{519, 140, 143, 144, 147, 148, 150, 151, 153, ...",61.0,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[198, 199, 200, 201, 202, 203, 204, 205, 206, ..."
2,Q06124,3-104,247-517,F-Q06124-F1-model_v3.cif,"[(5, 256), (58, 506), (71, 282), (75, 262), (7...","{256, 258, 259, 4, 5, 262, 6, 7, 265, 10, 3, 9...",53.0,"[3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, ...","[247, 248, 249, 250, 251, 252, 253, 254, 255, ..."
3,Q96FI4,244-245,220-280,F-Q96FI4-F1-model_v3.cif,"[(245, 247), (243, 245), (240, 244), (244, 245...","{240, 242, 243, 244, 245, 246, 247, 252}",8.0,"[244, 245]","[220, 221, 222, 223, 224, 225, 226, 227, 228, ..."
4,O76074,403-539,156-331,F-O76074-F1-model_v3.cif,"[(196, 486), (194, 486), (329, 490), (329, 491...","{194, 196, 485, 486, 487, 329, 490, 491}",8.0,"[403, 404, 405, 406, 407, 408, 409, 410, 411, ...","[156, 157, 158, 159, 160, 161, 162, 163, 164, ..."


In [22]:
intf['interface_residues'] = intf['interface_residues'].apply(lambda x: x.strip('{}')).apply(lambda x: x.split(','))
intf['interface_residues'] = intf['interface_residues'].apply(lambda x: [int(i) for i in x])

In [23]:
intf['region_1_interacting'] = intf.apply(lambda x: list(set(x['region_1 search']).intersection(set(x['interface_residues']))), axis=1)
intf['region_2_interacting'] = intf.apply(lambda x: list(set(x['region_2 search']).intersection(set(x['interface_residues']))), axis=1)

In [24]:
# Now we find the lengths between the interacting residues of each region.
for index, row in intf.iterrows():
    region_1 = row['region_1_interacting']
    region_2 = row['region_2_interacting']

    if region_2[0] > region_1[-1]:
        linker_range = str(list(range(region_1[-1], region_2[0]+1)))
    else:
        linker_range = str(list(range(region_2[-1], region_1[0]+1)))
    
    intf.loc[index, 'linker_range'] = linker_range

In [38]:
intf = intf[intf['linker_range'] != '[]'].reset_index(drop=True)

In [39]:
path = './project_pipeline/data/input/Alphafold_cif'

for index, row in intf.iterrows():
    fn = row['af_filename']
    fp = f'{path}/{fn}'
    link_range = row['linker_range'].strip('[]').split(',')
    link_range = [int(i) for i in link_range]

    mmcif = MMCIF2Dict(fp)
    seq = mmcif['_atom_site.label_seq_id']
    b_iso = mmcif['_atom_site.B_iso_or_equiv']

    df = pd.DataFrame({'seq': seq, 'plddt': b_iso}).drop_duplicates().reset_index(drop=True)
    df['seq'] = df['seq'].astype(int)
    df['plddt'] = df['plddt'].astype(float)
    df = df[df['seq'].isin(link_range)].reset_index(drop=True)

    structured = df[df['plddt'] > 50]
    disordered = df[df['plddt'] <= 50]

    intf.loc[index, 'structured_residues'] = str(structured['seq'].tolist())
    intf.loc[index, 'disordered_residues'] = str(disordered['seq'].tolist())


In [40]:
intf.head()

Unnamed: 0,uniprot,region_1,region_2,af_filename,interacting_residue_pairs,interface_residues,number_interface_residues,region_1 search,region_2 search,region_1_interacting,region_2_interacting,linker_range,structured_residues,disordered_residues
0,P62826,179-216,10-169,F-P62826-F1-model_v3.cif,"[(53, 179), (53, 180)]","[179, 180, 53]",3.0,"[179, 180, 181, 182, 183, 184, 185, 186, 187, ...","[10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 2...","[179, 180]",[53],"[53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 6...","[53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 6...",[]
1,Q07889,1-198,198-565,F-Q07889-F1-model_v3.cif,"[(183, 435), (198, 203), (183, 497), (170, 545...","[519, 140, 143, 144, 147, 148, 150, 151, 153, ...",61.0,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[198, 199, 200, 201, 202, 203, 204, 205, 206, ...","[140, 143, 144, 147, 148, 150, 151, 153, 155, ...","[519, 411, 412, 415, 544, 545, 548, 549, 552, ...","[100, 101, 102, 103, 104, 105, 106, 107, 108, ...","[100, 101, 102, 103, 104, 105, 106, 107, 108, ...","[179, 180, 181, 182, 183, 184, 185, 186, 187, ..."
2,Q06124,3-104,247-517,F-Q06124-F1-model_v3.cif,"[(5, 256), (58, 506), (71, 282), (75, 262), (7...","[256, 258, 259, 4, 5, 262, 6, 7, 265, 10, 3, 9...",53.0,"[3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, ...","[247, 248, 249, 250, 251, 252, 253, 254, 255, ...","[3, 4, 5, 6, 7, 9, 10, 33, 41, 57, 58, 59, 60,...","[256, 258, 259, 262, 265, 279, 280, 281, 282, ...","[80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 9...","[80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 9...","[158, 159, 160, 161, 238, 239, 240, 241]"
3,O76074,403-539,156-331,F-O76074-F1-model_v3.cif,"[(196, 486), (194, 486), (329, 490), (329, 491...","[194, 196, 485, 486, 487, 329, 490, 491]",8.0,"[403, 404, 405, 406, 407, 408, 409, 410, 411, ...","[156, 157, 158, 159, 160, 161, 162, 163, 164, ...","[485, 486, 487, 490, 491]","[329, 194, 196]","[196, 197, 198, 199, 200, 201, 202, 203, 204, ...","[196, 197, 198, 199, 200, 201, 202, 203, 204, ...","[394, 396, 397, 398, 399, 400, 401, 439, 440, ..."
4,P11142,263-287,225-262,F-P11142-F1-model_v3.cif,"[(245, 287), (231, 271), (241, 263), (262, 263...","[259, 260, 261, 262, 263, 264, 265, 266, 267, ...",29.0,"[263, 264, 265, 266, 267, 268, 269, 270, 271, ...","[225, 226, 227, 228, 229, 230, 231, 232, 233, ...","[263, 264, 265, 266, 267, 268, 270, 271, 274, ...","[259, 260, 261, 262, 230, 231, 233, 234, 235, ...","[253, 254, 255, 256, 257, 258, 259, 260, 261, ...","[253, 254, 255, 256, 257, 258, 259, 260, 261, ...",[]
