In [1]:
import numpy as np
import pandas as pd
from collections import defaultdict
import cPickle as pickle

#Getting path
curr_dir = !pwd

#Reading the list of filtered domains
with open(curr_dir[0]+"/../5.domains_stats/filtered10_list.pik", 'rb') as handle:
    filtered_domains_list10 = pickle.load(handle)
filtered_domains_list10.sort()

In [2]:
# From Vihinen et al, 1994
flex_scores = {
    'A': {'avg': 0.984, '0': 1.315, '1': 0.994, '2': 0.783},
    'C': {'avg': 0.906, '0': 1.196, '1': 0.939, '2': 0.785},
    'D': {'avg': 1.068, '0': 1.372, '1': 1.022, '2': 0.822},
    'E': {'avg': 1.094, '0': 1.376, '1': 1.052, '2': 0.826},
    'F': {'avg': 0.915, '0': 1.247, '1': 0.934, '2': 0.774},
    'G': {'avg': 1.031, '0': 1.382, '1': 1.018, '2': 0.784},
    'H': {'avg': 0.950, '0': 1.279, '1': 0.967, '2': 0.777},
    'I': {'avg': 0.927, '0': 1.241, '1': 0.977, '2': 0.776},
    'K': {'avg': 1.102, '0': 1.367, '1': 1.029, '2': 0.834},
    'L': {'avg': 0.935, '0': 1.234, '1': 0.982, '2': 0.783},
    'M': {'avg': 0.952, '0': 1.269, '1': 0.963, '2': 0.806},
    'N': {'avg': 1.048, '0': 1.380, '1': 1.022, '2': 0.799},
    'P': {'avg': 1.049, '0': 1.342, '1': 1.050, '2': 0.809},
    'Q': {'avg': 1.037, '0': 1.342, '1': 1.041, '2': 0.817},
    'R': {'avg': 1.008, '0': 1.310, '1': 1.026, '2': 0.807},
    'S': {'avg': 1.046, '0': 1.381, '1': 1.025, '2': 0.811},
    'T': {'avg': 0.997, '0': 1.324, '1': 0.998, '2': 0.795},
    'V': {'avg': 0.931, '0': 1.235, '1': 0.968, '2': 0.781},
    'W': {'avg': 0.904, '0': 1.186, '1': 0.938, '2': 0.796},
    'Y': {'avg': 0.929, '0': 1.199, '1': 0.981, '2': 0.788}
}

def flexibility(seq):
    window_weights = [0.25, 0.4375, 0.625, 0.8125, 1, 0.8125, 0.625, 0.4375, 0.25]
    flex_score = []
    neighbors = []
    
    # First pass checks if aa's are hydrophobic or hydrophilic for neighbor calculation
    for i in range(0,len(seq)):
        aa = seq[i]
        if flex_scores[aa]['avg'] > 1:
            neighbors.append(0)
        else:
            neighbors.append(1)
    
    # Second pass sums flexibilities using neighbors and sliding window of length 9
    for i in range(4,len(seq)-4):
        pos_score = 0
        for j in range(-4,5):
            # Get neighbors
            if i+j == 0:
                num_neighbors = neighbors[i+j+1]
            elif i+j == len(seq)-1:
                num_neighbors = neighbors[i+j-1]
            else:
                num_neighbors = neighbors[i+j-1] + neighbors[i+j+1]
                
            # Sum score over window
            aa = seq[i+j]
            pos_score += window_weights[j+4] * flex_scores[aa][str(num_neighbors)]
            
        flex_score.append(pos_score)
        
    return(flex_score)
            
    
flexibility('DDDDDDDDD')

[7.203]

In [21]:
# Mapping of genes to flexibility score
flex_dict = defaultdict(dict)

for domain_name in filtered_domains_list10:
    in_path = curr_dir[0]+"/../3.parse_HMMER/hmm_domains/pfam-v30/"
    filename = domain_name+".csv"
    domain_data = pd.read_csv(in_path+filename, sep='\t', index_col=0, dtype={"chrom_num": str})

    # Sort the domain data
    sorted_domain_data = domain_data.sort_values(by=["chrom_num", "gene", "TargetStart"])
    sorted_domain_data = sorted_domain_data.reset_index(drop=True)

    # Get the canonic protein id
    with open(curr_dir[0]+"/../4.parse_Uniprot/domains_canonic_prot/pfam-v30/"+domain_name+"_canonic_prot.pik", 'rb') as handle:
        canonic_protein = pickle.load(handle)
    
    for gene in sorted_domain_data.loc[:,'gene']:
        # No need to process a gene twice
        if gene in flex_dict:
            continue
        # Get sequence
        protein = canonic_protein[gene]
        seq = sorted_domain_data.loc[sorted_domain_data.loc[:,'prot'] == protein,'Target_Seq'].values[0]
        # Compute flexibility
        flex_dict[domain_name][gene] = flexibility(seq.replace('-','').replace('X','').upper())
    print("Finished domain "+domain_name)
            
# Save to file
#with open('flex_dict.pik', 'wb') as handle:
#    pickle.dump(flex_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

Finished domain 2OG-FeII_Oxy_3
Finished domain 7TM_GPCR_Srsx
Finished domain 7tm_1
Finished domain 7tm_2
Finished domain 7tm_3
Finished domain 7tm_4
Finished domain A2M
Finished domain A2M_N
Finished domain A2M_N_2
Finished domain A2M_comp
Finished domain A2M_recep
Finished domain AAA
Finished domain AAA_11
Finished domain AAA_12
Finished domain AAA_17
Finished domain AAA_18
Finished domain AAA_33
Finished domain AAA_5
Finished domain AAA_6
Finished domain AAA_7
Finished domain AAA_8
Finished domain AAA_9
Finished domain AA_permease
Finished domain AA_permease_2
Finished domain ABC2_membrane_3
Finished domain ABC_membrane
Finished domain ABC_tran
Finished domain ADAM_CR
Finished domain ADAM_spacer1
Finished domain ADH_N
Finished domain ADH_zinc_N
Finished domain ADK
Finished domain AMP-binding
Finished domain AMP-binding_C
Finished domain ANAPC3
Finished domain ANAPC4_WD40
Finished domain ANATO
Finished domain ANF_receptor
Finished domain APC_r
Finished domain APOBEC_C
Finished domain 

Finished domain LBP_BPI_CETP
Finished domain LCE
Finished domain LIM
Finished domain LRRC37
Finished domain LRRCT
Finished domain LRRNT
Finished domain LRR_4
Finished domain LRR_5
Finished domain LRR_6
Finished domain LRR_8
Finished domain LRR_9
Finished domain LSM
Finished domain Laminin_B
Finished domain Laminin_EGF
Finished domain Laminin_G_1
Finished domain Laminin_G_2
Finished domain Laminin_G_3
Finished domain Laminin_N
Finished domain Ldl_recept_a
Finished domain Ldl_recept_b
Finished domain Lectin_C
Finished domain Lig_chan
Finished domain Lig_chan-Glu_bd
Finished domain Linker_histone
Finished domain Lipocalin
Finished domain Lys
Finished domain MAGE
Finished domain MAGE_N
Finished domain MAM
Finished domain MARVEL
Finished domain MBOAT
Finished domain MBT
Finished domain MFS_1
Finished domain MH1
Finished domain MHC_I
Finished domain MHC_II_alpha
Finished domain MHC_II_beta
Finished domain MHC_I_C
Finished domain MIF4G
Finished domain MIP
Finished domain MIT
Finished domain M

Finished domain zf-C3HC4
Finished domain zf-C3HC4_2
Finished domain zf-C3HC4_3
Finished domain zf-C3HC4_4
Finished domain zf-C4
Finished domain zf-CCCH
Finished domain zf-CCHC
Finished domain zf-CXXC
Finished domain zf-DHHC
Finished domain zf-FCS
Finished domain zf-H2C2_2
Finished domain zf-H2C2_5
Finished domain zf-HC5HC2H
Finished domain zf-HC5HC2H_2
Finished domain zf-MYND
Finished domain zf-NF-X1
Finished domain zf-RING_11
Finished domain zf-RING_2
Finished domain zf-RING_5
Finished domain zf-RING_UBOX
Finished domain zf-RanBP
Finished domain zf-TRAF
Finished domain zf-UBP
Finished domain zf-met
Finished domain zf-rbx1


In [28]:
sorted_domain_data

Unnamed: 0,#TargetID,pfam_id,domain_name,E-value,BitScore,TargetStart,TargetEnd,HMM_Seq,Target_Seq,HMM_Pos,...,transcript,gene_biotype,transcript_biotype,hgncID,hugoSymbol,refseq,entrez,length,HMMStart,HMMEnd
0,ENSP00000338157.4,PF12874,zf-met,1.700000e-06,22.5,518.0,540.0,fyCelCnvsftsetqlksHlrgKkH,YICSECNRTFPSHTALKRHLRS--H,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1...",...,ENST00000335953.4,protein_coding,protein_coding,12930,ZBTB16,NP_005997,7704.0,673.0,1.0,25.0
1,ENSP00000376721.2,PF12874,zf-met,1.700000e-06,22.5,518.0,540.0,fyCelCnvsftsetqlksHlrgKkH,YICSECNRTFPSHTALKRHLRS--H,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1...",...,ENST00000392996.2,protein_coding,protein_coding,12930,ZBTB16,NP_001018011,7704.0,673.0,1.0,25.0
2,ENSP00000419607.1,PF12874,zf-met,1.900000e-08,30.2,16.0,40.0,fyCelCnvsftsetqlksHlrgKkH,FRCCLCHVTTANRPSLDAHLGGRKH,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1...",...,ENST00000476907.1,protein_coding,protein_coding,26184,TUT1,XP_005274243,64852.0,874.0,1.0,25.0
3,ENSP00000308000.7,PF12874,zf-met,1.900000e-08,30.2,54.0,78.0,fyCelCnvsftsetqlksHlrgKkH,FRCCLCHVTTANRPSLDAHLGGRKH,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1...",...,ENST00000308436.7,protein_coding,protein_coding,26184,TUT1,NP_073741,64852.0,912.0,1.0,25.0
4,ENSP00000350574.4,PF12874,zf-met,2.200000e-06,22.1,487.0,511.0,fyCelCnvsftsetqlksHlrgKkH,YECKTCGAMFTNSGNLIVHLRSLNH,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1...",...,ENST00000357899.4,protein_coding,protein_coding,25001,ZBTB44,XP_005271580,29068.0,570.0,1.0,25.0
5,ENSP00000380861.1,PF12874,zf-met,2.200000e-06,22.1,487.0,511.0,fyCelCnvsftsetqlksHlrgKkH,YECKTCGAMFTNSGNLIVHLRSLNH,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1...",...,ENST00000397753.1,protein_coding,protein_coding,25001,ZBTB44,XP_005271580,29068.0,570.0,1.0,25.0
6,ENSP00000377849.2,PF12874,zf-met,1.100000e-11,39.7,54.0,78.0,fyCelCnvsftsetqlksHlrgKkH,ISCNICQIRFNSQSQAEAHYKGNRH,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1...",...,ENST00000394313.2,protein_coding,protein_coding,17521,ZNF385A,NP_056296,25946.0,366.0,1.0,25.0
7,ENSP00000446913.1,PF12874,zf-met,1.100000e-11,39.7,54.0,78.0,fyCelCnvsftsetqlksHlrgKkH,ISCNICQIRFNSQSQAEAHYKGNRH,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1...",...,ENST00000546970.1,protein_coding,protein_coding,17521,ZNF385A,XP_005268841,25946.0,366.0,1.0,25.0
8,ENSP00000447162.1,PF12874,zf-met,1.100000e-11,39.7,54.0,78.0,fyCelCnvsftsetqlksHlrgKkH,ISCNICQIRFNSQSQAEAHYKGNRH,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1...",...,ENST00000551771.1,protein_coding,protein_coding,17521,ZNF385A,,25946.0,285.0,1.0,25.0
9,ENSP00000449161.1,PF12874,zf-met,1.100000e-11,39.7,54.0,78.0,fyCelCnvsftsetqlksHlrgKkH,ISCNICQIRFNSQSQAEAHYKGNRH,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1...",...,ENST00000551109.1,protein_coding,protein_coding,17521,ZNF385A,XP_005268840,25946.0,366.0,1.0,25.0
