## Extract the protein conservation scores in the form of JSD

Uses the JSD scores saved in: "conservation_scores/proteins_ucsc_con/Homo_sapiens.GRCh37/"

Higher JSD -> higher divergence from BLOSUM62 -> more conserved

### Output:
Creates a new dictionary for each domain with the JSD score added at each position.

In [1]:
import pandas as pd 
import numpy as np
import pickle
from collections import defaultdict
import datetime

In [3]:
curr_dir = !pwd
pfam_version = "32"
domains_th = "10"
update_same_file = True

if (update_same_file):
    input_path = curr_dir[0]+"/ext_features_dicts/pfam-v"+pfam_version+"/"
else:
    input_path = curr_dir[0]+"/../5.HMM_alter_align/domains_states_dicts/pfam-v"+pfam_version+"/"

#Read the list of domains
if (pfam_version == "32"):
    with open(curr_dir[0]+"/../5.domains_stats/pfam-v"+pfam_version+"/regular_human_domains_list.pik", 'rb') as handle:
        filtered_domains_list = pickle.load(handle)
else:
    with open(curr_dir[0]+"/../5.domains_stats/pfam-v"+pfam_version+"/filtered"+domains_th+"_list.pik", 'rb') as handle:
        filtered_domains_list = pickle.load(handle)
filtered_domains_list = filtered_domains_list.sort()
print (len(filtered_domains_list))

6503


In [4]:
%%time
mismatch_dict = defaultdict(list)
mismatch_count = 0
for domain_name in filtered_domains_list:
    
    dirfiles = !ls -t $input_path$domain_name
    filename = dirfiles[0]
    with open(input_path+domain_name+"/"+filename, 'rb') as handle:
        states_dict = pickle.load(handle)
    
    with open(curr_dir[0]+"/../4.parse_Uniprot/domains_canonic_prot/pfam-v"+pfam_version+"/"+domain_name+"_canonic_prot.pik", 'rb') as handle:
        canonic_protein = pickle.load(handle)
    
    for state in states_dict.keys():
        
        for d in states_dict[state]:
            
            #Find the protein ID
            gene = d["ens_gene"]
            protein = canonic_protein[gene]
            
            #Find JSD scores file by chromosome/gene id/protein id
            jsd_table = pd.read_csv(curr_dir[0]+"/../conservation_scores/proteins_ucsc_con/Homo_sapiens.GRCh37/"+d["chrom"]+"/"+gene+"/"+protein+".jsd.txt", skiprows=1, header=None, sep='\t')
            jsd_table.columns = ["Prot_idx", "AA", "JSD"]
            
            #Find the JSD of the corresponding protein position
            prot_pos = d["prot_pos"]
            prot_pos_adj = (prot_pos - 1) #Adjust the position because JSDs indices starts at 0.
            jsd_of_pos = float(jsd_table[jsd_table["Prot_idx"] == prot_pos_adj]["JSD"])
            
            #Sanity check: the AA match
            jsd_table_aa = jsd_table[jsd_table["Prot_idx"] == prot_pos_adj]["AA"].values[0]
            states_dict_aa = d["aa_ref_orig"]
            
            if (jsd_table_aa != "-" and jsd_table_aa.upper() != states_dict_aa.upper()):
                if not (jsd_table_aa == "*" and states_dict_aa.upper() == "X"): #mismatch isn't stop codon notation
                    mismatch_count += 1
                    mismatch_dict["chrom"].append(d["chrom"])
                    mismatch_dict["gene"].append(gene)
                    mismatch_dict["protein"].append(protein)
                    mismatch_dict["prot_pos"].append(prot_pos)
                    mismatch_dict["aa_ref"].append(states_dict_aa.upper())
                    mismatch_dict["aa_jsd"].append(jsd_table_aa.upper())
                    mismatch_dict["domain"].append(domain_name)
                    d["100-way-BLOSUM_JSD"] = -1
            else:
                d["100-way-BLOSUM_JSD"] = jsd_of_pos
            
    #Saving the updated dictionary
    !mkdir -p ext_features_dicts/pfam-v32/$domain_name
    
    with open(curr_dir[0]+"/ext_features_dicts/pfam-v"+pfam_version+"/"+domain_name+"/"+domain_name+"_hmm_states_dict_"+datetime.date.today().strftime("%m.%d.%y")+".pik", 'wb') as handle:
        pickle.dump(states_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
    print "Finished "+domain_name

Finished 1-cysPrx_C
Finished 14-3-3
Finished 2-Hacid_dh
Finished 2-Hacid_dh_C
Finished 2-oxoacid_dh
Finished 2-oxogl_dehyd_N
Finished 23ISL
Finished 2Fe-2S_thioredx
Finished 2OG-FeII_Oxy
Finished 2OG-FeII_Oxy_2
Finished 2OG-FeII_Oxy_3
Finished 2OG-FeII_Oxy_4
Finished 3-HAO
Finished 3-PAP
Finished 3Beta_HSD
Finished 3HCDH
Finished 3HCDH_N
Finished 40S_S4_C
Finished 40S_SA_C
Finished 4F5
Finished 4HBT
Finished 4HBT_2
Finished 4HBT_3
Finished 4_1_CTD
Finished 5-FTHF_cyc-lig
Finished 5-nucleotidase
Finished 53-BP1_Tudor
Finished 5HT_transport_N
Finished 5_nucleotid
Finished 5_nucleotid_C
Finished 60KD_IMP
Finished 6PF2K
Finished 6PGD
Finished 7TM_GPCR_Srsx
Finished 7TM_GPCR_Srv
Finished 7TM_GPCR_Srw
Finished 7TM_GPCR_Srx
Finished 7tm_1
Finished 7tm_2
Finished 7tm_3
Finished 7tm_4
Finished A1_Propeptide
Finished A2M
Finished A2M_BRD
Finished A2M_recep
Finished AAA
Finished AAA_11
Finished AAA_12
Finished AAA_16
Finished AAA_17
Finished AAA_18
Finished AAA_19
Finished AAA_2
Finished AAA_21
F

Finished Arv1
Finished Arylesterase
Finished Ashwin
Finished Asn_synthase
Finished Asp
Finished Asp-B-Hydro_N
Finished Asp_Arg_Hydrox
Finished Asp_Glu_race_2
Finished Asp_protease
Finished Asp_protease_2
Finished Asparaginase
Finished Asparaginase_2
Finished Asparaginase_C
Finished AstE_AspA
Finished Astacin
Finished Atg14
Finished Atg8
Finished Atrophin-1
Finished Atthog
Finished Atx10homo_assoc
Finished Atypical_Card
Finished Augurin
Finished Aurora-A_bind
Finished Auto_anti-p27
Finished Autophagy_C
Finished Autophagy_N
Finished Autophagy_act_C
Finished Auts2
Finished Avl9
Finished Ax_dynein_light
Finished Axin_b-cat_bind
Finished B-block_TFIIIC
Finished B12-binding
Finished B12-binding_2
Finished B12D
Finished B2-adapt-app_C
Finished B3_4
Finished B5
Finished B56
Finished B9-C2
Finished BAALC_N
Finished BAAT_C
Finished BACK
Finished BAF
Finished BAF250_C
Finished BAG
Finished BAG6
Finished BAH
Finished BAMBI
Finished BAR
Finished BAR_3
Finished BAR_3_WASP_bdg
Finished BASP1
Finished

Finished CTD
Finished CTD_bind
Finished CTF_NFI
Finished CTNNB1_binding
Finished CTNNBL
Finished CTP_synth_N
Finished CTP_transf_1
Finished CTP_transf_3
Finished CTP_transf_like
Finished CTU2
Finished CUB
Finished CUE
Finished CUT
Finished CUTL
Finished CWC25
Finished CX9C
Finished CXCL17
Finished CXCR4_N
Finished CXCXC
Finished CYLD_phos_site
Finished CYSTM
Finished CYTH
Finished CYTL1
Finished CYYR1
Finished CaKB
Finished CaM-KIIN
Finished CaMBD
Finished CaMKII_AD
Finished CaM_bdg_C0
Finished Ca_chan_IQ
Finished Ca_hom_mod
Finished CactinC_cactus
Finished Cactin_mid
Finished Cadherin
Finished Cadherin-like
Finished Cadherin_2
Finished Cadherin_3
Finished Cadherin_C
Finished Cadherin_C_2
Finished Cadherin_pro
Finished Cadherin_tail
Finished Calc_CGRP_IAPP
Finished Calcipressin
Finished Calcyon
Finished Caldesmon
Finished Calpain_III
Finished Calpain_inhib
Finished Calpain_u2
Finished Calponin
Finished Calreticulin
Finished Calsarcin
Finished Calsequestrin
Finished Calx-beta
Finished C

Finished DUF2043
Finished DUF2045
Finished DUF2046
Finished DUF2048
Finished DUF2052
Finished DUF2053
Finished DUF2054
Finished DUF21
Finished DUF2151
Finished DUF2152
Finished DUF2181
Finished DUF2205
Finished DUF2228
Finished DUF2315
Finished DUF2340
Finished DUF2347
Finished DUF2353
Finished DUF2358
Finished DUF2362
Finished DUF2367
Finished DUF2368
Finished DUF2369
Finished DUF2371
Finished DUF2373
Finished DUF2428
Finished DUF2431
Finished DUF2439
Finished DUF2448
Finished DUF2451
Finished DUF2452
Finished DUF2462
Finished DUF2464
Finished DUF2465
Finished DUF2475
Finished DUF2476
Finished DUF2477
Finished DUF2615
Finished DUF2678
Finished DUF2723
Finished DUF2781
Finished DUF2870
Finished DUF3128
Finished DUF3314
Finished DUF3337
Finished DUF3338
Finished DUF3342
Finished DUF3350
Finished DUF3361
Finished DUF3371
Finished DUF3377
Finished DUF3381
Finished DUF3384
Finished DUF3385
Finished DUF3395
Finished DUF3398
Finished DUF3399
Finished DUF3402
Finished DUF3429
Finished DUF3432

Finished DeoC
Finished Dermcidin
Finished Det1
Finished Dexa_ind
Finished Dicer_dimer
Finished Dickkopf_N
Finished Dimer_Tnp_hAT
Finished Dimerisation2
Finished Diphthami_syn_2
Finished Diphthamide_syn
Finished Dis3l2_C_term
Finished Dishevelled
Finished Disintegrin
Finished Dmrt1
Finished Dna2
Finished DnaB_C
Finished DnaJ
Finished DnaJ_C
Finished DnaJ_CXXCXGXG
Finished Dopey_N
Finished Doppel
Finished Dor1
Finished DoxX
Finished DoxX_2
Finished Dpoe2NT
Finished Dppa2_A
Finished Dpy-30
Finished Dpy19
Finished Draxin
Finished Drf_DAD
Finished Drf_FH1
Finished Drf_FH3
Finished Drf_GBD
Finished Dsh_C
Finished DuoxA
Finished Dus
Finished Dymeclin
Finished Dynactin
Finished Dynactin_p22
Finished Dynactin_p62
Finished Dynamin_M
Finished Dynamin_N
Finished Dynamitin
Finished Dynein_AAA_lid
Finished Dynein_C
Finished Dynein_IC2
Finished Dynein_attach_N
Finished Dynein_heavy
Finished Dynein_light
Finished Dysbindin
Finished Dzip-like_N
Finished E1-E2_ATPase
Finished E1_4HB
Finished E1_DerP2_De

Finished Forkhead_N
Finished Formin_GBD_N
Finished Formyl_trans_C
Finished Formyl_trans_N
Finished Fox-1_C
Finished Fra10Ac1
Finished Frag1
Finished FragX_IP
Finished Frataxin_Cyay
Finished Fringe
Finished Frizzled
Finished Frtz
Finished Fructosamin_kin
Finished FtsH_ext
Finished FtsJ
Finished Fucokinase
Finished Fucosidase_C
Finished FumaraseC_C
Finished Fumble
Finished Furin-like
Finished Furin-like_2
Finished Fz
Finished Fzo_mitofusin
Finished G-alpha
Finished G-gamma
Finished G-patch
Finished G-patch_2
Finished G0-G1_switch_2
Finished G10
Finished G2BR
Finished G2F
Finished G6B
Finished G6PD_C
Finished G6PD_N
Finished G8
Finished GABP-alpha
Finished GAD
Finished GAF
Finished GAF_2
Finished GAF_3
Finished GAGE
Finished GAIN
Finished GAIN_A
Finished GAPT
Finished GARS_A
Finished GARS_C
Finished GARS_N
Finished GAS
Finished GAS2
Finished GAT
Finished GATA
Finished GATA-N
Finished GATase
Finished GATase_5
Finished GATase_6
Finished GATase_7
Finished GBP
Finished GBP_C
Finished GBR2_CC


Finished Hint
Finished HipN
Finished Hira
Finished His_Phos_1
Finished His_Phos_2
Finished Hist_deacetyl
Finished Hist_rich_Ca-bd
Finished Histone
Finished Histone_H2A_C
Finished HlyIII
Finished HnRNPA1
Finished HnRNP_M
Finished Homeobox_KN
Finished Homeodomain
Finished Homez
Finished Hormone_1
Finished Hormone_2
Finished Hormone_3
Finished Hormone_4
Finished Hormone_5
Finished Hormone_6
Finished Hormone_recep
Finished Hox9_act
Finished HoxA13_N
Finished HpcH_HpaI
Finished Hrs_helical
Finished HscB_4_cys
Finished Humanin
Finished Hus1
Finished Hyccin
Finished Hyd_WA
Finished Hydant_A_N
Finished Hydantoinase_A
Finished Hydantoinase_B
Finished Hydin_ADK
Finished Hydrolase
Finished Hydrolase_4
Finished Hydrolase_6
Finished Hydrolase_like
Finished I-EGF_1
Finished I-set
Finished IATP
Finished IBB
Finished IBN_N
Finished IBR
Finished ICA69
Finished ICAM_N
Finished ICAP-1_inte_bdg
Finished ICAT
Finished ICMT
Finished IDO
Finished IER
Finished IF-2
Finished IF-2B
Finished IF3_C
Finished IF3_N

Finished MAJIN
Finished MALT1_Ig
Finished MAM
Finished MAM33
Finished MANEC
Finished MAP17
Finished MAP1B_neuraxin
Finished MAP2_projctn
Finished MAP65_ASE1
Finished MAP7
Finished MAPEG
Finished MAPKK1_Int
Finished MARCKS
Finished MARVEL
Finished MAS20
Finished MAT1
Finished MATH
Finished MBD
Finished MBD_C
Finished MBDa
Finished MBF1
Finished MBOAT
Finished MBT
Finished MCC-bdg_PDZ
Finished MCCD1
Finished MCD
Finished MCD_N
Finished MCLC
Finished MCM
Finished MCM2_N
Finished MCM3AP_GANP
Finished MCM6_C
Finished MCM_N
Finished MCM_OB
Finished MCM_bind
Finished MCM_lid
Finished MCRS_N
Finished MCU
Finished MDD_C
Finished MDFI
Finished MDM1
Finished MEA1
Finished MEF2_binding
Finished MEIOC
Finished MENTAL
Finished MFAP1
Finished MFS_1
Finished MFS_1_like
Finished MFS_2
Finished MFS_5
Finished MG1
Finished MG2
Finished MG3
Finished MG4
Finished MGAT2
Finished MGC-24
Finished MGS
Finished MH1
Finished MH2
Finished MHC2-interact
Finished MHC_I
Finished MHC_II_alpha
Finished MHC_II_beta
Fin

Finished NOA36
Finished NOB1_Zn_bind
Finished NOC3p
Finished NOD
Finished NOD2_WH
Finished NODP
Finished NOG1
Finished NOG1_N
Finished NOGCT
Finished NOP5NT
Finished NOPS
Finished NOT2_3_5
Finished NO_synthase
Finished NPAT_C
Finished NPBW
Finished NPC1_N
Finished NPDC1
Finished NPF
Finished NPFF
Finished NPIP
Finished NPL4
Finished NPM1-C
Finished NPP
Finished NPR2
Finished NPR3
Finished NRBF2
Finished NRBF2_MIT
Finished NRDE-2
Finished NRIP1_repr_1
Finished NRIP1_repr_2
Finished NRIP1_repr_3
Finished NRIP1_repr_4
Finished NRN1
Finished NR_Repeat
Finished NT-C2
Finished NT5C
Finished NTF2
Finished NTP_transf_2
Finished NTP_transf_3
Finished NTP_transf_7
Finished NTP_transferase
Finished NTPase_1
Finished NTPase_I-T
Finished NTR
Finished NUC129
Finished NUC130_3NT
Finished NUC153
Finished NUC173
Finished NUC194
Finished NUC202
Finished NUC205
Finished NUDE_C
Finished NUDIX
Finished NUDIX-like
Finished NUDIX_2
Finished NUDIX_4
Finished NUDIX_5
Finished NUFIP1
Finished NUFIP2
Finished NU

Finished PNP_phzG_C
Finished PNPase
Finished PNRC
Finished PNTB
Finished PNTB_4TM
Finished POB3_N
Finished POLO_box
Finished POM121
Finished POP1
Finished POPLD
Finished POT1
Finished POT1PC
Finished PP-binding
Finished PP1_bind
Finished PP1_inhibitor
Finished PP1c_bdg
Finished PP28
Finished PP2C
Finished PP2C_C
Finished PPAK
Finished PPARgamma_N
Finished PPDFL
Finished PPIP5K2_N
Finished PPI_Ypi1
Finished PPP1R26_N
Finished PPP1R32
Finished PPP1R35_C
Finished PPP4R2
Finished PPP5
Finished PPPI_inhib
Finished PPR
Finished PPR_1
Finished PPR_2
Finished PPR_3
Finished PPR_long
Finished PPTA
Finished PQ-loop
Finished PQQ_2
Finished PQQ_3
Finished PRA1
Finished PRAP
Finished PRAS
Finished PRC2_HTH_1
Finished PRCC
Finished PRELI
Finished PRIMA1
Finished PRK
Finished PRKCSH
Finished PRKCSH-like
Finished PRKCSH_1
Finished PRKG1_interact
Finished PRMT5
Finished PRMT5_C
Finished PRMT5_TIM
Finished PRNT
Finished PRO8NT
Finished PROCN
Finished PROCT
Finished PROL5-SMR
Finished PRORP
Finished PRP1

Finished RICTOR_phospho
Finished RIG-I_C
Finished RIG-I_C-RD
Finished RIH_assoc
Finished RII_binding_1
Finished RIIa
Finished RILP
Finished RINGv
Finished RINT1_TIP1
Finished RIO1
Finished RITA
Finished RIX1
Finished RL
Finished RL10P_insert
Finished RLI
Finished RLL
Finished RMI1_C
Finished RMI1_N
Finished RMI2
Finished RMMBL
Finished RMP
Finished RNA_GG_bind
Finished RNA_POL_M_15KD
Finished RNA_bind
Finished RNA_helicase
Finished RNA_ligase
Finished RNA_pol
Finished RNA_polI_A34
Finished RNA_pol_3_Rpc31
Finished RNA_pol_A_bac
Finished RNA_pol_I_A49
Finished RNA_pol_L
Finished RNA_pol_L_2
Finished RNA_pol_N
Finished RNA_pol_Rbc25
Finished RNA_pol_Rpa2_4
Finished RNA_pol_Rpb1_1
Finished RNA_pol_Rpb1_2
Finished RNA_pol_Rpb1_3
Finished RNA_pol_Rpb1_4
Finished RNA_pol_Rpb1_5
Finished RNA_pol_Rpb1_6
Finished RNA_pol_Rpb1_7
Finished RNA_pol_Rpb1_R
Finished RNA_pol_Rpb2_1
Finished RNA_pol_Rpb2_2
Finished RNA_pol_Rpb2_3
Finished RNA_pol_Rpb2_4
Finished RNA_pol_Rpb2_5
Finished RNA_pol_Rpb2_6
F

Finished SAYSvFN
Finished SBDS
Finished SBDS_C
Finished SBF
Finished SBF2
Finished SBF_like
Finished SBP56
Finished SBP_bac_3
Finished SCA7
Finished SCAI
Finished SCAMP
Finished SCAN
Finished SCAPER_N
Finished SCF
Finished SCHIP-1
Finished SCIMP
Finished SCNM1_acidic
Finished SCO1-SenC
Finished SCP-1
Finished SCP2
Finished SCRG1
Finished SDA1
Finished SDF
Finished SE
Finished SEA
Finished SEEEED
Finished SEEK1
Finished SEFIR
Finished SEP
Finished SERTA
Finished SET
Finished SF1-HH
Finished SF3A2
Finished SF3A3
Finished SF3a60_bindingd
Finished SF3b1
Finished SF3b10
Finished SFTA2
Finished SGIII
Finished SGL
Finished SGS
Finished SGT1
Finished SGTA_dimer
Finished SH2
Finished SH2_2
Finished SH3-RhoG_link
Finished SH3-WW_linker
Finished SH3BGR
Finished SH3BP5
Finished SH3_1
Finished SH3_10
Finished SH3_12
Finished SH3_15
Finished SH3_19
Finished SH3_2
Finished SH3_3
Finished SH3_9
Finished SHIPPO-rpt
Finished SHMT
Finished SHNi-TPR
Finished SHQ1
Finished SHR-BD
Finished SHS2_Rpb7-N
Finis

Finished TAXi_N
Finished TB
Finished TB2_DP1_HVA22
Finished TBCA
Finished TBCC
Finished TBCC_N
Finished TBD
Finished TBK1_CCD1
Finished TBK1_ULD
Finished TBP
Finished TBP-binding
Finished TBPIP
Finished TBX
Finished TC1
Finished TCL1_MTCP1
Finished TCR
Finished TCRP1
Finished TCR_zetazeta
Finished TCTP
Finished TDP43_N
Finished TDRP
Finished TEA
Finished TED_complement
Finished TEP1_N
Finished TERB2
Finished TERF2_RBM
Finished TEX12
Finished TEX13
Finished TEX15
Finished TEX19
Finished TEX29
Finished TEX33
Finished TFA2_Winged_2
Finished TFCD_C
Finished TFIIA
Finished TFIIA_gamma_C
Finished TFIIA_gamma_N
Finished TFIIB
Finished TFIID-18kDa
Finished TFIID-31kDa
Finished TFIID_20kDa
Finished TFIID_30kDa
Finished TFIID_NTD2
Finished TFIIE-A_C
Finished TFIIE_alpha
Finished TFIIE_beta
Finished TFIIF_alpha
Finished TFIIF_beta
Finished TFIIF_beta_N
Finished TFIIIC_delta
Finished TFIIIC_sub6
Finished TFIIS_C
Finished TFIIS_M
Finished TFR_dimer
Finished TF_AP-2
Finished TF_Otx
Finished TF_Zn_Ri

Finished UN_NPL4
Finished UPA
Finished UPAR_LY6
Finished UPAR_LY6_2
Finished UPA_2
Finished UPF0004
Finished UPF0016
Finished UPF0020
Finished UPF0029
Finished UPF0054
Finished UPF0061
Finished UPF0086
Finished UPF0113
Finished UPF0113_N
Finished UPF0121
Finished UPF0139
Finished UPF0160
Finished UPF0172
Finished UPF0176_N
Finished UPF0183
Finished UPF0184
Finished UPF0193
Finished UPF0203
Finished UPF0220
Finished UPF0239
Finished UPF0240
Finished UPF0258
Finished UPF0444
Finished UPF0449
Finished UPF0489
Finished UPF0492
Finished UPF0515
Finished UPF0524
Finished UPF0542
Finished UPF0547
Finished UPF0552
Finished UPF0556
Finished UPF0560
Finished UPF0561
Finished UPF0564
Finished UPF0565
Finished UPF0640
Finished UPF0669
Finished UPF0688
Finished UPF0697
Finished UPF0728
Finished UPF0731
Finished UPF0767
Finished UPF1_Zn_bind
Finished UPRTase
Finished UQ_con
Finished URO-D
Finished USP19_linker
Finished USP7_C2
Finished USP7_ICP0_bdg
Finished USP8_dimer
Finished USP8_interact
Finishe

Finished zf-MYST
Finished zf-NADH-PPase
Finished zf-NF-X1
Finished zf-NOSIP
Finished zf-NPL4
Finished zf-Nse
Finished zf-PARP
Finished zf-RAG1
Finished zf-RING-like
Finished zf-RING_10
Finished zf-RING_11
Finished zf-RING_12
Finished zf-RING_14
Finished zf-RING_2
Finished zf-RING_4
Finished zf-RING_5
Finished zf-RING_6
Finished zf-RING_9
Finished zf-RING_UBOX
Finished zf-RNPHF
Finished zf-RRN7
Finished zf-RanBP
Finished zf-SAP30
Finished zf-SCNM1
Finished zf-SNAP50_C
Finished zf-Sec23_Sec24
Finished zf-TAZ
Finished zf-TFIIIC
Finished zf-TRAF
Finished zf-TRAF_2
Finished zf-TRM13_CCCH
Finished zf-Tim10_DDP
Finished zf-U1
Finished zf-U11-48K
Finished zf-UBP
Finished zf-UBP_var
Finished zf-UBR
Finished zf-WRNIP1_ubi
Finished zf-ZPR1
Finished zf-dskA_traR
Finished zf-met
Finished zf-nanos
Finished zf-piccolo
Finished zf-primase
Finished zf-rbx1
Finished zf-tcix
Finished zf_C2H2_10
Finished zf_C2H2_6
Finished zf_C2H2_ZHX
Finished zf_C2HC_14
Finished zf_CCCH_4
Finished zf_CCCH_5
Finished zf_H

In [5]:
mismatch_df = pd.DataFrame.from_dict(mismatch_dict)
#mismatch_df.to_csv(curr_dir[0]+"/aa_mismatch.csv", sep='\t')

In [6]:
mismatch_df.shape

(907, 7)

In [7]:
mismatch_df["domain"].unique()

array(['CARD', 'FGF', 'GSHPx', 'JmjC', 'NAP', 'PDZ', 'PDZ_6', 'PHD',
       'PLU-1', 'RBM1CTR', 'Rdx', 'SelP_C', 'SelP_N', 'SelR',
       'Sep15_SelM', 'T4_deiodinase', 'VCX_VCY', 'eIF-1a', 'zf-C5HC2'],
      dtype=object)

In [8]:
non_gap_mismatch_df = mismatch_df[mismatch_df["aa_jsd"] != "-"]
non_gap_mismatch_df = non_gap_mismatch_df[non_gap_mismatch_df["aa_ref"] != "*"]
non_gap_mismatch_df = non_gap_mismatch_df.reset_index(drop=True)
#non_gap_mismatch_df.to_csv(curr_dir[0]+"/non_gap_mismatch_df.csv", sep='\t')

In [9]:
non_gap_mismatch_df.shape

(882, 7)