In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import matplotlib.patches as mpatches
import re
from scipy.stats import fisher_exact
import math


In [2]:
pd.set_option('display.max_rows', 25)
pd.set_option('display.max_columns', None)
pd.options.mode.chained_assignment = None  # default='warn'
import warnings

# Ignore specific warning
warnings.filterwarnings("ignore", category=RuntimeWarning)


# **1. Read Files**

## **1.1 GNOMAD**

In [9]:
gnomad = pd.read_pickle("../data/pipeline_results/gnomad_variants.pkl")
len(gnomad)

15941

In [10]:
gnomad['#CHROM'] = gnomad['#CHROM'].str.replace('chr', '')


In [11]:
gnomad.SYMBOL.value_counts()

SYMBOL
HLA-A       59
AHNAK2      57
HLA-B       53
HLA-C       51
HLA-DQB1    47
            ..
NEFM         1
ADAMDEC1     1
ADAM28       1
ENTPD4       1
RBMY1J       1
Name: count, Length: 7188, dtype: int64

## **1.2 CLINVAR**

In [12]:
clinvar = pd.read_pickle("../data/pipeline_results/clinvar_variants.pkl")
len(clinvar)

31252

In [13]:
len(clinvar)

31252

## **1.3 PAVS**

In [14]:
pavs = pd.read_pickle("../data/pipeline_results/pavs_variants.pkl")
len(pavs)

481

# **2. Concatenate**

In [15]:
# Add a 'Source' column to each dataframe
clinvar['Source'] = 'Pathogenic'
gnomad['Source'] = 'Common'
pavs['Source'] = 'Pavs'

In [16]:
concatenated_df = pd.concat([clinvar, gnomad,pavs], ignore_index=True)
concatenated_df

Unnamed: 0,#CHROM,POS,ID,REF,ALT,AF_ESP,AF_EXAC,AF_TGP,ALLELEID,CLNDN,CLNDNINCL,CLNDISDB,CLNDISDBINCL,CLNHGVS,CLNREVSTAT,CLNSIG,CLNSIGCONF,CLNSIGINCL,CLNVC,CLNVCSO,CLNVI,DBVARID,MC,ORIGIN,RS,Allele,Consequence,IMPACT,SYMBOL,Gene,Feature_type,Feature,BIOTYPE,EXON,INTRON,HGVSc,HGVSp,cDNA_position,CDS_position,Protein_position,Amino_acids,Codons,Existing_variation,DISTANCE,STRAND,VARIANT_CLASS,SYMBOL_SOURCE,HGNC_ID,CANONICAL,MANE_SELECT,MANE_PLUS_CLINICAL,ENSP,SWISSPROT,TREMBL,UNIPARC,UNIPROT_ISOFORM,RefSeq,SIFT,PolyPhen,DOMAINS,CLIN_SIG,SOMATIC,PHENO,Conservation,BLOSUM62,GENEINFO_NAME,GENEINFO_ID,RefSeq_noversion,UniProt_IDs,PDB_path,Residue_position,Residue,DSSP_path,intra_contacts,is_catalytic,pLDDT,secondary_structure,accessibility,total energy,Backbone Hbond,Sidechain Hbond,Van der Waals,Electrostatics,Solvation Polar,Solvation Hydrophobic,Van der Waals clashes,entropy sidechain,entropy mainchain,sloop_entropy,mloop_entropy,cis_bond,torsional clash,backbone clash,helix dipole,water bridge,disulfide,electrostatic kon,partial covalent bonds,energy Ionisation,Entropy Complex,Source,AC,AF,AN,QD,variant_type,vep,flags,hgvsAllele,geneSymbol,protein,zygosityLabel,phenoLabels,sex,pavs_mutation,vep_mutation
0,1,197429459,1048144,G,C,,,,1036051,Leber_congenital_amaurosis_8,,"MONDO:MONDO:0013453,MedGen:C3151202,OMIM:61383...",,NC_000001.11:g.197429459G>C,"criteria_provided,_single_submitter",Likely_pathogenic,,,single_nucleotide_variant,SO:0001483,,,"SO:0001583|missense_variant,SO:0001619|non-cod...",0,1571544334,C,missense_variant,MODERATE,CRB1,ENSG00000134376,Transcript,ENST00000367400,protein_coding,8/12,,,,2848/4958,2687/4221,896/1406,C/S,tGt/tCt,CM1610693,,1,SNV,HGNC,HGNC:2343,YES,NM_201253.3,,ENSP00000356370,P82279.210,A0A7D6VM04.8,UPI0000073345,P82279-1,NM_201253.3,deleterious(0),probably_damaging(0.947),Gene3D:2.10.25.10&AFDB-ENSP_mappings:AF-P82279...,,,1,2.050,-1,CRB1,23418,NM_201253,"[P82279-1, A0A7D6VM04]",/ibex/scratch/projects/c2102/databases/alphafo...,896,Cys,/ibex/scratch/projects/c2102/databases/dssp/al...,"[902, 903, 909, 910, 911, 912, 920, 921, 922]",False,78.820000,,23,5.248440,0.073737,0.000000e+00,0.751425,0.000000e+00,0.134081,1.298280,0.956765,-0.603521,-0.434252,0.0,0.0,0.0,0.113566,0.006548,0.000000e+00,0.0,2.95836,0.0,0.0,0.000000,0.0,Pathogenic,,,,,,,,,,,,,,,
1,1,197429466,1213987,T,G,,,,1203980,Retinitis_pigmentosa_12,,"MONDO:MONDO:0010818,MedGen:C1838647,OMIM:60010...",,NC_000001.11:g.197429466T>G,"criteria_provided,_single_submitter",Likely_pathogenic,,,single_nucleotide_variant,SO:0001483,,,"SO:0001583|missense_variant,SO:0001619|non-cod...",1,,G,missense_variant,MODERATE,CRB1,ENSG00000134376,Transcript,ENST00000367400,protein_coding,8/12,,,,2855/4958,2694/4221,898/1406,N/K,aaT/aaG,,,1,SNV,HGNC,HGNC:2343,YES,NM_201253.3,,ENSP00000356370,P82279.210,A0A7D6VM04.8,UPI0000073345,P82279-1,NM_201253.3,deleterious(0),benign(0.204),Gene3D:2.10.25.10&AFDB-ENSP_mappings:AF-P82279...,,,,-4.100,0,CRB1,23418,NM_201253,"[P82279-1, A0A7D6VM04]",/ibex/scratch/projects/c2102/databases/alphafo...,898,Asn,/ibex/scratch/projects/c2102/databases/dssp/al...,"[914, 917, 921, 922, 923, 924, 925]",False,78.600006,T,91,0.097497,0.658705,5.965850e-01,-0.087486,-6.183120e-02,-0.394596,-0.590773,0.029495,-0.287550,0.260123,0.0,0.0,0.0,-0.038429,-0.131667,1.325550e-02,0.0,0.00000,0.0,0.0,0.000000,0.0,Pathogenic,,,,,,,,,,,,,,,
2,1,197429467,1963628,G,A,,,,2017299,Retinitis_pigmentosa_12|Leber_congenital_amaur...,,"MONDO:MONDO:0010818,MedGen:C1838647,OMIM:60010...",,NC_000001.11:g.197429467G>A,"criteria_provided,_single_submitter",Likely_pathogenic,,,single_nucleotide_variant,SO:0001483,,,"SO:0001583|missense_variant,SO:0001619|non-cod...",1,,A,missense_variant,MODERATE,CRB1,ENSG00000134376,Transcript,ENST00000367400,protein_coding,8/12,,,,2856/4958,2695/4221,899/1406,G/R,Gga/Aga,COSV66329886,,1,SNV,HGNC,HGNC:2343,YES,NM_201253.3,,ENSP00000356370,P82279.210,A0A7D6VM04.8,UPI0000073345,P82279-1,NM_201253.3,deleterious(0.03),possibly_damaging(0.624),Gene3D:2.10.25.10&AFDB-ENSP_mappings:AF-P82279...,,1,1,1.220,-2,CRB1,23418,NM_201253,"[P82279-1, A0A7D6VM04]",/ibex/scratch/projects/c2102/databases/alphafo...,899,Gly,/ibex/scratch/projects/c2102/databases/dssp/al...,"[912, 914]",False,78.271996,T,71,2.038980,0.000000,2.842170e-14,-0.358241,-1.058960e-01,0.362182,-0.547692,1.342100,0.127956,1.138920,0.0,0.0,0.0,0.079655,0.057777,1.199040e-14,0.0,0.00000,0.0,0.0,0.000000,0.0,Pathogenic,,,,,,,,,,,,,,,
3,1,197429468,2202910,G,C,,,,1928560,Retinitis_pigmentosa_12|Leber_congenital_amaur...,,"MONDO:MONDO:0010818,MedGen:C1838647,OMIM:60010...",,NC_000001.11:g.197429468G>C,"criteria_provided,_single_submitter",Pathogenic,,,single_nucleotide_variant,SO:0001483,,,"SO:0001583|missense_variant,SO:0001619|non-cod...",1,,C,missense_variant,MODERATE,CRB1,ENSG00000134376,Transcript,ENST00000367400,protein_coding,8/12,,,,2857/4958,2696/4221,899/1406,G/A,gGa/gCa,CM130814&COSV66329474,,1,SNV,HGNC,HGNC:2343,YES,NM_201253.3,,ENSP00000356370,P82279.210,A0A7D6VM04.8,UPI0000073345,P82279-1,NM_201253.3,deleterious(0.01),possibly_damaging(0.887),Gene3D:2.10.25.10&AFDB-ENSP_mappings:AF-P82279...,,0&1,1&1,2.050,0,CRB1,23418,NM_201253,"[P82279-1, A0A7D6VM04]",/ibex/scratch/projects/c2102/databases/alphafo...,899,Gly,/ibex/scratch/projects/c2102/databases/dssp/al...,"[912, 914]",False,78.271996,T,71,2.305920,0.000000,0.000000e+00,-0.146505,0.000000e+00,0.192758,-0.226396,1.353180,-0.000509,1.133390,0.0,0.0,0.0,0.000000,0.038944,0.000000e+00,0.0,0.00000,0.0,0.0,0.000000,0.0,Pathogenic,,,,,,,,,,,,,,,
4,1,197429473,978993,G,T,,,,967053,Autosomal_recessive_retinitis_pigmentosa,,MedGen:C0339526,,NC_000001.11:g.197429473G>T,no_assertion_criteria_provided,Pathogenic,,,single_nucleotide_variant,SO:0001483,,,"SO:0001583|missense_variant,SO:0001619|non-cod...",1,1664767158,T,missense_variant,MODERATE,CRB1,ENSG00000134376,Transcript,ENST00000367400,protein_coding,8/12,,,,2862/4958,2701/4221,901/1406,V/F,Gtt/Ttt,rs1664767158&CM1618654&CS107850,,1,SNV,HGNC,HGNC:2343,YES,NM_201253.3,,ENSP00000356370,P82279.210,A0A7D6VM04.8,UPI0000073345,P82279-1,NM_201253.3,deleterious(0),possibly_damaging(0.83),Gene3D:2.10.25.10&AFDB-ENSP_mappings:AF-P82279...,pathogenic,,1&1&1,-0.545,-1,CRB1,23418,NM_201253,"[P82279-1, A0A7D6VM04]",/ibex/scratch/projects/c2102/databases/alphafo...,901,Val,/ibex/scratch/projects/c2102/databases/dssp/al...,"[910, 911, 912, 913, 914, 922]",False,78.329994,E,91,-0.116954,-0.027761,0.000000e+00,-0.244795,0.000000e+00,0.128989,-0.357719,0.014282,0.176529,0.301131,0.0,0.0,0.0,-0.114363,-0.018459,0.000000e+00,0.0,0.00000,0.0,0.0,0.006753,0.0,Pathogenic,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47669,2,232530046,NM_000751.2:c.727C>T,C,T,,,,,,,,,,,,,,,,,,,,,T,missense_variant,MODERATE,CHRND,ENSG00000135902,Transcript,ENST00000258385,protein_coding,7/12,,,,783/2962,727/1554,243/517,R/C,Cgc/Tgc,rs201733876&COSV51320564,,1,SNV,HGNC,HGNC:1965,YES,NM_000751.3,,ENSP00000258385,Q07001.202,,UPI000012525E,Q07001-1,NM_000751.3,deleterious(0),,TIGRFAM:TIGR00860&CDD:cd19028&PANTHER:PTHR1894...,likely_benign&uncertain_significance&conflicti...,0&1,1&1,2.150,-3,,,NM_000751,[Q07001-1],/ibex/scratch/projects/c2102/databases/alphafo...,243,Arg,/ibex/scratch/projects/c2102/databases/dssp/al...,"[161, 162, 163, 210, 211, 212, 213, 214, 495, ...",False,95.250000,E,91,1.610230,1.131200,1.092110e+00,0.911003,6.924240e-01,-1.690690,0.931905,-0.029911,-1.856800,0.450153,0.0,0.0,0.0,-0.025516,0.000000,4.352280e-03,0.0,0.00000,0.0,0.0,0.000000,0.0,Pavs,,,,,,,,NM_000751.2:c.727C>T,CHRND,NP_000742.1:p.(Arg243Cys),[heterozygous],[Blue nevus|Corneal opacity|Global development...,[MALE],Arg243Cys,Arg243Cys
47670,11,117338662,NM_014956.4:c.76G>A,G,A,,,,,,,,,,,,,,,,,,,,,A,missense_variant,MODERATE,CEP164,ENSG00000110274,Transcript,ENST00000278935,protein_coding,3/33,,,,224/5629,76/4383,26/1460,E/K,Gag/Aag,rs1565416876,,1,SNV,HGNC,HGNC:29182,YES,NM_014956.5,,ENSP00000278935,Q9UPV0.159,,UPI00001FA422,Q9UPV0-1,NM_014956.5,deleterious(0.01),possibly_damaging(0.715),Gene3D:3.30.1470.10&PANTHER:PTHR18902&PANTHER:...,uncertain_significance,,1,1.870,1,,,NM_014956,[Q9UPV0-1],/ibex/scratch/projects/c2102/databases/alphafo...,26,Glu,/ibex/scratch/projects/c2102/databases/dssp/al...,[51],False,88.692001,H,125,0.946732,0.784688,8.032740e-01,-0.105241,4.335570e-02,-0.326249,-0.422321,-0.000057,-0.664469,0.174407,0.0,0.0,0.0,-0.180566,-0.071483,8.399100e-01,0.0,0.00000,0.0,0.0,0.000000,0.0,Pavs,,,,,,,,NM_014956.4:c.76G>A,CEP164,NP_055771.4:p.(Glu26Lys),[homozygous],[Seizure|Thrombocytopenia],[MALE],Glu26Lys,Glu26Lys
47671,9,99149226,NM_004612.3:c.1433A>G,A,G,,,,,,,,,,,,,,,,,,,,,G,missense_variant,MODERATE,TGFBR1,ENSG00000106799,Transcript,ENST00000374994,protein_coding,9/9,,,,1526/6492,1433/1512,478/503,N/S,aAt/aGt,rs141259922&CM064318,,1,SNV,HGNC,HGNC:11772,YES,NM_004612.4,,ENSP00000364133,P36897.236,Q5T7S2.170,UPI000011D62A,P36897-1,NM_004612.4,deleterious_low_confidence(0.03),possibly_damaging(0.46),AFDB-ENSP_mappings:AF-P36897-F1.A&Gene3D:1.10....,uncertain_significance,,1&1,4.100,1,,,NM_004612,"[P36897-1, Q5T7S2, B4DXN7]",/ibex/scratch/projects/c2102/databases/alphafo...,478,Asn,/ibex/scratch/projects/c2102/databases/dssp/al...,"[382, 398]",False,98.080002,S,105,0.546138,0.000000,-1.421090e-14,0.069871,-9.237060e-14,-0.005548,0.228123,0.000000,-0.012204,0.266062,0.0,0.0,0.0,-0.000165,-0.017823,0.000000e+00,0.0,0.00000,0.0,0.0,0.000000,0.0,Pavs,,,,,,,,NM_004612.3:c.1433A>G,TGFBR1,NP_004603.1:p.(Asn478Ser),[heterozygous],[Abnormal facial shape|Cleft palate|Delayed sp...,[FEMALE],Asn478Ser,Asn478Ser
47672,11,68357846,NM_002335.3:c.685C>T,C,T,,,,,,,,,,,,,,,,,,,,,T,missense_variant&splice_region_variant,MODERATE,LRP5,ENSG00000162337,Transcript,ENST00000294304,protein_coding,3/23,,,,809/5177,685/4848,229/1615,R/W,Cgg/Tgg,rs766589610&COSV53713441,,1,SNV,HGNC,HGNC:6697,YES,NM_002335.4,,ENSP00000294304,O75197.205,,UPI0000073246,,NM_002335.4,deleterious_low_confidence(0),probably_damaging(0.997),PIRSF:PIRSF036314&PROSITE_profiles:PS51120&Sup...,uncertain_significance,0&1,1&1,1.260,-3,,,NM_002335,[O75197],/ibex/scratch/projects/c2102/databases/alphafo...,229,Arg,/ibex/scratch/projects/c2102/databases/dssp/al...,"[190, 191, 192, 193, 194, 211, 218, 220, 221, ...",False,90.259995,,75,3.785530,1.609630,3.306860e+00,1.555740,9.140490e-01,-2.290390,1.087520,-0.005376,-1.755700,-0.606421,0.0,0.0,0.0,-0.009837,0.153157,0.000000e+00,0.0,0.00000,0.0,0.0,-0.020528,0.0,Pavs,,,,,,,,NM_002335.3:c.685C>T,LRP5,NP_002326.2:p.(Arg229Trp),[nan],[Autosomal dominant inheritance|Autosomal rece...,[nan],Arg229Trp,Arg229Trp


In [17]:
concatenated_df[['REFAA', 'ALTAA']] = concatenated_df['Amino_acids'].str.split('/', expand=True)


In [18]:
## replace concatenated_df.secondary_structure == 'nan' with 'L'
concatenated_df.secondary_structure = concatenated_df.secondary_structure.replace(np.nan, 'L')
concatenated_df.value_counts('secondary_structure')

secondary_structure
H    15891
L    14955
E     7454
T     4017
S     3299
G     1357
B      371
I      330
Name: count, dtype: int64

# **3. Normalize accesibility**

In [19]:
# Dictionary mapping amino acids to their respective x values
x_values = {
    'Ala': 129.0,
    'Arg': 274.0,
    'Asn': 195.0,
    'Asp': 193.0,
    'Cys': 167.0,
    'Glu': 223.0,
    'Gln': 225.0,
    'Gly': 104.0,
    'His': 224.0,
    'Ile': 197.0,
    'Leu': 201.0,
    'Lys': 236.0,
    'Met': 224.0,
    'Phe': 240.0,
    'Pro': 159.0,
    'Ser': 155.0,
    'Thr': 172.0,
    'Trp': 285.0,
    'Tyr': 263.0,
    'Val': 174.0
}

# Create the 'ACCESIBILITY_NORMALIZED' column
concatenated_df['ACCESIBILITY_NORMALIZED'] = concatenated_df.apply(lambda row: row['accessibility'] / x_values[row['Residue']], axis=1)


# **4. Save**

In [20]:
concatenated_df.to_csv("../results/1_merged.csv", index=False)
concatenated_df.to_pickle("../results/1_merged.pkl")