In [None]:
################################################################################
## This script contains the code to process frameshit VCF from VEP.           ##
## Mensah & Niskanen et al.                                                   ##
## Disruption of nucleolar phase separation in human genetic disease 2022     ##
## Author: Alexandre P Magalhaes                                              ##
################################################################################

In [1]:
import matplotlib.pyplot as plt
import os
import pandas as pd
from numpy import arange
import re, gzip, time, itertools, io

In [2]:
df = pd.read_csv('HS_ALL_IDR_FS_variant_effect_output.txt', sep = '\t')
df.head()

Unnamed: 0,Uploaded_variation,Location,Allele,Gene,Feature,Feature_type,Consequence,cDNA_position,CDS_position,Protein_position,Amino_acids,Codons,Existing_variation,Extra
0,rs775855727,1:943999-944011,-,ENSG00000188976,ENST00000327044,Transcript,downstream_gene_variant,-,-,-,-,-,-,IMPACT=MODIFIER;DISTANCE=192;STRAND=-1;Wildtyp...
1,rs775855727,1:943999-944011,-,ENSG00000187634,ENST00000341065,Transcript,frameshift_variant,1615-1627,1616-1628,539-543,ELGTG/X,gAACTCGGCACAGGa/ga,-,IMPACT=HIGH;STRAND=1;FLAGS=cds_start_NF;Frames...
2,rs775855727,1:943999-944011,-,ENSG00000187634,ENST00000342066,Transcript,frameshift_variant,1982-1994,1892-1904,631-635,ELGTG/X,gAACTCGGCACAGGa/ga,-,IMPACT=HIGH;STRAND=1;FrameshiftSequence=MSKGIL...
3,rs775855727,1:943999-944011,-,ENSG00000187634,ENST00000455979,Transcript,frameshift_variant,1471-1483,1472-1484,491-495,ELGTG/X,gAACTCGGCACAGGa/ga,-,IMPACT=HIGH;STRAND=1;FLAGS=cds_start_NF;Frames...
4,rs775855727,1:943999-944011,-,ENSG00000187634,ENST00000464948,Transcript,downstream_gene_variant,-,-,-,-,-,-,IMPACT=MODIFIER;DISTANCE=1107;STRAND=1


In [3]:
dff = df[df['Consequence']== 'frameshift_variant']
droplist = [ 'Allele', 'Existing_variation', 'Feature_type']
dff = dff.drop(droplist, 1)
dff.info()
dff.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32108 entries, 1 to 95394
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Uploaded_variation  32108 non-null  object
 1   Location            32108 non-null  object
 2   Gene                32108 non-null  object
 3   Feature             32108 non-null  object
 4   Consequence         32108 non-null  object
 5   cDNA_position       32108 non-null  object
 6   CDS_position        32108 non-null  object
 7   Protein_position    32108 non-null  object
 8   Amino_acids         32108 non-null  object
 9   Codons              32108 non-null  object
 10  Extra               32108 non-null  object
dtypes: object(11)
memory usage: 2.9+ MB


Unnamed: 0,Uploaded_variation,Location,Gene,Feature,Consequence,cDNA_position,CDS_position,Protein_position,Amino_acids,Codons,Extra
1,rs775855727,1:943999-944011,ENSG00000187634,ENST00000341065,frameshift_variant,1615-1627,1616-1628,539-543,ELGTG/X,gAACTCGGCACAGGa/ga,IMPACT=HIGH;STRAND=1;FLAGS=cds_start_NF;Frames...
2,rs775855727,1:943999-944011,ENSG00000187634,ENST00000342066,frameshift_variant,1982-1994,1892-1904,631-635,ELGTG/X,gAACTCGGCACAGGa/ga,IMPACT=HIGH;STRAND=1;FrameshiftSequence=MSKGIL...
3,rs775855727,1:943999-944011,ENSG00000187634,ENST00000455979,frameshift_variant,1471-1483,1472-1484,491-495,ELGTG/X,gAACTCGGCACAGGa/ga,IMPACT=HIGH;STRAND=1;FLAGS=cds_start_NF;Frames...
11,rs775855727,1:943999-944011,ENSG00000187634,ENST00000616016,frameshift_variant,2890-2902,2381-2393,794-798,ELGTG/X,gAACTCGGCACAGGa/ga,IMPACT=HIGH;STRAND=1;FrameshiftSequence=MPAVKK...
12,rs775855727,1:943999-944011,ENSG00000187634,ENST00000616125,frameshift_variant,1568-1580,1568-1580,523-527,ELGTG/X,gAACTCGGCACAGGa/ga,IMPACT=HIGH;STRAND=1;FLAGS=cds_start_NF;Frames...


In [4]:
p1 = re.compile(r'IMPACT=(?P<IMPACT>.+?);'
                + r'STRAND=(?P<STRAND>.+?);'
                + r'FrameshiftSequence=(?P<FrameshiftSequence>.+?);'
                + r'WildtypeProtein=(?P<WildtypeProtein>.+?)$')

In [5]:
dff = pd.concat([
    dff,
    (
        dff.Extra.str.extractall(p1)
          .reset_index('match', drop=True)
    )
], axis=1).fillna('')
dff = dff.drop('STRAND', 1)
dff = dff.drop('Extra', 1)
dff = dff[dff['WildtypeProtein'].astype(str).str.startswith('M')]
dff.info()
dff.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 29774 entries, 2 to 95394
Data columns (total 13 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Uploaded_variation  29774 non-null  object
 1   Location            29774 non-null  object
 2   Gene                29774 non-null  object
 3   Feature             29774 non-null  object
 4   Consequence         29774 non-null  object
 5   cDNA_position       29774 non-null  object
 6   CDS_position        29774 non-null  object
 7   Protein_position    29774 non-null  object
 8   Amino_acids         29774 non-null  object
 9   Codons              29774 non-null  object
 10  IMPACT              29774 non-null  object
 11  FrameshiftSequence  29774 non-null  object
 12  WildtypeProtein     29774 non-null  object
dtypes: object(13)
memory usage: 3.2+ MB


Unnamed: 0,Uploaded_variation,Location,Gene,Feature,Consequence,cDNA_position,CDS_position,Protein_position,Amino_acids,Codons,IMPACT,FrameshiftSequence,WildtypeProtein
2,rs775855727,1:943999-944011,ENSG00000187634,ENST00000342066,frameshift_variant,1982-1994,1892-1904,631-635,ELGTG/X,gAACTCGGCACAGGa/ga,HIGH,MSKGILQVHPPICDCPGCRISSPVNRGRLADKRTVALPAARNLKKE...,MSKGILQVHPPICDCPGCRISSPVNRGRLADKRTVALPAARNLKKE...
11,rs775855727,1:943999-944011,ENSG00000187634,ENST00000616016,frameshift_variant,2890-2902,2381-2393,794-798,ELGTG/X,gAACTCGGCACAGGa/ga,HIGH,MPAVKKEFPGREDLALALATFHPTLAALPLPPLPGYLAPLPAAAAL...,MPAVKKEFPGREDLALALATFHPTLAALPLPPLPGYLAPLPAAAAL...
12,rs775855727,1:943999-944011,ENSG00000187634,ENST00000616125,frameshift_variant,1568-1580,1568-1580,523-527,ELGTG/X,gAACTCGGCACAGGa/ga,HIGH,MSKGILQVHPPICDCPGCRISSPVNRGRLADKRTVALPAARNLKKE...,MSKGILQVHPPICDCPGCRISSPVNRGRLADKRTVALPAARNLKKE...
13,rs775855727,1:943999-944011,ENSG00000187634,ENST00000617307,frameshift_variant,1832-1844,1832-1844,611-615,ELGTG/X,gAACTCGGCACAGGa/ga,HIGH,MSKGILQVHPPICDCPGCRISSPVNRGRLADKRTVALPAARNLKKE...,MSKGILQVHPPICDCPGCRISSPVNRGRLADKRTVALPAARNLKKE...
14,rs775855727,1:943999-944011,ENSG00000187634,ENST00000618181,frameshift_variant,1517-1529,1517-1529,506-510,ELGTG/X,gAACTCGGCACAGGa/ga,HIGH,MSKGILQVHPPICDCPGCRISSPVNRGRLADKRTVALPAARNLKKE...,MSKGILQVHPPICDCPGCRISSPVNRGRLADKRTVALPAARNLKKE...


In [6]:
Annot = pd.read_csv('/Users/magalhae/Desktop/IDRClinVar/FULLproteome/HG38_Canonical_pep_IDRanotF20_Final.csv')
keepList = ['TrscId', 'ID', 'gene_symbol', 'PEPId', 'TrueIDRstart']
Annot = Annot[keepList]
Annot = Annot.rename(columns={'TrscId': 'Feature',})
Annot.head()

Unnamed: 0,Feature,ID,gene_symbol,PEPId,TrueIDRstart
0,ENST00000390369,ENSP00000374892.2,TRBV7-4,ENSP00000374892,61.0
1,ENST00000610285,ENSP00000484195.1,NPIPA3,ENSP00000484195,235.0
2,ENST00000611400,ENSP00000477946.1,TEX264,ENSP00000477946,215.0
3,ENST00000357443,ENSP00000350028.2,MOV10,ENSP00000350028,969.0
4,ENST00000426095,ENSP00000409151.2,HSPA1L,ENSP00000409151,619.0


In [7]:
dffAnnot = pd.merge(dff, Annot , how="left", on="Feature")
dffAnnot = dffAnnot.dropna(subset = ['TrueIDRstart'])
dffAnnot = dffAnnot.rename(columns={'PEPId': 'PepID',})
dffAnnot = dffAnnot.drop('ID', 1)
dffAnnot.info()
dffAnnot.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8294 entries, 0 to 29773
Data columns (total 16 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Uploaded_variation  8294 non-null   object 
 1   Location            8294 non-null   object 
 2   Gene                8294 non-null   object 
 3   Feature             8294 non-null   object 
 4   Consequence         8294 non-null   object 
 5   cDNA_position       8294 non-null   object 
 6   CDS_position        8294 non-null   object 
 7   Protein_position    8294 non-null   object 
 8   Amino_acids         8294 non-null   object 
 9   Codons              8294 non-null   object 
 10  IMPACT              8294 non-null   object 
 11  FrameshiftSequence  8294 non-null   object 
 12  WildtypeProtein     8294 non-null   object 
 13  gene_symbol         8294 non-null   object 
 14  PepID               8294 non-null   object 
 15  TrueIDRstart        8294 non-null   float64
dtypes: fl

Unnamed: 0,Uploaded_variation,Location,Gene,Feature,Consequence,cDNA_position,CDS_position,Protein_position,Amino_acids,Codons,IMPACT,FrameshiftSequence,WildtypeProtein,gene_symbol,PepID,TrueIDRstart
0,rs775855727,1:943999-944011,ENSG00000187634,ENST00000342066,frameshift_variant,1982-1994,1892-1904,631-635,ELGTG/X,gAACTCGGCACAGGa/ga,HIGH,MSKGILQVHPPICDCPGCRISSPVNRGRLADKRTVALPAARNLKKE...,MSKGILQVHPPICDCPGCRISSPVNRGRLADKRTVALPAARNLKKE...,SAMD11,ENSP00000342313,631.0
8,rs763263330,1:943999-944000,ENSG00000187634,ENST00000342066,frameshift_variant,1982-1983,1892-1893,631,E/EIGX,gaa/gaAATCGGCAa,HIGH,MSKGILQVHPPICDCPGCRISSPVNRGRLADKRTVALPAARNLKKE...,MSKGILQVHPPICDCPGCRISSPVNRGRLADKRTVALPAARNLKKE...,SAMD11,ENSP00000342313,631.0
16,rs764300897,1:944012-944013,ENSG00000187634,ENST00000342066,frameshift_variant,1995-1996,1905-1906,635-636,GE/GX,ggAGag/ggag,HIGH,MSKGILQVHPPICDCPGCRISSPVNRGRLADKRTVALPAARNLKKE...,MSKGILQVHPPICDCPGCRISSPVNRGRLADKRTVALPAARNLKKE...,SAMD11,ENSP00000342313,631.0
24,rs1557612630,1:944020,ENSG00000187634,ENST00000342066,frameshift_variant,2003,1913,638,P/X,cCc/cc,HIGH,MSKGILQVHPPICDCPGCRISSPVNRGRLADKRTVALPAARNLKKE...,MSKGILQVHPPICDCPGCRISSPVNRGRLADKRTVALPAARNLKKE...,SAMD11,ENSP00000342313,631.0
32,rs1557612652,1:944027,ENSG00000187634,ENST00000342066,frameshift_variant,2010,1920,640,S/X,tcC/tc,HIGH,MSKGILQVHPPICDCPGCRISSPVNRGRLADKRTVALPAARNLKKE...,MSKGILQVHPPICDCPGCRISSPVNRGRLADKRTVALPAARNLKKE...,SAMD11,ENSP00000342313,631.0


In [8]:
p3 = re.compile(r'(?P<wtaa>[A-Z]*)\W'
                +r'(?P<mutaa>[A-Z]*)')
dffAnnot = pd.concat([
    dffAnnot,
    (
        dffAnnot.Amino_acids.str.extractall(p3)
          .reset_index('match', drop=True)
    )
], axis=1).fillna('')
dffAnnot.info()
dffAnnot.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9325 entries, 0 to 29773
Data columns (total 18 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Uploaded_variation  9325 non-null   object 
 1   Location            9325 non-null   object 
 2   Gene                9325 non-null   object 
 3   Feature             9325 non-null   object 
 4   Consequence         9325 non-null   object 
 5   cDNA_position       9325 non-null   object 
 6   CDS_position        9325 non-null   object 
 7   Protein_position    9325 non-null   object 
 8   Amino_acids         9325 non-null   object 
 9   Codons              9325 non-null   object 
 10  IMPACT              9325 non-null   object 
 11  FrameshiftSequence  9325 non-null   object 
 12  WildtypeProtein     9325 non-null   object 
 13  gene_symbol         9325 non-null   object 
 14  PepID               9325 non-null   object 
 15  TrueIDRstart        9325 non-null   float64
 16  wtaa 

Unnamed: 0,Uploaded_variation,Location,Gene,Feature,Consequence,cDNA_position,CDS_position,Protein_position,Amino_acids,Codons,IMPACT,FrameshiftSequence,WildtypeProtein,gene_symbol,PepID,TrueIDRstart,wtaa,mutaa
0,rs775855727,1:943999-944011,ENSG00000187634,ENST00000342066,frameshift_variant,1982-1994,1892-1904,631-635,ELGTG/X,gAACTCGGCACAGGa/ga,HIGH,MSKGILQVHPPICDCPGCRISSPVNRGRLADKRTVALPAARNLKKE...,MSKGILQVHPPICDCPGCRISSPVNRGRLADKRTVALPAARNLKKE...,SAMD11,ENSP00000342313,631.0,ELGTG,X
8,rs763263330,1:943999-944000,ENSG00000187634,ENST00000342066,frameshift_variant,1982-1983,1892-1893,631,E/EIGX,gaa/gaAATCGGCAa,HIGH,MSKGILQVHPPICDCPGCRISSPVNRGRLADKRTVALPAARNLKKE...,MSKGILQVHPPICDCPGCRISSPVNRGRLADKRTVALPAARNLKKE...,SAMD11,ENSP00000342313,631.0,E,EIGX
16,rs764300897,1:944012-944013,ENSG00000187634,ENST00000342066,frameshift_variant,1995-1996,1905-1906,635-636,GE/GX,ggAGag/ggag,HIGH,MSKGILQVHPPICDCPGCRISSPVNRGRLADKRTVALPAARNLKKE...,MSKGILQVHPPICDCPGCRISSPVNRGRLADKRTVALPAARNLKKE...,SAMD11,ENSP00000342313,631.0,GE,GX
24,rs1557612630,1:944020,ENSG00000187634,ENST00000342066,frameshift_variant,2003,1913,638,P/X,cCc/cc,HIGH,MSKGILQVHPPICDCPGCRISSPVNRGRLADKRTVALPAARNLKKE...,MSKGILQVHPPICDCPGCRISSPVNRGRLADKRTVALPAARNLKKE...,SAMD11,ENSP00000342313,631.0,P,X
32,rs1557612652,1:944027,ENSG00000187634,ENST00000342066,frameshift_variant,2010,1920,640,S/X,tcC/tc,HIGH,MSKGILQVHPPICDCPGCRISSPVNRGRLADKRTVALPAARNLKKE...,MSKGILQVHPPICDCPGCRISSPVNRGRLADKRTVALPAARNLKKE...,SAMD11,ENSP00000342313,631.0,S,X


In [9]:
dffAnnot['ID'] = dffAnnot['PepID'].astype(str) + ":p." + dffAnnot['wtaa'].astype(str) + dffAnnot['Protein_position'].astype(str) + dffAnnot['mutaa'].astype(str)
dffAnnot.head()

Unnamed: 0,Uploaded_variation,Location,Gene,Feature,Consequence,cDNA_position,CDS_position,Protein_position,Amino_acids,Codons,IMPACT,FrameshiftSequence,WildtypeProtein,gene_symbol,PepID,TrueIDRstart,wtaa,mutaa,ID
0,rs775855727,1:943999-944011,ENSG00000187634,ENST00000342066,frameshift_variant,1982-1994,1892-1904,631-635,ELGTG/X,gAACTCGGCACAGGa/ga,HIGH,MSKGILQVHPPICDCPGCRISSPVNRGRLADKRTVALPAARNLKKE...,MSKGILQVHPPICDCPGCRISSPVNRGRLADKRTVALPAARNLKKE...,SAMD11,ENSP00000342313,631.0,ELGTG,X,ENSP00000342313:p.ELGTG631-635X
8,rs763263330,1:943999-944000,ENSG00000187634,ENST00000342066,frameshift_variant,1982-1983,1892-1893,631,E/EIGX,gaa/gaAATCGGCAa,HIGH,MSKGILQVHPPICDCPGCRISSPVNRGRLADKRTVALPAARNLKKE...,MSKGILQVHPPICDCPGCRISSPVNRGRLADKRTVALPAARNLKKE...,SAMD11,ENSP00000342313,631.0,E,EIGX,ENSP00000342313:p.E631EIGX
16,rs764300897,1:944012-944013,ENSG00000187634,ENST00000342066,frameshift_variant,1995-1996,1905-1906,635-636,GE/GX,ggAGag/ggag,HIGH,MSKGILQVHPPICDCPGCRISSPVNRGRLADKRTVALPAARNLKKE...,MSKGILQVHPPICDCPGCRISSPVNRGRLADKRTVALPAARNLKKE...,SAMD11,ENSP00000342313,631.0,GE,GX,ENSP00000342313:p.GE635-636GX
24,rs1557612630,1:944020,ENSG00000187634,ENST00000342066,frameshift_variant,2003,1913,638,P/X,cCc/cc,HIGH,MSKGILQVHPPICDCPGCRISSPVNRGRLADKRTVALPAARNLKKE...,MSKGILQVHPPICDCPGCRISSPVNRGRLADKRTVALPAARNLKKE...,SAMD11,ENSP00000342313,631.0,P,X,ENSP00000342313:p.P638X
32,rs1557612652,1:944027,ENSG00000187634,ENST00000342066,frameshift_variant,2010,1920,640,S/X,tcC/tc,HIGH,MSKGILQVHPPICDCPGCRISSPVNRGRLADKRTVALPAARNLKKE...,MSKGILQVHPPICDCPGCRISSPVNRGRLADKRTVALPAARNLKKE...,SAMD11,ENSP00000342313,631.0,S,X,ENSP00000342313:p.S640X


In [10]:
WTdffAnnot = dffAnnot[['Uploaded_variation','Location','Feature', 'PepID', 'Gene', 'gene_symbol', 'WildtypeProtein', 'TrueIDRstart']]
MutdffAnnot = dffAnnot[['Uploaded_variation','Location','ID','Feature', 'PepID', 'Gene', 'gene_symbol', 'FrameshiftSequence', 'TrueIDRstart','Amino_acids','Protein_position']]
WTdffAnnot = WTdffAnnot.rename(columns={'WildtypeProtein': 'Sequence',})
MutdffAnnot = MutdffAnnot.rename(columns={'FrameshiftSequence': 'Sequence',})
WTdffAnnot['SeqType'] = 'WildType'
MutdffAnnot['SeqType'] = 'Frameshift'
WTdffAnnot['ID'] = WTdffAnnot['PepID']
WTdffAnnot['Uploaded_variation'] = ""
WTdffAnnot['Location'] =  ""
WTdffAnnot = WTdffAnnot.drop_duplicates(subset=['ID'])
WTdffAnnot.to_csv('WT_Frameshift.csv', index=False)
MutdffAnnot.to_csv('Mut_Frameshift.csv', index=False)