In [None]:
################################################################################
## This script contains the code to reproduce NMD analysis.                   ##
## Mensah & Niskanen et al.                                                   ##
## Disruption of nucleolar phase separation in human genetic disease 2022     ##
## Author: Alexandre P Magalhaes                                              ##
################################################################################

In [1]:
from __future__ import print_function
import re, gzip, time, itertools, io
import sys
import pandas as pd
import numpy as np
import csv
import requests
from pandarallel import pandarallel

In [2]:
df = pd.read_csv('MasterTable_wNMD_wVariantDisease_18012021.csv',low_memory=False)
df = df.drop('NMD', 1)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13002 entries, 0 to 13001
Columns: 196 entries, Uploaded_variation to gnomAD_SAS_AF
dtypes: bool(3), float64(150), int64(9), object(34)
memory usage: 19.2+ MB


  df = df.drop('NMD', 1)


In [3]:
def CDS_stop(x):
    lenseq = x['FullLength']
    return lenseq * 3 + 3
    
    
    
df["CDS_stop"] = df.apply(CDS_stop, axis = 1)
df.head()

Unnamed: 0,Uploaded_variation,Location,Feature,PepID,Gene,gene_symbol,Sequence,TrueIDRstart,SeqType,seqTypeExp,...,gnomAD_AF,gnomAD_AFR_AF,gnomAD_AMR_AF,gnomAD_ASJ_AF,gnomAD_EAS_AF,gnomAD_FIN_AF,gnomAD_NFE_AF,gnomAD_OTH_AF,gnomAD_SAS_AF,CDS_stop
0,692054,16:86511252-86511259,ENST00000262426,ENSP00000262426,ENSG00000103241,FOXF1,MSSAPEKQQPPHGGGGGGGGGGGAAMDPASSGPSKAKKTNAGIRRP...,207,Frameshift,Frameshift,...,,,,,,,,,,873
1,1013621,21:34792484-34792490,ENST00000399240,ENSP00000382184,ENSG00000159216,RUNX1,MASDSIFESFPSYPQCFMRECILGMNPSRDVHDASTSRRFTPPSTA...,360,Frameshift,Frameshift,...,,,,,,,,,,1773
2,rs1057519478,6:1611038-1611056,ENST00000645831,ENSP00000493906,ENSG00000054598,FOXC1,MQARYSVSSPNSLGVVPYLGGEQSYYRAAAAAAGGGYTAMPAPMSV...,141,Frameshift,Frameshift,...,,,,,,,,,,924
3,rs1179926739,11:17720334-17720335,ENST00000250003,ENSP00000250003,ENSG00000129152,MYOD1,MELLSPPLRDVDLTAPDGSLCSFATTDDFYDDPCFDSPDLRFFEDL...,262,Frameshift,Frameshift,...,,,,,,,,,,831
4,rs1603388837,18:59269381,ENST00000256852,ENSP00000256852,ENSG00000134438,RAX,MHLPGCAPAMADGSFSLAGHLLRSPGGSTSRLHSIEAILGFTKDDG...,187,Frameshift,Frameshift,...,,,,,,,,,,852


In [5]:
exons = pd.read_csv('ExonCoordinates.txt',sep = '\t',low_memory=False)
exons = exons.drop('Gene_stable_ID', 1)
exons = exons[exons['CDS_start'].notna()]
exons.head()

  exons = exons.drop('Gene_stable_ID', 1)


Unnamed: 0,Transcript_stable_ID,Exon_region_start,Exon_region_end,Exon_rank_in_transcript,Genomic_coding_start,Genomic_coding_end,cDNA_coding_start,cDNA_coding_end,CDS_start,CDS_end,Protein_stable_ID,Exon_stable_ID,Constitutive_exon
0,ENST00000005286,60924481,60924733,1,60924634.0,60924733.0,154.0,253.0,1.0,100.0,ENSP00000005286,ENSE00002270579,0
1,ENST00000005286,60927204,60927418,2,60927204.0,60927418.0,254.0,468.0,101.0,315.0,ENSP00000005286,ENSE00003633778,0
2,ENST00000005286,60927641,60927859,3,60927641.0,60927859.0,469.0,687.0,316.0,534.0,ENSP00000005286,ENSE00003601138,0
3,ENST00000005286,60928629,60928960,4,60928629.0,60928960.0,688.0,1019.0,535.0,866.0,ENSP00000005286,ENSE00003537152,0
4,ENST00000005286,60930510,60930659,5,60930510.0,60930659.0,1020.0,1169.0,867.0,1016.0,ENSP00000005286,ENSE00003788120,0


In [None]:
def isoneExon(x):
    feature = x['Transcript_stable_ID']
    exonarray = exons.loc[exons['Transcript_stable_ID'] == feature]
    

In [14]:
pandarallel.initialize(progress_bar=True, nb_workers = 11)

def NMD_check(x):
    feature = x['Feature']
    stop = x['CDS_stop']
    exonarray = exons.loc[exons['Transcript_stable_ID'] == feature]
    if len(exonarray) == 0:
        return "Empty"
    exonarray = exonarray.values.tolist()
    firstexon = exonarray[0]
    feStart = firstexon[8]
    Constitutive_exon = len(exonarray)
    if Constitutive_exon == 1:
        return "NMD_escaping_variant"
    if stop <= 100 :
        return "NMD_escaping_variant"
    lastexon = exonarray[-1]
    LEstart = lastexon[8]
    LEstop = lastexon[9]
    if stop >= LEstart:
        return "NMD_escaping_variant"
    if stop >= LEstop:
        return "NMD_escaping_variant"
    if stop < LEstart:
        s2lastExon = exonarray[-2]
        s2LEstop = s2lastExon[9]
        if stop >= s2LEstop - 51:
            return "NMD_escaping_variant"
            
start = time.time()
df['NMD_check'] = df.parallel_apply(NMD_check, axis = 1)
end = time.time()
print(end - start)                                    
df.to_csv('MasterTable_wNMD_wVariantDisease_2200511.csv', index=False)

INFO: Pandarallel will run on 11 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=1182), Label(value='0 / 1182'))), …

6.67624568939209


In [105]:
df.to_csv('MasterTable_wNMD_wVariantDisease_220313.csv', index=False)

In [None]:
df.to_csv('df.csv', index=False)

In [7]:
feature = 'ENST00000262426'
stop = 873
exonarray = exons.loc[exons['Transcript_stable_ID'] == feature]

In [9]:
exonarray

Unnamed: 0,Transcript_stable_ID,Exon_region_start,Exon_region_end,Exon_rank_in_transcript,Genomic_coding_start,Genomic_coding_end,cDNA_coding_start,cDNA_coding_end,CDS_start,CDS_end,Protein_stable_ID,Exon_stable_ID,Constitutive_exon
3369,ENST00000262426,86510527,86511548,1,86510570.0,86511548.0,44.0,1022.0,1.0,979.0,ENSP00000262426,ENSE00001286040,1
3370,ENST00000262426,86512925,86515422,2,86512925.0,86513085.0,1023.0,1183.0,980.0,1140.0,ENSP00000262426,ENSE00001121720,1


In [10]:
exonarray = exonarray.values.tolist()
exonarray

[['ENST00000262426',
  86510527,
  86511548,
  1,
  86510570.0,
  86511548.0,
  44.0,
  1022.0,
  1.0,
  979.0,
  'ENSP00000262426',
  'ENSE00001286040',
  1],
 ['ENST00000262426',
  86512925,
  86515422,
  2,
  86512925.0,
  86513085.0,
  1023.0,
  1183.0,
  980.0,
  1140.0,
  'ENSP00000262426',
  'ENSE00001121720',
  1]]

In [13]:
len(exonarray)

2