In [None]:
################################################################################
## This script contains the code to make the Master Table.                    ##
## Mensah & Niskanen et al.                                                   ##
## Disruption of nucleolar phase separation in human genetic disease 2022     ##
## Author: Alexandre P Magalhaes                                              ##
################################################################################

In [1]:
import os
import re, gzip, time, itertools, io
import pandas as pd
import numpy as np
from numpy import arange
import matplotlib.pyplot as plt
import pickle
import metapredict as meta
from Bio import SeqIO
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from Bio.SeqUtils.ProtParam import ProteinAnalysis
from Bio.SeqIO.FastaIO import SimpleFastaParser
from localcider.sequenceParameters import SequenceParameters

In [51]:
df = pd.read_csv('WT_Frameshift.csv')
df1 = pd.read_csv('Mut_Frameshift.csv')
df2 = pd.read_csv('WT_Stop_gained.csv')
df3 = pd.read_csv('Mut_Stop_gained.csv')

In [52]:
df['seqTypeExp'] = "WildTypeFS"
df1['seqTypeExp'] = "Frameshift"
df2['seqTypeExp'] = "WildTypeSG"
df3['seqTypeExp'] = "Stop_gained"
df['Vartype'] = "Frameshift"
df1['Vartype'] = "Frameshift"
df2['Vartype'] = "Stop_gained"
df3['Vartype'] = "Stop_gained"

In [53]:
pdList = [df, df1]
dfVar = pd.concat(pdList, ignore_index=True)
dfVar = dfVar.drop_duplicates(subset=['ID'])
dfVar['TrueIDRstart'] = dfVar['TrueIDRstart'].astype(int)
dfVar['FullLength'] = dfVar['Sequence'].str.len()
dfVar = dfVar.dropna(subset = ['Sequence'])
dfVar['FullLength'] = dfVar['FullLength'].astype(int)
dfVar.info()
dfVar.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12367 entries, 0 to 12660
Data columns (total 16 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Uploaded_variation  9037 non-null   object
 1   Location            9037 non-null   object
 2   Feature             12367 non-null  object
 3   PepID               12367 non-null  object
 4   Gene                12367 non-null  object
 5   gene_symbol         12367 non-null  object
 6   Sequence            12367 non-null  object
 7   TrueIDRstart        12367 non-null  int64 
 8   SeqType             12367 non-null  object
 9   ID                  12367 non-null  object
 10  seqTypeExp          12367 non-null  object
 11  Vartype             12367 non-null  object
 12  Amino_acids         9037 non-null   object
 13  Protein_position    9037 non-null   object
 14  Target              6 non-null      object
 15  FullLength          12367 non-null  int64 
dtypes: int64(2), object(14

Unnamed: 0,Uploaded_variation,Location,Feature,PepID,Gene,gene_symbol,Sequence,TrueIDRstart,SeqType,ID,seqTypeExp,Vartype,Amino_acids,Protein_position,Target,FullLength
0,,,ENST00000325307,ENSP00000359393,ENSG00000029993,HMGB3,MAKGDPKKPKGKMSAYAFFVQTCREEHKKKNPEVPVNFAEFSKKCS...,153,WildType,ENSP00000359393,WildTypeFS,Frameshift,,,,200
1,,,ENST00000256852,ENSP00000256852,ENSG00000134438,RAX,MHLPGCAPAMADGSFSLAGHLLRSPGGSTSRLHSIEAILGFTKDDG...,187,WildType,ENSP00000256852,WildTypeFS,Frameshift,,,,346
2,,,ENST00000342066,ENSP00000342313,ENSG00000187634,SAMD11,MSKGILQVHPPICDCPGCRISSPVNRGRLADKRTVALPAARNLKKE...,631,WildType,ENSP00000342313,WildTypeFS,Frameshift,,,,681
3,,,ENST00000327044,ENSP00000317992,ENSG00000188976,NOC2L,MAAAGSRKRRLAELTVDEFLASGFDSESESESENSPQAETREAREA...,697,WildType,ENSP00000317992,WildTypeFS,Frameshift,,,,749
4,,,ENST00000379407,ENSP00000368717,ENSG00000187583,PLEKHN1,MGNSHCVPQAPRRLRASFSRKPSLKGNREDSARMSAGLPGPEAARS...,490,WildType,ENSP00000368717,WildTypeFS,Frameshift,,,,576


In [54]:
dfVar['IDRseq'] = dfVar.apply(lambda x : str(x['Sequence'])[x['TrueIDRstart']:x['FullLength']],1)

In [55]:
dfVar['IDRLength'] = dfVar['IDRseq'].str.len()
nan_value = float("NaN")
dfVar.replace("", nan_value, inplace=True)
dfVar_ = dfVar.dropna(subset = ['IDRseq'])
dfVar_.to_csv('dfVar.csv', index=False)
dfVar_.info()
dfVar_.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12259 entries, 0 to 12660
Data columns (total 18 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Uploaded_variation  8929 non-null   object
 1   Location            8929 non-null   object
 2   Feature             12259 non-null  object
 3   PepID               12259 non-null  object
 4   Gene                12259 non-null  object
 5   gene_symbol         12259 non-null  object
 6   Sequence            12259 non-null  object
 7   TrueIDRstart        12259 non-null  int64 
 8   SeqType             12259 non-null  object
 9   ID                  12259 non-null  object
 10  seqTypeExp          12259 non-null  object
 11  Vartype             12259 non-null  object
 12  Amino_acids         8929 non-null   object
 13  Protein_position    8929 non-null   object
 14  Target              6 non-null      object
 15  FullLength          12259 non-null  int64 
 16  IDRseq              12

Unnamed: 0,Uploaded_variation,Location,Feature,PepID,Gene,gene_symbol,Sequence,TrueIDRstart,SeqType,ID,seqTypeExp,Vartype,Amino_acids,Protein_position,Target,FullLength,IDRseq,IDRLength
0,,,ENST00000325307,ENSP00000359393,ENSG00000029993,HMGB3,MAKGDPKKPKGKMSAYAFFVQTCREEHKKKNPEVPVNFAEFSKKCS...,153,WildType,ENSP00000359393,WildTypeFS,Frameshift,,,,200,EKDVADYKSKGKFDGAKGPAKVARKKVEEEDEEEEEEEEEEEEEEDE,47
1,,,ENST00000256852,ENSP00000256852,ENSG00000134438,RAX,MHLPGCAPAMADGSFSLAGHLLRSPGGSTSRLHSIEAILGFTKDDG...,187,WildType,ENSP00000256852,WildTypeFS,Frameshift,,,,346,RAKWRRQEKLEVSSMKLQDSPLLSFSRSPPSATLSPLGAGPGSGGG...,159
2,,,ENST00000342066,ENSP00000342313,ENSG00000187634,SAMD11,MSKGILQVHPPICDCPGCRISSPVNRGRLADKRTVALPAARNLKKE...,631,WildType,ENSP00000342313,WildTypeFS,Frameshift,,,,681,LGTGEQPLSPTTATSPYGGGHALAGQTSPKQENGTLALLPGAPDPS...,50
3,,,ENST00000327044,ENSP00000317992,ENSG00000188976,NOC2L,MAAAGSRKRRLAELTVDEFLASGFDSESESESENSPQAETREAREA...,697,WildType,ENSP00000317992,WildTypeFS,Frameshift,,,,749,DDEEDEEEGEEDSSNSEDGDPDAEAGLAPGELQQLAQGPEDELEDL...,52
4,,,ENST00000379407,ENSP00000368717,ENSG00000187583,PLEKHN1,MGNSHCVPQAPRRLRASFSRKPSLKGNREDSARMSAGLPGPEAARS...,490,WildType,ENSP00000368717,WildTypeFS,Frameshift,,,,576,LQSRAAQRHRGSAKDGGPQPPDAPQLVSSAREGSPEPWLPLTDGRS...,86


In [7]:
def frac_X(seq, x):
    """Return fraction of sequence for arbitrary amino acid X."""
    return seq.count(x) / len(seq)
def frac_group(seq, group):
    """Return fraction of sequence for arbitrary group of amino acids where the group is a string of amino acid symbols."""
    count = 0
    for sym in group:
        count += seq.count(sym)
    return count / len(seq)
def frac_acidic(seq):
    return frac_group(seq, 'DE')
def frac_basic(seq):
    return frac_group(seq, 'RKH')
def frac_RK(seq):
    return frac_group(seq, 'RK')
def frac_aromatic(seq):
    return frac_group(seq, 'FYW')

In [8]:
IDRFracDisoPromoting = []
IDRFracPos = []
IDRFracNeg = []
IDRFracCharged = []
IDRMeanNetCharge = []
IDRfrac_acidic = []
IDRfrac_basic = []
IDRfrac_RK = []
IDRfrac_L = []
IDRfrac_R = []
IDRfrac_M = []
IDRfrac_aromatic = []
IDRsec_struc = []
FullpepFracDisoPromoting = []
FullpepFracPos = []
FullpepFracNeg = []
FullpepFracCharged = []
FullpepMeanNetCharge = []
Fullpepfrac_acidic = []
Fullpepfrac_basic = []
Fullpepfrac_basic = []
Fullpepfrac_RK = []
Fullpepfrac_L = []
Fullpepfrac_R = []
Fullpepfrac_M = []
Fullpepfrac_aromatic = []
Fullpepsec_struc = []

In [9]:
for i, row in dfVar_.iterrows():
    seq = dfVar_['IDRseq'][i]
    SeqOb = SequenceParameters(dfVar_['IDRseq'][i])
    IDRFracDisoPromoting.append(SeqOb.get_fraction_disorder_promoting())
    IDRFracPos.append(SeqOb.get_fraction_positive())
    IDRFracNeg.append(SeqOb.get_fraction_negative())
    IDRFracCharged.append(SeqOb.get_FCR(pH=None))
    IDRMeanNetCharge.append(SeqOb.get_mean_net_charge(pH=None))
    IDRfrac_acidic.append(frac_acidic(seq))
    IDRfrac_basic.append(frac_basic(seq))
    IDRfrac_RK.append(frac_RK(seq))
    IDRfrac_L.append(frac_X(seq, 'L'))
    IDRfrac_R.append(frac_X(seq, 'R'))
    IDRfrac_M.append(frac_X(seq, 'M'))
    IDRfrac_aromatic.append(frac_aromatic(seq))
    X = ProteinAnalysis(seq)
    sec_struc = X.secondary_structure_fraction()
    IDRsec_struc.append(sec_struc[0])
    seq2 = dfVar_['Sequence'][i]
    SeqOb2 = SequenceParameters(dfVar_['Sequence'][i])
    FullpepFracDisoPromoting.append(SeqOb2.get_fraction_disorder_promoting())
    FullpepFracPos.append(SeqOb2.get_fraction_positive())
    FullpepFracNeg.append(SeqOb2.get_fraction_negative())
    FullpepFracCharged.append(SeqOb2.get_FCR(pH=None))
    FullpepMeanNetCharge.append(SeqOb2.get_mean_net_charge(pH=None))
    Fullpepfrac_acidic.append(frac_acidic(seq2))
    Fullpepfrac_basic.append(frac_basic(seq2))
    Fullpepfrac_RK.append(frac_RK(seq2))
    Fullpepfrac_L.append(frac_X(seq2, 'L'))
    Fullpepfrac_R.append(frac_X(seq2, 'R'))
    Fullpepfrac_M.append(frac_X(seq2, 'M'))
    Fullpepfrac_aromatic.append(frac_aromatic(seq2))
    X2 = ProteinAnalysis(seq2)
    sec_struc2 = X2.secondary_structure_fraction()
    Fullpepsec_struc.append(sec_struc2[0])

In [56]:
s1 = pd.Series(IDRFracDisoPromoting, name='IDRFracDisoPromoting')
s4 = pd.Series(IDRFracPos, name='IDRFracPos')
s5 = pd.Series(IDRFracNeg, name='IDRFracNeg')
s6 = pd.Series(IDRFracCharged, name='IDRFracCharged')
s7 = pd.Series(IDRMeanNetCharge, name='IDRMeanNetCharge')
s8 = pd.Series(IDRfrac_acidic, name='IDRfrac_acidic')
s9 = pd.Series(IDRfrac_basic, name='IDRfrac_basic')
s10 = pd.Series(IDRfrac_L, name='IDRfrac_L')
s11 = pd.Series(IDRfrac_R, name='IDRfrac_R')
s12 = pd.Series(IDRfrac_M, name='IDRfrac_M')
s13 = pd.Series(IDRfrac_aromatic, name='IDRfrac_aromatic')
s14 = pd.Series(IDRsec_struc, name='IDRsec_struc')
s17 = pd.Series(FullpepFracDisoPromoting, name='FullpepFracDisoPromoting')    
s20 = pd.Series(FullpepFracPos, name='FullpepFracPos')    
s21 = pd.Series(FullpepFracNeg, name='FullpepFracNeg')
s22 = pd.Series(FullpepFracCharged, name='FullpepFracCharged')
s23 = pd.Series(FullpepMeanNetCharge, name='FullpepMeanNetCharge')
s24 = pd.Series(Fullpepfrac_acidic, name='Fullpepfrac_acidic')
s25 = pd.Series(Fullpepfrac_basic, name='Fullpepfrac_basic')
s26 = pd.Series(Fullpepfrac_L, name='Fullpepfrac_L')
s27 = pd.Series(Fullpepfrac_R, name='Fullpepfrac_R')
s28 = pd.Series(Fullpepfrac_M, name='Fullpepfrac_M')
s29 = pd.Series(Fullpepfrac_aromatic, name='Fullpepfrac_aromatic')
s30 = pd.Series(Fullpepsec_struc, name='Fullpepsec_struc')
s32 = pd.Series(IDRfrac_RK, name='IDRfrac_RK')
s33 = pd.Series(Fullpepfrac_RK, name='Fullpepfrac_RK')
s32 = pd.Series(IDRfrac_RK, name='IDRfrac_RK')

In [57]:
dfVar_ = dfVar_.assign(IDRFracDisoPromoting=s1.values)
dfVar_ = dfVar_.assign(IDRFracPos=s4.values)
dfVar_ = dfVar_.assign(IDRFracNeg=s5.values)
dfVar_ = dfVar_.assign(IDRFracCharged=s6.values)
dfVar_ = dfVar_.assign(IDRMeanNetCharge=s7.values)
dfVar_ = dfVar_.assign(IDRfrac_acidic=s8.values)
dfVar_ = dfVar_.assign(IDRfrac_basic=s9.values)
dfVar_ = dfVar_.assign(IDRfrac_RK=s32.values)
dfVar_ = dfVar_.assign(IDRfrac_L=s10.values)
dfVar_ = dfVar_.assign(IDRfrac_R=s11.values)
dfVar_ = dfVar_.assign(IDRfrac_M=s12.values)
dfVar_ = dfVar_.assign(IDRfrac_aromatic=s13.values)
dfVar_ = dfVar_.assign(IDRsec_struc=s14.values)
dfVar_ = dfVar_.assign(FullpepFracDisoPromoting=s17.values)
dfVar_ = dfVar_.assign(FullpepFracPos=s20.values)
dfVar_ = dfVar_.assign(FullpepFracNeg=s21.values)
dfVar_ = dfVar_.assign(FullpepFracCharged=s22.values)
dfVar_ = dfVar_.assign(FullpepMeanNetCharge=s23.values)
dfVar_ = dfVar_.assign(Fullpepfrac_acidic=s24.values)
dfVar_ = dfVar_.assign(Fullpepfrac_basic=s25.values)
dfVar_ = dfVar_.assign(Fullpepfrac_RK=s33.values)
dfVar_ = dfVar_.assign(Fullpepfrac_L=s26.values)
dfVar_ = dfVar_.assign(Fullpepfrac_R=s27.values)
dfVar_ = dfVar_.assign(Fullpepfrac_M=s28.values)
dfVar_ = dfVar_.assign(Fullpepfrac_aromatic=s29.values)
dfVar_ = dfVar_.assign(Fullpepsec_struc=s30.values)

In [58]:
dfVar_.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12259 entries, 0 to 12660
Data columns (total 44 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Uploaded_variation        8929 non-null   object 
 1   Location                  8929 non-null   object 
 2   Feature                   12259 non-null  object 
 3   PepID                     12259 non-null  object 
 4   Gene                      12259 non-null  object 
 5   gene_symbol               12259 non-null  object 
 6   Sequence                  12259 non-null  object 
 7   TrueIDRstart              12259 non-null  int64  
 8   SeqType                   12259 non-null  object 
 9   ID                        12259 non-null  object 
 10  seqTypeExp                12259 non-null  object 
 11  Vartype                   12259 non-null  object 
 12  Amino_acids               8929 non-null   object 
 13  Protein_position          8929 non-null   object 
 14  Target

In [12]:
IDRpLDDT = []
IDRdisorder_score = []
for i, row in dfVar_.iterrows():
    seq = dfVar_['IDRseq'][i]
    predict_pLDDT = meta.predict_pLDDT(seq)
    predict_disorder = meta.predict_disorder(seq)
    IDRpLDDT.append(np.mean(predict_pLDDT))
    IDRdisorder_score.append(np.mean(predict_disorder))

In [13]:
IDRkappa = []
for i, row in dfVar_.iterrows():
    SeqOb = SequenceParameters(dfVar_['IDRseq'][i])
    IDRkappa.append(SeqOb.get_kappa())

In [59]:
s2 = pd.Series(IDRkappa, name='IDRkappa')
s15 = pd.Series(IDRpLDDT, name='IDRpLDDT')
s16 = pd.Series(IDRdisorder_score, name='IDRdisorder_score')

dfVar_ = dfVar_.assign(IDRkappa=s2.values)
dfVar_ = dfVar_.assign(IDRpLDDT=s15.values)
dfVar_ = dfVar_.assign(IDRdisorder_score=s16.values)


In [60]:
dfVar_.to_csv('dfVar_.csv', index=False)

In [61]:
dfwt = dfVar_.loc[dfVar_['SeqType'] == 'WildType']
droplist = ['Feature', 'Gene', 'gene_symbol', 'SeqType', 'ID', 'Amino_acids', 'Protein_position', 'TrueIDRstart']
dfwt = dfwt.drop(droplist, 1)
dfwt = dfwt.rename(columns={'Sequence': 'WTSequence', 'IDRseq': 'WTIDRseq', 'FullLength':'WTFullLength' , 'IDRLength':'WTIDRLength'})
dfwt = dfwt.drop_duplicates(subset=['PepID'])
dfwt.info()
dfwt.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3330 entries, 0 to 3329
Data columns (total 39 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Uploaded_variation        0 non-null      object 
 1   Location                  0 non-null      object 
 2   PepID                     3330 non-null   object 
 3   WTSequence                3330 non-null   object 
 4   seqTypeExp                3330 non-null   object 
 5   Vartype                   3330 non-null   object 
 6   Target                    0 non-null      object 
 7   WTFullLength              3330 non-null   int64  
 8   WTIDRseq                  3330 non-null   object 
 9   WTIDRLength               3330 non-null   int64  
 10  IDRFracDisoPromoting      3330 non-null   float64
 11  IDRFracPos                3330 non-null   float64
 12  IDRFracNeg                3330 non-null   float64
 13  IDRFracCharged            3330 non-null   float64
 14  IDRMeanN

Unnamed: 0,Uploaded_variation,Location,PepID,WTSequence,seqTypeExp,Vartype,Target,WTFullLength,WTIDRseq,WTIDRLength,...,Fullpepfrac_basic,Fullpepfrac_RK,Fullpepfrac_L,Fullpepfrac_R,Fullpepfrac_M,Fullpepfrac_aromatic,Fullpepsec_struc,IDRkappa,IDRpLDDT,IDRdisorder_score
0,,,ENSP00000359393,MAKGDPKKPKGKMSAYAFFVQTCREEHKKKNPEVPVNFAEFSKKCS...,WildTypeFS,Frameshift,,200,EKDVADYKSKGKFDGAKGPAKVARKKVEEEDEEEEEEEEEEEEEEDE,47,...,0.24,0.235,0.02,0.035,0.03,0.09,0.17,0.624053,61.228585,0.898298
1,,,ENSP00000256852,MHLPGCAPAMADGSFSLAGHLLRSPGGSTSRLHSIEAILGFTKDDG...,WildTypeFS,Frameshift,,346,RAKWRRQEKLEVSSMKLQDSPLLSFSRSPPSATLSPLGAGPGSGGG...,159,...,0.135838,0.115607,0.101156,0.066474,0.008671,0.063584,0.202312,0.180977,61.581404,0.583981
2,,,ENSP00000342313,MSKGILQVHPPICDCPGCRISSPVNRGRLADKRTVALPAARNLKKE...,WildTypeFS,Frameshift,,681,LGTGEQPLSPTTATSPYGGGHALAGQTSPKQENGTLALLPGAPDPS...,50,...,0.142438,0.110132,0.117474,0.082232,0.011747,0.042584,0.212922,0.281729,56.186376,0.84814
3,,,ENSP00000317992,MAAAGSRKRRLAELTVDEFLASGFDSESESESENSPQAETREAREA...,WildTypeFS,Frameshift,,749,DDEEDEEEGEEDSSNSEDGDPDAEAGLAPGELQQLAQGPEDELEDL...,52,...,0.145527,0.128171,0.118825,0.072096,0.016021,0.080107,0.29506,0.357836,58.35995,0.894269
4,,,ENSP00000368717,MGNSHCVPQAPRRLRASFSRKPSLKGNREDSARMSAGLPGPEAARS...,WildTypeFS,Frameshift,,576,LQSRAAQRHRGSAKDGGPQPPDAPQLVSSAREGSPEPWLPLTDGRS...,86,...,0.145833,0.105903,0.119792,0.078125,0.006944,0.059028,0.243056,0.236036,55.530934,0.826291


In [62]:
dfwt= dfwt.rename(columns={'IDRFracDisoPromoting' : 'WTIDRFracDisoPromoting'})
dfwt= dfwt.rename(columns={'IDRFracPos' : 'WTIDRFracPos'})
dfwt= dfwt.rename(columns={'IDRFracNeg' : 'WTIDRFracNeg'})
dfwt= dfwt.rename(columns={'IDRFracCharged' : 'WTIDRFracCharged'})
dfwt= dfwt.rename(columns={'IDRMeanNetCharge' : 'WTIDRMeanNetCharge'})
dfwt= dfwt.rename(columns={'IDRfrac_acidic' : 'WTIDRfrac_acidic'})
dfwt= dfwt.rename(columns={'IDRfrac_basic' : 'WTIDRfrac_basic'})
dfwt= dfwt.rename(columns={'IDRfrac_RK' : 'WTIDRfrac_RK'})
dfwt= dfwt.rename(columns={'IDRfrac_L' : 'WTIDRfrac_L'})
dfwt= dfwt.rename(columns={'IDRfrac_R' : 'WTIDRfrac_R'})
dfwt= dfwt.rename(columns={'IDRfrac_M' : 'WTIDRfrac_M'})
dfwt= dfwt.rename(columns={'IDRfrac_aromatic' : 'WTIDRfrac_aromatic'})
dfwt= dfwt.rename(columns={'IDRpLDDT' : 'WTIDRpLDDT'})
dfwt= dfwt.rename(columns={'IDRdisorder_score' : 'WTIDRdisorder_score'})
dfwt= dfwt.rename(columns={'IDRkappa' : 'WTIDRkappa'})
dfwt= dfwt.rename(columns={'IDRsec_struc' : 'WTIDRsec_struc'})
dfwt= dfwt.rename(columns={'FullpepFracDisoPromoting' : 'WTFullpepFracDisoPromoting'})
dfwt= dfwt.rename(columns={'FullpepFracPos' : 'WTFullpepFracPos'})
dfwt= dfwt.rename(columns={'FullpepFracNeg' : 'WTFullpepFracNeg'})
dfwt= dfwt.rename(columns={'FullpepFracCharged' : 'WTFullpepFracCharged'})
dfwt= dfwt.rename(columns={'FullpepMeanNetCharge' : 'WTFullpepMeanNetCharge'})
dfwt= dfwt.rename(columns={'FullpepMeanNetCharge' : 'WTFullpepMeanNetCharge'})
dfwt= dfwt.rename(columns={'Fullpepfrac_acidic' : 'WTFullpepfrac_acidic'})
dfwt= dfwt.rename(columns={'Fullpepfrac_basic' : 'WTFullpepfrac_basic'})
dfwt= dfwt.rename(columns={'Fullpepfrac_L' : 'WTFullpepfrac_L'})
dfwt= dfwt.rename(columns={'Fullpepfrac_R' : 'WTFullpepfrac_R'})
dfwt= dfwt.rename(columns={'Fullpepfrac_M' : 'WTFullpepfrac_M'})
dfwt= dfwt.rename(columns={'Fullpepsec_struc' : 'WTFullpepsec_struc'})
dfwt= dfwt.rename(columns={'Fullpepfrac_aromatic' : 'WTFullpepfrac_aromatic'})
dfwt= dfwt.rename(columns={'Fullpepfrac_RK' : 'WTFullpepfrac_RK'})

In [63]:
dfVar_ = pd.merge(dfVar_, dfwt , how="left", on="PepID")
dfVar_.replace("", nan_value, inplace=True)
dfVar_.head()

Unnamed: 0,Uploaded_variation_x,Location_x,Feature,PepID,Gene,gene_symbol,Sequence,TrueIDRstart,SeqType,ID,...,WTFullpepfrac_basic,WTFullpepfrac_RK,WTFullpepfrac_L,WTFullpepfrac_R,WTFullpepfrac_M,WTFullpepfrac_aromatic,WTFullpepsec_struc,WTIDRkappa,WTIDRpLDDT,WTIDRdisorder_score
0,,,ENST00000325307,ENSP00000359393,ENSG00000029993,HMGB3,MAKGDPKKPKGKMSAYAFFVQTCREEHKKKNPEVPVNFAEFSKKCS...,153,WildType,ENSP00000359393,...,0.24,0.235,0.02,0.035,0.03,0.09,0.17,0.624053,61.228585,0.898298
1,,,ENST00000256852,ENSP00000256852,ENSG00000134438,RAX,MHLPGCAPAMADGSFSLAGHLLRSPGGSTSRLHSIEAILGFTKDDG...,187,WildType,ENSP00000256852,...,0.135838,0.115607,0.101156,0.066474,0.008671,0.063584,0.202312,0.180977,61.581404,0.583981
2,,,ENST00000342066,ENSP00000342313,ENSG00000187634,SAMD11,MSKGILQVHPPICDCPGCRISSPVNRGRLADKRTVALPAARNLKKE...,631,WildType,ENSP00000342313,...,0.142438,0.110132,0.117474,0.082232,0.011747,0.042584,0.212922,0.281729,56.186376,0.84814
3,,,ENST00000327044,ENSP00000317992,ENSG00000188976,NOC2L,MAAAGSRKRRLAELTVDEFLASGFDSESESESENSPQAETREAREA...,697,WildType,ENSP00000317992,...,0.145527,0.128171,0.118825,0.072096,0.016021,0.080107,0.29506,0.357836,58.35995,0.894269
4,,,ENST00000379407,ENSP00000368717,ENSG00000187583,PLEKHN1,MGNSHCVPQAPRRLRASFSRKPSLKGNREDSARMSAGLPGPEAARS...,490,WildType,ENSP00000368717,...,0.145833,0.105903,0.119792,0.078125,0.006944,0.059028,0.243056,0.236036,55.530934,0.826291


In [64]:
dfVar_['DELTA_IDRFracDisoPromoting'] = dfVar_['WTIDRFracDisoPromoting'].astype(float) - dfVar_['IDRFracDisoPromoting'].astype(float)
dfVar_['DELTA_IDRFracPos'] = dfVar_['WTIDRFracPos'].astype(float) - dfVar_['IDRFracPos'].astype(float)
dfVar_['DELTA_IDRFracNeg'] = dfVar_['WTIDRFracNeg'].astype(float) - dfVar_['IDRFracNeg'].astype(float)
dfVar_['DELTA_IDRFracCharged'] = dfVar_['WTIDRFracCharged'].astype(float) - dfVar_['IDRFracCharged'].astype(float)
dfVar_['DELTA_IDRMeanNetCharge'] = dfVar_['WTIDRMeanNetCharge'].astype(float) - dfVar_['IDRMeanNetCharge'].astype(float)
dfVar_['DELTA_IDRfrac_acidic'] = dfVar_['WTIDRfrac_acidic'].astype(float) - dfVar_['IDRfrac_acidic'].astype(float)
dfVar_['DELTA_IDRfrac_basic'] = dfVar_['WTIDRfrac_basic'].astype(float) - dfVar_['IDRfrac_basic'].astype(float)
dfVar_['DELTA_IDRfrac_RK'] = dfVar_['WTIDRfrac_RK'].astype(float) - dfVar_['IDRfrac_RK'].astype(float)
dfVar_['DELTA_IDRfrac_L'] = dfVar_['WTIDRfrac_L'].astype(float) - dfVar_['IDRfrac_L'].astype(float)
dfVar_['DELTA_IDRfrac_R'] = dfVar_['WTIDRfrac_R'].astype(float) - dfVar_['IDRfrac_R'].astype(float)
dfVar_['DELTA_IDRfrac_M'] = dfVar_['WTIDRfrac_M'].astype(float) - dfVar_['IDRfrac_M'].astype(float)
dfVar_['DELTA_IDRfrac_aromatic'] = dfVar_['WTIDRfrac_aromatic'].astype(float) - dfVar_['IDRfrac_aromatic'].astype(float)
dfVar_['DELTA_IDRsec_struc'] = dfVar_['WTIDRsec_struc'].astype(float) - dfVar_['IDRsec_struc'].astype(float)
dfVar_['DELTA_IDRpLDDT'] = dfVar_['WTIDRpLDDT'].astype(float) - dfVar_['IDRpLDDT'].astype(float)
dfVar_['DELTA_IDRdisorder_score'] = dfVar_['WTIDRdisorder_score'].astype(float) - dfVar_['IDRdisorder_score'].astype(float)
dfVar_['DELTA_IDRkappa'] = dfVar_['WTIDRkappa'].astype(float) - dfVar_['IDRkappa'].astype(float)
dfVar_['DELTA_FullpepFracDisoPromoting'] = dfVar_['WTFullpepFracDisoPromoting'].astype(float) - dfVar_['FullpepFracDisoPromoting'].astype(float)
dfVar_['DELTA_FullpepFracPos'] = dfVar_['WTFullpepFracPos'].astype(float) - dfVar_['FullpepFracPos'].astype(float)
dfVar_['DELTA_FullpepFracNeg'] = dfVar_['WTFullpepFracNeg'].astype(float) - dfVar_['FullpepFracNeg'].astype(float)
dfVar_['DELTA_FullpepFracCharged'] = dfVar_['WTFullpepFracCharged'].astype(float) - dfVar_['FullpepFracCharged'].astype(float)
dfVar_['DELTA_FullpepMeanNetCharge'] = dfVar_['WTFullpepMeanNetCharge'].astype(float) - dfVar_['FullpepMeanNetCharge'].astype(float)
dfVar_['DELTA_Fullpepfrac_acidic'] = dfVar_['WTFullpepfrac_acidic'].astype(float) - dfVar_['Fullpepfrac_acidic'].astype(float)
dfVar_['DELTA_Fullpepfrac_basic'] = dfVar_['WTFullpepfrac_basic'].astype(float) - dfVar_['Fullpepfrac_basic'].astype(float)
dfVar_['DELTA_Fullpepfrac_RK'] = dfVar_['WTFullpepfrac_RK'].astype(float) - dfVar_['Fullpepfrac_RK'].astype(float)
dfVar_['DELTA_Fullpepfrac_L'] = dfVar_['WTFullpepfrac_L'].astype(float) - dfVar_['Fullpepfrac_L'].astype(float)
dfVar_['DELTA_Fullpepfrac_R'] = dfVar_['WTFullpepfrac_R'].astype(float) - dfVar_['Fullpepfrac_R'].astype(float)
dfVar_['DELTA_Fullpepfrac_M'] = dfVar_['WTFullpepfrac_M'].astype(float) - dfVar_['Fullpepfrac_M'].astype(float)
dfVar_['DELTA_Fullpepfrac_aromatic'] = dfVar_['WTFullpepfrac_aromatic'].astype(float) - dfVar_['Fullpepfrac_aromatic'].astype(float)
dfVar_['DELTA_Fullpepsec_struc'] = dfVar_['WTFullpepsec_struc'].astype(float) - dfVar_['Fullpepsec_struc'].astype(float)
dfVar_['DELTA_FullLength'] = dfVar_['WTFullLength'].astype(float) - dfVar_['FullLength'].astype(float)
dfVar_['DELTA_IDRLength'] = dfVar_['WTIDRLength'].astype(float) - dfVar_['IDRLength'].astype(float)

In [65]:
dfVar_.to_csv('MasterTable_FSVar_Full.csv', index=False)

In [66]:
dfVar_.head()

Unnamed: 0,Uploaded_variation_x,Location_x,Feature,PepID,Gene,gene_symbol,Sequence,TrueIDRstart,SeqType,ID,...,DELTA_Fullpepfrac_acidic,DELTA_Fullpepfrac_basic,DELTA_Fullpepfrac_RK,DELTA_Fullpepfrac_L,DELTA_Fullpepfrac_R,DELTA_Fullpepfrac_M,DELTA_Fullpepfrac_aromatic,DELTA_Fullpepsec_struc,DELTA_FullLength,DELTA_IDRLength
0,,,ENST00000325307,ENSP00000359393,ENSG00000029993,HMGB3,MAKGDPKKPKGKMSAYAFFVQTCREEHKKKNPEVPVNFAEFSKKCS...,153,WildType,ENSP00000359393,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,,,ENST00000256852,ENSP00000256852,ENSG00000134438,RAX,MHLPGCAPAMADGSFSLAGHLLRSPGGSTSRLHSIEAILGFTKDDG...,187,WildType,ENSP00000256852,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,,,ENST00000342066,ENSP00000342313,ENSG00000187634,SAMD11,MSKGILQVHPPICDCPGCRISSPVNRGRLADKRTVALPAARNLKKE...,631,WildType,ENSP00000342313,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,,,ENST00000327044,ENSP00000317992,ENSG00000188976,NOC2L,MAAAGSRKRRLAELTVDEFLASGFDSESESESENSPQAETREAREA...,697,WildType,ENSP00000317992,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,,,ENST00000379407,ENSP00000368717,ENSG00000187583,PLEKHN1,MGNSHCVPQAPRRLRASFSRKPSLKGNREDSARMSAGLPGPEAARS...,490,WildType,ENSP00000368717,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [67]:
pdList2 = [df, df1,df2, df3]
dfVarSG = pd.concat(pdList2, ignore_index=True)
dfVarSG = dfVarSG.drop_duplicates(subset=['ID'])
dfVarSG['TrueIDRstart'] = dfVarSG['TrueIDRstart'].astype(int)
dfVarSG['FullLength'] = dfVarSG['Sequence'].str.len()
dfVarSG = dfVarSG.dropna(subset = ['Sequence'])
dfVarSG['FullLength'] = dfVarSG['FullLength'].astype(int)
dfVarSG['IDRseq'] = dfVarSG.apply(lambda x : str(x['Sequence'])[x['TrueIDRstart']:x['FullLength']],1)
dfVarSG['IDRLength'] = dfVarSG['IDRseq'].str.len()
dfVarSG.to_csv('MasterTable_Var_FUll.csv', index=False)
dfVarSG.info()
dfVarSG.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 25065 entries, 0 to 28043
Data columns (total 19 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Uploaded_variation  19362 non-null  object
 1   Location            19362 non-null  object
 2   Feature             25065 non-null  object
 3   PepID               25065 non-null  object
 4   Gene                25065 non-null  object
 5   gene_symbol         25065 non-null  object
 6   Sequence            25065 non-null  object
 7   TrueIDRstart        25065 non-null  int64 
 8   SeqType             12367 non-null  object
 9   ID                  25065 non-null  object
 10  seqTypeExp          25065 non-null  object
 11  Vartype             25065 non-null  object
 12  Amino_acids         19362 non-null  object
 13  Protein_position    19362 non-null  object
 14  Target              6 non-null      object
 15  seqType             12698 non-null  object
 16  FullLength          25

Unnamed: 0,Uploaded_variation,Location,Feature,PepID,Gene,gene_symbol,Sequence,TrueIDRstart,SeqType,ID,seqTypeExp,Vartype,Amino_acids,Protein_position,Target,seqType,FullLength,IDRseq,IDRLength
0,,,ENST00000325307,ENSP00000359393,ENSG00000029993,HMGB3,MAKGDPKKPKGKMSAYAFFVQTCREEHKKKNPEVPVNFAEFSKKCS...,153,WildType,ENSP00000359393,WildTypeFS,Frameshift,,,,,200,EKDVADYKSKGKFDGAKGPAKVARKKVEEEDEEEEEEEEEEEEEEDE,47
1,,,ENST00000256852,ENSP00000256852,ENSG00000134438,RAX,MHLPGCAPAMADGSFSLAGHLLRSPGGSTSRLHSIEAILGFTKDDG...,187,WildType,ENSP00000256852,WildTypeFS,Frameshift,,,,,346,RAKWRRQEKLEVSSMKLQDSPLLSFSRSPPSATLSPLGAGPGSGGG...,159
2,,,ENST00000342066,ENSP00000342313,ENSG00000187634,SAMD11,MSKGILQVHPPICDCPGCRISSPVNRGRLADKRTVALPAARNLKKE...,631,WildType,ENSP00000342313,WildTypeFS,Frameshift,,,,,681,LGTGEQPLSPTTATSPYGGGHALAGQTSPKQENGTLALLPGAPDPS...,50
3,,,ENST00000327044,ENSP00000317992,ENSG00000188976,NOC2L,MAAAGSRKRRLAELTVDEFLASGFDSESESESENSPQAETREAREA...,697,WildType,ENSP00000317992,WildTypeFS,Frameshift,,,,,749,DDEEDEEEGEEDSSNSEDGDPDAEAGLAPGELQQLAQGPEDELEDL...,52
4,,,ENST00000379407,ENSP00000368717,ENSG00000187583,PLEKHN1,MGNSHCVPQAPRRLRASFSRKPSLKGNREDSARMSAGLPGPEAARS...,490,WildType,ENSP00000368717,WildTypeFS,Frameshift,,,,,576,LQSRAAQRHRGSAKDGGPQPPDAPQLVSSAREGSPEPWLPLTDGRS...,86


In [68]:
dfVarSG.to_csv('MasterTable_Var_FUll.csv', index=False)

In [69]:
dfVar_= pd.read_csv('MasterTable_FSVar_FUll.csv',low_memory=False)
dropl = ['Uploaded_variation_y','Location_y','seqTypeExp_y','Vartype_y']
dfVar_ = dfVar_.drop(dropl,1)
dfVar_[['FS_Start', 'FS_stop' ]] = dfVar_['Protein_position'].str.split(pat="-",expand=True)
dfVar_ = dfVar_.drop('FS_stop', 1)

In [70]:
subdf = dfVar_[['ID', 'FS_Start']]
subdf = subdf.dropna(subset = ['FS_Start'])
subdf[['ID', 'Extra' ]] = subdf['ID'].str.split(pat=":",expand=True)
subdf=subdf.drop('Extra', 1)
subdf = subdf.drop_duplicates('ID', keep='first')
subdf = subdf.set_index('ID')
subdf.info()
subdf.head()

<class 'pandas.core.frame.DataFrame'>
Index: 3320 entries, ENSP00000359393 to ENSP00000372547
Data columns (total 1 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   FS_Start  3320 non-null   object
dtypes: object(1)
memory usage: 51.9+ KB


Unnamed: 0_level_0,FS_Start
ID,Unnamed: 1_level_1
ENSP00000359393,153
ENSP00000250003,186
ENSP00000262426,231
ENSP00000256852,222
ENSP00000399240,380


In [71]:
dfVar_ = dfVar_.set_index('ID')
dfVar_.update(subdf, overwrite=False)

In [72]:
dfVar_ = dfVar_.dropna(subset = ['FS_Start'])
dfVar_['FS_Start'] = dfVar_['FS_Start'].astype(int)
dfVar_['FullLength'] = dfVar_['FullLength'].astype(int)
dfVar_['PFSseq'] = dfVar_.apply(lambda x : str(x['Sequence'])[x['FS_Start']:x['FullLength']],1)
dfVar_['PFSseqLength'] = dfVar_['PFSseq'].str.len()
dfVar_.info()

<class 'pandas.core.frame.DataFrame'>
Index: 12246 entries, ENSP00000359393 to ENSP00000372547:p.S190X
Columns: 114 entries, Uploaded_variation_x to PFSseqLength
dtypes: float64(29), int64(5), object(80)
memory usage: 10.7+ MB


In [73]:
dfVar_.to_csv('test.csv', index=True)

In [74]:
PFSFracDisoPromoting = []
PFSFracPos = []
PFSFracNeg = []
PFSFracCharged = []
PFSMeanNetCharge = []
PFSfrac_acidic = []
PFSfrac_basic = []
PFSfrac_RK = []
PFSfrac_L = []
PFSfrac_R = []
PFSfrac_M = []
PFSfrac_aromatic = []
PFSsec_struc = []
PFSpLDDT = []
PFSdisorder_score = []
PFSkappa = []
for i, row in dfVar_.iterrows():
    if dfVar_['PFSseqLength'][i] == 0:
        PFSFracDisoPromoting.append('NaN')
        PFSFracPos.append('NaN')
        PFSFracNeg.append('NaN')
        PFSFracCharged.append('NaN')
        PFSMeanNetCharge.append('NaN')
        PFSfrac_acidic.append('NaN')
        PFSfrac_basic.append('NaN')
        PFSfrac_RK.append('NaN')
        PFSfrac_L.append('NaN')
        PFSfrac_R.append('NaN')
        PFSfrac_M.append('NaN')
        PFSfrac_aromatic.append('NaN')
        PFSsec_struc.append('NaN')
        PFSpLDDT.append('NaN')
        PFSdisorder_score.append('NaN')
        PFSkappa.append('NaN')
    else:
        seq = dfVar_['PFSseq'][i]
        SeqOb = SequenceParameters(dfVar_['PFSseq'][i])
        PFSFracDisoPromoting.append(SeqOb.get_fraction_disorder_promoting())
        PFSFracPos.append(SeqOb.get_fraction_positive())
        PFSFracNeg.append(SeqOb.get_fraction_negative())
        PFSFracCharged.append(SeqOb.get_FCR(pH=None))
        PFSMeanNetCharge.append(SeqOb.get_mean_net_charge(pH=None))
        PFSfrac_acidic.append(frac_acidic(seq))
        PFSfrac_basic.append(frac_basic(seq))
        PFSfrac_RK.append(frac_RK(seq))
        PFSfrac_L.append(frac_X(seq, 'L'))
        PFSfrac_R.append(frac_X(seq, 'R'))
        PFSfrac_M.append(frac_X(seq, 'M'))
        PFSfrac_aromatic.append(frac_aromatic(seq))
        X = ProteinAnalysis(seq)
        sec_struc = X.secondary_structure_fraction()
        PFSsec_struc.append(sec_struc[0])
        predict_pLDDT = meta.predict_pLDDT(seq)
        predict_disorder = meta.predict_disorder(seq)
        PFSpLDDT.append(np.mean(predict_pLDDT))
        PFSdisorder_score.append(np.mean(predict_disorder))
        PFSkappa.append(SeqOb.get_kappa())

In [75]:
s1 = pd.Series(PFSFracDisoPromoting, name='PFSFracDisoPromoting')
s2 = pd.Series(PFSfrac_RK, name='PFSfrac_RK')
s4 = pd.Series(PFSFracPos, name='PFSFracPos')
s5 = pd.Series(PFSFracNeg, name='PFSFracNeg')
s6 = pd.Series(PFSFracCharged, name='PFSFracCharged')
s7 = pd.Series(PFSMeanNetCharge, name='PFSMeanNetCharge')
s8 = pd.Series(PFSfrac_acidic, name='PFSfrac_acidic')
s9 = pd.Series(PFSfrac_basic, name='PFSfrac_basic')
s10 = pd.Series(PFSfrac_L, name='PFSfrac_L')
s11 = pd.Series(PFSfrac_R, name='PFSfrac_R')
s12 = pd.Series(PFSfrac_M, name='PFSfrac_M')
s13 = pd.Series(PFSfrac_aromatic, name='PFSfrac_aromatic')
s14 = pd.Series(PFSsec_struc, name='PFSsec_struc')
s2 = pd.Series(PFSkappa, name='PFSkappa')
s15 = pd.Series(PFSpLDDT, name='PFSpLDDT')
s16 = pd.Series(PFSdisorder_score, name='PFSdisorder_score')


In [76]:
dfVar_ = dfVar_.assign(PFSFracDisoPromoting=s1.values)
dfVar_ = dfVar_.assign(PFSFracPos=s4.values)
dfVar_ = dfVar_.assign(PFSFracNeg=s5.values)
dfVar_ = dfVar_.assign(PFSFracCharged=s6.values)
dfVar_ = dfVar_.assign(PFSMeanNetCharge=s7.values)
dfVar_ = dfVar_.assign(PFSfrac_acidic=s8.values)
dfVar_ = dfVar_.assign(PFSfrac_basic=s9.values)
dfVar_ = dfVar_.assign(PFSfrac_RK=s2.values)
dfVar_ = dfVar_.assign(PFSfrac_L=s10.values)
dfVar_ = dfVar_.assign(PFSfrac_R=s11.values)
dfVar_ = dfVar_.assign(PFSfrac_M=s12.values)
dfVar_ = dfVar_.assign(PFSfrac_aromatic=s13.values)
dfVar_= dfVar_.assign(PFSsec_struc=s14.values)
dfVar_ = dfVar_.assign(PFSkappa=s2.values)
dfVar_ = dfVar_.assign(PFSpLDDT=s15.values)
dfVar_ = dfVar_.assign(PFSdisorder_score=s16.values)

In [77]:
dfVar_.to_csv('MasterTable_Var_FINAL.csv', index=True)

In [78]:
dfwt2 = dfVar_.loc[dfVar_['SeqType'] == 'WildType']
list2 = ['PepID', 'PFSseqLength','PFSFracDisoPromoting' ,'PFSFracPos','PFSFracNeg','PFSFracCharged','PFSMeanNetCharge','PFSfrac_acidic','PFSfrac_basic','PFSfrac_RK','PFSfrac_L','PFSfrac_R','PFSfrac_M','PFSfrac_aromatic','PFSsec_struc','PFSkappa','PFSpLDDT', 'PFSdisorder_score']
dfwt2 = dfwt2[list2]
dfwt2.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3317 entries, ENSP00000359393 to ENSP00000372547
Data columns (total 18 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   PepID                 3317 non-null   object
 1   PFSseqLength          3317 non-null   int64 
 2   PFSFracDisoPromoting  3317 non-null   object
 3   PFSFracPos            3317 non-null   object
 4   PFSFracNeg            3317 non-null   object
 5   PFSFracCharged        3317 non-null   object
 6   PFSMeanNetCharge      3317 non-null   object
 7   PFSfrac_acidic        3317 non-null   object
 8   PFSfrac_basic         3317 non-null   object
 9   PFSfrac_RK            3317 non-null   object
 10  PFSfrac_L             3317 non-null   object
 11  PFSfrac_R             3317 non-null   object
 12  PFSfrac_M             3317 non-null   object
 13  PFSfrac_aromatic      3317 non-null   object
 14  PFSsec_struc          3317 non-null   object
 15  PFSkappa          

In [79]:
dfwt2 = dfwt2.rename(columns={'PFSFracDisoPromoting' : 'WTPFSFracDisoPromoting'})
dfwt2 = dfwt2.rename(columns={'PFSFracPos' : 'WTPFSFracPos'})
dfwt2 = dfwt2.rename(columns={'PFSFracNeg' : 'WTPFSFracNeg'})
dfwt2 = dfwt2.rename(columns={'PFSFracCharged' : 'WTPFSFracCharged'})
dfwt2 = dfwt2.rename(columns={'PFSMeanNetCharge' : 'WTPFSMeanNetCharge'})
dfwt2 = dfwt2.rename(columns={'PFSfrac_acidic' : 'WTPFSfrac_acidic'})
dfwt2 = dfwt2.rename(columns={'PFSfrac_basic' : 'WTPFSfrac_basic'})
dfwt2 = dfwt2.rename(columns={'PFSfrac_RK' : 'WTPFSfrac_RK'})
dfwt2 = dfwt2.rename(columns={'PFSfrac_L' : 'WTPFSfrac_L'})
dfwt2 = dfwt2.rename(columns={'PFSfrac_R' : 'WTPFSfrac_R'})
dfwt2 = dfwt2.rename(columns={'PFSfrac_M' : 'WTPFSfrac_M'})
dfwt2 = dfwt2.rename(columns={'PFSfrac_aromatic' : 'WTPFSfrac_aromatic'})
dfwt2 = dfwt2.rename(columns={'PFSpLDDT' : 'WTPFSpLDDT'})
dfwt2 = dfwt2.rename(columns={'PFSdisorder_score' : 'WTPFSdisorder_score'})
dfwt2 = dfwt2.rename(columns={'PFSkappa' : 'WTPFSkappa'})
dfwt2 = dfwt2.rename(columns={'PFSsec_struc' : 'WTPFSsec_struc'})
dfwt2 = dfwt2.rename(columns={'PFSseqLength' : 'WTPFSseqLength'})
dfwt2 = dfwt2.drop_duplicates(subset=['PepID'])
dfwt2.info()
dfwt2.head()

<class 'pandas.core.frame.DataFrame'>
Index: 3317 entries, ENSP00000359393 to ENSP00000372547
Data columns (total 18 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   PepID                   3317 non-null   object
 1   WTPFSseqLength          3317 non-null   int64 
 2   WTPFSFracDisoPromoting  3317 non-null   object
 3   WTPFSFracPos            3317 non-null   object
 4   WTPFSFracNeg            3317 non-null   object
 5   WTPFSFracCharged        3317 non-null   object
 6   WTPFSMeanNetCharge      3317 non-null   object
 7   WTPFSfrac_acidic        3317 non-null   object
 8   WTPFSfrac_basic         3317 non-null   object
 9   WTPFSfrac_RK            3317 non-null   object
 10  WTPFSfrac_L             3317 non-null   object
 11  WTPFSfrac_R             3317 non-null   object
 12  WTPFSfrac_M             3317 non-null   object
 13  WTPFSfrac_aromatic      3317 non-null   object
 14  WTPFSsec_struc          3317 non-nul

Unnamed: 0_level_0,PepID,WTPFSseqLength,WTPFSFracDisoPromoting,WTPFSFracPos,WTPFSFracNeg,WTPFSFracCharged,WTPFSMeanNetCharge,WTPFSfrac_acidic,WTPFSfrac_basic,WTPFSfrac_RK,WTPFSfrac_L,WTPFSfrac_R,WTPFSfrac_M,WTPFSfrac_aromatic,WTPFSsec_struc,WTPFSkappa,WTPFSpLDDT,WTPFSdisorder_score
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
ENSP00000359393,ENSP00000359393,47,0.893617,0.191489,0.510638,0.702128,0.319149,0.510638,0.191489,0.624053,0.0,0.0212766,0,0.0425532,0.106383,0.624053,61.2286,0.898298
ENSP00000256852,ENSP00000256852,124,0.758065,0.0483871,0.0483871,0.0967742,0.0,0.0483871,0.0564516,0.238029,0.129032,0.016129,0,0.0645161,0.217742,0.238029,63.7788,0.586355
ENSP00000342313,ENSP00000342313,50,0.8,0.02,0.06,0.08,0.04,0.06,0.04,0.281729,0.14,0.0,0,0.02,0.16,0.281729,56.1864,0.84814
ENSP00000317992,ENSP00000317992,2,1.0,0.0,1.0,1.0,1.0,1.0,0.0,-1.0,0.0,0.0,0,0.0,0.0,-1.0,13.8822,1.0
ENSP00000368717,ENSP00000368717,85,0.823529,0.117647,0.117647,0.235294,0.0,0.117647,0.152941,0.233977,0.0823529,0.0941176,0,0.0470588,0.164706,0.233977,55.7003,0.831


In [80]:
dfVar_ = pd.merge(dfVar_, dfwt2 , how="left", on="PepID")
dfVar_.replace("", nan_value, inplace=True)
dfVar_.head()

Unnamed: 0,Uploaded_variation_x,Location_x,Feature,PepID,Gene,gene_symbol,Sequence,TrueIDRstart,SeqType,seqTypeExp_x,...,WTPFSfrac_basic,WTPFSfrac_RK,WTPFSfrac_L,WTPFSfrac_R,WTPFSfrac_M,WTPFSfrac_aromatic,WTPFSsec_struc,WTPFSkappa,WTPFSpLDDT,WTPFSdisorder_score
0,,,ENST00000325307,ENSP00000359393,ENSG00000029993,HMGB3,MAKGDPKKPKGKMSAYAFFVQTCREEHKKKNPEVPVNFAEFSKKCS...,153,WildType,WildTypeFS,...,0.191489,0.624053,0.0,0.0212766,0,0.0425532,0.106383,0.624053,61.2286,0.898298
1,,,ENST00000256852,ENSP00000256852,ENSG00000134438,RAX,MHLPGCAPAMADGSFSLAGHLLRSPGGSTSRLHSIEAILGFTKDDG...,187,WildType,WildTypeFS,...,0.0564516,0.238029,0.129032,0.016129,0,0.0645161,0.217742,0.238029,63.7788,0.586355
2,,,ENST00000342066,ENSP00000342313,ENSG00000187634,SAMD11,MSKGILQVHPPICDCPGCRISSPVNRGRLADKRTVALPAARNLKKE...,631,WildType,WildTypeFS,...,0.04,0.281729,0.14,0.0,0,0.02,0.16,0.281729,56.1864,0.84814
3,,,ENST00000327044,ENSP00000317992,ENSG00000188976,NOC2L,MAAAGSRKRRLAELTVDEFLASGFDSESESESENSPQAETREAREA...,697,WildType,WildTypeFS,...,0.0,-1.0,0.0,0.0,0,0.0,0.0,-1.0,13.8822,1.0
4,,,ENST00000379407,ENSP00000368717,ENSG00000187583,PLEKHN1,MGNSHCVPQAPRRLRASFSRKPSLKGNREDSARMSAGLPGPEAARS...,490,WildType,WildTypeFS,...,0.152941,0.233977,0.0823529,0.0941176,0,0.0470588,0.164706,0.233977,55.7003,0.831


In [81]:
dfVar_.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12246 entries, 0 to 12245
Columns: 147 entries, Uploaded_variation_x to WTPFSdisorder_score
dtypes: float64(93), int64(5), object(49)
memory usage: 13.8+ MB


In [82]:
dfVar_['DELTA_PFSFracDisoPromoting'] = dfVar_['WTPFSFracDisoPromoting'].astype(float) - dfVar_['PFSFracDisoPromoting'].astype(float)
dfVar_['DELTA_PFSFracPos'] = dfVar_['WTPFSFracPos'].astype(float) - dfVar_['PFSFracPos'].astype(float)
dfVar_['DELTA_PFSFracNeg'] = dfVar_['WTPFSFracNeg'].astype(float) - dfVar_['PFSFracNeg'].astype(float)
dfVar_['DELTA_PFSFracCharged'] = dfVar_['WTPFSFracCharged'].astype(float) - dfVar_['PFSFracCharged'].astype(float)
dfVar_['DELTA_PFSMeanNetCharge'] = dfVar_['WTPFSMeanNetCharge'].astype(float) - dfVar_['PFSMeanNetCharge'].astype(float)
dfVar_['DELTA_PFSfrac_acidic'] = dfVar_['WTPFSfrac_acidic'].astype(float) - dfVar_['PFSfrac_acidic'].astype(float)
dfVar_['DELTA_PFSfrac_basic'] = dfVar_['WTPFSfrac_basic'].astype(float) - dfVar_['PFSfrac_basic'].astype(float)
dfVar_['DELTA_PFSfrac_RK'] = dfVar_['WTPFSfrac_RK'].astype(float) - dfVar_['PFSfrac_RK'].astype(float)
dfVar_['DELTA_PFSfrac_L'] = dfVar_['WTPFSfrac_L'].astype(float) - dfVar_['PFSfrac_L'].astype(float)
dfVar_['DELTA_PFSfrac_R'] = dfVar_['WTPFSfrac_R'].astype(float) - dfVar_['PFSfrac_R'].astype(float)
dfVar_['DELTA_PFSfrac_M'] = dfVar_['WTPFSfrac_M'].astype(float) - dfVar_['PFSfrac_M'].astype(float)
dfVar_['DELTA_PFSfrac_aromatic'] = dfVar_['WTPFSfrac_aromatic'].astype(float) - dfVar_['PFSfrac_aromatic'].astype(float)
dfVar_['DELTA_PFSsec_struc'] = dfVar_['WTPFSsec_struc'].astype(float) - dfVar_['PFSsec_struc'].astype(float)
dfVar_['DELTA_PFSpLDDT'] = dfVar_['WTPFSpLDDT'].astype(float) - dfVar_['PFSpLDDT'].astype(float)
dfVar_['DELTA_PFSdisorder_score'] = dfVar_['WTPFSdisorder_score'].astype(float) - dfVar_['PFSdisorder_score'].astype(float)
dfVar_['DELTA_PFSkappa'] = dfVar_['WTPFSkappa'].astype(float) - dfVar_['PFSkappa'].astype(float)
dfVar_['DELTA_PFSseqLength'] = dfVar_['WTPFSseqLength'].astype(float) - dfVar_['PFSseqLength'].astype(float)
dfVar_ = dfVar_.rename(columns={'Uploaded_variation_x' : 'Uploaded_variation'})
dfVar_ = dfVar_.rename(columns={'Location_x' : 'Location'})
dfVar_ = dfVar_.rename(columns={'seqTypeExp_x' : 'seqTypeExp'})
dfVar_ = dfVar_.rename(columns={'Vartype_x' : 'Vartype'})

In [83]:
dfVar_.to_csv('MasterTable_Var_FINAL.csv', index=True)

In [84]:
df = pd.read_csv('WT_Frameshift.csv')
df1 = pd.read_csv('Mut_Frameshift.csv')
df2 = pd.read_csv('WT_Stop_gained.csv')
df3 = pd.read_csv('Mut_Stop_gained.csv')

In [85]:
pdList = [df, df1]
pdList2 = [df2, df3]

In [86]:
df['seqTypeExp'] = "WildTypeFS"
df1['seqTypeExp'] = "Frameshift"
df2['seqTypeExp'] = "WildTypeSG"
df3['seqTypeExp'] = "Stop_gained"
df['Vartype'] = "Frameshift"
df1['Vartype'] = "Frameshift"
df2['Vartype'] = "Stop_gained"
df3['Vartype'] = "Stop_gained"
dfFS = pd.concat(pdList, ignore_index=True)
dfSG = pd.concat(pdList2, ignore_index=True)

In [87]:
dfFS[['FS_Start', 'FS_stop' ]] = dfFS['Protein_position'].str.split(pat="-",expand=True)
dfFS = dfFS.drop('FS_stop', 1)

In [88]:
dfFS = dfFS.set_index('ID')
dfFS.update(subdf, overwrite=False)

In [89]:
dfFS.info()

<class 'pandas.core.frame.DataFrame'>
Index: 12661 entries, ENSP00000359393 to ENSP00000372547:p.S190X
Data columns (total 15 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Uploaded_variation  9331 non-null   object 
 1   Location            9331 non-null   object 
 2   Feature             12661 non-null  object 
 3   PepID               12661 non-null  object 
 4   Gene                12661 non-null  object 
 5   gene_symbol         12661 non-null  object 
 6   Sequence            12661 non-null  object 
 7   TrueIDRstart        12661 non-null  float64
 8   SeqType             12661 non-null  object 
 9   seqTypeExp          12661 non-null  object 
 10  Vartype             12661 non-null  object 
 11  Amino_acids         9331 non-null   object 
 12  Protein_position    9331 non-null   object 
 13  Target              6 non-null      object 
 14  FS_Start            12648 non-null  object 
dtypes: float64(1), object(14)


In [90]:
dfFS = dfFS.dropna(subset = ['FS_Start'])
dfFS['FullLength'] = dfFS['Sequence'].str.len()
dfFS['FullLength'] = dfFS['FullLength'].astype(int)
dfFS['FS_Start'] = dfFS['FS_Start'].astype(int)
dfFS['PFSseq'] = dfFS.apply(lambda x : str(x['Sequence'])[x['FS_Start']:x['FullLength']],1)
dfFS['PFSseqLength'] = dfFS['PFSseq'].str.len()
dfFS.info()
dfFS.head()

<class 'pandas.core.frame.DataFrame'>
Index: 12648 entries, ENSP00000359393 to ENSP00000372547:p.S190X
Data columns (total 18 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Uploaded_variation  9331 non-null   object 
 1   Location            9331 non-null   object 
 2   Feature             12648 non-null  object 
 3   PepID               12648 non-null  object 
 4   Gene                12648 non-null  object 
 5   gene_symbol         12648 non-null  object 
 6   Sequence            12648 non-null  object 
 7   TrueIDRstart        12648 non-null  float64
 8   SeqType             12648 non-null  object 
 9   seqTypeExp          12648 non-null  object 
 10  Vartype             12648 non-null  object 
 11  Amino_acids         9331 non-null   object 
 12  Protein_position    9331 non-null   object 
 13  Target              6 non-null      object 
 14  FS_Start            12648 non-null  int64  
 15  FullLength          12648 

Unnamed: 0_level_0,Uploaded_variation,Location,Feature,PepID,Gene,gene_symbol,Sequence,TrueIDRstart,SeqType,seqTypeExp,Vartype,Amino_acids,Protein_position,Target,FS_Start,FullLength,PFSseq,PFSseqLength
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
ENSP00000359393,,,ENST00000325307,ENSP00000359393,ENSG00000029993,HMGB3,MAKGDPKKPKGKMSAYAFFVQTCREEHKKKNPEVPVNFAEFSKKCS...,153.0,WildType,WildTypeFS,Frameshift,,,,153,200,EKDVADYKSKGKFDGAKGPAKVARKKVEEEDEEEEEEEEEEEEEEDE,47
ENSP00000256852,,,ENST00000256852,ENSP00000256852,ENSG00000134438,RAX,MHLPGCAPAMADGSFSLAGHLLRSPGGSTSRLHSIEAILGFTKDDG...,187.0,WildType,WildTypeFS,Frameshift,,,,222,346,PLGAGPGSGGGPAGGALPLESWLGPPLPGGGATALQSLPGFGPPAQ...,124
ENSP00000342313,,,ENST00000342066,ENSP00000342313,ENSG00000187634,SAMD11,MSKGILQVHPPICDCPGCRISSPVNRGRLADKRTVALPAARNLKKE...,631.0,WildType,WildTypeFS,Frameshift,,,,631,681,LGTGEQPLSPTTATSPYGGGHALAGQTSPKQENGTLALLPGAPDPS...,50
ENSP00000317992,,,ENST00000327044,ENSP00000317992,ENSG00000188976,NOC2L,MAAAGSRKRRLAELTVDEFLASGFDSESESESENSPQAETREAREA...,697.0,WildType,WildTypeFS,Frameshift,,,,747,749,DD,2
ENSP00000368717,,,ENST00000379407,ENSP00000368717,ENSG00000187583,PLEKHN1,MGNSHCVPQAPRRLRASFSRKPSLKGNREDSARMSAGLPGPEAARS...,490.0,WildType,WildTypeFS,Frameshift,,,,491,576,QSRAAQRHRGSAKDGGPQPPDAPQLVSSAREGSPEPWLPLTDGRSP...,85


In [91]:
dfSG = dfSG.dropna(subset = ['Sequence'])
dfSG['FullLength'] = dfSG['Sequence'].str.len()
dfSG['FullLength'] = dfSG['FullLength'].astype(int)
dfSG['TrueIDRstart'] = dfSG['TrueIDRstart'].astype(int)
dfSG['IDRSseq'] = dfSG.apply(lambda x : str(x['Sequence'])[x['TrueIDRstart']:x['FullLength']],1)
dfSG['IDRSseqLength'] = dfSG['IDRSseq'].str.len()

In [92]:
dfSG.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 15033 entries, 0 to 15382
Data columns (total 17 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Uploaded_variation  10325 non-null  object 
 1   Location            10325 non-null  object 
 2   ID                  15033 non-null  object 
 3   Feature             15033 non-null  object 
 4   PepID               15033 non-null  object 
 5   Gene                15033 non-null  object 
 6   gene_symbol         15033 non-null  object 
 7   Sequence            15033 non-null  object 
 8   seqType             15033 non-null  object 
 9   TrueIDRstart        15033 non-null  int64  
 10  seqTypeExp          15033 non-null  object 
 11  Vartype             15033 non-null  object 
 12  Amino_acids         10325 non-null  object 
 13  Protein_position    10325 non-null  float64
 14  FullLength          15033 non-null  int64  
 15  IDRSseq             15033 non-null  object 
 16  IDRS

In [93]:
dfSG.to_csv('LengthTable_Stop_Gained.csv', index=False)
dfFS.to_csv('LengthTable_Frameshift.csv', index=False)

In [94]:
dfSG = dfSG.rename(columns={'IDRSseq' : 'TargetSeq'})
dfSG = dfSG.rename(columns={'IDRSseqLength' : 'TargetSeqLength'})
dfFS = dfFS.rename(columns={'PFSseq' : 'TargetSeq'})
dfFS = dfFS.rename(columns={'PFSseqLength' : 'TargetSeqLength'})

In [95]:
dfSG.head()

Unnamed: 0,Uploaded_variation,Location,ID,Feature,PepID,Gene,gene_symbol,Sequence,seqType,TrueIDRstart,seqTypeExp,Vartype,Amino_acids,Protein_position,FullLength,TargetSeq,TargetSeqLength
0,,,ENSP00000345305,ENST00000341690,ENSP00000345305,ENSG00000103168,TAF1C,MLPPLIDPWDPGLTARDLLFRGGCRYRKRPRVVLDVTEQISRFLLD...,WildType,737,WildTypeSG,Stop_gained,,,775,ATTPPHSQASSVRATRSQQHTPVLSSSQPLRKKPRMGF,38
1,,,ENSP00000367815,ENST00000378553,ENSP00000367815,ENSG00000154099,DNAAF1,MHPEPSEPATGGAAELDCAQEPGVEESAGDHGSAGRGGCKEEINDP...,WildType,632,WildTypeSG,Stop_gained,,,725,LEIRKQDTKSPRPLIQELSDEDPSGQLLMPPTCQRDAAPLTSSGDR...,93
2,,,ENSP00000317992,ENST00000327044,ENSP00000317992,ENSG00000188976,NOC2L,MAAAGSRKRRLAELTVDEFLASGFDSESESESENSPQAETREAREA...,WildType,697,WildTypeSG,Stop_gained,,,749,DDEEDEEEGEEDSSNSEDGDPDAEAGLAPGELQQLAQGPEDELEDL...,52
3,,,ENSP00000342313,ENST00000342066,ENSP00000342313,ENSG00000187634,SAMD11,MSKGILQVHPPICDCPGCRISSPVNRGRLADKRTVALPAARNLKKE...,WildType,631,WildTypeSG,Stop_gained,,,681,LGTGEQPLSPTTATSPYGGGHALAGQTSPKQENGTLALLPGAPDPS...,50
4,,,ENSP00000368717,ENST00000379407,ENSP00000368717,ENSG00000187583,PLEKHN1,MGNSHCVPQAPRRLRASFSRKPSLKGNREDSARMSAGLPGPEAARS...,WildType,490,WildTypeSG,Stop_gained,,,576,LQSRAAQRHRGSAKDGGPQPPDAPQLVSSAREGSPEPWLPLTDGRS...,86


In [96]:
dfFS = dfFS.reset_index()
dfFS.head()

Unnamed: 0,ID,Uploaded_variation,Location,Feature,PepID,Gene,gene_symbol,Sequence,TrueIDRstart,SeqType,seqTypeExp,Vartype,Amino_acids,Protein_position,Target,FS_Start,FullLength,TargetSeq,TargetSeqLength
0,ENSP00000359393,,,ENST00000325307,ENSP00000359393,ENSG00000029993,HMGB3,MAKGDPKKPKGKMSAYAFFVQTCREEHKKKNPEVPVNFAEFSKKCS...,153.0,WildType,WildTypeFS,Frameshift,,,,153,200,EKDVADYKSKGKFDGAKGPAKVARKKVEEEDEEEEEEEEEEEEEEDE,47
1,ENSP00000256852,,,ENST00000256852,ENSP00000256852,ENSG00000134438,RAX,MHLPGCAPAMADGSFSLAGHLLRSPGGSTSRLHSIEAILGFTKDDG...,187.0,WildType,WildTypeFS,Frameshift,,,,222,346,PLGAGPGSGGGPAGGALPLESWLGPPLPGGGATALQSLPGFGPPAQ...,124
2,ENSP00000342313,,,ENST00000342066,ENSP00000342313,ENSG00000187634,SAMD11,MSKGILQVHPPICDCPGCRISSPVNRGRLADKRTVALPAARNLKKE...,631.0,WildType,WildTypeFS,Frameshift,,,,631,681,LGTGEQPLSPTTATSPYGGGHALAGQTSPKQENGTLALLPGAPDPS...,50
3,ENSP00000317992,,,ENST00000327044,ENSP00000317992,ENSG00000188976,NOC2L,MAAAGSRKRRLAELTVDEFLASGFDSESESESENSPQAETREAREA...,697.0,WildType,WildTypeFS,Frameshift,,,,747,749,DD,2
4,ENSP00000368717,,,ENST00000379407,ENSP00000368717,ENSG00000187583,PLEKHN1,MGNSHCVPQAPRRLRASFSRKPSLKGNREDSARMSAGLPGPEAARS...,490.0,WildType,WildTypeFS,Frameshift,,,,491,576,QSRAAQRHRGSAKDGGPQPPDAPQLVSSAREGSPEPWLPLTDGRSP...,85


In [97]:
List = [dfFS, dfSG]
dfAlt = pd.concat(List, ignore_index=True)

In [98]:
dfAlt.to_csv('LengthTable_Full.csv', index=False)

In [99]:
dfAlt.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27681 entries, 0 to 27680
Data columns (total 20 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   ID                  27681 non-null  object 
 1   Uploaded_variation  19656 non-null  object 
 2   Location            19656 non-null  object 
 3   Feature             27681 non-null  object 
 4   PepID               27681 non-null  object 
 5   Gene                27681 non-null  object 
 6   gene_symbol         27681 non-null  object 
 7   Sequence            27681 non-null  object 
 8   TrueIDRstart        27681 non-null  float64
 9   SeqType             12648 non-null  object 
 10  seqTypeExp          27681 non-null  object 
 11  Vartype             27681 non-null  object 
 12  Amino_acids         19656 non-null  object 
 13  Protein_position    19656 non-null  object 
 14  Target              6 non-null      object 
 15  FS_Start            12648 non-null  float64
 16  Full

In [100]:
df = pd.read_csv('LengthTable_Full.csv')
df.head()

Unnamed: 0,ID,Uploaded_variation,Location,Feature,PepID,Gene,gene_symbol,Sequence,TrueIDRstart,SeqType,seqTypeExp,Vartype,Amino_acids,Protein_position,Target,FS_Start,FullLength,TargetSeq,TargetSeqLength,seqType
0,ENSP00000359393,,,ENST00000325307,ENSP00000359393,ENSG00000029993,HMGB3,MAKGDPKKPKGKMSAYAFFVQTCREEHKKKNPEVPVNFAEFSKKCS...,153.0,WildType,WildTypeFS,Frameshift,,,,153.0,200,EKDVADYKSKGKFDGAKGPAKVARKKVEEEDEEEEEEEEEEEEEEDE,47,
1,ENSP00000256852,,,ENST00000256852,ENSP00000256852,ENSG00000134438,RAX,MHLPGCAPAMADGSFSLAGHLLRSPGGSTSRLHSIEAILGFTKDDG...,187.0,WildType,WildTypeFS,Frameshift,,,,222.0,346,PLGAGPGSGGGPAGGALPLESWLGPPLPGGGATALQSLPGFGPPAQ...,124,
2,ENSP00000342313,,,ENST00000342066,ENSP00000342313,ENSG00000187634,SAMD11,MSKGILQVHPPICDCPGCRISSPVNRGRLADKRTVALPAARNLKKE...,631.0,WildType,WildTypeFS,Frameshift,,,,631.0,681,LGTGEQPLSPTTATSPYGGGHALAGQTSPKQENGTLALLPGAPDPS...,50,
3,ENSP00000317992,,,ENST00000327044,ENSP00000317992,ENSG00000188976,NOC2L,MAAAGSRKRRLAELTVDEFLASGFDSESESESENSPQAETREAREA...,697.0,WildType,WildTypeFS,Frameshift,,,,747.0,749,DD,2,
4,ENSP00000368717,,,ENST00000379407,ENSP00000368717,ENSG00000187583,PLEKHN1,MGNSHCVPQAPRRLRASFSRKPSLKGNREDSARMSAGLPGPEAARS...,490.0,WildType,WildTypeFS,Frameshift,,,,491.0,576,QSRAAQRHRGSAKDGGPQPPDAPQLVSSAREGSPEPWLPLTDGRSP...,85,


In [101]:
df['FullLength'] = df['FullLength'].astype(int)
df['TrueIDRstart'] = df['TrueIDRstart'].astype(int)
df['IDRSseq'] = df.apply(lambda x : str(x['Sequence'])[x['TrueIDRstart']:x['FullLength']],1)
df['IDRSseqLength'] = df['IDRSseq'].str.len()
df.head()

Unnamed: 0,ID,Uploaded_variation,Location,Feature,PepID,Gene,gene_symbol,Sequence,TrueIDRstart,SeqType,...,Amino_acids,Protein_position,Target,FS_Start,FullLength,TargetSeq,TargetSeqLength,seqType,IDRSseq,IDRSseqLength
0,ENSP00000359393,,,ENST00000325307,ENSP00000359393,ENSG00000029993,HMGB3,MAKGDPKKPKGKMSAYAFFVQTCREEHKKKNPEVPVNFAEFSKKCS...,153,WildType,...,,,,153.0,200,EKDVADYKSKGKFDGAKGPAKVARKKVEEEDEEEEEEEEEEEEEEDE,47,,EKDVADYKSKGKFDGAKGPAKVARKKVEEEDEEEEEEEEEEEEEEDE,47
1,ENSP00000256852,,,ENST00000256852,ENSP00000256852,ENSG00000134438,RAX,MHLPGCAPAMADGSFSLAGHLLRSPGGSTSRLHSIEAILGFTKDDG...,187,WildType,...,,,,222.0,346,PLGAGPGSGGGPAGGALPLESWLGPPLPGGGATALQSLPGFGPPAQ...,124,,RAKWRRQEKLEVSSMKLQDSPLLSFSRSPPSATLSPLGAGPGSGGG...,159
2,ENSP00000342313,,,ENST00000342066,ENSP00000342313,ENSG00000187634,SAMD11,MSKGILQVHPPICDCPGCRISSPVNRGRLADKRTVALPAARNLKKE...,631,WildType,...,,,,631.0,681,LGTGEQPLSPTTATSPYGGGHALAGQTSPKQENGTLALLPGAPDPS...,50,,LGTGEQPLSPTTATSPYGGGHALAGQTSPKQENGTLALLPGAPDPS...,50
3,ENSP00000317992,,,ENST00000327044,ENSP00000317992,ENSG00000188976,NOC2L,MAAAGSRKRRLAELTVDEFLASGFDSESESESENSPQAETREAREA...,697,WildType,...,,,,747.0,749,DD,2,,DDEEDEEEGEEDSSNSEDGDPDAEAGLAPGELQQLAQGPEDELEDL...,52
4,ENSP00000368717,,,ENST00000379407,ENSP00000368717,ENSG00000187583,PLEKHN1,MGNSHCVPQAPRRLRASFSRKPSLKGNREDSARMSAGLPGPEAARS...,490,WildType,...,,,,491.0,576,QSRAAQRHRGSAKDGGPQPPDAPQLVSSAREGSPEPWLPLTDGRSP...,85,,LQSRAAQRHRGSAKDGGPQPPDAPQLVSSAREGSPEPWLPLTDGRS...,86


In [102]:
df = df.drop('seqType', 1)

In [103]:
df.to_csv('LengthTable_Full.csv', index=False)