In [None]:
#importing required modules and packages 
import pandas as pd
import pandasql
from pandasql import sqldf
import numpy as np

import matplotlib.pyplot as plt
import matplotlib.lines as mlines
import matplotlib.transforms as mtransforms
import seaborn as sns
import matplotlib.cbook as cbook

from matplotlib import rcParams

from pandasql import sqldf
pysqldf = lambda q: sqldf(q, globals())
!pip install fastcluster

In [None]:
#import multiplettests
from statsmodels.stats.multitest import multipletests

In [None]:
#importing correlation functions
from scipy import stats
from scipy.stats import kendalltau, pearsonr, spearmanr

In [None]:
#Creating a list of all proteases in the order we want
#Alphabetical order was followed here, except for AEP which is a different type of protease
CTS_type = ['CTSA_pH 4.5', 
            'CTSB_pH 4.5_pH 5.5',
            'CTSD_pH 3.4_pH 4.5', 
            'CTSE_pH 3.4_pH 4.5', 
            'CTSF_pH 4.5',
            'CTSK_pH 4.5', 
            'CTSL_pH 4.5_pH 5.5', 
            'CTSO_pH 5.5',
            'CTSS_pH 4.5_pH 5.5', 
            'CTSV_pH 3.4_pH 4.5', 
            'CTSX_pH 3.4_pH 4.5',
            'AEP_pH 4.5_pH 5.5']

In [None]:
#ASyn

In [None]:
#Selecting ASyn data from the file
df_asyn = pd.read_excel('Supplementary Data 1.xlsx', sheet_name = 'ASyn')
df_asyn = df_asyn.set_index('position of P1 site in protein')

In [None]:
#Viewing ASyn data
df_asyn

In [None]:
#Specifying the order in which we want proteases to appear in our data using CTS_type list
df_asyn_rearr = df_asyn.reindex(columns=CTS_type)
df_asyn_rearr.head(2)

In [None]:
#Renaming the columns
df_asyn_rearr.rename(columns = {'CTSA_pH 4.5':'A',
            'CTSB_pH 4.5_pH 5.5':'B',
            'CTSD_pH 3.4_pH 4.5':'D', 
            'CTSE_pH 3.4_pH 4.5':'E',  
            'CTSF_pH 4.5':'F',
            'CTSK_pH 4.5':'K', 
            'CTSL_pH 4.5_pH 5.5':'L', 
            'CTSO_pH 5.5':'O',
            'CTSS_pH 4.5_pH 5.5':'S', 
            'CTSV_pH 3.4_pH 4.5':'V', 
            'CTSX_pH 3.4_pH 4.5':'X',
            'AEP_pH 4.5_pH 5.5':'AEP'}, 
                                   inplace = True)
df_asyn_rearr.head(5)

#Excluding columns A, F and O from ASyn dataframe as all the values within these columns correspond to zero (no cleavage)  
df_asyn_rearr_wo_AFO = df_asyn_rearr.drop(['A','F','O'], axis = 1)


In [None]:
#Heatmap with correlation coefficients and p values--dpi 100

sns.set_theme(style="white")
cmap = sns.light_palette("mediumaquamarine", as_cmap=True)

#Pairwise correlation of all columns in ASyn dataframe using Spearman correlation method
asyn_corr = df_asyn_rearr_wo_AFO.corr(method='spearman').round(2)

#P values 
pval_asyn = asyn_corr.corr(method=lambda x, y: spearmanr(x, y)[1]) - np.eye(*asyn_corr.shape)

#Set the p values, *** for less than 0.001, ** for less than 0.01, * for less than 0.05
p = pval_asyn.applymap(lambda x: ''.join(['*' for t in [0.001,0.01,0.05] if x<=t]))

#Correlation coefficients + p values
asyn_corr_p = asyn_corr.astype(str) + p
asyn_corr_p_map = asyn_corr_p.to_numpy()


#Plot the heatmap
df_asyn_corr_p = asyn_corr.where(np.tril(np.ones(asyn_corr.shape)).astype(np.bool_))

f, ax = plt.subplots(figsize=(12, 10), dpi=100)

g_asyn = sns.heatmap(df_asyn_corr_p, annot=asyn_corr_p_map, cmap=cmap, vmax=0.5, vmin=-0.5, center=0,
            square=False, linewidths=.1,
                cbar_kws={"shrink": 0.5, "pad": -0.1}, annot_kws={"size":15}, 
                fmt='')

plt.setp(g_asyn.yaxis.get_majorticklabels(), rotation=0, fontsize=20)
plt.setp(g_asyn.xaxis.get_majorticklabels(), rotation=0, fontsize=20)

plt.show()

In [None]:
#TDP43

In [None]:
#Selecting TDP43 data from the file
df_tdp43 = pd.read_excel('Supplementary Data 1.xlsx', sheet_name = 'TDP43')
df_tdp43 = df_tdp43.set_index('position of P1 site in protein')

In [None]:
#Viewing TDP43 data
df_tdp43

In [None]:
#Specifying the order in which we want proteases to appear in our data using CTS_type list
df_tdp43_rearr = df_tdp43.reindex(columns=CTS_type)
df_tdp43_rearr.head(2)

In [None]:
#Renaming the columns
df_tdp43_rearr.rename(columns = {'CTSA_pH 4.5':'A',
            'CTSB_pH 4.5_pH 5.5':'B',
            'CTSD_pH 3.4_pH 4.5':'D', 
            'CTSE_pH 3.4_pH 4.5':'E',  
            'CTSF_pH 4.5':'F',
            'CTSK_pH 4.5':'K', 
            'CTSL_pH 4.5_pH 5.5':'L', 
            'CTSO_pH 5.5':'O',
            'CTSS_pH 4.5_pH 5.5':'S', 
            'CTSV_pH 3.4_pH 4.5':'V', 
            'CTSX_pH 3.4_pH 4.5':'X',
            'AEP_pH 4.5_pH 5.5':'AEP'}, 
                                   inplace = True)
df_tdp43_rearr.head(5)

In [None]:
#Heatmap with correlation coefficients and p values--dpi 100

sns.set_theme(style="white")
cmap = sns.light_palette("salmon", as_cmap=True)

#Pairwise correlation of all columns in TDP43 dataframe using Spearman correlation method
tdp43_corr = df_tdp43_rearr.corr(method='spearman').round(2)

#P values 
pval_tdp43 = tdp43_corr.corr(method=lambda x, y: spearmanr(x, y)[1]) - np.eye(*tdp43_corr.shape)

#Set the p values, *** for less than 0.001, ** for less than 0.01, * for less than 0.05
p = pval_tdp43.applymap(lambda x: ''.join(['*' for t in [0.001,0.01,0.05] if x<=t]))

#Correlation coefficients + p values
tdp43_corr_p = tdp43_corr.astype(str) + p
tdp43_corr_p_map = tdp43_corr_p.to_numpy()


#Plot the heatmap
df_tdp43_corr_p = tdp43_corr.where(np.tril(np.ones(tdp43_corr.shape)).astype(np.bool_))

f, ax = plt.subplots(figsize=(12, 10), dpi=100)

g_tdp43 = sns.heatmap(df_tdp43_corr_p, annot=tdp43_corr_p_map, cmap=cmap, vmax=0.5, vmin=-0.5, center=0,
            square=False, linewidths=.1,
                cbar_kws={"shrink": 0.5, "pad": -0.1}, annot_kws={"size":15}, 
                fmt='')

plt.setp(g_tdp43.yaxis.get_majorticklabels(), rotation=0, fontsize=20)
plt.setp(g_tdp43.xaxis.get_majorticklabels(), rotation=0, fontsize=20)

plt.show()

In [None]:
#Tau

In [None]:
#Selecting tau data from the file
df_tau = pd.read_excel('Supplementary Data 1.xlsx', sheet_name = 'Tau')
df_tau = df_tau.set_index('position of P1 site in protein')

In [None]:
#Viewing tau data
df_tau

In [None]:
#Specifying the order in which we want proteases to appear in our data using CTS_type list
df_tau_rearr = df_tau.reindex(columns=CTS_type)
df_tau_rearr.head(2)

In [None]:
#Renaming the columns
df_tau_rearr.rename(columns = {'CTSA_pH 4.5':'A',
            'CTSB_pH 4.5_pH 5.5':'B',
            'CTSD_pH 3.4_pH 4.5':'D', 
            'CTSE_pH 3.4_pH 4.5':'E',  
            'CTSF_pH 4.5':'F',
            'CTSK_pH 4.5':'K', 
            'CTSL_pH 4.5_pH 5.5':'L', 
            'CTSO_pH 5.5':'O',
            'CTSS_pH 4.5_pH 5.5':'S', 
            'CTSV_pH 3.4_pH 4.5':'V', 
            'CTSX_pH 3.4_pH 4.5':'X',
            'AEP_pH 4.5_pH 5.5':'AEP'}, 
                                   inplace = True)
df_tau_rearr.head(5)

In [None]:
#Heatmap with correlation coefficients and p values--dpi 100

sns.set_theme(style="white")
cmap = sns.light_palette("cornflowerblue", as_cmap=True)

#Pairwise correlation of all columns in tau dataframe using Spearman correlation method
tau_corr = df_tau_rearr.corr(method='spearman').round(2)

#P values 
pval_tau = tau_corr.corr(method=lambda x, y: spearmanr(x, y)[1]) - np.eye(*tau_corr.shape)

#Set the p values, *** for less than 0.001, ** for less than 0.01, * for less than 0.05
p = pval_tau.applymap(lambda x: ''.join(['*' for t in [0.001,0.01,0.05] if x<=t]))

#Correlation coefficients + p values
tau_corr_p = tau_corr.astype(str) + p
tau_corr_p_map = tau_corr_p.to_numpy()


#Plot the heatmap
df_tau_corr_p = tau_corr.where(np.tril(np.ones(tau_corr.shape)).astype(np.bool_))

f, ax = plt.subplots(figsize=(12, 10), dpi=100)

g_tau = sns.heatmap(df_tau_corr_p, annot=tau_corr_p_map, cmap=cmap, vmax=0.5, vmin=-0.5, center=0,
            square=False, linewidths=.1,
                cbar_kws={"shrink": 0.5, "pad": -0.1}, annot_kws={"size":15}, 
                fmt='')

plt.setp(g_tau.yaxis.get_majorticklabels(), rotation=0, fontsize=20)
plt.setp(g_tau.xaxis.get_majorticklabels(), rotation=0, fontsize=20)

plt.show()