In [None]:
# importing relevant modules and packages 
import pandas as pd
import pandasql
from pandasql import sqldf
import numpy as np

import matplotlib.pyplot as plt
import matplotlib.lines as mlines
import matplotlib.transforms as mtransforms
import seaborn as sns
import matplotlib.cbook as cbook

from matplotlib import rcParams

from pandasql import sqldf
pysqldf = lambda q: sqldf(q, globals())

In [None]:
#import multiplettests
from statsmodels.stats.multitest import multipletests

In [None]:
#importing correlation functions
from scipy import stats
from scipy.stats import kendalltau, pearsonr, spearmanr

In [None]:
#Creating the list of all columns from the raw data in the order desired
CTS_type = ['CTSA_pH 4.5', 
            'CTSB_pH 4.5_pH 5.5',
            'CTSD_pH 3.4_pH 4.5', 
            'CTSE_pH 3.4_pH 4.5', 
            'CTSF_pH 4.5',
            'CTSK_pH 4.5', 
            'CTSL_pH 4.5_pH 5.5', 
            'CTSO_pH 5.5',
            'CTSS_pH 4.5_pH 5.5', 
            'CTSV_pH 3.4_pH 4.5', 
            'CTSX_pH 3.4_pH 4.5',
            'AEP_pH 4.5_pH 5.5']

In [None]:
#ASyn

In [None]:
#Selecting ASyn data from the file
df_asyn = pd.read_excel('Supplementary Data 1.xlsx', sheet_name = 'ASyn')
df_asyn = df_asyn.set_index('position of P1 site in protein')

In [None]:
#Viewing ASyn data
df_asyn

In [None]:
#Specifying the order in which we want proteases to appear in our data using CTS_type list
df_asyn_rearr = df_asyn.reindex(columns=CTS_type)
df_asyn_rearr.head(2)

In [None]:
#Renaming the columns
df_asyn_rearr.rename(columns = {'CTSA_pH 4.5':'A',
            'CTSB_pH 4.5_pH 5.5':'B',
            'CTSD_pH 3.4_pH 4.5':'D', 
            'CTSE_pH 3.4_pH 4.5':'E',  
            'CTSF_pH 4.5':'F',
            'CTSK_pH 4.5':'K', 
            'CTSL_pH 4.5_pH 5.5':'L', 
            'CTSO_pH 5.5':'O',
            'CTSS_pH 4.5_pH 5.5':'S', 
            'CTSV_pH 3.4_pH 4.5':'V', 
            'CTSX_pH 3.4_pH 4.5':'X',
            'AEP_pH 4.5_pH 5.5':'AEP'}, 
                                   inplace = True)
df_asyn_rearr.head(5)



In [None]:
#ASyn--Sum of individual protease cleavages
asyn_individual_cathepsin = (df_asyn_rearr == 1).astype(int).sum(axis= 'rows')
asyn_individual_cathepsin.sum()

In [None]:
#ASyn--Normalization wrt sum of individual protease cleavages
ASyn_individual_cathepsin_total_sites = asyn_individual_cathepsin.div(asyn_individual_cathepsin.sum()).multiply(100)
df1 = ASyn_individual_cathepsin_total_sites.to_frame()
df1.rename(columns={0:'ASyn'}, inplace=True)
df1 = df1.rename_axis('Proteases')
df1

In [None]:
#Selecting TDP43 data from the file
df_tdp43 = pd.read_excel('Supplementary Data 1.xlsx', sheet_name = 'TDP43')
df_tdp43 = df_tdp43.set_index('position of P1 site in protein')

In [None]:
#Viewing TDP43 data
df_tdp43

In [None]:
#Specifying the order in which we want proteases to appear in our data using CTS_type list
df_tdp43_rearr = df_tdp43.reindex(columns=CTS_type)
df_tdp43_rearr.head(2)

In [None]:
#Renaming the columns
df_tdp43_rearr.rename(columns = {'CTSA_pH 4.5':'A',
            'CTSB_pH 4.5_pH 5.5':'B',
            'CTSD_pH 3.4_pH 4.5':'D', 
            'CTSE_pH 3.4_pH 4.5':'E',  
            'CTSF_pH 4.5':'F',
            'CTSK_pH 4.5':'K', 
            'CTSL_pH 4.5_pH 5.5':'L', 
            'CTSO_pH 5.5':'O',
            'CTSS_pH 4.5_pH 5.5':'S', 
            'CTSV_pH 3.4_pH 4.5':'V', 
            'CTSX_pH 3.4_pH 4.5':'X',
            'AEP_pH 4.5_pH 5.5':'AEP'}, 
                                   inplace = True)
df_tdp43_rearr.head(5)

In [None]:
#TDP43--Sum of individual protease cleavages
tdp43_individual_cathepsin = (df_tdp43_rearr == 1).astype(int).sum(axis= 'rows')
tdp43_individual_cathepsin.sum()

In [None]:
#TDP43--Normalization wrt sum of individual protease cleavages
TDP43_individual_cathepsin_total_sites = tdp43_individual_cathepsin.div(tdp43_individual_cathepsin.sum()).multiply(100)
df2 = TDP43_individual_cathepsin_total_sites.to_frame()
df2.rename(columns={0:'TDP43'}, inplace=True)
df2 = df2.rename_axis('Proteases')
df2

In [None]:
#Selecting tau data from the file
df_tau = pd.read_excel('Supplementary Data 1.xlsx', sheet_name = 'Tau')
df_tau = df_tau.set_index('position of P1 site in protein')

In [None]:
#Viewing tau data
df_tau

In [None]:
#Specifying the order in which we want proteases to appear in our data using CTS_type list
df_tau_rearr = df_tau.reindex(columns=CTS_type)
df_tau_rearr.head(2)

In [None]:
#Renaming the columns
df_tau_rearr.rename(columns = {'CTSA_pH 4.5':'A',
            'CTSB_pH 4.5_pH 5.5':'B',
            'CTSD_pH 3.4_pH 4.5':'D', 
            'CTSE_pH 3.4_pH 4.5':'E',  
            'CTSF_pH 4.5':'F',
            'CTSK_pH 4.5':'K', 
            'CTSL_pH 4.5_pH 5.5':'L', 
            'CTSO_pH 5.5':'O',
            'CTSS_pH 4.5_pH 5.5':'S', 
            'CTSV_pH 3.4_pH 4.5':'V', 
            'CTSX_pH 3.4_pH 4.5':'X',
            'AEP_pH 4.5_pH 5.5':'AEP'}, 
                                   inplace = True)
df_tau_rearr.head(5)

In [None]:
#Tau--Sum of individual protease cleavages
tau_individual_cathepsin = (df_tau_rearr == 1).astype(int).sum(axis= 'rows')
tau_individual_cathepsin.sum()

In [None]:
#Tau--Normalization wrt sum of individual protease cleavages
Tau_individual_cathepsin_total_sites = tau_individual_cathepsin.div(tau_individual_cathepsin.sum()).multiply(100)
df3 = Tau_individual_cathepsin_total_sites.to_frame()
df3.rename(columns={0:'Tau'}, inplace=True)
df3 = df3.rename_axis('Proteases')
df3

In [None]:
#All protein data--normalized wrt sum of individual cathepsin cleavages
df = pd.concat([df1, df2, df3], axis=1)
df

In [None]:
#Bar plot
plt.rcParams.update({
    "figure.facecolor":  'white',  # red   with alpha = 30%
    "figure.edgecolor": 'black',
    "axes.edgecolor":    'black',
    "axes.facecolor":    'white' # green with alpha = 50%
})
plt.rcParams["font.weight"] = "bold"
plt.rcParams["figure.dpi"] = 100

y = df[['ASyn','TDP43','Tau']].plot(color = ['mediumaquamarine','lightsalmon','cornflowerblue'], kind='bar', 
stacked = False, fontsize = 60, figsize = (40, 20), width=0.85, linewidth=2.0)
y.xaxis.set_tick_params(labelsize=60, rotation=0)
y.legend(loc=2, prop={'size': 40})

In [None]:
#Mean clustering with data normalized wrt sum of individual cathepsin cleavages
mean = df.mean(axis=1)
mean_df = pd.concat([mean, mean, mean], axis=1)
mean_df.rename(columns={0:'ASyn', 1:'TDP43', 2: 'Tau'}, inplace=True)
mean_df
mean_cluster = df.div(mean_df)
mean_cluster

In [None]:
#Hierarchical clustering
sns.set(rc={'figure.figsize':(120,360),'figure.dpi':100})
sns.set(font_scale=1.5)
g = sns.clustermap(mean_cluster, method='single',
linewidth = 0.5,  cmap = 'Greys', annot=True, col_cluster=False, linecolor='lightgrey', 
               cbar= False, cbar_pos=(0,0, 0,0))
plt.setp(g.ax_heatmap.yaxis.get_majorticklabels(), rotation=0) # ytick rotate
g.cax.remove()
g.ax_heatmap.tick_params(right=False)
plt.show()