/
idp_in_bacteria_functions.py
469 lines (362 loc) · 14.4 KB
/
idp_in_bacteria_functions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
# functions for the IDP in bacteria project
### dependencies ###
from IDP_analysis.packages_import import *
### Read fasta ###
def read_fasta(fasta_file: str) -> pd.DataFrame:
"""Processes raw .fasta files
Opens a .fasta file, parses the sequences and their IDs into a
dataframe and returns the dataframe.
Parameters
----------
fasta_file : str
the raw .fasta file directory.
Returns
-------
pd.DataFrame
a dataframe with ID, Sequence and Length columns.
"""
# open the file
handle = open(fasta_file, 'r')
seq_list = list(SeqIO.parse(handle, 'fasta'))
handle.close()
# parse data into lists
ids = [seq_record.id.split('|')[1] for seq_record in seq_list]
seqs = [str(seq_record.seq) for seq_record in seq_list]
lens = [len(seq) for seq in seqs]
# save data into a dataframe
df = pd.DataFrame({'ID':ids, 'Sequence':seqs, 'Length':lens})
return df
### Find longest binary IDR ###
def find_longest_binary_IDR(binary_disorder_list: list) -> int:
"""Finds the longest stretch of binary IDR for a
given protein
Parameters
----------
binary_disorder_list : list
a list of binary values indicating whether the
residue is disordered (1) or not (0)
Returns
-------
int
a number of disordered residues in the longest
IDR
"""
# set max IDR length as zero
max_IDR_len = 0
# set current IDR length as zero
current_IDR_len = 0
# iterate though each residue
for disorder in binary_disorder_list:
if disorder == 1:
# increase current IDR length if disordered
current_IDR_len += 1
else:
# not part of IDR, save current length if max or continue
if current_IDR_len > max_IDR_len:
max_IDR_len = current_IDR_len
current_IDR_len = 0
# check last time
if current_IDR_len > max_IDR_len:
max_IDR_len = current_IDR_len
return max_IDR_len
### Align sequences ###
def align_seqs(seqs: list, ids: list, gap_ins_pen: float=-1.0,
gap_ext_pen: float=-0.5):
"""Multiple sequence alignment using ClustalW
Parameters
----------
seqs : list
A list of sequences to be aligned (as str)
ids : list
A list of sequence IDs (as str)
gap_ins_pen : float
Gap insertion penalty for the alignment.
-1.0 by default
gap_ext_pen : float
Gap extension penalty for the alignment.
-0.5 by default
Returns
-------
Alignment object
"""
# create seq records
records = [SeqRecord(Seq(seq), id) for seq, id in zip(seqs, ids)]
# save records as fasta
with open('unaligned.fas', 'w') as handle:
SeqIO.write(records, handle, 'fasta')
# align using ClustalW
cline = ClustalwCommandline(infile='unaligned.fas', outfile='aligned.fas',
gapext=gap_ext_pen, gapopen=gap_ins_pen,
type='PROTEIN', outorder='INPUT',
output='FASTA')
cline()
# read the output file
align = AlignIO.read('aligned.fas', 'fasta')
return align
### Align disorder ###
def align_disorder(disorder_values: list, seqs: list, ids: list,
gap_ins_pen: float=-1.0, gap_ext_pen: float=-0.5) -> tuple:
"""Aligns disorder values based on AA sequences
Parameters:
-----------
disorder_values : list
A list of disorder values to be plotted
seqs : list
A list of sequences to be aligned
ids : list
A list of sequence IDs (as str)
gap_ins_pen : float
Gap insertion penalty for the alignment.
-1.0 by default
gap_ext_pen : float
Gap extension penalty for the alignment.
-0.5 by default
Returns:
--------
(sequence_alignment, disorder_alignment)
"""
aligned_disorders = []
# align sequences
alignment = align_seqs(seqs, ids, gap_ins_pen=gap_ins_pen, gap_ext_pen=gap_ext_pen)
# insert gaps in disorder values
for a, d in zip(alignment, disorder_values):
seq_disorder_full = [*d.copy()]
for i, residue in enumerate(a.seq):
if residue == '-':
seq_disorder_full.insert(i, np.nan)
seq_disorder_full = pd.Series(seq_disorder_full).interpolate().tolist()
aligned_disorders.append(seq_disorder_full)
return (alignment, aligned_disorders)
### Get charge / hydrophobicity ###
def get_charge_hydrophobicity(seq: str) -> tuple:
"""Calculates mean absolute charge and hydrophobicity
Parameters
----------
seq : str
AA sequence to calculate values for
Returns:
--------
tuple
tuple with two float values:
1) mean absolute charge
2) mean hydrophobicity (scaled 0-1)
"""
# calculate mean absolute charge
prot_analysis = ProteinAnalysis(seq)
mean_abs_charge = abs(prot_analysis.charge_at_pH(7)) / prot_analysis.length
# get Kyle-Doolitle scale
KyteDoolitle = gravy_scales.get('KyteDoolitle', -1)
KyteDoolitle['X'] = 0
KyteDoolitle['U'] = 0
# normalize scale to 0-1 range
scaler = MinMaxScaler()
KD_norm = {k:v[0] for k, v in zip(KyteDoolitle.keys(), scaler.fit_transform(np.array([*KyteDoolitle.values()]).reshape(-1, 1)))}
# calculate mean hydrophobicity
hydrophobicity_sum = sum(KD_norm[aa] for aa in seq)
mean_hydrophobicity = hydrophobicity_sum / prot_analysis.length
return mean_abs_charge, mean_hydrophobicity
### Plot aligned cluster ###
def plot_aligned_cluster(df:pd.DataFrame, cluster:str, mav:int=50, errorbar=('ci', 95), scatter:bool=False) -> None:
"""Create a plot of disorder scores for a given cluster using aligned disorder values.
The function generates a plot showing the predicted disorder scores along the sequence positions for the given cluster.
Each line in the plot represents a species within the cluster, and the plot also includes error bars to indicate the
standard deviation of disorder scores. The horizontal dashed line at 0.5 serves as a reference point to distinguish
ordered regions (below 0.5) from disordered regions (above 0.5).
----------
Parameters:
df (pd.DataFrame) : Dataframe that contains aligned disorder data
cluster (str) : The identifier of the cluster for which the disorder scores are to be plotted.
mav (int, optional) : The moving average window size used to smooth the disorder scores. The default value is 50.
errorbar (tuple or str, optional) : Type of error bar to be used, inherited from sns.lineplot(). Default is ('ci', 95).
scatter (bool, optional) : Whether to include a scatterplot for each species. Default is False.
Returns:
None
"""
# extract disorder data on the specified cluster
data = df[df['cluster']==cluster]
aligned_disorders = data['disorder_aligned']
# calculate the adjusted mav based on data length
mav = int(np.round(mav * len(aligned_disorders.iloc[0]) / 500)) + 1
# create figure and axis
fig, ax = plt.subplots(figsize=(15, 8))
# set limits and labels
ax.set_ylim(0, 1)
ax.set_xlabel('Position in sequence', fontsize=18)
ax.set_ylabel('Predicted disorder', fontsize=18)
# apply mav
ys = [np.convolve(ad, np.ones(mav)/mav) for ad in aligned_disorders]
ys = pd.Series([y[int(mav/2):len(y)-int(mav/2)+1] for y in ys], index=data.index)
data['ys'] = ys
# iterate through each group and plot a line with error
for group in set(data.group):
group_data = data[data['group']==group]
color = group_data['color'].iloc[0]
group_data = pd.DataFrame({group_data['species_tag'].iloc[i]:group_data['ys'].iloc[i] for i in range(len(group_data))})
group_data['position'] = group_data.index + 1
group_data = group_data.melt(id_vars=['position'], value_name='disorder', var_name='species_tag')
sns.lineplot(data=group_data, x='position', y='disorder', errorbar=errorbar, c=color, label=group)
# scatter for individual species if needed
if scatter:
plt.scatter(x=group_data.position, y=group_data.disorder, marker='.', color=color, alpha=0.1)
# add 0.5 threshold line and legend
ax.axhline(0.5, ls='--', c='black')
ax.legend()
fig.show()
###################################
### OLD FUNCTIONS - no longer used
# ### Read pickle file ###
# def read_pickle_file(file_name: str):
# """A simple function to read pickle files
# Parameters
# ----------
# file_name : srt
# location of the pickle file
# Returns
# -------
# any format, but best with pd.DataFrames or lists
# """
# print(file_name + ' loading...')
# with open(file_name, 'rb') as f:
# result = pickle.load(f)
# print(file_name + ' loaded!')
# return result
# ### Binary disorder ###
# def binary_disorder(disorder_list: list, threshold: float=0.5) -> list:
# """Returns a list of binary values for disordered
# residues given a threshold
# Parameters
# ----------
# disorder_list : list
# a list of disorder values
# threshold : float
# a binary threshold, 0.5 by default. Disorder score
# above the threshold means that the residue is
# considered to be disordered
# Returns
# -------
# list
# a list of binary values indicating whether the
# residue is disordered (1) or not (0)
# """
# return [1 if dis > threshold else 0 for dis in disorder_list]
# ### Fix -infinitives ###
# def fix_neg_inf(df: pd.DataFrame, replacement=np.nan) -> pd.DataFrame:
# """Replaces -inf values in a given dataframe
# Parameters
# ----------
# df : pd.DataFrame
# The dataframe with -inf to be replaced
# replacement
# The value to replace -inf with. Default
# is np.nan
# Returns:
# --------
# pd.DataFrame
# """
# df_new = df.copy()
# df_new[df.astype(float) < 0] = replacement
# return df_new
# ### Drop unavailable rows ###
# def drop_unavailable(proteome: pd.DataFrame, reset_index: bool=True) -> pd.DataFrame:
# """Drop rows if not all models are available
# """
# mask = [proteome.disorder.iloc[n].shape[1]==4 for n in range(len(proteome))]
# proteome = proteome[mask]
# if reset_index == True:
# proteome.reset_index(inplace=True, drop=True)
# return proteome
# ### Get additional columns with IDP data ###
# def get_additional_columns(proteome:pd.DataFrame) -> None:
# """Add Additional Columns to Proteome DataFrame
# This function calculates additional columns for the proteome DataFrame including:
# disorder_combined: mean of all disorder models
# disorder_binary: binary disorder values based on a threshold of 0.5
# disorder_mean: mean disorder score for each protein
# disorder_binary_mean: mean binary disorder value (fraction of disorder) for each protein
# longest_IDR: length of the longest binary intrinsically disordered region for each protein
# -----------
# Parameters:
# proteome (pd.DataFrame): The dataframe containing protein information.
# Returns:
# None
# """
# # get means of all models
# proteome['disorder_combined'] = [np.mean(proteome.disorder.iloc[i], axis=1) for i in range(len(proteome))]
# # get binary disorder values
# proteome['disorder_binary'] = proteome.disorder_combined.apply(binary_disorder, threshold=0.5)
# # get mean disorder values for each protein (mean disorder score)
# proteome['disorder_mean'] = proteome.disorder_combined.apply(np.mean)
# # get mean binary disorder values for each protein (fraction of disorder)
# proteome['disorder_binary_mean'] = proteome.disorder_binary.apply(np.mean)
# # find longest binary IDR for each protein
# proteome['longest_IDR'] = proteome.disorder_binary.apply(find_longest_binary_IDR)
# return
# ### Get and process pairwise OMA file ###
# def get_OMA(species_tag:str, common_species_tag:str='THET8', oma_type:str='any') -> pd.DataFrame:
# """Obtains pairwise OMA and returns a dataframe
# -----------
# Parameters:
# species_tag (str): Tag for species to use in OMA.
# common_species_tag (str): Tag for the second species to use in OMA. This should be the same
# for all OMA files. 'THET8' by default.
# oma_type (str): Type of OMA relationships to keep. 'any' by default. Other possibilities are
# '1:1', 'n:1', 'm:1' and 'n:m'
# Returns:
# pd.DataFrame
# """
# # get file
# path = f'https://omabrowser.org/cgi-bin/gateway.pl?f=PairwiseOrthologs&p1={common_species_tag}&p2={species_tag}&p3=UniProt'
# try_retrieve_url(path, '_')
# OMA = pd.read_csv('_', sep='\t')
# if len(OMA) == 0:
# path = f'https://omabrowser.org/cgi-bin/gateway.pl?f=PairwiseOrthologs&p1={species_tag}&p2={common_species_tag}&p3=UniProt'
# try_retrieve_url(path, '_')
# OMA = pd.read_csv('_', sep='\t')
# # process file
# OMA.columns = [common_species_tag, species_tag, 'oma_type', 'OMA_group']
# if oma_type != 'any':
# OMA = OMA[OMA['oma_type'] == oma_type]
# OMA.drop(['OMA_group', 'oma_type'], axis='columns', inplace=True)
# OMA[common_species_tag] = OMA[common_species_tag].str.replace(common_species_tag, '')
# OMA[common_species_tag] = OMA[common_species_tag].str.replace('_', '')
# # OMA = OMA[OMA[common_species_tag].str.len()==6]
# OMA[species_tag] = OMA[species_tag].str.replace(species_tag, '')
# OMA[species_tag] = OMA[species_tag].str.replace('_', '')
# OMA.reset_index(inplace=True, drop=True)
# return OMA
# ### Read fIDPnn disorder files ###
# def read_fIDPnn_disorder(path, insert_into_df=False, original_df=None):
# """Processes sequence data from a file and returns DataFrames or updates an existing DataFrame
# ----------
# Parameters:
# path (str) : The path to the input file containing sequence data.
# insert_into_df (bool, optional) : Whether to insert processed data into an existing DataFrame. Default is False.
# original_df (pd.DataFrame, optionsl) : Original DataFrame to insert into if insert_into_df=True. Default is None.
# Returns:
# None if insert_into_df is True, otherwise a dictionary of DataFrames.
# """
# dfs = {}
# lists = {}
# dflines = []
# with open(path, 'rb') as f:
# for i, line in enumerate(tqdm(f.readlines())):
# if line.decode('utf-8')[:8] == 'Sequence':
# continue
# elif line.decode('utf-8')[0] == '>':
# if dflines != []:
# columns = dflines[0].lower().replace(' ', '_').strip().split(',')
# df = pd.DataFrame([l.strip().split(',') for l in dflines[1:]], columns=columns)
# df.iloc[:, 2:] = df.iloc[:, 2:].astype('float')
# df[df.columns[0]] = df.iloc[:, 0].astype('float')
# dfs[name] = df
# lists[name] = df.predicted_score_for_disorder.tolist()
# name = line.decode('utf-8').split('|')[1]
# dflines = []
# else:
# dflines.append(line.decode('utf-8'))
# if insert_into_df:
# m = original_df.disorder_fIDPnn.isna()
# original_df.loc[m, 'disorder_fIDPnn'] = original_df.ID.map(lists)
# return
# else:
# return dfs