/
correlations.py
138 lines (120 loc) · 5.58 KB
/
correlations.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
import numpy as np
import scipy
import scipy.stats
import random
from tqdm import tqdm
# Most of the code in this module assumes that the original data comes
# in a Pandas DataFrame.
random.seed(8)
################################################################################
## COMPUTE PEARSON CORRELATIONS
################################################################################
# Compute correlations between samples and save them in a list
def correlation_upper_triangle(sample, first_feature):
corr_values = []
for i in range(len(sample)):
x = sample.iloc[i][first_feature:]
for j in range(i+1, len(sample)):
y = sample.iloc[j][first_feature:]
corr, p_val = scipy.stats.pearsonr(x, y)
corr_values.append(corr)
return corr_values
# Compute the full correlation matrix between all samples
# Complexity (N^2)/2
def correlation_matrix(sample, first_feature):
return np.corrcoef(sample)
################################################################################
## SAMPLE SUB-MATRICES FROM A CORRELATION MATRIX
################################################################################
# Get the upper triangle of a symmetric submatrix
# Takes as input the row indices
# Returns the list of values in the upper triangle
def sample_upper_triangle(sample_rows, cmatrix):
sub_matrix = cmatrix[sample_rows, sample_rows]
values = []
for i in range(len(sample_rows)):
for j in range(i+1, len(sample_rows)):
row = sample_rows[i]
col = sample_rows[j]
values.append(cmatrix[row, col])
return values
# Extract a non-symmetric submatrix from the correlation matrix
# Assumes sample1 and sample 2 are disjoint sets of replicates
def sample_rectangular_matrix(sample1, sample2, cmatrix):
sub_matrix = cmatrix[sample1, :]
sub_matrix = sub_matrix[:, sample2]
return sub_matrix
# Compute the mean row of a self-correlation matrix, ignoring the
# diagonal values (assuming these are the maximum values)
def correlation_median_row(cmatrix):
X = np.sort(cmatrix, axis=0)
median_idx = int( (X.shape[0]-1)/2 )
return X[median_idx,:]
# Extract asymmetric matrix of alleles vs controls
# This heavily depends on metadata. For each allele, we want control that are in the same plate
def allele_to_control_matrix(allele_index, metadata, plate_field, ctl_mask, control_samples, corr_matrix):
control_corr_matrix = []
for i in allele_index:
a = metadata.loc[i]
#WARN: Here a complex indexing is applied using both indices and values
# of ctl_mask. More info at https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.loc.html
# "An alignable boolean Series. The index of the key will be aligned
# before masking.""
ctl_idx = metadata[metadata[plate_field] == a[plate_field]].loc[ctl_mask].index
if len(ctl_idx) > control_samples:
ctl_idx = list(ctl_idx)
random.shuffle(ctl_idx)
ctl_idx = ctl_idx[0:control_samples]
elif len(ctl_idx) < control_samples:
ctl_idx = list(metadata[ctl_mask].index)
random.shuffle(ctl_idx)
ctl_idx = ctl_idx[0:control_samples]
m = sample_rectangular_matrix([i], ctl_idx, corr_matrix)
#return
control_corr_matrix.append(m)
return np.concatenate(control_corr_matrix)
################################################################################
## COMPUTE STATISTICS ON CORRELATION DISTRIBUTIONS
################################################################################
# Creates groups of size "sample_size" from some rows in the
# correlation matrix "cmatrix". The process is repeated several times
# to create a large enough background distribution
# TODO: choose random groups of samples that do not share the same treatment
def null_distribution(sample_rows, cmatrix, sample_size, repeats=30):
distribution = []
for i in tqdm(range(repeats)):
index = np.asarray([ k for k in sample_rows])
np.random.shuffle(index)
for j in range(0, len(index), sample_size):
median_corr = median_correlation(index[j:j+sample_size], cmatrix)
distribution.append(median_corr)
return distribution
# For one particular treatment compute the median of replicates,
# and also create a null distribution for comparison.
# Return mean of replicate correlations and 95th percentile of the null
def median_correlation(sample_index, cmatrix):
corr_values = sample_upper_triangle(sample_index, cmatrix)
median = np.median(corr_values)
return median
# Compute replicate correlation for all treatments, and report
# the fraction of treatments whose replicate correlation is greater than
# the 95th percentile of the null.
def fraction_strong_test(treated_samples, treatments, corr_matrix, null, treatment_field, sample_size):
null.sort()
p95 = null[ int( 0.95*len(null) ) ]
results = {}
for t in tqdm(treatments):
sample = treated_samples[treated_samples[treatment_field] == t]
replicates = len(sample)
if replicates >= sample_size:
sample_data = sample.sample(sample_size)
median = median_correlation(sample_data.index, corr_matrix)
results[t] = median
#replicates -= sample_size
# Evaluate results:
fraction = np.sum([m > p95 for m in results.values()])/len(results)
print("Treatments tested:", len(results))
print("At 95th percentile of the null")
print("Fraction strong: {:5.2f}%".format(fraction*100))
print("Null threshold: {:6.4f}".format(p95))
return results