# MOA Evaluation using enrichment analysis

- Adopted here: https://www.nature.com/articles/s41467-019-10154-8
- Explained here: https://yulab-smu.github.io/clusterProfiler-book/chapter2.html

### Procedure:
1. Compute similarity / correlation / affinity matrix
1. Define MOA matching criteria based on metadata
  - https://github.com/carpenterlab/2018_rohban_natcomm/blob/master/code/moa_evaluations.R#L67
  - https://github.com/carpenterlab/2018_rohban_natcomm/blob/master/code/evaluate.R#L190
1. Find the threshold of top connections (percentile)
1. Run enrichment analysis (one-sided version of Fisher's exact test)
  - https://github.com/carpenterlab/2018_rohban_natcomm/blob/master/code/moa_evaluations.R#L97
  - https://github.com/carpenterlab/2018_rohban_natcomm/blob/master/code/evaluate.R#L205

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle
import sys
import sklearn

sys.path.append("../profiling/")
import metrics
import quality
import profiling

# Load correlation matrix

In [None]:
SIM_MATRIX = "data/cos_efn128combinedplatesout_conv6a_1e-2_e30.csv"
OUT_RESUTS = "data/efn128combinedplatesout_conv6a_1e-2_e30"

In [None]:
X, Y = profiling.load_similarity_matrix(SIM_MATRIX)

# MOA matching

In [None]:
Y.groupby("Metadata_moa.x")["Var1"].count()

In [None]:
moa_matches = []
Y["Ref_moa"] = Y["Metadata_moa.x"].str.replace('|', '___')
for k,r in Y.iterrows():
    moas = r["Metadata_moa.x"].split("|")
    candidates = []
    for m in moas:
        reg = r'(^|___){}($|___)'.format(m)
        candidates.append(Y["Ref_moa"].str.contains(reg))
    matches = candidates[0]
    for c in candidates:
        matches = matches | c
    moa_matches.append(matches)

In [None]:
moa_matches = np.asarray(moa_matches)
plt.imshow(moa_matches)

# Enrichment analysis

In [None]:
results = {}
SIM = np.asarray(X[Y.Var1])
is_query = moa_matches.sum(axis=0) > 1

for i in range(SIM.shape[0]):
    if is_query[i]:
        idx = [x for x in range(SIM.shape[1]) if x != i]
        results[i] = quality.enrichment_analysis(SIM[i,idx], moa_matches[i,idx], 99.)
        if results[i]["ods_ratio"] is np.nan:
            print(results[i]["V"], i)

In [None]:
folds = [results[x]["ods_ratio"] for x in results]
print("Average folds of enrichment at top 1%:", np.mean(folds))

In [None]:
enrichment_results = pd.DataFrame(data=results).T

# Average precision analysis

In [None]:
def precision_at_k(sim_matrix, moa_matches, rank_pos=None):
    results = {}
    is_query = moa_matches.sum(axis=0) > 1
    for i in range(sim_matrix.shape[0]):
        if is_query[i]:
            ranking = np.argsort(-sim_matrix[i,idx])
            pk = metrics.precision_at_k(moa_matches[i, ranking[1:]], rank_pos)
            results[i] = {"precision_at_k":pk,"pk":rank_pos}
    return results

In [None]:
positions = [x for x in range(5,55,5)]
average_precision_at_k = []
for pos in positions:
    prec_k = precision_at_k(SIM, moa_matches, pos)
    average_precision_at_k.append(np.mean([prec_k[q]["precision_at_k"] for q in prec_k]))

In [None]:
plt.figure(figsize=(10,6))
plt.plot(positions, average_precision_at_k)

In [None]:
top_1percent = max(int(X.shape[0]*0.01), 1)
top_prec = precision_at_k(SIM, moa_matches, top_1percent)
avg_top_prec = np.mean([top_prec[q]["precision_at_k"] for q in top_prec])
print(f"Average of Precision At Top 1% ({top_1percent} results) => ", avg_top_prec)

In [None]:
prec_at_top1 = pd.DataFrame(data=top_prec).T

# Recall analysis

In [None]:
def recall_at(sim_matrix, moa_matches, rank_pos=None):
    results = {}
    is_query = moa_matches.sum(axis=0) > 1
    for i in range(sim_matrix.shape[0]):
        if is_query[i]:
            ranking = np.argsort(-sim_matrix[i,:])
            rc = np.sum(moa_matches[i, ranking[1:rank_pos]]) / np.sum(moa_matches[i,:])
            results[i] = {"recall_at_k":rc, "rk":rank_pos}
    return results

In [None]:
recall = []
for pos in positions:
    recall_k = recall_at(SIM, moa_matches, pos)
    recall.append(np.mean([recall_k[x]["recall_at_k"] for x in recall_k]))

In [None]:
plt.figure(figsize=(10,6))
plt.plot(positions, recall)

In [None]:
recall_top_10 = recall_at(SIM, moa_matches, top_1percent*10)
avg_recall_at_top = np.mean([recall_top_10[x]["recall_at_k"] for x in recall_top_10])
print(f"Average Recall At Top 10% ({top_1percent*10} results) => ", avg_recall_at_top)

In [None]:
recall_at_top10 = pd.DataFrame(data=recall_top_10).T

# Interpolated Recall-Precision Curve

In [None]:
recall_axis, average_precision = metrics.interpolated_precision_recall_curve(moa_matches, SIM)

plt.figure(figsize=(10,6))
plt.plot(recall_axis, average_precision)

print("Mean Average Precision (MAP): \t", np.mean(average_precision))
print("Area Under the PR curve: \t", sklearn.metrics.auc(recall_axis, average_precision))

# Save Results

In [None]:
results = {
    "ranking": positions,
    "precision_at_k": average_precision_at_k,
    "recall": recall,
    "avg_prec@top1": avg_top_prec,
    "avg_recall@top1": avg_recall_at_top,
    "recall_axis": recall_axis,
    "precision_axis": average_precision,
    "mean_average_precision": np.mean(average_precision),
    "reference_library_size": len(X),
    "number_of_queries": len(enrichment_results)
}

with open(OUT_RESUTS + ".pkl", "bw") as out:
    pickle.dump(results, out)

In [None]:
all_results = pd.merge(X["Var1"], enrichment_results, left_index=True, right_index=True)
all_results = pd.merge(all_results, prec_at_top1, left_index=True, right_index=True)
all_results = pd.merge(all_results, recall_at_top10, left_index=True, right_index=True)

In [None]:
all_results.to_csv(OUT_RESUTS + ".csv", index=True)