In [1]:
import numpy as np
import os
import matplotlib.pyplot as plt
import pandas as pd
import pickle as pk
from tqdm import tqdm
import sys
sys.path.append('scripts/')
from functions import *

In [5]:
# input motifs for archetypes
mots_meme_fn = 'results/Archetypes/input-pwms.meme'
mots_meme = readMEME(mots_meme_fn)

# Get all patterns in a single dictionary
mots_pfms = mots_meme[0]
mots_names = mots_meme[1]


In [13]:
# get unique motif names
mots_names
mots_names_unique = []
for i in mots_names:
    if i not in mots_names_unique:
        mots_names_unique.append(i)

len(mots_names_unique)

5043

In [6]:
from joblib import Parallel, delayed
import numpy as np
from tqdm import tqdm

# Pre-compute key list for faster access
keys = list(mots_pfms.keys())
n = len(mots_pfms)

# Pre-allocate similarity matrix
jMap = np.zeros((n, n))

# Define a function for parallel computation
def compute_similarity(i, mots_pfms, keys):
    similarities = []
    base = mots_pfms[keys[i]]
    for j in range(i + 1, n):
        other = mots_pfms[keys[j]]
        dist, _, _ = bestOverlapPFMs(base, other, overlap='complete', agg='sum')
        similarities.append((i, j, dist))
    return similarities

# Compute the upper triangle of the matrix in parallel
results = Parallel(n_jobs=-1)(delayed(compute_similarity)(i, mots_pfms, keys) for i in tqdm(range(n)))

# Update the matrix with results
for res in results:
    for i, j, dist in res:
        jMap[i, j] = dist
        jMap[j, i] = dist

100%|██████████| 5043/5043 [12:35<00:00,  6.68it/s]


In [7]:
with open('results/Archetypes/motif-similarity-JSD.pkl', 'wb') as f:
    pk.dump(jMap, f)

In [8]:
# Load the JSD similarities
with open('results/Archetypes/motif-similarity-JSD.pkl', 'rb') as f:
    jMap = pk.load(f)

# Create row and column names
col_names = list(mots_pfms.keys())
row_names = col_names

# Create a DataFrame with row and column names
df = pd.DataFrame(jMap, index=row_names, columns=col_names)
df = df.melt(ignore_index=False, var_name='motif2', value_name='JSD')
df = df.reset_index().rename(columns={'index': 'motif1'})

# Save as a TSV file
import gzip
with gzip.open('results/Archetypes/motif-similarity-JSD.tsv.gz', 'wt') as f:
    df.to_csv(f, sep='\t', index=False)

In [9]:
df.head()

Unnamed: 0,notif1,motif2,JSD
0,M00115_2.00,M00115_2.00,0.0
1,M01659_2.00,M00115_2.00,1.120823
2,M01660_2.00,M00115_2.00,0.150321
3,M01664_2.00,M00115_2.00,0.176049
4,M01667_2.00,M00115_2.00,0.15202
