### TF - motif bias matrix

1. downloading gene - motif name mapping in here: https://resources.aertslab.org/cistarget/motif_collections/v10nr_clust_public/snapshots/motifs-v10-nr.hgnc-m0.00001-o0.0.tbl
2. then load the motif -> gene enrichment mapping in here: https://resources.aertslab.org/cistarget/databases/homo_sapiens/hg38/refseq_r80/mc_v10_clust/gene_based/the hg38_10kbp_up_10kbp_down_full_tx_v10_clust.genes_vs_motifs.scores.feather
3. then we replace the motif name with gene names
4. then we take the max enrichment if same gene-motif
5. then renormalize this per gene and save it as our new attention bias

----

### PPI bias matrix

1. we get the human PPI matrix from stringdb here: https://string-db.org/cgi/download?sessionId=bwVBywlOX6i8&species_text=Homo+sapiens
2. we get the same one for other species (e.g. mus musculus): https://stringdb-downloads.org/download/protein.links.v12.0/10090.protein.links.v12.0.txt.gz
3. we open them and use ensembl's biomart to map ENSP to ENSG. we merge all similar pairs
4. we map it to a grn with associated scores.
5. we use them as our new PPI bias matrix

In [3]:
import pandas as pd
from scdataloader.utils import load_genes
from scdataloader.utils import getBiomartTable
import numpy as np
from tqdm import tqdm
import torch
from scipy import sparse

%load_ext autoreload
%autoreload 2


[92m→[0m connected lamindb: jkobject/scprint


In [2]:
genedf = load_genes()

In [3]:
da = pd.read_feather('~/main_scenic+_database.feather')
da = da.set_index('motifs')


tbl = pd.read_csv('~/motifs-v10-nr.hgnc-m0.00001-o0.0.tbl'
, sep='\t', header=None, names=['motif_id','motif_name',
'motif_description','source_name','source_version','gene_name','motif_similarity_qvalue','similar_motif_id','similar_motif_description','orthologous_identity','orthologous_gene_name','orthologous_species','description'])
tbl = tbl.set_index('motif_id')
tbl = tbl[tbl.gene_name.isin(genedf.symbol)]


  tbl = pd.read_csv('~/motifs-v10-nr.hgnc-m0.00001-o0.0.tbl'


In [None]:
rn = {}
toadd = {}
for i in tqdm.tqdm(da.index):
    try:
        res = tbl.loc[i, 'gene_name']
    except KeyError:
        da = da.drop(i)
    if type(res) != str:
        for v in res.values[1:]:
            toadd.update({v: da.loc[i].values})
        res = res.values[0]
    rn.update({i: res})

In [35]:
da = da.rename(index=rn)
da = pd.concat([da, pd.DataFrame(toadd, index=da.columns).T])
da = da.loc[~da.index.duplicated(keep='first')]


In [None]:
da.to_parquet('../data/main/main_scenic+.parquet')

In [3]:
da = pd.read_parquet("../data/main/main_scenic+.parquet")

In [2]:
biomart = getBiomartTable(attributes=['mmusculus_homolog_ensembl_gene'])
biomart

downloading gene names from biomart
['ensembl_gene_id', 'external_gene_name', 'mmusculus_homolog_ensembl_gene']


Unnamed: 0,ensembl_gene_id,external_gene_name,mmusculus_homolog_ensembl_gene
0,ENSG00000210049,MT-TF,
1,ENSG00000211459,MT-RNR1,
2,ENSG00000210077,MT-TV,
3,ENSG00000210082,MT-RNR2,
4,ENSG00000209082,MT-TL1,
...,...,...,...
76061,ENSG00000197312,DDI2,
76062,ENSG00000215695,RSC1A1,ENSMUSG00000078515
76063,ENSG00000215695,RSC1A1,ENSMUSG00000040715
76064,ENSG00000271742,,


In [4]:
mp = {}
for v, k in biomart.iloc[:,:2].values:
    if v is np.nan or k is np.nan:
        continue
    if k in mp:
        if v not in mp[k]:
            mp[k]+=[v]
    else:
        mp[k] = [v]


In [5]:
MAX=5
MIN=0

In [6]:
da = MIN + (da - da.min(1).values[:, None]) * (MAX - MIN) / (da.max(1) - da.min(1)).values[:, None]


In [7]:
new_index = []
new_data = []

# Remap indices
for idx in da.index:
    if idx in mp:
        for mapped_idx in mp[idx]:
            new_index.append(mapped_idx)
            new_data.append(da.loc[idx].values)
    else:
        new_index.append(idx)
        new_data.append(da.loc[idx].values)

# Remap columns
new_columns_set = set()
remapped_data = []

for row in tqdm(new_data):
    new_row = []
    for col, value in zip(da.columns, row):
        if col in mp:
            for mapped_col in mp[col]:
                new_row.append(value)
                new_columns_set.add(mapped_col)
        else:
            new_row.append(value)
            new_columns_set.add(col)
    remapped_data.append(new_row)

new_columns = list(new_columns_set)

new_da = pd.DataFrame(remapped_data, index=new_index, columns=new_columns)


100%|██████████| 1692/1692 [00:26<00:00, 63.12it/s]


In [8]:
# Ensure new_da is a square matrix with the union of both columns and indices
all_labels = sorted(set(new_da.index).union(set(new_da.columns)))
new_da = new_da.reindex(index=all_labels, columns=all_labels, fill_value=0)
new_da


Unnamed: 0,AAED1,AARS,AATK-AS1,ABHD11-AS1,ACPP,ACPT,ACRC,ACTN1-AS1,ADCK3,ADCK4,...,ZNF664-FAM101A,ZNF705E,ZNF720,ZNF724P,ZNF788,ZNF806,ZNRD1,ZNRD1ASP,ZRANB2-AS2,ZUFSP
AAED1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AARS,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AATK-AS1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ABHD11-AS1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ACPP,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZNF806,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ZNRD1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ZNRD1ASP,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ZRANB2-AS2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
new_da = new_da.loc[new_da.index.str.contains('ENSG'),new_da.index.str.contains('ENSG')]

In [15]:
new_da = new_da.T

## protein

In [17]:
string = pd.read_csv('../data/main/9606.protein.links.v12.0.txt.gz', sep=' ')

In [20]:
del string

In [18]:
string.protein1 = string.protein1.str.split('.').str[1]
string.protein2 = string.protein2.str.split('.').str[1]

In [10]:
rel = {}
for a, b in string.iloc[:,:2].values:
    a, b = rn.get(a, ""), rn.get(b, "")
    if a and b:
        if a in rel:
            rel[a].add(b)
        else:
            rel[a] = set([b])  
        if b in rel:
            rel[b].add(a)
        else:
            rel[b] = set([a])
len(rel)

19193

In [26]:
res = pd.DataFrame(data=np.zeros((len(rel), len(rel))), columns=list(rel.keys()), index=list(rel.keys()))

In [12]:
res.drop(columns=[''], inplace=True)
res.drop(index=[''], inplace=True)

In [25]:
for i,j in tqdm.tqdm(rel.items()):
    res.loc[i,list(j)] = 1


19193it [00:20, 917.76it/s] 


In [None]:
res.mean().mean()

0.03608647264434039

In [None]:
res.to_parquet('../data/main/stringdb_bias.parquet')

In [21]:
res = pd.read_parquet("../data/main/stringdb_bias.parquet")

In [30]:
res

Unnamed: 0,ENSG00000075292,ENSG00000172531,ENSG00000180745,ENSG00000156886,ENSG00000185069,ENSG00000125817,ENSG00000177459,ENSG00000074201,ENSG00000109390,ENSG00000165066,...,ENSG00000152592,ENSG00000164778,ENSG00000130957,ENSG00000165084,ENSG00000114487,ENSG00000178093,ENSG00000203950,ENSG00000126953,ENSG00000185985,ENSG00000180210
ENSG00000075292,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ENSG00000172531,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
ENSG00000180745,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ENSG00000156886,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ENSG00000185069,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ENSG00000178093,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ENSG00000203950,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ENSG00000126953,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
ENSG00000185985,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
res + res.T

In [29]:
res.sum(1)

ENSG00000075292    1221.0
ENSG00000172531    3264.0
ENSG00000180745     466.0
ENSG00000156886     287.0
ENSG00000185069     671.0
                    ...  
ENSG00000178093    1031.0
ENSG00000203950       0.0
ENSG00000126953     852.0
ENSG00000185985     719.0
ENSG00000180210    1276.0
Length: 19193, dtype: float64

In [66]:
# # Align the indices and columns of res with da, and vice versa, filling missing values with 0
all_indices = sorted(set(new_da.index).union(set(res.index)))
# 
da_aligned = new_da.reindex(index=all_indices, columns=all_indices, fill_value=0)
res_aligned = res.reindex(index=all_indices, columns=all_indices, fill_value=0)
# 
# # Add res to da
da = da_aligned + res_aligned

In [67]:
da.shape

(28206, 28206)

In [68]:
del new_da
del res

In [16]:
genes = torch.load('../data/temp/vbd8bavn/epoch=17-step=90000.ckpt')['hyper_parameters']['genes']

In [79]:
mp = {} 
for k,v in biomart.iloc[:,[0,2]].values:
    if v is np.nan or k is np.nan:
        continue
    if k in mp:
        if v not in mp[k]:
            mp[k]+=[v]
    else:
        mp[k] = [v]

In [81]:
# Ensure da contains exactly the same genes as in genes (dropping or filling empty with 0)
da = da.reindex(index=genes, columns=genes, fill_value=0)
# Convert da to a sparse array
da = sparse.csr_matrix(da.values)


In [None]:
biomart_filtered = biomart[~biomart.mmusculus_homolog_ensembl_gene.isna()].iloc[:, [0, 2]]
biomart_filtered = biomart_filtered.loc[~biomart_filtered.duplicated(keep=False)]

biomart_filtered[biomart_filtered.mmusculus_homolog_ensembl_gene.isin(set(genes)) & biomart_filtered.ensembl_gene_id.isin(set(genes))]
biomart_filtered


Unnamed: 0,ensembl_gene_id,mmusculus_homolog_ensembl_gene
5,ENSG00000198888,ENSMUSG00000064341
9,ENSG00000198763,ENSMUSG00000064345
15,ENSG00000198804,ENSMUSG00000064351
18,ENSG00000198712,ENSMUSG00000064354
20,ENSG00000228253,ENSMUSG00000064356
...,...,...
76056,ENSG00000116771,ENSMUSG00000040706
76059,ENSG00000252417,ENSMUSG00002075659
76062,ENSG00000215695,ENSMUSG00000078515
76063,ENSG00000215695,ENSMUSG00000040715


In [None]:
# Create a mapping from gene to its index in the genes list
gene_to_index = {gene: idx for idx, gene in enumerate(genes)}

# Replace values in biomart_filtered with their locations in "genes"
biomart_filtered = biomart_filtered.applymap(lambda x: gene_to_index.get(x, x))

# Filter out rows where any of the values are not in the gene_to_index mapping
biomart_filtered = biomart_filtered[biomart_filtered.applymap(lambda x: isinstance(x, int)).all(axis=1)]


  biomart_filtered = biomart_filtered.applymap(lambda x: gene_to_index.get(x, x))
  biomart_filtered = biomart_filtered[biomart_filtered.applymap(lambda x: isinstance(x, int)).all(axis=1)]


In [None]:
da[biomart_filtered.values[:,1][:, None], biomart_filtered.values[:,1]] = da[biomart_filtered.values[:,0][:, None], biomart_filtered.values[:,0]]

  self._set_arrayXarray_sparse(i, j, x)


In [None]:
sparse.save_npz("bias_sparse.npz", da)


In [25]:
sparse.save_npz("bias_sparse.npz", da)
