### TF - motif bias matrix

1. downloading gene - motif name mapping in here: https://resources.aertslab.org/cistarget/motif_collections/v10nr_clust_public/snapshots/motifs-v10-nr.hgnc-m0.00001-o0.0.tbl
2. then load the motif -> gene enrichment mapping in here: https://resources.aertslab.org/cistarget/databases/homo_sapiens/hg38/refseq_r80/mc_v10_clust/gene_based/the hg38_10kbp_up_10kbp_down_full_tx_v10_clust.genes_vs_motifs.scores.feather
3. then we replace the motif name with gene names
4. then we take the max enrichment if same gene-motif
5. then renormalize this per gene and save it as our new attention bias

----

### PPI bias matrix

1. we get the human PPI matrix from stringdb here: https://string-db.org/cgi/download?sessionId=bwVBywlOX6i8&species_text=Homo+sapiens
2. we get the same one for other species (e.g. mus musculus): https://stringdb-downloads.org/download/protein.links.v12.0/10090.protein.links.v12.0.txt.gz
3. we open them and use ensembl's biomart to map ENSP to ENSG. we merge all similar pairs
4. we map it to a grn with associated scores.
5. we use them as our new PPI bias matrix

In [15]:
import pandas as pd
from scdataloader.utils import load_genes
from scdataloader.utils import getBiomartTable
import numpy as np
import tqdm

In [2]:
genedf = load_genes()

In [3]:
da = pd.read_feather('~/main_scenic+_database.feather')
da = da.set_index('motifs')


tbl = pd.read_csv('~/motifs-v10-nr.hgnc-m0.00001-o0.0.tbl'
, sep='\t', header=None, names=['motif_id','motif_name',
'motif_description','source_name','source_version','gene_name','motif_similarity_qvalue','similar_motif_id','similar_motif_description','orthologous_identity','orthologous_gene_name','orthologous_species','description'])
tbl = tbl.set_index('motif_id')
tbl = tbl[tbl.gene_name.isin(genedf.symbol)]


  tbl = pd.read_csv('~/motifs-v10-nr.hgnc-m0.00001-o0.0.tbl'


In [None]:
rn = {}
toadd = {}
for i in tqdm.tqdm(da.index):
    try:
        res = tbl.loc[i, 'gene_name']
    except KeyError:
        da = da.drop(i)
    if type(res) != str:
        for v in res.values[1:]:
            toadd.update({v: da.loc[i].values})
        res = res.values[0]
    rn.update({i: res})

In [35]:
da = da.rename(index=rn)
da = pd.concat([da, pd.DataFrame(toadd, index=da.columns).T])
da = da.loc[~da.index.duplicated(keep='first')]


In [37]:
da.to_parquet('~/main_scenic+.parquet')

## protein

In [3]:
string = pd.read_csv('~/9606.protein.links.v12.0.txt.gz', sep=' ')

In [4]:
string.protein1 = string.protein1.str.split('.').str[1]
string.protein2 = string.protein2.str.split('.').str[1]

In [7]:
biomart = getBiomartTable(attributes=['ensembl_peptide_id'])
biomart

downloading gene names from biomart
['ensembl_gene_id', 'hgnc_symbol', 'gene_biotype', 'entrezgene_id', 'ensembl_peptide_id']


Unnamed: 0,ensembl_gene_id,hgnc_symbol,gene_biotype,entrezgene_id,ensembl_peptide_id
0,ENSG00000210049,MT-TF,Mt_tRNA,,
1,ENSG00000211459,MT-RNR1,Mt_rRNA,,
2,ENSG00000210077,MT-TV,Mt_tRNA,,
3,ENSG00000210082,MT-RNR2,Mt_rRNA,,
4,ENSG00000209082,MT-TL1,Mt_tRNA,,
...,...,...,...,...,...
191604,ENSG00000271742,ENSG00000271742,lncRNA,,
191605,ENSG00000116786,PLEKHM2,protein_coding,23207.0,ENSP00000364956
191606,ENSG00000116786,PLEKHM2,protein_coding,23207.0,ENSP00000364950
191607,ENSG00000116786,PLEKHM2,protein_coding,23207.0,


In [8]:
rn = {j: k for k,j in biomart[['ensembl_gene_id', 'ensembl_peptide_id']].values if k is not None}
len(rn)

122615

In [10]:
rel = {}
for a, b in string.iloc[:,:2].values:
    a, b = rn.get(a, ""), rn.get(b, "")
    if a and b:
        if a in rel:
            rel[a].add(b)
        else:
            rel[a] = set([b])  
        if b in rel:
            rel[b].add(a)
        else:
            rel[b] = set([a])
len(rel)

19193

In [26]:
da = pd.DataFrame(data=np.zeros((len(rel), len(rel))), columns=list(rel.keys()), index=list(rel.keys()))

In [12]:
da.drop(columns=[''], inplace=True)
da.drop(index=[''], inplace=True)

In [25]:
for i,j in tqdm.tqdm(rel.items()):
    da.loc[i,list(j)] = 1


19193it [00:20, 917.76it/s] 


In [None]:
da.mean().mean()

0.03608647264434039