## working with ESM2 embeddings

In [None]:
! esm-extract esm2_t33_650M_UR50D data/temp/test_esm.fasta data/temp/ --include mean

In [97]:
import io
import os
from biomart import BiomartServer

def _fetchFromServer(ensemble_server, attributes):
    server = BiomartServer(ensemble_server, verbose=True)
    ensmbl = server.datasets["hsapiens_gene_ensembl"]
    print(attributes)
    res = pd.read_csv(
        io.StringIO(
            ensmbl.search({"attributes": attributes}, header=1).content.decode()
        ),
        sep="\t",
    )
    return res

def createFoldersFor(filepath):
    """
    will recursively create folders if needed until having all the folders required to save the file in this filepath
    """
    prevval = ""
    for val in os.path.expanduser(filepath).split("/")[:-1]:
        prevval += val + "/"
        if not os.path.exists(prevval):
            os.mkdir(prevval)

def getBiomartTable(
    ensemble_server="http://feb2023.archive.ensembl.org/biomart",
    useCache=False,
    cache_folder="/tmp/biomart/",
    attributes=[],
    bypass_attributes=False,
):
    """generate a genelist dataframe from ensembl's biomart

    Args:
        ensemble_server ([type], optional): [description]. Defaults to ENSEMBL_SERVER_V.
        useCache (bool, optional): [description]. Defaults to False.
        cache_folder ([type], optional): [description]. Defaults to CACHE_PATH.

    Raises:
        ValueError: [description]

    Returns:
        [type]: [description]
    """
    attr = (
        [
            "ensembl_gene_id",
            "hgnc_symbol",
            "gene_biotype",
            "entrezgene_id",
        ]
        if not bypass_attributes
        else []
    )
    assert cache_folder[-1] == "/"

    cache_folder = os.path.expanduser(cache_folder)
    createFoldersFor(cache_folder)
    cachefile = os.path.join(cache_folder, ".biomart.csv")
    if useCache & os.path.isfile(cachefile):
        print("fetching gene names from biomart cache")
        res = pd.read_csv(cachefile)
    else:
        print("downloading gene names from biomart")

        res = _fetchFromServer(ensemble_server, attr + attributes)
        res.to_csv(cachefile, index=False)
    
    res.columns = attr + attributes
    if type(res) is not type(pd.DataFrame()):
        raise ValueError("should be a dataframe")
    res = res[~(res["ensembl_gene_id"].isna() & res["hgnc_symbol"].isna())]
    res.loc[res[res.hgnc_symbol.isna()].index, "hgnc_symbol"] = res[
        res.hgnc_symbol.isna()
    ]["ensembl_gene_id"]

    return res

In [107]:
biomart2 = getBiomartTable()

downloading gene names from biomart
[BiomartServer:'http://feb2023.archive.ensembl.org/biomart/martservice'] is alive.
[BiomartServer:'http://feb2023.archive.ensembl.org/biomart/martservice'] Fetching datasets
[BiomartServer:'http://feb2023.archive.ensembl.org/biomart/martservice'] Fetching databases
[BiomartDatabase:'Ensembl Genes 109'] Fetching datasets
[BiomartDatabase:'Mouse strains 109'] Fetching datasets
[BiomartDatabase:'Sequence'] Fetching datasets
[BiomartDatabase:'Ontology'] Fetching datasets
[BiomartDatabase:'Genomic features 109'] Fetching datasets
[BiomartDatabase:'Ensembl Variation 109'] Fetching datasets
[BiomartDatabase:'Ensembl Regulation 109'] Fetching datasets
['ensembl_gene_id', 'hgnc_symbol', 'gene_biotype', 'entrezgene_id']
[BiomartDataset:'hsapiens_gene_ensembl'] Searching using following params:
{'attributes': ['ensembl_gene_id',
                'hgnc_symbol',
                'gene_biotype',
                'entrezgene_id']}
[BiomartDataset:'hsapiens_gene_ensemb

In [101]:
biomart = getBiomartTable(attributes=               
    ["ensembl_transcript_id",
        "protein_id",
        "ensembl_peptide_id",
        #"peptide",
       # "uniprotswissprot",
    ], bypass_attributes=False, useCache=False)

downloading gene names from biomart
[BiomartServer:'http://feb2023.archive.ensembl.org/biomart/martservice'] is alive.
[BiomartServer:'http://feb2023.archive.ensembl.org/biomart/martservice'] Fetching datasets
[BiomartServer:'http://feb2023.archive.ensembl.org/biomart/martservice'] Fetching databases
[BiomartDatabase:'Ensembl Genes 109'] Fetching datasets
[BiomartDatabase:'Mouse strains 109'] Fetching datasets
[BiomartDatabase:'Sequence'] Fetching datasets
[BiomartDatabase:'Ontology'] Fetching datasets
[BiomartDatabase:'Genomic features 109'] Fetching datasets
[BiomartDatabase:'Ensembl Variation 109'] Fetching datasets
[BiomartDatabase:'Ensembl Regulation 109'] Fetching datasets
['ensembl_gene_id', 'hgnc_symbol', 'gene_biotype', 'entrezgene_id', 'ensembl_transcript_id', 'protein_id', 'ensembl_peptide_id']
[BiomartDataset:'hsapiens_gene_ensembl'] Searching using following params:
{'attributes': ['ensembl_gene_id',
                'hgnc_symbol',
                'gene_biotype',
          

In [106]:
biomart[biomart.gene_biotype.isin(['Mt_tRNA', 'snRNA', 'sRNA'])]

Unnamed: 0,ensembl_gene_id,hgnc_symbol,gene_biotype,entrezgene_id,ensembl_transcript_id,protein_id,ensembl_peptide_id


In [110]:
biomart2[biomart2.gene_biotype.isin(['Mt_tRNA', 'snRNA', 'sRNA'])].head(20)

Unnamed: 0,ensembl_gene_id,hgnc_symbol,gene_biotype,entrezgene_id
0,ENSG00000210049,MT-TF,Mt_tRNA,
2,ENSG00000210077,MT-TV,Mt_tRNA,
4,ENSG00000209082,MT-TL1,Mt_tRNA,
6,ENSG00000210100,MT-TI,Mt_tRNA,
7,ENSG00000210107,MT-TQ,Mt_tRNA,
8,ENSG00000210112,MT-TM,Mt_tRNA,
10,ENSG00000210117,MT-TW,Mt_tRNA,
11,ENSG00000210127,MT-TA,Mt_tRNA,
12,ENSG00000210135,MT-TN,Mt_tRNA,
13,ENSG00000210140,MT-TC,Mt_tRNA,


In [105]:
main_genes

Unnamed: 0,ensembl_gene_id,hgnc_symbol,gene_biotype,entrezgene_id,ensembl_transcript_id,protein_id,ensembl_peptide_id
0,ENSG00000198888,MT-ND1,protein_coding,4535.0,ENST00000361390,CAA24026,ENSP00000354687
1,ENSG00000198888,MT-ND1,protein_coding,4535.0,ENST00000361390,AAB58943,ENSP00000354687
2,ENSG00000198888,MT-ND1,protein_coding,4535.0,ENST00000361390,BAA07290,ENSP00000354687
3,ENSG00000198888,MT-ND1,protein_coding,4535.0,ENST00000361390,AAP89036,ENSP00000354687
4,ENSG00000198888,MT-ND1,protein_coding,4535.0,ENST00000361390,AAP89049,ENSP00000354687
...,...,...,...,...,...,...,...
901591,ENSG00000162437,RAVER2,protein_coding,55225.0,ENST00000418058,,ENSP00000397069
901592,ENSG00000122432,SPATA1,protein_coding,100505741.0,ENST00000697276,,ENSP00000514413
901593,ENSG00000122432,SPATA1,protein_coding,100505741.0,ENST00000699524,,ENSP00000514414
901594,ENSG00000122432,SPATA1,protein_coding,100505741.0,ENST00000697277,,ENSP00000514416


In [109]:
import gget

In [118]:
len(ls)/2

47.0

In [124]:
ls[val*2].split(' ')[0][1:]

'ENST00000576342'

In [None]:
res = []
size = 600
from multiprocessing import Pool


genes = biomart2[biomart2.gene_biotype.isin(['protein_coding'])].ensembl_gene_id.tolist()
groups = [genes[i*size:(i+1)*size] for i in range(len(genes)//size)] + [genes[(len(genes)//size)*size:]]
with Pool() as p:
    results = [p.apply(gget.seq, args=(sub, True, False)) for sub in groups]

res = [item for sublist in results for item in sublist]

In [141]:
from scprint.utils import get_seq
%reload_ext autoreload
%autoreload 2

In [150]:
get_seq(biomart2[biomart2.gene_biotype.isin(['protein_coding'])].ensembl_gene_id.tolist()[:4], translate=True, isoforms=False, save=True)

Sun Nov 12 16:33:29 2023 INFO Requesting amino acid sequence of the canonical transcript ENST00000361390 of gene ENSG00000198888 from UniProt.
Sun Nov 12 16:33:29 2023 INFO Requesting amino acid sequence of the canonical transcript ENST00000361453 of gene ENSG00000198763 from UniProt.
Sun Nov 12 16:33:29 2023 INFO Requesting amino acid sequence of the canonical transcript ENST00000361624 of gene ENSG00000198804 from UniProt.
Sun Nov 12 16:33:29 2023 INFO Requesting amino acid sequence of the canonical transcript ENST00000361739 of gene ENSG00000198712 from UniProt.


> [0;32m/Users/jkobject/Documents/code/scPRINT/scprint/utils/get_seq.py[0m(300)[0;36mseq[0;34m()[0m
[0;32m    298 [0;31m            [0;32mimport[0m [0mpdb[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    299 [0;31m            [0mpdb[0m[0;34m.[0m[0mset_trace[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m--> 300 [0;31m            [0mdf_uniprot[0m [0;34m=[0m [0mdf_uniprot[0m[0;34m.[0m[0mjoin[0m[0;34m([0m[0minfo_df[0m[0;34m,[0m [0mon[0m[0;34m=[0m[0;34m"canonical_transcript"[0m[0;34m,[0m [0mhow[0m[0;34m=[0m[0;34m"inner"[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    301 [0;31m[0;34m[0m[0m
[0m[0;32m    302 [0;31m[0;34m[0m[0m
[0m
             index         ensembl_id uniprot_id pdb_id ncbi_gene_id  \
0  ENSG00000198888  ENSG00000198888.2        NaN    NaN          NaN   
1  ENSG00000198763  ENSG00000198763.3        NaN    NaN          NaN   
2  ENSG00000198804  ENSG00000198804.2        NaN    NaN          NaN   

In [None]:
python launch/predict.py --config="pretrained/extract_embedding.yml" \
--data_path="./data/examples/example.fasta" --save_dir="./resuts" \
--save_frequency 1 --save_embeddings

In [102]:
biomart

Unnamed: 0,ensembl_gene_id,hgnc_symbol,gene_biotype,entrezgene_id,ensembl_transcript_id,protein_id,ensembl_peptide_id
0,ENSG00000198888,MT-ND1,protein_coding,4535.0,ENST00000361390,CAA24026,ENSP00000354687
1,ENSG00000198888,MT-ND1,protein_coding,4535.0,ENST00000361390,AAB58943,ENSP00000354687
2,ENSG00000198888,MT-ND1,protein_coding,4535.0,ENST00000361390,BAA07290,ENSP00000354687
3,ENSG00000198888,MT-ND1,protein_coding,4535.0,ENST00000361390,AAP89036,ENSP00000354687
4,ENSG00000198888,MT-ND1,protein_coding,4535.0,ENST00000361390,AAP89049,ENSP00000354687
...,...,...,...,...,...,...,...
901591,ENSG00000162437,RAVER2,protein_coding,55225.0,ENST00000418058,,ENSP00000397069
901592,ENSG00000122432,SPATA1,protein_coding,100505741.0,ENST00000697276,,ENSP00000514413
901593,ENSG00000122432,SPATA1,protein_coding,100505741.0,ENST00000699524,,ENSP00000514414
901594,ENSG00000122432,SPATA1,protein_coding,100505741.0,ENST00000697277,,ENSP00000514416


In [12]:
import os
import glob
import torch

tensor_list = []
for file in glob.glob("../../data/temp/*.pt"):
    tensor = torch.load(file)['mean_representations'][33]
    tensor_list.append(tensor)

concatenated_tensor = torch.cat(tensor_list, dim=0)


{33: tensor([-0.0224,  0.0423, -0.0291,  ..., -0.1396, -0.0235, -0.0449])}

In [None]:
# get all genes in the dataset, all species
# load them from biomart (for each species)
# for all protein codings
    # get the fasta file from uniprot using gget seq
    # if not available use gget seq from ensembl
        # use https://github.com/prestevez/dna2proteins to convert to protein
        # merge with the uniprot fasta
    # get the embedding of the fasta file using esm
    

# for all non protein codings (RNA based)
    # get the fasta file from ensembl using gget seq
    # use https://github.com/ml4bio/RNA-FM to embed the sequence

# can we have learned embeddings for just a subset of the elements of the transformer?
# can we have two KQV matrices, one for the protein coding, one for the RNA based? -> we would need at least to not have a skip connection for this first layer
# else we have an additional FCN layer that maps both to the actual embedding size
# adds 



In [None]:
def get_structural_embeddings(biotype, ids):

    if biotype == "proteins":

    if biotype == "protein_coding_genes":

    if biotype == "non_coding_genes":

    if biotype == "coding_transcripts":

    if biotype == "non_coding_transcripts":
        # get fasta file
        # subset fasta file
        # 

In [None]:
store it as an additional array in the dataset (make it behave as a varm)

add anndata idioms in the dataset

add GRN as a sparse array in the data (make it behave as a varp)

In [None]:

https://ftp.ensembl.org/pub/release-110/fasta/mus_musculus/pep/Mus_musculus.GRCm39.pep.all.fa.gz

https://ftp.ensembl.org/pub/release-110/fasta/homo_sapiens/ncrna/Homo_sapiens.GRCh38.ncrna.fa.gz

https://ftp.ensembl.org/pub/release-110/fasta/mus_musculus/ncrna/Mus_musculus.GRCm39.ncrna.fa.gz

In [2]:
import ftplib
import os

def list_files(ftp, match=''):
    files = ftp.nlst()
    return [file for file in files if file.endswith(match)]

def load_fasta_species(species="homo_sapiens")
    ftp = ftplib.FTP('ftp.ensembl.org')
    ftp.login()
    ftp.cwd('/pub/release-110/fasta/homo_sapiens/pep/')
    file = list_files(ftp, '.all.fa.gz')[0]
    local_file_path = '../../data/fasta/' + file
    if not os.path.exists(local_file_path):
        with open(local_file_path, 'wb') as local_file:
            ftp.retrbinary('RETR ' + file, local_file.write)
    ftp.cwd('/pub/release-110/fasta/homo_sapiens/ncrna/')
    file = list_files(ftp, '.ncrna.fa.gz')[0]
    local_file_path = '../../data/fasta/' + file
    if not os.path.exists(local_file_path):
        with open(local_file_path, 'wb') as local_file:
            ftp.retrbinary('RETR ' + file, local_file.write)
    ftp.quit()

'221 Goodbye.'

In [17]:
'*' in record.seq

False

In [9]:
import bionty as bt

In [10]:
genedf = bt.Gene().df()
protgenedf = genedf[genedf.biotype=="protein_coding"]

  for level in list(self.possible_simple_keys):


In [20]:
from Bio import SeqIO
genes_found = set()
with open("../../data/fasta/Homo_sapiens.GRCh38.pep.all.fa", "r") as original_fasta:
    for record in SeqIO.parse(original_fasta, "fasta"):
        break

In [21]:
record

SeqRecord(seq=Seq('EI'), id='ENSP00000451042.1', name='ENSP00000451042.1', description='ENSP00000451042.1 pep chromosome:GRCh38:14:22438547:22438554:1 gene:ENSG00000223997.1 transcript:ENST00000415118.1 gene_biotype:TR_D_gene transcript_biotype:TR_D_gene gene_symbol:TRDD1 description:T cell receptor delta diversity 1 [Source:HGNC Symbol;Acc:HGNC:12254]', dbxrefs=[])

In [None]:
# given a gene file and organism
#load the organism fasta if not already done
#subset the fasta
#subset the gene file
# embed
#load the data and erase / zip the rest
# return the embedding and gene file

In [23]:
subset_fasta(protgenedf.ensembl_gene_id.tolist()[:10], subfasta_path="../../data/fasta/subset_test.fa")

had to drop duplicates for ENSG00000000419
had to drop duplicates for ENSG00000000419
had to drop duplicates for ENSG00000000419
had to drop duplicates for ENSG00000000419
had to drop duplicates for ENSG00000000419
had to drop duplicates for ENSG00000000419
had to drop duplicates for ENSG00000001036
had to drop duplicates for ENSG00000000460
had to drop duplicates for ENSG00000000460
had to drop duplicates for ENSG00000000460
had to drop duplicates for ENSG00000000460
had to drop duplicates for ENSG00000000460
had to drop duplicates for ENSG00000000460
had to drop duplicates for ENSG00000000460
had to drop duplicates for ENSG00000000457
had to drop duplicates for ENSG00000000457
had to drop duplicates for ENSG00000000457
had to drop duplicates for ENSG00000001167
had to drop duplicates for ENSG00000000003
had to drop duplicates for ENSG00000001084
had to drop duplicates for ENSG00000001084
had to drop duplicates for ENSG00000001084
had to drop duplicates for ENSG00000001084
had to drop

{'ENSG00000000003',
 'ENSG00000000005',
 'ENSG00000000419',
 'ENSG00000000457',
 'ENSG00000000460',
 'ENSG00000000938',
 'ENSG00000000971',
 'ENSG00000001036',
 'ENSG00000001084',
 'ENSG00000001167'}

In [1]:
#! gunzip ../../data/fasta/subset_test.fa.gz
! esm-extract esm2_t33_650M_UR50D ../../data/fasta/Homo_sapiens.GRCh38.pep.all.fa ../../data/fasta/ --include mean

/bin/bash: esm-extract: command not found


In [None]:
from scrint.data_loader import rna_embedder

In [None]:
RNABert = rna_embedder.RNABert()

In [None]:
RNABert()

In [None]:
seqs, label, test_dl  = data.load_data_EMB(args.data_embedding) 
features = train.make_feature(model, test_dl, seqs)

In [None]:
embed(genesdf, organism="homo_sapiens", cache=True, fasta_path="/tmp/data/fasta/", cache=True)

In [1]:
import bionty as bt
genedf = bt.Gene().df()
protgenedf = genedf[genedf.biotype=="protein_coding"]

In [2]:
from scprint import utils
from scprint.dataloader import PROTBERT, RNABERT
from torch.nn.functional import avg_pool1d
import os
from scprint.utils.utils import run_command
import pandas as pd
%reload_ext autoreload
%autoreload 2

💡 lamindb instance: jkobject/scprint


In [3]:
organism="homo_sapiens"
cache=True
fasta_path="/tmp/data/fasta/"
config="esm-extract"
pretrained_model="esm2_t33_650M_UR50D"

In [6]:
# given a gene file and organism
# load the organism fasta if not already done
utils.load_fasta_species(species=organism, output_path=fasta_path, cache=cache)
# subset the fasta
fasta_file = next(
    file for file in os.listdir(fasta_path) if file.endswith(".all.fa.gz")
)

In [17]:
fasta_path + "subset.fa"

'/tmp/data/fasta/subset.fa'

In [18]:
protgenedf = genedf[genedf["biotype"] == "protein_coding"]
#subprocess.run(["gunzip", fasta_path+fasta_file], check=True)
utils.subset_fasta(
    protgenedf["ensembl_gene_id"].tolist(),
    subfasta_path=fasta_path + "subset.fa",
    fasta_path=fasta_path+fasta_file[:-3],
    drop_unknown_seq=True,
)

dropped 98733 duplicates
dropped 112 weird sequences


{'ENSG00000204296',
 'ENSG00000185127',
 'ENSG00000277058',
 'ENSG00000228570',
 'ENSG00000273575',
 'ENSG00000078098',
 'ENSG00000183734',
 'ENSG00000139679',
 'ENSG00000206457',
 'ENSG00000079841',
 'ENSG00000278306',
 'ENSG00000291672',
 'ENSG00000187258',
 'ENSG00000079819',
 'ENSG00000114948',
 'ENSG00000144644',
 'ENSG00000141971',
 'ENSG00000165914',
 'ENSG00000108826',
 'ENSG00000144010',
 'ENSG00000198211',
 'ENSG00000108395',
 'ENSG00000126759',
 'ENSG00000151498',
 'ENSG00000171847',
 'ENSG00000135932',
 'ENSG00000291431',
 'ENSG00000204571',
 'ENSG00000187416',
 'ENSG00000164078',
 'ENSG00000187605',
 'ENSG00000224103',
 'ENSG00000198792',
 'ENSG00000263296',
 'ENSG00000197050',
 'ENSG00000141316',
 'ENSG00000151413',
 'ENSG00000004478',
 'ENSG00000285397',
 'ENSG00000168930',
 'ENSG00000100253',
 'ENSG00000229597',
 'ENSG00000148308',
 'ENSG00000119669',
 'ENSG00000172992',
 'ENSG00000166938',
 'ENSG00000069535',
 'ENSG00000185883',
 'ENSG00000161057',
 'ENSG00000261247',


In [51]:
# subset the gene file
# embed
prot_embedder = PROTBERT()
prot_embeddings = prot_embedder(
    fasta_path + "subset.fa", output_folder=fasta_path + "esm_out/", cache=True
)
# load the data and erase / zip the rest
utils.utils.run_command(["gzip", fasta_path+fasta_file[:-3]])

gzip: /tmp/data/fasta/Homo_sapiens.GRCh38.pep.all.fa.gz: No such file or directory


KeyboardInterrupt: 

In [101]:
prot_embeddings.columns = prot_embeddings.columns.astype(str)

In [102]:
prot_embeddings.to_parquet(fasta_path + "prot_embeddings.parquet")

In [6]:
prot_embeddings = pd.read_parquet(fasta_path + "prot_embeddings.parquet")

In [8]:
fasta_file

NameError: name 'fasta_file' is not defined

In [9]:
rnagenedf

Unnamed: 0,ensembl_gene_id,symbol,ncbi_gene_id,biotype,description,synonyms
20,ENSG00000002079,MYH16,,transcribed_unitary_pseudogene,myosin heavy chain 16 pseudogene [Source:HGNC ...,MYH16P|MHC20|MYH5
382,ENSG00000018607,ZNF285CP,,transcribed_unprocessed_pseudogene,"zinc finger protein 285C, pseudogene [Source:H...",ZNF806
399,ENSG00000020219,CCT8L1P,,processed_pseudogene,"chaperonin containing TCP1 subunit 8 like 1, p...",CCT8L1
584,ENSG00000042304,C2orf83,,transcribed_unprocessed_pseudogene,chromosome 2 open reading frame 83 [Source:HGN...,DKFZP547H025
765,ENSG00000056678,KIFC1,3833,lncRNA,kinesin family member C1 [Source:HGNC Symbol;A...,KNSL2|HSET
...,...,...,...,...,...,...
75712,ENSG00000292367,ELOCP24,,processed_pseudogene,elongin C pseudogene 24 [Source:HGNC Symbol;Ac...,TCEB1P24|TCEB1P25
75713,ENSG00000292368,TRPC6P1,,processed_pseudogene,TRPC6 pseudogene 1 [Source:HGNC Symbol;Acc:HGN...,TRPC6P|TRPC6L
75714,ENSG00000292369,,,lncRNA,novel transcript,
75715,ENSG00000292370,WASIR1,,lncRNA,WASH and IL9R antisense RNA 1 [Source:HGNC Sym...,NCRNA00286B


In [14]:
fasta_file =  'Homo_sapiens.GRCh38.ncrna.fa'

In [11]:
[
    file for file in os.listdir(fasta_path) if file.endswith(".ncrna.fa.gz")
]

[]

In [7]:
# return the embedding and gene file
# do the same for RNA
rnagenedf = genedf[genedf["biotype"] != "protein_coding"]
fasta_file = next(
    file for file in os.listdir(fasta_path) if file.endswith(".ncrna.fa.gz")
)
utils.utils.run_command(["gunzip", fasta_path+fasta_file])

StopIteration: 

In [13]:
utils.subset_fasta(
    rnagenedf["ensembl_gene_id"].tolist(),
    subfasta_path=fasta_path + "subset.ncrna.fa",
    fasta_path=fasta_path+fasta_file[:-3],
    drop_unknown_seq=True,
)

NameError: name 'fasta_file' is not defined

In [17]:
rna_embeddings.shape

(10741, 120)

In [24]:
rna_embedder = RNABERT()
rna_embeddings = rna_embedder(fasta_path + "subset.ncrna.fa")
# Check if the sizes of the cembeddings are not the same
utils.utils.run_command(["gzip", fasta_path+fasta_file[:-3]])

  self.max_length = self.config.max_position_embeddings


device:  cuda
-----start-------
> /home/ml4ig1/Documents code/scPRINT/scprint/RNABERT/main.py(81)__call__()
     79 
     80         pdb.set_trace()
---> 81         features = self.make_feature(self.model, dataloader, seqs)
     82         features = np.array([np.array(embedding).sum(0) for embedding in features])
     83 



gzip: /tmp/data/fasta/Homo_sapiens.GRCh38.ncrna: No such file or directory
  utils.utils.run_command(["gzip", fasta_path+fasta_file[:-3]])


1

In [36]:
from torch.nn import AdaptiveAvgPool1d
import torch

In [50]:
emb_size=512

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,502,503,504,505,506,507,508,509,510,511
ENSG00000105655,0.014333,-0.010633,0.013968,0.030830,0.024772,0.026067,0.051075,-0.031466,0.072350,3.868613e-02,...,-0.006279,0.043659,0.031526,-0.010682,-0.010194,0.026657,0.038430,0.009889,0.021101,-0.006732
ENSG00000244624,0.014796,0.013769,0.033292,-0.003722,0.061670,0.075587,-0.110301,-0.121495,0.145035,-7.702387e-03,...,-0.035593,0.115917,-0.095334,-0.014428,0.028780,-0.007508,-0.039843,0.032129,0.011211,-0.035082
ENSG00000281020,-0.028017,-0.027368,-0.005342,0.043397,0.052755,0.136662,0.085750,0.077198,0.036610,-8.414690e-02,...,-0.097886,0.048462,-0.039388,0.014468,0.038682,-0.022038,-0.065837,0.003867,-0.046382,-0.045088
ENSG00000103510,-0.002395,0.000779,0.006410,0.014631,0.029880,0.016721,0.076056,0.003724,0.035240,5.431109e-02,...,0.004630,0.021648,0.026764,0.021950,-0.020560,0.051313,-0.002275,-0.032072,-0.037507,0.023426
ENSG00000267140,-0.014375,-0.008790,0.033140,0.066227,0.028542,0.040251,0.039163,0.013008,-0.000148,1.982154e-02,...,-0.038076,0.020250,0.085799,0.046093,-0.023583,0.014465,0.047460,0.011711,-0.014058,-0.043279
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ENSG00000201114,1.131868,1.131868,1.131868,1.131868,15.795394,30.458919,30.458919,30.458919,15.229460,-4.687230e-18,...,11.032937,5.062331,-0.908275,-0.908275,-0.908275,-0.453977,0.000321,0.000321,0.000321,0.000321
ENSG00000206659,-5.791424,-5.791424,-5.791424,-5.791424,4.884549,15.560523,15.560523,15.560523,7.780261,4.015249e-18,...,6.063670,-10.920573,-27.904815,-27.904815,-27.904815,-13.952602,-0.000388,-0.000388,-0.000388,-0.000388
ENSG00000252484,17.999512,17.999512,17.999512,17.999512,-3.346543,-24.692599,-24.692599,-24.692599,-12.346300,6.923504e-17,...,20.597301,16.078656,11.560012,11.560012,11.560012,5.778884,-0.002244,-0.002244,-0.002244,-0.002244
ENSG00000274121,0.208830,0.208830,0.208830,0.208830,15.979470,31.750110,31.750110,31.750110,15.875055,1.294281e-17,...,-11.187131,-10.434742,-9.682354,-9.682354,-9.682354,-4.840921,0.000512,0.000512,0.000512,0.000512


In [None]:
if prot_embeddings.shape[1] < emb_size:
    prot_embeddings = inverse_pooling(prot_embeddings, emb_size)
elif prot_embeddings.shape[1] > emb_size:
    prot_embeddings = AvgPool1d(prot_embeddings, emb_size)

if rna_embeddings.shape[1] < emb_size:
    rna_embeddings = inverse_pooling(rna_embeddings, emb_size)
elif rna_embeddings.shape[1] > emb_size:
    rna_embeddings = AvgPool1d(rna_embeddings, emb_size)


## finalize

In [24]:
from scprint.dataloader.embedder import embed
import lnschema_bionty as lb
import pandas as pd
%reload_ext autoreload
%autoreload 2


In [3]:
organism = lb.Organism.filter(ontology_id="NCBITaxon:9606").one()

In [8]:
genedf = lb.Gene.filter(organism_id=organism.id).df()

In [16]:
embeddings = embed(genedf=genedf,
    organism="homo_sapiens",
    cache=False,
    fasta_path="/tmp/data/fasta/",
    embedding_size=1024)

  utils.utils.run_command(["gunzip", fasta_path + fasta_file])


18039  genes had duplicates
dropped 112 weird sequences
> /home/ml4ig1/Documents code/scPRINT/scprint/dataloader/embedder.py(38)embed()
     36 
     37     pdb.set_trace()
---> 38     prot_embeddings = prot_embedder(
     39         fasta_path + "subset.fa", output_folder=fasta_path + "esm_out/", cache=cache
     40     )

> /home/ml4ig1/Documents code/scPRINT/scprint/dataloader/embedder.py(39)embed()
     37     pdb.set_trace()
     38     prot_embeddings = prot_embedder(
---> 39         fasta_path + "subset.fa", output_folder=fasta_path + "esm_out/", cache=cache
     40     )
     41     # load the data and erase / zip the rest

> /home/ml4ig1/Documents code/scPRINT/scprint/dataloader/embedder.py(38)embed()
     36 
     37     pdb.set_trace()
---> 38     prot_embeddings = prot_embedder(
     39         fasta_path + "subset.fa", output_folder=fasta_path + "esm_out/", cache=cache
     40     )

running protbert
b'Transferred model to GPU'
b'Read /tmp/data/fasta/subset.fa with 23149 s

  run_command(cmd, shell=True)


> /home/ml4ig1/Documents code/scPRINT/scprint/dataloader/embedder.py(42)embed()
     40     )
     41     # load the data and erase / zip the rest
---> 42     utils.utils.run_command(["gzip", fasta_path + fasta_file[:-3]])
     43     # return the embedding and gene file
     44     # do the same for RNA



  utils.utils.run_command(["gzip", fasta_path + fasta_file[:-3]])


> /home/ml4ig1/Documents code/scPRINT/scprint/dataloader/embedder.py(45)embed()
     43     # return the embedding and gene file
     44     # do the same for RNA
---> 45     rnagenedf = genedf[genedf["biotype"] != "protein_coding"]
     46     fasta_file = next(
     47         file for file in os.listdir(fasta_path) if file.endswith(".ncrna.fa.gz")

> /home/ml4ig1/Documents code/scPRINT/scprint/dataloader/embedder.py(46)embed()
     44     # do the same for RNA
     45     rnagenedf = genedf[genedf["biotype"] != "protein_coding"]
---> 46     fasta_file = next(
     47         file for file in os.listdir(fasta_path) if file.endswith(".ncrna.fa.gz")
     48     )

> /home/ml4ig1/Documents code/scPRINT/scprint/dataloader/embedder.py(47)embed()
     45     rnagenedf = genedf[genedf["biotype"] != "protein_coding"]
     46     fasta_file = next(
---> 47         file for file in os.listdir(fasta_path) if file.endswith(".ncrna.fa.gz")
     48     )
     49     utils.utils.run_command(["gunzi

gzip: /tmp/data/fasta/Homo_sapiens.GRCh38.ncrna.fa already exists;	not overwritten
  utils.utils.run_command(["gunzip", fasta_path + fasta_file])


6781  genes had duplicates
dropped 0 weird sequences


  self.config = get_config(file_path=config)


device:  cuda
-----start-------
> /home/ml4ig1/Documents code/scPRINT/scprint/RNABERT/main.py(81)__call__()
     79         features = np.array([np.array(embedding).sum(0) for embedding in features])
     80 
---> 81         return pd.DataFrame(features, index=names)
     82 
     83     def make_feature(self, model, dataloader, seqs):

> /home/ml4ig1/Documents code/scPRINT/scprint/RNABERT/main.py(82)__call__()
     80 
     81         return pd.DataFrame(features, index=names)
---> 82 
     83     def make_feature(self, model, dataloader, seqs):
     84         model.eval()



gzip: /tmp/data/fasta/Homo_sapiens.GRCh38.ncrna.fa.gz already exists;	not overwritten
  utils.utils.run_command(["gzip", fasta_path + fasta_file[:-3]])


In [17]:
embeddings

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1014,1015,1016,1017,1018,1019,1020,1021,1022,1023
ENSG00000105655,0.033054,0.014233,-0.005222,-0.004395,-0.002865,-0.012969,0.077506,0.022427,0.035671,0.022441,...,0.004184,0.024460,0.049430,0.016470,0.038377,0.006617,0.027568,-0.023355,-0.023004,-0.014181
ENSG00000170632,0.060958,0.041447,0.029791,0.008027,-0.015393,0.027763,0.059350,0.090556,-0.007572,0.037182,...,-0.002642,0.009855,0.041328,-0.062031,-0.058393,0.032410,-0.023121,-0.014434,-0.015568,0.006333
ENSG00000274611,0.059348,0.029684,0.033241,-0.017488,0.007841,0.077359,0.101719,0.097066,0.044031,0.098204,...,0.002306,-0.005977,-0.047774,-0.083684,-0.103021,-0.008538,0.062130,0.010444,-0.022874,0.001780
ENSG00000125772,0.022788,-0.003619,0.005735,-0.026457,0.006911,0.001660,0.000381,0.034589,0.031418,0.072625,...,0.007978,0.018537,0.028154,-0.031829,0.012631,0.059014,0.021746,-0.061535,-0.102304,-0.021641
ENSG00000158270,-0.002063,-0.045581,0.010904,0.018350,0.040196,0.097059,0.049399,0.074891,0.022556,0.036207,...,0.027617,-0.019420,0.019151,-0.035153,-0.060060,0.013986,-0.066521,-0.054233,-0.008349,0.090384
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ENSG00000201114,1.131868,1.131868,1.131868,1.131868,1.131868,1.131868,1.131868,1.131868,15.795394,30.458919,...,-0.908275,-0.453977,0.000321,0.000321,0.000321,0.000321,0.000321,0.000321,0.000321,0.000321
ENSG00000206659,-5.791424,-5.791424,-5.791424,-5.791424,-5.791424,-5.791424,-5.791424,-5.791424,4.884549,15.560523,...,-27.904815,-13.952602,-0.000388,-0.000388,-0.000388,-0.000388,-0.000388,-0.000388,-0.000388,-0.000388
ENSG00000252484,17.999512,17.999512,17.999512,17.999512,17.999512,17.999512,17.999512,17.999512,-3.346543,-24.692599,...,11.560012,5.778884,-0.002244,-0.002244,-0.002244,-0.002244,-0.002244,-0.002244,-0.002244,-0.002244
ENSG00000274121,0.208830,0.208830,0.208830,0.208830,0.208830,0.208830,0.208830,0.208830,15.979470,31.750110,...,-9.682354,-4.840921,0.000512,0.000512,0.000512,0.000512,0.000512,0.000512,0.000512,0.000512


In [20]:
genedf = genedf.set_index('ensembl_gene_id')

In [25]:
genedf = pd.concat([genedf.loc[embeddings.index], embeddings], axis=1, join='inner')

In [27]:
genedf

Unnamed: 0,uid,symbol,stable_id,ncbi_gene_ids,biotype,description,synonyms,organism_id,bionty_source_id,updated_at,...,1014,1015,1016,1017,1018,1019,1020,1021,1022,1023
ENSG00000105655,lc640mxPf8qF,ISYNA1,,51477,protein_coding,inositol-3-phosphate synthase 1 [Source:HGNC S...,INO1|INOS|IPS,2,9.0,2023-11-22 13:16:56.558676+00:00,...,0.004184,0.024460,0.049430,0.016470,0.038377,0.006617,0.027568,-0.023355,-0.023004,-0.014181
ENSG00000170632,CznlMn2vaNw7,ARMC10,,83787,protein_coding,armadillo repeat containing 10 [Source:HGNC Sy...,SVH|MGC3195,2,9.0,2023-11-22 13:16:57.139114+00:00,...,-0.002642,0.009855,0.041328,-0.062031,-0.058393,0.032410,-0.023121,-0.014434,-0.015568,0.006333
ENSG00000274611,Ti9CWRipX0xM,TBC1D3,,729873,protein_coding,TBC1 domain family member 3 [Source:HGNC Symbo...,DKFZP434P2235|PRC17|TBC1D3A,2,9.0,2023-11-22 13:17:00.027097+00:00,...,0.002306,-0.005977,-0.047774,-0.083684,-0.103021,-0.008538,0.062130,0.010444,-0.022874,0.001780
ENSG00000125772,ecy2QwNwMARj,GPCPD1,,56261,protein_coding,glycerophosphocholine phosphodiesterase 1 [Sou...,KIAA1434|GDPD6|GDE5,2,9.0,2023-11-22 13:16:56.785577+00:00,...,0.007978,0.018537,0.028154,-0.031829,0.012631,0.059014,0.021746,-0.061535,-0.102304,-0.021641
ENSG00000158270,0Q6kihSvsAHD,COLEC12,,81035,protein_coding,collectin subfamily member 12 [Source:HGNC Sym...,CL-P1|SCARA4|SRCL,2,9.0,2023-11-22 13:16:57.006646+00:00,...,0.027617,-0.019420,0.019151,-0.035153,-0.060060,0.013986,-0.066521,-0.054233,-0.008349,0.090384
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ENSG00000201114,JxKILeF1CGa7,Y_RNA,,,misc_RNA,Y RNA [Source:RFAM;Acc:RF00019],,2,9.0,2023-11-22 13:16:57.439984+00:00,...,-0.908275,-0.453977,0.000321,0.000321,0.000321,0.000321,0.000321,0.000321,0.000321,0.000321
ENSG00000206659,sR265qYPZy70,Y_RNA,,,misc_RNA,Y RNA [Source:RFAM;Acc:RF00019],,2,9.0,2023-11-22 13:16:57.866342+00:00,...,-27.904815,-13.952602,-0.000388,-0.000388,-0.000388,-0.000388,-0.000388,-0.000388,-0.000388,-0.000388
ENSG00000252484,PEY7YG25HysR,RN7SKP49,,,misc_RNA,RN7SK pseudogene 49 [Source:HGNC Symbol;Acc:HG...,,2,9.0,2023-11-22 13:16:58.970772+00:00,...,11.560012,5.778884,-0.002244,-0.002244,-0.002244,-0.002244,-0.002244,-0.002244,-0.002244,-0.002244
ENSG00000274121,UNgHRjTeGOgl,,,,misc_RNA,,,2,9.0,2023-11-22 13:17:00.007539+00:00,...,-9.682354,-4.840921,0.000512,0.000512,0.000512,0.000512,0.000512,0.000512,0.000512,0.000512


In [30]:
genedf.columns = genedf.columns.astype(str)

In [31]:
genedf.to_parquet('../../data/temp/genesdf.parquet')