## Preprocessing notebooks that makes expression matrix analysis for further ##

In [1]:
import polars as pl
from pathlib import Path
from pycomfort.files import *
from pycomfort import files
import pyarrow
import pandas as pd
from functional import seq
from typing import *
import functools

In [2]:
import genomepy
from genotations import *
from genotations import ensembl
from genotations.genomes import *
from genotations.quantification import *

  from .autonotebook import tqdm as notebook_tqdm


Adding source paths

In [3]:
import sys

base = Path("..")
local = (base / "dashboard").resolve()
if local.exists():
    sys.path.insert(0, Path("..").absolute().as_posix())
    sys.path.insert(0, local)
    print(sys.path)
else:
    base = Path(".")
%load_ext autoreload
%autoreload 2

[PosixPath('/home/antonkulaga/expressions-dashboard/dashboard'), '/home/antonkulaga/expressions-dashboard/notebooks/..', '/home/antonkulaga/expressions-dashboard/notebooks', '/home/antonkulaga/micromamba/envs/dashboard/lib/python310.zip', '/home/antonkulaga/micromamba/envs/dashboard/lib/python3.10', '/home/antonkulaga/micromamba/envs/dashboard/lib/python3.10/lib-dynload', '', '/home/antonkulaga/micromamba/envs/dashboard/lib/python3.10/site-packages']


Configure polars to display more info in notebooks

In [4]:
pl.Config.set_tbl_width_chars(10000)
pl.Config.set_fmt_str_lengths(1000)
pl.Config.set_tbl_rows(20)

polars.cfg.Config

In [5]:
from dashboard.prepare import *
from dashboard.models import *

## Setting up Paths

In [6]:
data = base / "data"
inputs = data / "inputs"
inputs.mkdir(exist_ok=True)
interim = data / "interim"
interim.mkdir(exist_ok=True)
output = data / "output"
output.mkdir(exist_ok=True)

## Loading data

In [10]:
skip_if_exists: bool = True

In [38]:
bioprojects: list[Bioproject] = with_ext(inputs, ".tsv").map(lambda p: Bioproject(p.stem, inputs)).to_list()
bioprojects

[<dashboard.models.Bioproject at 0x7f8053b15900>,
 <dashboard.models.Bioproject at 0x7f8053b16bf0>,
 <dashboard.models.Bioproject at 0x7f8053b15330>]

In [40]:
p = bioprojects[0]
p.genes

gene,gene_name
str,str
"""ENSMUSG00000100595""","""Gm19087"""
"""ENSMUSG00000097426""","""Gm8941"""
"""ENSMUSG00000104385""","""Gm7449"""
"""ENSMUSG00000101231""","""Gm28283"""
"""ENSMUSG00000102135""","""Gm37108"""
"""ENSMUSG00000103282""","""Gm37275"""
"""ENSMUSG00000101097""","""Gm6679"""
"""ENSMUSG00000102534""","""Gm37225"""
"""ENSMUSG00000100831""","""Gm17847"""
"""ENSMUSG00000100884""","""Gm28281"""


In [30]:
load(bioprojects[0]).annotations_df.join(bioprojects[0].genes, on="gene")

gene,gene_name,SRR9089320,SRR9089321,SRR9089322,SRR9089323,SRR9089324,SRR9089319
str,str,f64,f64,f64,f64,f64,f64


In [37]:
quants_from_bioproject(Path("/data/samples/muscle_differentiation/PRJNA797288"), name_part="quant.genes.sf")

OrderedDict([('SRR17619478',
              shape: (116357, 4)
              ┌────────────────────┬──────────┬─────────────────┬──────────┐
              │ gene               ┆ TPM      ┆ EffectiveLength ┆ NumReads │
              │ ---                ┆ ---      ┆ ---             ┆ ---      │
              │ str                ┆ f64      ┆ f64             ┆ f64      │
              ╞════════════════════╪══════════╪═════════════════╪══════════╡
              │ ENSMUST00000213427 ┆ 0.0      ┆ 4.0             ┆ 0.0      │
              ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
              │ ENSMUST00000117697 ┆ 0.0      ┆ 688.0           ┆ 0.0      │
              ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
              │ ENSMUST00000214850 ┆ 0.0      ┆ 6.0             ┆ 0.0      │
              ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
              │ ENSMUST00000121437 ┆ 0.0      ┆ 14.0            ┆ 0.0      │
              

In [35]:
tprint(Path("/data/samples/muscle_differentiation/PRJNA797288"))

PRJNA797288
	SRR17619478
		SRR17619478.json
		quant_SRR17619478
			cmd_info.json
			lib_format_counts.json
			quant.sf
			quant.genes.sf
			aux_info
				expected_bias.gz
				observed_bias.gz
				observed_bias_3p.gz
				unmapped_names.txt
				obs_gc.gz
				ambig_info.tsv
				meta_info.json
				exp_gc.gz
				fld.gz
				obs5_seq.gz
				exp3_seq.gz
				obs3_seq.gz
				exp5_seq.gz
				bootstrap
					names.tsv.gz
					bootstraps.gz
			logs
				salmon_quant.log
			libParams
				flenDist.txt
		report
			fastp.html
			fastp.json
	SRR17619477
		SRR17619477.json
		quant_SRR17619477
			cmd_info.json
			lib_format_counts.json
			quant.sf
			quant.genes.sf
			aux_info
				expected_bias.gz
				observed_bias.gz
				observed_bias_3p.gz
				unmapped_names.txt
				obs_gc.gz
				ambig_info.tsv
				meta_info.json
				exp_gc.gz
				fld.gz
				obs5_seq.gz
				exp3_seq.gz
				obs3_seq.gz
				exp5_seq.gz
				bootstrap
					names.tsv.gz
					bootstraps.gz
			logs
				salmon_quant.log
			libParams
				flenDist.t

In [24]:
expressions: pl.DataFrame = expressions_from_bioproject(Path("/data/samples/muscle_differentiation")/ "PRJNA543661", True)
expressions

transcript,SRR9089320,SRR9089321,SRR9089322,SRR9089323,SRR9089324,SRR9089319
str,f64,f64,f64,f64,f64,f64
"""ENSMUST00000178537""",0.0,0.0,0.0,0.0,0.0,0.0
"""ENSMUST00000178862""",0.0,0.0,0.0,0.0,0.0,0.0
"""ENSMUST00000196221""",0.0,0.0,0.0,0.0,0.0,0.0
"""ENSMUST00000179664""",0.0,0.0,0.0,0.0,0.0,0.0
"""ENSMUST00000177564""",0.0,0.0,0.0,0.0,0.0,0.0
"""ENSMUST00000179520""",0.0,0.0,0.0,0.0,0.0,0.0
"""ENSMUST00000179883""",0.0,0.0,0.0,0.0,0.0,0.0
"""ENSMUST00000195858""",0.0,0.0,0.0,0.0,0.0,0.0
"""ENSMUST00000179932""",0.0,0.0,0.0,0.0,0.0,0.0
"""ENSMUST00000180001""",0.0,0.0,0.0,0.0,0.0,0.0


In [25]:
genome = genomes.mouse.genome
annotations = genomes.mouse.annotations

In [26]:
summarized_expressions = with_expressions_summaries(expressions, pl.col("^SRR[a-zA-Z0-9]+$"))
summarized_expressions

transcript,SRR9089320,SRR9089321,SRR9089322,SRR9089323,SRR9089324,SRR9089319,sum_TPM,avg_TPM
str,f64,f64,f64,f64,f64,f64,f64,f64
"""ENSMUST00000082407""",51402.680674,56580.70452,55664.676759,52401.340584,47473.629331,49060.764263,312583.796131,52097.299355
"""ENSMUST00000188837""",19676.999212,19434.871789,22760.88867,16397.931394,18245.143152,18753.311916,115269.146133,19211.524356
"""ENSMUST00000227614""",16742.068951,16529.718368,16175.696122,17681.900143,19585.856418,22166.156102,108881.396104,18146.899351
"""ENSMUST00000118430""",11057.544635,7900.532889,11868.707526,17289.49997,15998.877962,21687.356475,85802.519457,14300.419909
"""ENSMUST00000121411""",12830.31681,11606.327118,13425.557301,13310.650102,13546.985701,14931.569663,79651.406695,13275.234449
"""ENSMUST00000176683""",7073.435975,6909.161714,7750.666221,9266.199011,9618.407732,9798.51455,50416.385203,8402.730867
"""ENSMUST00000042235""",7340.308323,7825.27558,7811.268633,7637.342655,7882.101002,7930.875465,46427.171658,7737.861943
"""ENSMUST00000082408""",7366.565203,7763.058704,7941.216947,7413.65407,6929.471648,7280.120767,44694.087339,7449.014557
"""ENSMUST00000082409""",6368.531742,7194.357054,7830.050186,7552.675006,7082.670636,7497.24198,43525.526604,7254.254434
"""ENSMUST00000178135""",5603.666849,4691.558245,7042.249709,5539.936095,7539.215868,9280.22131,39696.848076,6616.141346


In [28]:
with_exons_info = annotations.with_genes_transcripts_exons_coordinates_only().extend_with_annotations(summarized_expressions)
with_exons_info.write_parquet("/data/test.parquet")

In [None]:
with_exons_info#%% md
## Preprocessing notebooks that makes expression matrix analysis for further ##

In [None]:
import polars as pl
from pathlib import Path
from pycomfort.files import *
from pycomfort import files
import pyarrow
import pandas as pd
from functional import seq
from typing import *
import functools

In [None]:
import genomepy
from genotations import *

Adding source paths

In [None]:
import sys

base = Path("..")
local = (base / "dashboard").resolve()
if local.exists():
    sys.path.insert(0, Path("..").absolute().as_posix())
    sys.path.insert(0, local)
    print(sys.path)
else:
    base = Path(".")
%load_ext autoreload
%autoreload 2

Configure polars to display more info in notebooks

In [None]:
pl.Config.set_tbl_width_chars(10000)
pl.Config.set_fmt_str_lengths(1000)
pl.Config.set_tbl_rows(20)

In [None]:
from dashboard.prepare import *

## Setting up Paths

In [None]:
samples = Path("/") / "data" / "samples" / "muscle_differentiation"
bioprojects = dirs(samples)

In [None]:
data = base / "data"
inputs = data / "inputs"
inputs.mkdir(exist_ok=True)
interim = data / "interim"
interim.mkdir(exist_ok=True)
output = data / "output"
output.mkdir(exist_ok=True)

## Loading data

In [None]:
skip_if_exists: bool = True

In [None]:
expressions: pl.DataFrame = expressions_from_bioproject(Path("/data/samples/muscle_differentiation")/ "PRJNA543661", True)
expressions

In [None]:
genome = genomes.mouse.genome
annotations = genomes.mouse.annotations

In [None]:
summarized_expressions = with_expressions_summaries(expressions, pl.col("^SRR[a-zA-Z0-9]+$"))
summarized_expressions

In [31]:
with_exons_info = annotations.with_genes_transcripts_exons_coordinates_only().extend_with_annotations_and_sequences(summarized_expressions, genome)
with_exons_info.head(10)

There are (780104, 15) annotations, loading sequences can take quite a while!


gene,gene_name,transcript,transcript_name,exon,transcript_exon,coordinates,SRR9089320,SRR9089321,SRR9089322,SRR9089323,SRR9089324,SRR9089319,sum_TPM,avg_TPM,sequence
str,str,str,str,str,str,list[str],f64,f64,f64,f64,f64,f64,f64,f64,object
"""ENSMUSG00000100595""","""Gm19087""","""ENSMUST00000191430""","""Gm19087-201""","""ENSMUSE00001324949""","""Gm19087-201_1""","[""1"", ""+"", ... ""150984611""]",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,TGTTGCTATGGACTGTGTATGATGTCCTTCCAACTTAAGAGAGAGAGGCATTACTTCATGTATATATTTTTAGGTCACACCCATCTTACCCTTGGTTCTTCTGATGCTTGGGATTAATAGTCCAGCATTGCAGATACaaaaaaaaaaaaaaaaaGACCCTGATATTATATATAAAGAGGGGAATTTGCATTCTAAATTGTTGGGGACCATTCCTCCTAGGAGAAACAGttttttttttttttttttttttttttttttttttttttttttttttACAAGAAGTCAGAAATCTTGTTGTTACTGCTGAAGAACAGAGACTATCCCCTGGCCGTGGGCTCAATCCCAGGAGGTGTCTTAGAGGATATGAAAATTTGCTCTTGGACTAAAACTCTAAGCATCAGAATGTAATGTTGATAGAAACAGTGAGCATCCCTCTACACCCTCAAATGTTGAGCGTCCATTAGATGGAAAGATTTTATATGTCAAAAAGATTCCAGTATGGGATTTCATTTTGAACAAGATCTTAACAAGAAATCAGTGGCCACTTTAGTATTGGGTATGCTTCTCCAGTACACAGTGACACCTGAAAGCAACTGGCAGAGAAGTTGGTGGTGACAGGTGGCACCTCTGTGTTACCAGGATCTTCACAGATCATAAACAGAAGGGACTTGGtaaatatataactatataaatataAAACAATTTGTGACCCCAAGGCACTCTGAATTCATATTCCTCCTGCAGAGGCTCATTCTTTAACCTGGTTGGGAGGGATTAGTTTGGAACCATCACAAGGTATCCTTAAGAACCATTCAGCTTCAAAAGAATATTGTAATTAGATGGGCTGTATACGGATTGGGGTTTTCTCAACAAATCACTGTTAGAAACAGTATTTAATATGGGAAATTACAGTCACCGCTGATGAAGAGAGTGTTTTCCATTGAGA
"""ENSMUSG00000097426""","""Gm8941""","""ENSMUST00000181451""","""Gm8941-201""","""ENSMUSE00001158900""","""Gm8941-201_1""","[""1"", ""+"", ... ""151012971""]",0.0,0.0,0.0,0.0,0.0,0.307963,0.307963,0.051327,ATGGTCTACATGTTCCAGTATGACTCTCTACCCATGACAAGTTCAACGGCATAGTCAAGGCTGAGAATGGGAAGCTTGCCATCAACAGGAAGGCCATCACTTTCTTCCAGGAGTGGGATCTCACTAACATCAAATGGGGTGACGCTGTTGCTGAGTATGTTGTGTAGTGTACTGGCATCTTTACCACCATGGAGAAGGCTGGGGACCACTTGAAGAGTGGTGCCAAAAGAGTTATTATATCTGTCCCTTCTGCTGATGTTCCCATGTTTGTGATGGGTGTGAACCACAAGAAGTATGTCTGCAACACAAAATTGTCTGCAATGCTTCCTGCACCACCAACTGCTTAGCCCCACCCACCCCCTTGCCAAGGTCATCCATGACAACTTTGGCATTGTGGAAGGATTCCTTACCACAGTCGATGCCATCACTGCCACCCAGAAGACTGTGGATTGTCCCTCTGGAAAGCTGTGGCGTGATGGCCAAGGGACTGCCCAGAACATCATCCCTGCATCCACTGGTGATGTCAAGACTGTGGGCAAGGTCATCCCAGAGCTGAACAGGAAGCTCACTGTCATGGCCTTCCATGTTCCTACCCCCAATGTATCCGTTGTAGATCTATCATGCCGCCTAGAGAAACCTGCCAAGTATGATGACATCAAGAAAGTGGTGAAGCAGGCATCCTGGGGCCTACTAAAGGGCACTCTGGGATtgtac
"""ENSMUSG00000097426""","""Gm8941""","""ENSMUST00000181451""","""Gm8941-201""","""ENSMUSE00001115039""","""Gm8941-201_2""","[""1"", ""+"", ... ""151013531""]",0.0,0.0,0.0,0.0,0.0,0.307963,0.307963,0.051327,GGCTACACTAAGAACCAGGTTGTCTCCTGCGACTTCAACAGTAACTCCCACTCTTCCACCTTTGATGCTGGGACTGGCATTGTTCTCACTGACAACTTTGTAAAGCCCATTTCCTGTTATGACAATGAATATGGCTACAGCAGCAGGGTGGTAGACCTCATGGCCTACATAGCCTCAAGGAGTAA
"""ENSMUSG00000104385""","""Gm7449""","""ENSMUST00000194393""","""Gm7449-201""","""ENSMUSE00001339893""","""Gm7449-201_1""","[""1"", ""+"", ... ""6981446""]",0.119103,0.257547,0.0,0.154863,0.0,0.0,0.531513,0.0885855,ATGGGCAAAGGAGACCTTAAGAAACCAAGAGGCAAAACGTCCTCCTCATATACATGCTTTGTGCAAACCTGTTGGGAGAAGCACAAGAAGAAGCACCTGGATGCTTCAGTCAACTTCTCAGAGTTCTCCAAGAAGTGCTCAGAGAAGTGGAAGACCATGTCTGCTaaaaaaaaaaaaaaaGTGGGGGGAGGGATTTGAACATATGGCAAAGGCTGACAAGGCTCGTTATGAAAGAGAAATGAAAACCTACATCCCTCCCAAAGGGGAGACCAGAAAGAAGTTCAAGGACCCCAATGCACCCAAGAAGCTTACTTTGGCCTTCTTCTTGCTCTGTTCTAAGTACCGCTCCAAAATTAAAGGCGAGCATCCTGGCTTTTTCATTGGTGATGTTGCAAAGAAACTAGGAAAGATGTGGAATAACACTGCAGTGGATGACAAGCAGCCCTATGAGAAGAAGGCTGTCAAGCTGAAGGAGAAGTACAAGAAGAATATTGCTGCCTACAGAGATGAAGGAAAACCTGATGCAGTgaaaagagggtggtcaaggctgaaaagagcaagaaaaagaaggaagaggaagatgaggaggaggatgaagagcatgaggaagaggagtaagaagaagagatgaagatgatgatgaagaagatgatgatgaataag
"""ENSMUSG00000101231""","""Gm28283""","""ENSMUST00000185509""","""Gm28283-201""","""ENSMUSE00001325295""","""Gm28283-201_1""","[""1"", ""-"", ... ""108540244""]",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,CTGCAAAGAATGTAAGAATTGAGCTTGATTACATTAAATGTGACTGTTTGAAATGGAAAAAACACATCTTTGGTGTTTTCAAATTGAAGGAGATGATCATTTTCATTATTTGTGTCCCCTTTAACTTCTGCTTGAGCAAGCAACagagaggaggagagagagGCTGGAGCAAAGATAT
"""ENSMUSG00000102135""","""Gm37108""","""ENSMUST00000194605""","""Gm37108-201""","""ENSMUSE00001339810""","""Gm37108-201_1""","[""1"", ""+"", ... ""6987219""]",0.722537,0.188472,0.854944,0.803678,0.719618,0.200622,3.489871,0.581645,ATGGGCATCTCTCGGGACAACTGGCACAAGCGCCGCAAGACCGGGGGTAAGCGAAAACCCTACCACAAGAAGCGAAAGTATGAGCTGGGAGGGCCCGCTGCCAACACAAAGATTGGCCCTCGCCACATACACACAGTCCGAGTTCGAGGAGGCAATAAGAAGTACCGTGCCCTGAGATTGGATATGGGGAACTTTTCCTGGGGCTCTGAGTGTTGTACTCGCAAAACAAGGATCATTGATGTTGTCTACAATGCATCCAACAACGAGCTTGTCTGCACCAAGACCCTGGTGAAGAACTGCATTGTGCTTATTGACAGCACGCCGTACTGACAGTGGTACGAGTCCCACTATGCACTGCCCCTGGGCCTCAAGAAGGGGGCCAAGCTGACTCCTGAGGAGGAAGAGATTTTaaacaaaaaacgatcaaagaaaattca
"""ENSMUSG00000102135""","""Gm37108""","""ENSMUST00000194605""","""Gm37108-201""","""ENSMUSE00001343179""","""Gm37108-201_2""","[""1"", ""+"", ... ""6993812""]",0.722537,0.188472,0.854944,0.803678,0.719618,0.200622,3.489871,0.581645,aagaaatacgacaaaaggaaaaagaatgccaaaaTCAGCAGCCTCCTGGAGGAGCAGTTCCAGCAGGGCAAGCTTCTAGCCTGTATTGCCTCACGACCAGGCCAGTGTGGCAGAGCAGACGGCTATGTGCTCGAAGGCAAGGAGCTGGAGTTCTATCTGCGGATGATCAAAGCCCGGAAAGGCAAATGA
"""ENSMUSG00000103282""","""Gm37275""","""ENSMUST00000191703""","""Gm37275-201""","""ENSMUSE00001337469""","""Gm37275-201_1""","[""1"", ""+"", ... ""7000012""]",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,CCACCTGACAGAGGAAGAAAATGGCAAATG
"""ENSMUSG00000101097""","""Gm6679""","""ENSMUST00000191467""","""Gm6679-201""","""ENSMUSE00001335952""","""Gm6679-201_1""","[""1"", ""+"", ... ""108699733""]",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,atGTCAAAAGGTGTAGGTGATGAAGGTTCATCTAGTCATAATAAATCAAAGGTTACAATGTCTACCTCAGACTCTGGAAGTAGAAACAGATCTGAATTATTTTAAACCTTAAATGGGTCATCTGTTGACTCACAACAATCTCAGTCCATAAATCCATGGTATATTGATGAAGTTGCAGAAGAAACTGCGAAGTCACTTACAGAGAGGTCTTCAACTTTGGACGTTCATCACCTCCACCACAATGTCCTTCCATGAACTCCTTGTCTAGGAAGAACAGATTTCACTCCTTACCCTTTTGCTTGACAAAGATGTCCAATATTAATGACAGCATGGCTCATAGTCCACTCTTTCTGTCACTGCAGTCTGTGGTGGGGGAGCTGAACAGCACACCTGTCCAGGAGAGTCCACCCTCGACCATCTCTTCTGGGAATGCATATGGGCTAGAGGTGGGCTCACTGGTTGATGTAAAAGAGGACCCCCTGTTCTATGGGGTTATCTGTTGGATTTGGCAGCCACCAGGGCTCAGTGACGTGCTAGCTGGACTGAAACTGAATGATGAACGCACAGGCTGTACAGATAGAACTTTCAGGTGCAACTGCTATTTCACCTGTGCCCTGAAGAAGGCACTGTTTGTGAAACTAAAGAGCTGCAGACCGACCAGACTATAGGTTTGAATACTTGTAGTCTGTTTTCACTCAGACTGAAATATGTAACTCTTTAGCATTTGGGGGCTACTTACATGAAGTAGTAGAAGAAAATAATCCATCTAAAATGGAAAAGGAAGGTTTAGAGATAATGATTGGAAAAAACAAAAGACATCCAGGGCAATTACAATTCTTATTACTTATACTCAACTTTGTTCTACTTATTTGCTTTTGTTTTGCCCTGGACACTGTGCTACTTAGACCCAAAGAGAATGATGTGGAATATCAAGAGCTACTAAGGACAGAGATAGTCAATCCTCTGAGAATATATGGATATGTGTGTGCCACAAACATTATGAAACTGAGGAAAACAGTTTAAAATGTTGAGGTTGAATCAGAATTTACTTCTGAAGAAAAAGATCCTGAAGAGTTTCTAAATATCTTGTTTCATGATATTTTAAGGTTTGAACCATTGTTAAAAATAAGGTCAGCCAGTCAAAAAGCTCACGACTGTAACTTTTGTCAAATTTTTAATAAAAGTTGGAGTACCCACAAATCAGCAATTAGAATGGTCTTTTATCAACAGCAACCAGAAATTTGCAGTGGCGTCATCATGGTTGATTATCCAGTTGCCTCAGTTTGGAAAAGTCTTAAGACTAAtttttttttCCTTTCCTGGAATTAAATATAACAGATTTACTTGAAGGCACTCACAGGCAGTGCCACATCTGTGGAGGACTTGCGATCTATAAGTCAACAGAGTACTATGACAACCCAGACATGTCAGTCGGGAAGATCAAACACTTCTGCAAGACTGGCAGCACCTGGGTTCTCCTTCATCTCAGAAGGCTGAATCATACTTACCACCCAGTATCACTTAAAAAGACTTGTCTGACTGGGACTGCAGACTTGTCTGCATCCCCTGACAGAAGATGAAGTCATCTGCTGTTCTCTCCATAGAAACTAGCCACTATGGTACTTTTGTGAAGTACGGGAAAGATGTCTCTGTCTGGCTCTTCTTTGACAGTATGACAGATCGAGATGGCTGTCAGAATGACTTCAAAATTCCACAAGTGAAGGCCTGCCCAGAAGTGGGAGAGTACTTAAAGGTATCTCTGGAGGACCTGAACTCTTTGGACTCCAGAAGGATTCAAGGCTGTGGACGCAGACTTCTTGCAATGCATACATGTGCATGTACCAAAGTCCAACCATGAGCTTGTACAAA
"""ENSMUSG00000102534""","""Gm37225""","""ENSMUST00000193423""","""Gm37225-201""","""ENSMUSE00001337169""","""Gm37225-201_1""","[""1"", ""+"", ... ""7068765""]",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,GCTGAAGTAGAACAGAAGACGCTGACCTTCCGCAGGTTCACTTACTGTGGCGTGGACCTAGACTATCATCTGCTAGACATGTCCAATGAGCAGCTGGTGCAGTTGTCCAGCACTGACAGTGGCAGAGCCTGAACTGTGGTCTGGAGCAGAAGCAGCATTCACTACTCAAGAGCTTGAGACAGGCCAAGAATGAGGAGCCACCCATGGAGAAGCCCAAGGTAGTGAAGACCCACCTGAGGGACATGATTATCCTGCCTGAGATGGTAGGcagtatggtgggtgtgtagaccttaaatcaggtggagatcaaaccagagataattgg


In [35]:
with_exons_info.drop("coordinates")

gene,gene_name,transcript,transcript_name,exon,transcript_exon,SRR9089320,SRR9089321,SRR9089322,SRR9089323,SRR9089324,SRR9089319,sum_TPM,avg_TPM,sequence
str,str,str,str,str,str,f64,f64,f64,f64,f64,f64,f64,f64,object
"""ENSMUSG00000100595""","""Gm19087""","""ENSMUST00000191430""","""Gm19087-201""","""ENSMUSE00001324949""","""Gm19087-201_1""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,TGTTGCTATGGACTGTGTATGATGTCCTTCCAACTTAAGAGAGAGAGGCATTACTTCATGTATATATTTTTAGGTCACACCCATCTTACCCTTGGTTCTTCTGATGCTTGGGATTAATAGTCCAGCATTGCAGATACaaaaaaaaaaaaaaaaaGACCCTGATATTATATATAAAGAGGGGAATTTGCATTCTAAATTGTTGGGGACCATTCCTCCTAGGAGAAACAGttttttttttttttttttttttttttttttttttttttttttttttACAAGAAGTCAGAAATCTTGTTGTTACTGCTGAAGAACAGAGACTATCCCCTGGCCGTGGGCTCAATCCCAGGAGGTGTCTTAGAGGATATGAAAATTTGCTCTTGGACTAAAACTCTAAGCATCAGAATGTAATGTTGATAGAAACAGTGAGCATCCCTCTACACCCTCAAATGTTGAGCGTCCATTAGATGGAAAGATTTTATATGTCAAAAAGATTCCAGTATGGGATTTCATTTTGAACAAGATCTTAACAAGAAATCAGTGGCCACTTTAGTATTGGGTATGCTTCTCCAGTACACAGTGACACCTGAAAGCAACTGGCAGAGAAGTTGGTGGTGACAGGTGGCACCTCTGTGTTACCAGGATCTTCACAGATCATAAACAGAAGGGACTTGGtaaatatataactatataaatataAAACAATTTGTGACCCCAAGGCACTCTGAATTCATATTCCTCCTGCAGAGGCTCATTCTTTAACCTGGTTGGGAGGGATTAGTTTGGAACCATCACAAGGTATCCTTAAGAACCATTCAGCTTCAAAAGAATATTGTAATTAGATGGGCTGTATACGGATTGGGGTTTTCTCAACAAATCACTGTTAGAAACAGTATTTAATATGGGAAATTACAGTCACCGCTGATGAAGAGAGTGTTTTCCATTGAGA
"""ENSMUSG00000097426""","""Gm8941""","""ENSMUST00000181451""","""Gm8941-201""","""ENSMUSE00001158900""","""Gm8941-201_1""",0.0,0.0,0.0,0.0,0.0,0.307963,0.307963,0.051327,ATGGTCTACATGTTCCAGTATGACTCTCTACCCATGACAAGTTCAACGGCATAGTCAAGGCTGAGAATGGGAAGCTTGCCATCAACAGGAAGGCCATCACTTTCTTCCAGGAGTGGGATCTCACTAACATCAAATGGGGTGACGCTGTTGCTGAGTATGTTGTGTAGTGTACTGGCATCTTTACCACCATGGAGAAGGCTGGGGACCACTTGAAGAGTGGTGCCAAAAGAGTTATTATATCTGTCCCTTCTGCTGATGTTCCCATGTTTGTGATGGGTGTGAACCACAAGAAGTATGTCTGCAACACAAAATTGTCTGCAATGCTTCCTGCACCACCAACTGCTTAGCCCCACCCACCCCCTTGCCAAGGTCATCCATGACAACTTTGGCATTGTGGAAGGATTCCTTACCACAGTCGATGCCATCACTGCCACCCAGAAGACTGTGGATTGTCCCTCTGGAAAGCTGTGGCGTGATGGCCAAGGGACTGCCCAGAACATCATCCCTGCATCCACTGGTGATGTCAAGACTGTGGGCAAGGTCATCCCAGAGCTGAACAGGAAGCTCACTGTCATGGCCTTCCATGTTCCTACCCCCAATGTATCCGTTGTAGATCTATCATGCCGCCTAGAGAAACCTGCCAAGTATGATGACATCAAGAAAGTGGTGAAGCAGGCATCCTGGGGCCTACTAAAGGGCACTCTGGGATtgtac
"""ENSMUSG00000097426""","""Gm8941""","""ENSMUST00000181451""","""Gm8941-201""","""ENSMUSE00001115039""","""Gm8941-201_2""",0.0,0.0,0.0,0.0,0.0,0.307963,0.307963,0.051327,GGCTACACTAAGAACCAGGTTGTCTCCTGCGACTTCAACAGTAACTCCCACTCTTCCACCTTTGATGCTGGGACTGGCATTGTTCTCACTGACAACTTTGTAAAGCCCATTTCCTGTTATGACAATGAATATGGCTACAGCAGCAGGGTGGTAGACCTCATGGCCTACATAGCCTCAAGGAGTAA
"""ENSMUSG00000104385""","""Gm7449""","""ENSMUST00000194393""","""Gm7449-201""","""ENSMUSE00001339893""","""Gm7449-201_1""",0.119103,0.257547,0.0,0.154863,0.0,0.0,0.531513,0.0885855,ATGGGCAAAGGAGACCTTAAGAAACCAAGAGGCAAAACGTCCTCCTCATATACATGCTTTGTGCAAACCTGTTGGGAGAAGCACAAGAAGAAGCACCTGGATGCTTCAGTCAACTTCTCAGAGTTCTCCAAGAAGTGCTCAGAGAAGTGGAAGACCATGTCTGCTaaaaaaaaaaaaaaaGTGGGGGGAGGGATTTGAACATATGGCAAAGGCTGACAAGGCTCGTTATGAAAGAGAAATGAAAACCTACATCCCTCCCAAAGGGGAGACCAGAAAGAAGTTCAAGGACCCCAATGCACCCAAGAAGCTTACTTTGGCCTTCTTCTTGCTCTGTTCTAAGTACCGCTCCAAAATTAAAGGCGAGCATCCTGGCTTTTTCATTGGTGATGTTGCAAAGAAACTAGGAAAGATGTGGAATAACACTGCAGTGGATGACAAGCAGCCCTATGAGAAGAAGGCTGTCAAGCTGAAGGAGAAGTACAAGAAGAATATTGCTGCCTACAGAGATGAAGGAAAACCTGATGCAGTgaaaagagggtggtcaaggctgaaaagagcaagaaaaagaaggaagaggaagatgaggaggaggatgaagagcatgaggaagaggagtaagaagaagagatgaagatgatgatgaagaagatgatgatgaataag
"""ENSMUSG00000101231""","""Gm28283""","""ENSMUST00000185509""","""Gm28283-201""","""ENSMUSE00001325295""","""Gm28283-201_1""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,CTGCAAAGAATGTAAGAATTGAGCTTGATTACATTAAATGTGACTGTTTGAAATGGAAAAAACACATCTTTGGTGTTTTCAAATTGAAGGAGATGATCATTTTCATTATTTGTGTCCCCTTTAACTTCTGCTTGAGCAAGCAACagagaggaggagagagagGCTGGAGCAAAGATAT
"""ENSMUSG00000102135""","""Gm37108""","""ENSMUST00000194605""","""Gm37108-201""","""ENSMUSE00001339810""","""Gm37108-201_1""",0.722537,0.188472,0.854944,0.803678,0.719618,0.200622,3.489871,0.581645,ATGGGCATCTCTCGGGACAACTGGCACAAGCGCCGCAAGACCGGGGGTAAGCGAAAACCCTACCACAAGAAGCGAAAGTATGAGCTGGGAGGGCCCGCTGCCAACACAAAGATTGGCCCTCGCCACATACACACAGTCCGAGTTCGAGGAGGCAATAAGAAGTACCGTGCCCTGAGATTGGATATGGGGAACTTTTCCTGGGGCTCTGAGTGTTGTACTCGCAAAACAAGGATCATTGATGTTGTCTACAATGCATCCAACAACGAGCTTGTCTGCACCAAGACCCTGGTGAAGAACTGCATTGTGCTTATTGACAGCACGCCGTACTGACAGTGGTACGAGTCCCACTATGCACTGCCCCTGGGCCTCAAGAAGGGGGCCAAGCTGACTCCTGAGGAGGAAGAGATTTTaaacaaaaaacgatcaaagaaaattca
"""ENSMUSG00000102135""","""Gm37108""","""ENSMUST00000194605""","""Gm37108-201""","""ENSMUSE00001343179""","""Gm37108-201_2""",0.722537,0.188472,0.854944,0.803678,0.719618,0.200622,3.489871,0.581645,aagaaatacgacaaaaggaaaaagaatgccaaaaTCAGCAGCCTCCTGGAGGAGCAGTTCCAGCAGGGCAAGCTTCTAGCCTGTATTGCCTCACGACCAGGCCAGTGTGGCAGAGCAGACGGCTATGTGCTCGAAGGCAAGGAGCTGGAGTTCTATCTGCGGATGATCAAAGCCCGGAAAGGCAAATGA
"""ENSMUSG00000103282""","""Gm37275""","""ENSMUST00000191703""","""Gm37275-201""","""ENSMUSE00001337469""","""Gm37275-201_1""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,CCACCTGACAGAGGAAGAAAATGGCAAATG
"""ENSMUSG00000101097""","""Gm6679""","""ENSMUST00000191467""","""Gm6679-201""","""ENSMUSE00001335952""","""Gm6679-201_1""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,atGTCAAAAGGTGTAGGTGATGAAGGTTCATCTAGTCATAATAAATCAAAGGTTACAATGTCTACCTCAGACTCTGGAAGTAGAAACAGATCTGAATTATTTTAAACCTTAAATGGGTCATCTGTTGACTCACAACAATCTCAGTCCATAAATCCATGGTATATTGATGAAGTTGCAGAAGAAACTGCGAAGTCACTTACAGAGAGGTCTTCAACTTTGGACGTTCATCACCTCCACCACAATGTCCTTCCATGAACTCCTTGTCTAGGAAGAACAGATTTCACTCCTTACCCTTTTGCTTGACAAAGATGTCCAATATTAATGACAGCATGGCTCATAGTCCACTCTTTCTGTCACTGCAGTCTGTGGTGGGGGAGCTGAACAGCACACCTGTCCAGGAGAGTCCACCCTCGACCATCTCTTCTGGGAATGCATATGGGCTAGAGGTGGGCTCACTGGTTGATGTAAAAGAGGACCCCCTGTTCTATGGGGTTATCTGTTGGATTTGGCAGCCACCAGGGCTCAGTGACGTGCTAGCTGGACTGAAACTGAATGATGAACGCACAGGCTGTACAGATAGAACTTTCAGGTGCAACTGCTATTTCACCTGTGCCCTGAAGAAGGCACTGTTTGTGAAACTAAAGAGCTGCAGACCGACCAGACTATAGGTTTGAATACTTGTAGTCTGTTTTCACTCAGACTGAAATATGTAACTCTTTAGCATTTGGGGGCTACTTACATGAAGTAGTAGAAGAAAATAATCCATCTAAAATGGAAAAGGAAGGTTTAGAGATAATGATTGGAAAAAACAAAAGACATCCAGGGCAATTACAATTCTTATTACTTATACTCAACTTTGTTCTACTTATTTGCTTTTGTTTTGCCCTGGACACTGTGCTACTTAGACCCAAAGAGAATGATGTGGAATATCAAGAGCTACTAAGGACAGAGATAGTCAATCCTCTGAGAATATATGGATATGTGTGTGCCACAAACATTATGAAACTGAGGAAAACAGTTTAAAATGTTGAGGTTGAATCAGAATTTACTTCTGAAGAAAAAGATCCTGAAGAGTTTCTAAATATCTTGTTTCATGATATTTTAAGGTTTGAACCATTGTTAAAAATAAGGTCAGCCAGTCAAAAAGCTCACGACTGTAACTTTTGTCAAATTTTTAATAAAAGTTGGAGTACCCACAAATCAGCAATTAGAATGGTCTTTTATCAACAGCAACCAGAAATTTGCAGTGGCGTCATCATGGTTGATTATCCAGTTGCCTCAGTTTGGAAAAGTCTTAAGACTAAtttttttttCCTTTCCTGGAATTAAATATAACAGATTTACTTGAAGGCACTCACAGGCAGTGCCACATCTGTGGAGGACTTGCGATCTATAAGTCAACAGAGTACTATGACAACCCAGACATGTCAGTCGGGAAGATCAAACACTTCTGCAAGACTGGCAGCACCTGGGTTCTCCTTCATCTCAGAAGGCTGAATCATACTTACCACCCAGTATCACTTAAAAAGACTTGTCTGACTGGGACTGCAGACTTGTCTGCATCCCCTGACAGAAGATGAAGTCATCTGCTGTTCTCTCCATAGAAACTAGCCACTATGGTACTTTTGTGAAGTACGGGAAAGATGTCTCTGTCTGGCTCTTCTTTGACAGTATGACAGATCGAGATGGCTGTCAGAATGACTTCAAAATTCCACAAGTGAAGGCCTGCCCAGAAGTGGGAGAGTACTTAAAGGTATCTCTGGAGGACCTGAACTCTTTGGACTCCAGAAGGATTCAAGGCTGTGGACGCAGACTTCTTGCAATGCATACATGTGCATGTACCAAAGTCCAACCATGAGCTTGTACAAA
"""ENSMUSG00000102534""","""Gm37225""","""ENSMUST00000193423""","""Gm37225-201""","""ENSMUSE00001337169""","""Gm37225-201_1""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,GCTGAAGTAGAACAGAAGACGCTGACCTTCCGCAGGTTCACTTACTGTGGCGTGGACCTAGACTATCATCTGCTAGACATGTCCAATGAGCAGCTGGTGCAGTTGTCCAGCACTGACAGTGGCAGAGCCTGAACTGTGGTCTGGAGCAGAAGCAGCATTCACTACTCAAGAGCTTGAGACAGGCCAAGAATGAGGAGCCACCCATGGAGAAGCCCAAGGTAGTGAAGACCCACCTGAGGGACATGATTATCCTGCCTGAGATGGTAGGcagtatggtgggtgtgtagaccttaaatcaggtggagatcaaaccagagataattgg


In [41]:
with_exons_info.select(pl.col("sequence").cast(pl.Utf8)).head(10)

InvalidOperationError: cannot cast array of type ObjectChunked to arrow datatype

In [None]:
pl.read_parquet("/data/test.parquet")

In [None]:
annotations.with_genes_transcripts_exons_coordinates_only().annotations_df

#### Writing transcript expressions

In [None]:
if not samples.exists():
    if files(inputs).len() == 0:
        raise Exception("No samples and no parquet files")


## Getting gene names

In [None]:
def load_path_transcript_extended(p: Path) -> pl.DataFrame:
    df = pl.read_parquet(str(p))
    return genomes.mouse.annotations.with_genes_transcripts_exons_coordinates_only().extend_with_annotations(df)

In [None]:
mouse_genes_transcripts = genomes.mouse.

In [None]:
def load_path_exon_extended(p: Path) -> pl.DataFrame:
    df = pl.read_parquet(str(p))
    return genomes.mouse.annotations.with_genes_transcripts_exons_coordinates_only().extend_with_annotations(df)

In [None]:
# Just expressions

In [None]:
expressions_short = OrderedDict(with_ext(inputs, "parquet") \
                          .filter(lambda p: "transcripts" in p.name) \
                          .map(lambda p: (p.name.split("_")[0], pl.read_parquet(str(p))))
                          )
expressions_short

# Expressions with sequences

In [None]:
expressions = OrderedDict(with_ext(inputs, "parquet")\
                          .filter(lambda p: "transcripts" in p.name)\
                          .map(lambda p: (p.name.split("_")[0], load_path_extended(p)))
                          )
expressions

### Find genes of interest

Functions that does the search

In [None]:
tpm_columns = pl.col("^SRR[a-zA-Z0-9]+$")
min_average_tpm = 0.01

In [None]:

def search_expressions_in_bioprojects(gene_name: str,
                                      tpm_columns: Union[pl.Expr, list[pl.Expr], "str", list[str]] = tpm_columns,
                                      min_avg_value: float = min_average_tpm ,
                                      exact: bool = True, genome: Optional[genomepy.Genome] = None):
    return OrderedDict([(k, search_in_expressions(v, gene_name, tpm_columns, min_avg_value, exact, genome)) for k,v in expressions.items()])

In [None]:
nf2 = search_expressions_in_bioprojects("Nf2", genome = genomes.mouse.genome)
nf2

In [None]:
nf2.keys()

In [None]:
nf2['PRJNA797288']

In [None]:
nf2['PRJNA543661']

In [None]:
zswim8 = search_expressions_in_bioprojects("Zswim8", min_average_tpm)
zswim8

In [None]:
deltex2 = search_expressions_in_bioprojects("Dtx2", min_average_tpm)
deltex2

In [None]:
traf3 = search_expressions_in_bioprojects("Traf3", min_average_tpm)
traf3

## Add sequences

In [None]:
nf2_test = nf2["PRJNA761115"]
nf2_test.join(mouse_exons.annotations_df.with_column((pl.col())).select("transcript", "sequence", "strand", ), on="transcript")

In [None]:
print("test")

In [None]:
genomes.mouse.annotations.annotations_df.head(10)

In [29]:
pl.read_parquet("/data/test.parquet")

thread '<unnamed>' panicked at 'The column lengths in the DataFrame are not equal.', /home/runner/work/polars/polars/polars/polars-core/src/fmt.rs:343:13
note: run with `RUST_BACKTRACE=1` environment variable to display a backtrace


PanicException: The column lengths in the DataFrame are not equal.

In [23]:
annotations.with_genes_transcripts_exons_coordinates_only().annotations_df

gene,gene_name,transcript,transcript_name,exon,transcript_exon,coordinates
str,str,str,str,str,str,list[str]
"""ENSMUSG00000102628""","""Gm37671""","""ENSMUST00000193198""","""Gm37671-201""","""ENSMUSE00001342333""","""Gm37671-201_1""","[""1"", ""+"", ... ""150958296""]"
"""ENSMUSG00000100595""","""Gm19087""","""ENSMUST00000191430""","""Gm19087-201""","""ENSMUSE00001324949""","""Gm19087-201_1""","[""1"", ""+"", ... ""150984611""]"
"""ENSMUSG00000097426""","""Gm8941""","""ENSMUST00000181451""","""Gm8941-201""","""ENSMUSE00001158900""","""Gm8941-201_1""","[""1"", ""+"", ... ""151012971""]"
"""ENSMUSG00000097426""","""Gm8941""","""ENSMUST00000181451""","""Gm8941-201""","""ENSMUSE00001115039""","""Gm8941-201_2""","[""1"", ""+"", ... ""151013531""]"
"""ENSMUSG00000104478""","""Gm38212""","""ENSMUST00000194081""","""Gm38212-201""","""ENSMUSE00001337335""","""Gm38212-201_1""","[""1"", ""+"", ... ""108347562""]"
"""ENSMUSG00000104385""","""Gm7449""","""ENSMUST00000194393""","""Gm7449-201""","""ENSMUSE00001339893""","""Gm7449-201_1""","[""1"", ""+"", ... ""6981446""]"
"""ENSMUSG00000086053""","""Gm15178""","""ENSMUST00000132100""","""Gm15178-201""","""ENSMUSE00000831723""","""Gm15178-201_1""","[""1"", ""-"", ... ""75373007""]"
"""ENSMUSG00000086053""","""Gm15178""","""ENSMUST00000132100""","""Gm15178-201""","""ENSMUSE00000827648""","""Gm15178-201_2""","[""1"", ""-"", ... ""75369089""]"
"""ENSMUSG00000101231""","""Gm28283""","""ENSMUST00000185509""","""Gm28283-201""","""ENSMUSE00001325295""","""Gm28283-201_1""","[""1"", ""-"", ... ""108540244""]"
"""ENSMUSG00000102135""","""Gm37108""","""ENSMUST00000194605""","""Gm37108-201""","""ENSMUSE00001339810""","""Gm37108-201_1""","[""1"", ""+"", ... ""6987219""]"


#### Writing transcript expressions

In [11]:
if not samples.exists():
    if files(inputs).len() == 0:
        raise Exception("No samples and no parquet files")


## Getting gene names

In [22]:
def load_path_transcript_extended(p: Path) -> pl.DataFrame:
    df = pl.read_parquet(str(p))
    return genomes.mouse.annotations.with_genes_transcripts_exons_coordinates_only().extend_with_annotations(df)

In [None]:
mouse_genes_transcripts = genomes.mouse.

In [None]:
def load_path_exon_extended(p: Path) -> pl.DataFrame:
    df = pl.read_parquet(str(p))
    return genomes.mouse.annotations.with_genes_transcripts_exons_coordinates_only().extend_with_annotations(df)

In [68]:
# Just expressions

In [69]:
expressions_short = OrderedDict(with_ext(inputs, "parquet") \
                          .filter(lambda p: "transcripts" in p.name) \
                          .map(lambda p: (p.name.split("_")[0], pl.read_parquet(str(p))))
                          )
expressions_short

OrderedDict([('PRJNA761115',
              shape: (116357, 7)
              ┌────────────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┐
              │ transcript         ┆ SRR15731249 ┆ SRR15731252 ┆ SRR15731250 ┆ SRR15731251 ┆ SRR15731248 ┆ SRR15731247 │
              │ ---                ┆ ---         ┆ ---         ┆ ---         ┆ ---         ┆ ---         ┆ ---         │
              │ str                ┆ f64         ┆ f64         ┆ f64         ┆ f64         ┆ f64         ┆ f64         │
              ╞════════════════════╪═════════════╪═════════════╪═════════════╪═════════════╪═════════════╪═════════════╡
              │ ENSMUST00000178537 ┆ 0.0         ┆ 0.0         ┆ 0.0         ┆ 0.0         ┆ 0.0         ┆ 0.0         │
              ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤
              │ ENSMUST00000178862 ┆ 0.0         ┆ 0.0         ┆ 0.0         ┆ 0.0         

# Expressions with sequences

In [24]:
expressions = OrderedDict(with_ext(inputs, "parquet")\
                          .filter(lambda p: "transcripts" in p.name)\
                          .map(lambda p: (p.name.split("_")[0], load_path_extended(p)))
                          )
expressions

gene,gene_name,transcript,transcript_name,transcript_exon,coordinates,SRR15731249,SRR15731252,SRR15731250,SRR15731251,SRR15731248,SRR15731247
str,str,str,str,str,list[str],f64,f64,f64,f64,f64,f64
"""ENSMUSG00000100595""","""Gm19087""","""ENSMUST00000191430""","""Gm19087-201""","""Gm19087-201_1""","[""1"", ""150983666"", ""150984611""]",0.0,0.0,0.0,0.0,0.0,0.0
"""ENSMUSG00000097426""","""Gm8941""","""ENSMUST00000181451""","""Gm8941-201""","""Gm8941-201_1""","[""1"", ""151012258"", ""151012971""]",0.0,0.0,0.0,0.0,0.0,0.0
"""ENSMUSG00000097426""","""Gm8941""","""ENSMUST00000181451""","""Gm8941-201""","""Gm8941-201_2""","[""1"", ""151013347"", ""151013531""]",0.0,0.0,0.0,0.0,0.0,0.0
"""ENSMUSG00000104385""","""Gm7449""","""ENSMUST00000194393""","""Gm7449-201""","""Gm7449-201_1""","[""1"", ""6980784"", ""6981446""]",0.0,0.0,0.0,0.0,0.0,0.0
"""ENSMUSG00000101231""","""Gm28283""","""ENSMUST00000185509""","""Gm28283-201""","""Gm28283-201_1""","[""1"", ""108540067"", ""108540244""]",0.0,0.0,0.0,0.0,0.0,0.0
"""ENSMUSG00000102135""","""Gm37108""","""ENSMUST00000194605""","""Gm37108-201""","""Gm37108-201_1""","[""1"", ""6986783"", ""6987219""]",0.0,0.0,0.0,0.0,0.0,0.0
"""ENSMUSG00000102135""","""Gm37108""","""ENSMUST00000194605""","""Gm37108-201""","""Gm37108-201_2""","[""1"", ""6993624"", ""6993812""]",0.0,0.0,0.0,0.0,0.0,0.0
"""ENSMUSG00000103282""","""Gm37275""","""ENSMUST00000191703""","""Gm37275-201""","""Gm37275-201_1""","[""1"", ""6999983"", ""7000012""]",0.0,0.0,0.0,0.0,0.0,0.0
"""ENSMUSG00000101097""","""Gm6679""","""ENSMUST00000191467""","""Gm6679-201""","""Gm6679-201_1""","[""1"", ""108697865"", ""108699733""]",0.0,0.0,0.0,0.0,0.0,0.0
"""ENSMUSG00000102534""","""Gm37225""","""ENSMUST00000193423""","""Gm37225-201""","""Gm37225-201_1""","[""1"", ""7068441"", ""7068765""]",0.0,0.0,0.0,0.0,0.0,0.0


### Find genes of interest

Functions that does the search

In [58]:
tpm_columns = pl.col("^SRR[a-zA-Z0-9]+$")
min_average_tpm = 0.01

In [59]:

def search_expressions_in_bioprojects(gene_name: str,
                                      tpm_columns: Union[pl.Expr, list[pl.Expr], "str", list[str]] = tpm_columns,
                                      min_avg_value: float = min_average_tpm ,
                                      exact: bool = True, genome: Optional[genomepy.Genome] = None):
    return OrderedDict([(k, search_in_expressions(v, gene_name, tpm_columns, min_avg_value, exact, genome)) for k,v in expressions.items()])

In [62]:
nf2 = search_expressions_in_bioprojects("Nf2", genome = genomes.mouse.genome)
nf2

There are (103, 14) annotations, loading sequences can take quite a while!
There are (116, 14) annotations, loading sequences can take quite a while!
There are (106, 14) annotations, loading sequences can take quite a while!


OrderedDict([('PRJNA761115',
              shape: (103, 15)
              ┌────────────────────┬───────────┬────────────────────┬─────────────────┬─────┬─────────────┬────────────┬───────────┬───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐
              │ gene               ┆ gene_name ┆ transcript         ┆ transcript_name ┆ ... ┆ SRR15731247 ┆ sum_TPM    ┆ avg_TPM   ┆ sequen

In [64]:
nf2.keys()

odict_keys(['PRJNA761115', 'PRJNA797288', 'PRJNA543661'])

In [66]:
nf2['PRJNA797288']

gene,gene_name,transcript,transcript_name,transcript_exon,coordinates,SRR17619478,SRR17619477,SRR17619475,SRR17619479,SRR17619476,SRR17619480,sum_TPM,avg_TPM,sequence
str,str,str,str,str,list[str],f64,f64,f64,f64,f64,f64,f64,f64,object
"""ENSMUSG00000009073""","""Nf2""","""ENSMUST00000109910""","""Nf2-205""","""Nf2-205_1""","[""11"", ""4798872"", ""4799536""]",9.673427,10.050181,13.708633,9.901915,13.390427,13.730403,70.454986,11.742498,CTCGCAGTTGAACTCCATCTCGGCGTCCATGGTGACGATCCGCACCGTGAATGTCTTGGGCTGCTTCCTCTTGAGTGAGCTGAAGCTCATGCGAGAAGCGATGGCTCCGGCCATGGCGCGAGGTACCGGCCTTGGATCCCCTCACCCCTAGGCCCGCGTTCTGGGTCTCTTAGCCCCCCTGGAGTAGCTGCACAGGCCTCAGGGCCACCATGGTGGCCGGCTGGCCGGCCCGAGACAGACCTAGGCTGCCTGGGTCCCCAGCGGCCAGCATGGGCCGCGAGGACCCGGCTGTGGGTGCGGAAAGATGAGTTAAGGGACAGTCTCTGACGACgggggggtcggggggAGATGCCGGCAAAAGCACGCGCAGCCCCGAGCTTGGAGCTTCGGGCTGGGGACGCCAGGGACGCCTTTGGACAGAAATCCTAGACGGCTGCGGCGCCGACGCCGGGCCTCTCAGCGACCCCCGAGGGTCGCGGGGCCGCGCAGTAGTTGGGAGCGCACACGCGCGTGCGTGTCTGTCACCAGCTCCGCTCCGCGCGCCCTCGGGAAGCGTCCTCTACCAACCCCCACCTCAGCGACCTGCCGGATCCCCGCCCCGCCCCGCCCTCCGGACTCCCACTACGCTACTCAGCGACTCACCCTAACACTTTTTTCTCTAAGAT
"""ENSMUSG00000009073""","""Nf2""","""ENSMUST00000109910""","""Nf2-205""","""Nf2-205_2""","[""11"", ""4770368"", ""4770493""]",9.673427,10.050181,13.708633,9.901915,13.390427,13.730403,70.454986,11.742498,CTTCTTGTCCATTTTGAGCCAGGCCACCGTGTCCTTGATTGTATACTGCAGTCCAAAGAACCAGGTTTCCCGAAGCCCCAGTGTCCGGCACACCAAATCAAACAGGTCCTTCCCCTTCCATTTCAT
"""ENSMUSG00000009073""","""Nf2""","""ENSMUST00000109910""","""Nf2-205""","""Nf2-205_3""","[""11"", ""4768505"", ""4768627""]",9.673427,10.050181,13.708633,9.901915,13.390427,13.730403,70.454986,11.742498,CTGTAAGAAAAATAAGTGTTGCGTGATCTCTTGAACTAGCTCCTCCTCAGCATTTTCAGGATAAAATTTGGCCAGGAAGTGAAAGGTAACTGGTTCTTCCTTCGAAACATCATGATCCAACAC
"""ENSMUSG00000009073""","""Nf2""","""ENSMUST00000109910""","""Nf2-205""","""Nf2-205_4""","[""11"", ""4766082"", ""4766165""]",9.673427,10.050181,13.708633,9.901915,13.390427,13.730403,70.454986,11.742498,CTTGGCCTGGACAGCATATGACGCCAAGAGCACGGACGCCTCGGGAGGGCAGTAGACCTTTTCATCCAAAATCTGCTTCTTCAC
"""ENSMUSG00000009073""","""Nf2""","""ENSMUST00000109910""","""Nf2-205""","""Nf2-205_5""","[""11"", ""4758257"", ""4758325""]",9.673427,10.050181,13.708633,9.901915,13.390427,13.730403,70.454986,11.742498,CCTTTTCGGGAGCAATTCCTCTTGGGCTAAAAATCCCCGCTTGTGCACAGAGGGGTCATAGTCGCCATA
"""ENSMUSG00000009073""","""Nf2""","""ENSMUST00000109910""","""Nf2-205""","""Nf2-205_6""","[""11"", ""4756254"", ""4756336""]",9.673427,10.050181,13.708633,9.901915,13.390427,13.730403,70.454986,11.742498,CTGGCTCTGCCCCGGTGCTCCGCATACCAAGCCGTAATTCTCTCCTCCCACATTTCCGGAGTCATCTGATAGAGATTTATCAC
"""ENSMUSG00000009073""","""Nf2""","""ENSMUST00000109910""","""Nf2-205""","""Nf2-205_7""","[""11"", ""4753678"", ""4753753""]",9.673427,10.050181,13.708633,9.901915,13.390427,13.730403,70.454986,11.742498,CCGGATTGTAAAGTAGTTCACACCATACATCTCCAGGTCCTGAGCTATCTTCAAATACTCCATTTCAGCTTCATCC
"""ENSMUSG00000009073""","""Nf2""","""ENSMUST00000109910""","""Nf2-205""","""Nf2-205_8""","[""11"", ""4749861"", ""4749995""]",9.673427,10.050181,13.708633,9.901915,13.390427,13.730403,70.454986,11.742498,CTCCTTGTCGCTGTAGGAGATGTTTCGGATTTCATTCCATGGGAAGGAGATCTTGGGGGTCAGCCTGTTCTCAGGGTCATAGATATGAAGCCCAAGAGCATCCACTCCAAGCAGCAACTCTGTGCCCTTTTTATT
"""ENSMUSG00000009073""","""Nf2""","""ENSMUST00000109910""","""Nf2-205""","""Nf2-205_9""","[""11"", ""4744843"", ""4744917""]",9.673427,10.050181,13.708633,9.901915,13.390427,13.730403,70.454986,11.742498,CAGCTTATTAACACGAAGCTTTGAGGAGTTAAATTTGAAGACATCAATTTTCTTATCCAGTGGTTTAATAGTAAA
"""ENSMUSG00000009073""","""Nf2""","""ENSMUST00000109910""","""Nf2-205""","""Nf2-205_10""","[""11"", ""4744412"", ""4744525""]",9.673427,10.050181,13.708633,9.901915,13.390427,13.730403,70.454986,11.742498,CTGCTTTCTAGCCTTCTCTTCCCTGGCCTGGGCTTTCATCTGCTGAACTTCTAAAGAGTCAGCTTTCCGTCGCCTCATAAATAGGTCATGGTTCCCAATACATAGCTGAAGAAT


In [67]:
nf2['PRJNA543661']

gene,gene_name,transcript,transcript_name,transcript_exon,coordinates,SRR9089320,SRR9089321,SRR9089322,SRR9089323,SRR9089324,SRR9089319,sum_TPM,avg_TPM,sequence
str,str,str,str,str,list[str],f64,f64,f64,f64,f64,f64,f64,f64,object
"""ENSMUSG00000009073""","""Nf2""","""ENSMUST00000109910""","""Nf2-205""","""Nf2-205_1""","[""11"", ""4798872"", ""4799536""]",21.91233,19.978904,18.334085,18.688448,17.526474,15.183504,111.623745,18.603958,CTCGCAGTTGAACTCCATCTCGGCGTCCATGGTGACGATCCGCACCGTGAATGTCTTGGGCTGCTTCCTCTTGAGTGAGCTGAAGCTCATGCGAGAAGCGATGGCTCCGGCCATGGCGCGAGGTACCGGCCTTGGATCCCCTCACCCCTAGGCCCGCGTTCTGGGTCTCTTAGCCCCCCTGGAGTAGCTGCACAGGCCTCAGGGCCACCATGGTGGCCGGCTGGCCGGCCCGAGACAGACCTAGGCTGCCTGGGTCCCCAGCGGCCAGCATGGGCCGCGAGGACCCGGCTGTGGGTGCGGAAAGATGAGTTAAGGGACAGTCTCTGACGACgggggggtcggggggAGATGCCGGCAAAAGCACGCGCAGCCCCGAGCTTGGAGCTTCGGGCTGGGGACGCCAGGGACGCCTTTGGACAGAAATCCTAGACGGCTGCGGCGCCGACGCCGGGCCTCTCAGCGACCCCCGAGGGTCGCGGGGCCGCGCAGTAGTTGGGAGCGCACACGCGCGTGCGTGTCTGTCACCAGCTCCGCTCCGCGCGCCCTCGGGAAGCGTCCTCTACCAACCCCCACCTCAGCGACCTGCCGGATCCCCGCCCCGCCCCGCCCTCCGGACTCCCACTACGCTACTCAGCGACTCACCCTAACACTTTTTTCTCTAAGAT
"""ENSMUSG00000009073""","""Nf2""","""ENSMUST00000109910""","""Nf2-205""","""Nf2-205_2""","[""11"", ""4770368"", ""4770493""]",21.91233,19.978904,18.334085,18.688448,17.526474,15.183504,111.623745,18.603958,CTTCTTGTCCATTTTGAGCCAGGCCACCGTGTCCTTGATTGTATACTGCAGTCCAAAGAACCAGGTTTCCCGAAGCCCCAGTGTCCGGCACACCAAATCAAACAGGTCCTTCCCCTTCCATTTCAT
"""ENSMUSG00000009073""","""Nf2""","""ENSMUST00000109910""","""Nf2-205""","""Nf2-205_3""","[""11"", ""4768505"", ""4768627""]",21.91233,19.978904,18.334085,18.688448,17.526474,15.183504,111.623745,18.603958,CTGTAAGAAAAATAAGTGTTGCGTGATCTCTTGAACTAGCTCCTCCTCAGCATTTTCAGGATAAAATTTGGCCAGGAAGTGAAAGGTAACTGGTTCTTCCTTCGAAACATCATGATCCAACAC
"""ENSMUSG00000009073""","""Nf2""","""ENSMUST00000109910""","""Nf2-205""","""Nf2-205_4""","[""11"", ""4766082"", ""4766165""]",21.91233,19.978904,18.334085,18.688448,17.526474,15.183504,111.623745,18.603958,CTTGGCCTGGACAGCATATGACGCCAAGAGCACGGACGCCTCGGGAGGGCAGTAGACCTTTTCATCCAAAATCTGCTTCTTCAC
"""ENSMUSG00000009073""","""Nf2""","""ENSMUST00000109910""","""Nf2-205""","""Nf2-205_5""","[""11"", ""4758257"", ""4758325""]",21.91233,19.978904,18.334085,18.688448,17.526474,15.183504,111.623745,18.603958,CCTTTTCGGGAGCAATTCCTCTTGGGCTAAAAATCCCCGCTTGTGCACAGAGGGGTCATAGTCGCCATA
"""ENSMUSG00000009073""","""Nf2""","""ENSMUST00000109910""","""Nf2-205""","""Nf2-205_6""","[""11"", ""4756254"", ""4756336""]",21.91233,19.978904,18.334085,18.688448,17.526474,15.183504,111.623745,18.603958,CTGGCTCTGCCCCGGTGCTCCGCATACCAAGCCGTAATTCTCTCCTCCCACATTTCCGGAGTCATCTGATAGAGATTTATCAC
"""ENSMUSG00000009073""","""Nf2""","""ENSMUST00000109910""","""Nf2-205""","""Nf2-205_7""","[""11"", ""4753678"", ""4753753""]",21.91233,19.978904,18.334085,18.688448,17.526474,15.183504,111.623745,18.603958,CCGGATTGTAAAGTAGTTCACACCATACATCTCCAGGTCCTGAGCTATCTTCAAATACTCCATTTCAGCTTCATCC
"""ENSMUSG00000009073""","""Nf2""","""ENSMUST00000109910""","""Nf2-205""","""Nf2-205_8""","[""11"", ""4749861"", ""4749995""]",21.91233,19.978904,18.334085,18.688448,17.526474,15.183504,111.623745,18.603958,CTCCTTGTCGCTGTAGGAGATGTTTCGGATTTCATTCCATGGGAAGGAGATCTTGGGGGTCAGCCTGTTCTCAGGGTCATAGATATGAAGCCCAAGAGCATCCACTCCAAGCAGCAACTCTGTGCCCTTTTTATT
"""ENSMUSG00000009073""","""Nf2""","""ENSMUST00000109910""","""Nf2-205""","""Nf2-205_9""","[""11"", ""4744843"", ""4744917""]",21.91233,19.978904,18.334085,18.688448,17.526474,15.183504,111.623745,18.603958,CAGCTTATTAACACGAAGCTTTGAGGAGTTAAATTTGAAGACATCAATTTTCTTATCCAGTGGTTTAATAGTAAA
"""ENSMUSG00000009073""","""Nf2""","""ENSMUST00000109910""","""Nf2-205""","""Nf2-205_10""","[""11"", ""4744412"", ""4744525""]",21.91233,19.978904,18.334085,18.688448,17.526474,15.183504,111.623745,18.603958,CTGCTTTCTAGCCTTCTCTTCCCTGGCCTGGGCTTTCATCTGCTGAACTTCTAAAGAGTCAGCTTTCCGTCGCCTCATAAATAGGTCATGGTTCCCAATACATAGCTGAAGAAT


In [None]:
zswim8 = search_expressions_in_bioprojects("Zswim8", min_average_tpm)
zswim8

In [None]:
deltex2 = search_expressions_in_bioprojects("Dtx2", min_average_tpm)
deltex2

In [None]:
traf3 = search_expressions_in_bioprojects("Traf3", min_average_tpm)
traf3

## Add sequences

In [None]:
nf2_test = nf2["PRJNA761115"]
nf2_test.join(mouse_exons.annotations_df.with_column((pl.col())).select("transcript", "sequence", "strand", ), on="transcript")

In [None]:
print("test")

In [20]:
genomes.mouse.annotations.annotations_df.head(10)

seqname,source,feature,start,end,score,strand,frame,attribute,gene,gene_name,transcript_biotype,transcript,transcript_name,exon_number
str,str,str,u64,u64,str,cat,str,str,str,str,str,str,str,u64
"""1""","""havana""","""gene""",150956201,150958296,""".""","""+""",""".""","""gene_id ""ENSMUSG00000102628""; gene_version ""2""; gene_name ""Gm37671""; gene_source ""havana""; gene_biotype ""TEC"";""","""ENSMUSG00000102628""","""Gm37671""",,,,
"""1""","""havana""","""transcript""",150956201,150958296,""".""","""+""",""".""","""gene_id ""ENSMUSG00000102628""; gene_version ""2""; transcript_id ""ENSMUST00000193198""; transcript_version ""2""; gene_name ""Gm37671""; gene_source ""havana""; gene_biotype ""TEC""; transcript_name ""Gm37671-201""; transcript_source ""havana""; transcript_biotype ""TEC""; tag ""basic""; tag ""Ensembl_canonical""; transcript_support_level ""NA (assigned to previous version 1)"";""","""ENSMUSG00000102628""","""Gm37671""","""TEC""","""ENSMUST00000193198""","""Gm37671-201""",
"""1""","""havana""","""exon""",150956201,150958296,""".""","""+""",""".""","""gene_id ""ENSMUSG00000102628""; gene_version ""2""; transcript_id ""ENSMUST00000193198""; transcript_version ""2""; exon_number ""1""; gene_name ""Gm37671""; gene_source ""havana""; gene_biotype ""TEC""; transcript_name ""Gm37671-201""; transcript_source ""havana""; transcript_biotype ""TEC""; exon_id ""ENSMUSE00001342333""; exon_version ""2""; tag ""basic""; tag ""Ensembl_canonical""; transcript_support_level ""NA (assigned to previous version 1)"";""","""ENSMUSG00000102628""","""Gm37671""","""TEC""","""ENSMUST00000193198""","""Gm37671-201""",1.0
"""1""","""havana""","""gene""",150983666,150984611,""".""","""+""",""".""","""gene_id ""ENSMUSG00000100595""; gene_version ""2""; gene_name ""Gm19087""; gene_source ""havana""; gene_biotype ""processed_pseudogene"";""","""ENSMUSG00000100595""","""Gm19087""",,,,
"""1""","""havana""","""transcript""",150983666,150984611,""".""","""+""",""".""","""gene_id ""ENSMUSG00000100595""; gene_version ""2""; transcript_id ""ENSMUST00000191430""; transcript_version ""2""; gene_name ""Gm19087""; gene_source ""havana""; gene_biotype ""processed_pseudogene""; transcript_name ""Gm19087-201""; transcript_source ""havana""; transcript_biotype ""processed_pseudogene""; tag ""basic""; tag ""Ensembl_canonical""; transcript_support_level ""NA (assigned to previous version 1)"";""","""ENSMUSG00000100595""","""Gm19087""","""processed_pseudogene""","""ENSMUST00000191430""","""Gm19087-201""",
"""1""","""havana""","""exon""",150983666,150984611,""".""","""+""",""".""","""gene_id ""ENSMUSG00000100595""; gene_version ""2""; transcript_id ""ENSMUST00000191430""; transcript_version ""2""; exon_number ""1""; gene_name ""Gm19087""; gene_source ""havana""; gene_biotype ""processed_pseudogene""; transcript_name ""Gm19087-201""; transcript_source ""havana""; transcript_biotype ""processed_pseudogene""; exon_id ""ENSMUSE00001324949""; exon_version ""2""; tag ""basic""; tag ""Ensembl_canonical""; transcript_support_level ""NA (assigned to previous version 1)"";""","""ENSMUSG00000100595""","""Gm19087""","""processed_pseudogene""","""ENSMUST00000191430""","""Gm19087-201""",1.0
"""1""","""havana""","""gene""",151012258,151013531,""".""","""+""",""".""","""gene_id ""ENSMUSG00000097426""; gene_version ""2""; gene_name ""Gm8941""; gene_source ""havana""; gene_biotype ""processed_pseudogene"";""","""ENSMUSG00000097426""","""Gm8941""",,,,
"""1""","""havana""","""transcript""",151012258,151013531,""".""","""+""",""".""","""gene_id ""ENSMUSG00000097426""; gene_version ""2""; transcript_id ""ENSMUST00000181451""; transcript_version ""2""; gene_name ""Gm8941""; gene_source ""havana""; gene_biotype ""processed_pseudogene""; transcript_name ""Gm8941-201""; transcript_source ""havana""; transcript_biotype ""processed_pseudogene""; tag ""basic""; tag ""Ensembl_canonical""; transcript_support_level ""NA (assigned to previous version 1)"";""","""ENSMUSG00000097426""","""Gm8941""","""processed_pseudogene""","""ENSMUST00000181451""","""Gm8941-201""",
"""1""","""havana""","""exon""",151012258,151012971,""".""","""+""",""".""","""gene_id ""ENSMUSG00000097426""; gene_version ""2""; transcript_id ""ENSMUST00000181451""; transcript_version ""2""; exon_number ""1""; gene_name ""Gm8941""; gene_source ""havana""; gene_biotype ""processed_pseudogene""; transcript_name ""Gm8941-201""; transcript_source ""havana""; transcript_biotype ""processed_pseudogene""; exon_id ""ENSMUSE00001158900""; exon_version ""2""; tag ""basic""; tag ""Ensembl_canonical""; transcript_support_level ""NA (assigned to previous version 1)"";""","""ENSMUSG00000097426""","""Gm8941""","""processed_pseudogene""","""ENSMUST00000181451""","""Gm8941-201""",1.0
"""1""","""havana""","""exon""",151013347,151013531,""".""","""+""",""".""","""gene_id ""ENSMUSG00000097426""; gene_version ""2""; transcript_id ""ENSMUST00000181451""; transcript_version ""2""; exon_number ""2""; gene_name ""Gm8941""; gene_source ""havana""; gene_biotype ""processed_pseudogene""; transcript_name ""Gm8941-201""; transcript_source ""havana""; transcript_biotype ""processed_pseudogene""; exon_id ""ENSMUSE00001115039""; exon_version ""2""; tag ""basic""; tag ""Ensembl_canonical""; transcript_support_level ""NA (assigned to previous version 1)"";""","""ENSMUSG00000097426""","""Gm8941""","""processed_pseudogene""","""ENSMUST00000181451""","""Gm8941-201""",2.0
