## Preprocessing notebooks that makes expression matrix analysis for further ##

In [1]:
import polars as pl
from pathlib import Path
from pycomfort.files import *
from pycomfort import files
import pyarrow
import pandas as pd
from functional import seq
from typing import *
import functools

In [2]:
from genotations import *

  from .autonotebook import tqdm as notebook_tqdm


Adding source paths

In [3]:
import sys

base = Path("..")
local = (base / "dashboard").resolve()
if local.exists():
    sys.path.insert(0, Path("..").absolute().as_posix())
    sys.path.insert(0, local)
    print(sys.path)
else:
    base = Path(".")
%load_ext autoreload
%autoreload 2

[PosixPath('/home/antonkulaga/expressions-dashboard/dashboard'), '/home/antonkulaga/expressions-dashboard/notebooks/..', '/home/antonkulaga/.local/share/JetBrains/Toolbox/apps/IDEA-U/ch-0/223.7571.182.plugins/python/helpers-pro/jupyter_debug', '/home/antonkulaga/.local/share/JetBrains/Toolbox/apps/IDEA-U/ch-0/223.7571.182.plugins/python/helpers/pydev', '/home/antonkulaga/expressions-dashboard/notebooks', '/home/antonkulaga/micromamba/envs/dashboard/lib/python310.zip', '/home/antonkulaga/micromamba/envs/dashboard/lib/python3.10', '/home/antonkulaga/micromamba/envs/dashboard/lib/python3.10/lib-dynload', '', '/home/antonkulaga/micromamba/envs/dashboard/lib/python3.10/site-packages']


Configure polars to display more info in notebooks

In [4]:
pl.Config.set_tbl_width_chars(10000)
pl.Config.set_fmt_str_lengths(1000)
pl.Config.set_tbl_rows(20)

polars.cfg.Config

In [5]:
from dashboard.preprocess import *

## Setting up Paths

In [6]:
samples = Path("/") / "data" / "samples" / "cellfabrik"
bioprojects = dirs(samples)

In [7]:
data = base / "data"
inputs = data / "inputs"
inputs.mkdir(exist_ok=True)
interim = data / "interim"
interim.mkdir(exist_ok=True)
output = data / "output"
output.mkdir(exist_ok=True)

In [8]:
genes_from_bioproject(bioprojects[0])

OrderedDict([('SRR15731249',
              shape: (116357, 4)
              ┌────────────────────┬──────────┬─────────────────┬──────────┐
              │ gene               ┆ TPM      ┆ EffectiveLength ┆ NumReads │
              │ ---                ┆ ---      ┆ ---             ┆ ---      │
              │ str                ┆ f64      ┆ f64             ┆ f64      │
              ╞════════════════════╪══════════╪═════════════════╪══════════╡
              │ ENSMUST00000213427 ┆ 0.0      ┆ 20.0            ┆ 0.0      │
              ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
              │ ENSMUST00000117697 ┆ 0.0      ┆ 684.0           ┆ 0.0      │
              ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
              │ ENSMUST00000214850 ┆ 0.0      ┆ 28.0            ┆ 0.0      │
              ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
              │ ENSMUST00000121437 ┆ 0.0      ┆ 35.0            ┆ 0.0      │
              

## Loading data

#### Writing transcript expressions

In [100]:
def write_transcripts():
    for p in bioprojects:
        name: str = p.stem + "_transcripts.parquet"
        where = inputs / name
        expressions: pl.DataFrame = expressions_from_bioproject(p)
        print(f"results will be written to {where}")
        expressions.write_parquet(str(where))

#### Writing gene expressions

In [101]:
def write_genes():
    for p in bioprojects:
        name: str = p.stem + "_genes.parquet"
        where = inputs / name
        expressions: pl.DataFrame = expressions_from_bioproject(p, False)
        print(f"results will be written to {where}")
        expressions.write_parquet(str(where))

In [12]:
with_ext(inputs, "parquet")

[PosixPath('../data/inputs/PRJNA761115_genes.parquet'), PosixPath('../data/inputs/PRJNA761115_transcripts.parquet')]

## Getting gene names

In [46]:
transcripts = OrderedDict(with_ext(inputs, "parquet")\
                          .filter(lambda p: "transcripts" in p.name)\
                          .map(lambda p: (p.name.split("_")[0], with_mouse_transcript_info(pl.read_parquet(str(p)))))
                          )
transcripts

OrderedDict([('PRJNA761115',
              shape: (116357, 10)
              ┌────────────────────┬─────────────────┬────────────────────┬───────────┬─────┬─────────────┬─────────────┬─────────────┬─────────────┐
              │ transcript         ┆ transcript_name ┆ gene               ┆ gene_name ┆ ... ┆ SRR15731250 ┆ SRR15731251 ┆ SRR15731248 ┆ SRR15731247 │
              │ ---                ┆ ---             ┆ ---                ┆ ---       ┆     ┆ ---         ┆ ---         ┆ ---         ┆ ---         │
              │ str                ┆ str             ┆ str                ┆ str       ┆     ┆ f64         ┆ f64         ┆ f64         ┆ f64         │
              ╞════════════════════╪═════════════════╪════════════════════╪═══════════╪═════╪═════════════╪═════════════╪═════════════╪═════════════╡
              │ ENSMUST00000191430 ┆ Gm19087-201     ┆ ENSMUSG00000100595 ┆ Gm19087   ┆ ... ┆ 0.0         ┆ 0.0         ┆ 0.0         ┆ 0.0         │
              ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌

### Load genes of interests

In [88]:
def with_transcript_summaries(df: pl.DataFrame):
    tpms = df.columns[4:]
    sums = pl.sum(tpms).alias("sum_TPM")
    avg = (sums / len(tpms)).alias("avg_TPM")
    return df.with_column(sums).with_column(avg).sort(avg, True)

def search_in_expressions(df: pl.DataFrame, gene_name: str, min_avg_value: float = 0.0, exact: bool = True):
    search = pl.col("gene_name") == gene_name if exact else pl.col("gene_name").str.contains(gene_name)
    return with_transcript_summaries(df).filter(search).filter(pl.col("avg_TPM") >= min_avg_value)

def search_expressions_in_bioprojects(gene: str, min_avg_value: float = 0.0):
    return OrderedDict([(k, search_in_expressions(v, gene, min_avg_value)) for k,v in transcripts.items()])

In [94]:
min_average_tpm = 0.01

In [95]:
nf2 = search_expressions_in_bioprojects("Nf2", min_average_tpm)
nf2

OrderedDict([('PRJNA761115',
              shape: (9, 12)
              ┌────────────────────┬─────────────────┬────────────────────┬───────────┬─────┬─────────────┬─────────────┬────────────┬───────────┐
              │ transcript         ┆ transcript_name ┆ gene               ┆ gene_name ┆ ... ┆ SRR15731248 ┆ SRR15731247 ┆ sum_TPM    ┆ avg_TPM   │
              │ ---                ┆ ---             ┆ ---                ┆ ---       ┆     ┆ ---         ┆ ---         ┆ ---        ┆ ---       │
              │ str                ┆ str             ┆ str                ┆ str       ┆     ┆ f64         ┆ f64         ┆ f64        ┆ f64       │
              ╞════════════════════╪═════════════════╪════════════════════╪═══════════╪═════╪═════════════╪═════════════╪════════════╪═══════════╡
              │ ENSMUST00000109910 ┆ Nf2-205         ┆ ENSMUSG00000009073 ┆ Nf2       ┆ ... ┆ 32.998689   ┆ 35.162034   ┆ 220.701185 ┆ 36.783531 │
              ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌

In [96]:
zswim8 = search_expressions_in_bioprojects("Zswim8", min_average_tpm)
zswim8

OrderedDict([('PRJNA761115',
              shape: (13, 12)
              ┌────────────────────┬─────────────────┬────────────────────┬───────────┬─────┬─────────────┬─────────────┬────────────┬───────────┐
              │ transcript         ┆ transcript_name ┆ gene               ┆ gene_name ┆ ... ┆ SRR15731248 ┆ SRR15731247 ┆ sum_TPM    ┆ avg_TPM   │
              │ ---                ┆ ---             ┆ ---                ┆ ---       ┆     ┆ ---         ┆ ---         ┆ ---        ┆ ---       │
              │ str                ┆ str             ┆ str                ┆ str       ┆     ┆ f64         ┆ f64         ┆ f64        ┆ f64       │
              ╞════════════════════╪═════════════════╪════════════════════╪═══════════╪═════╪═════════════╪═════════════╪════════════╪═══════════╡
              │ ENSMUST00000022358 ┆ Zswim8-201      ┆ ENSMUSG00000021819 ┆ Zswim8    ┆ ... ┆ 35.805864   ┆ 36.172884   ┆ 219.304388 ┆ 36.550731 │
              ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌

In [97]:
deltex2 = search_expressions_in_bioprojects("Dtx2", min_average_tpm)
deltex2

OrderedDict([('PRJNA761115',
              shape: (8, 12)
              ┌────────────────────┬─────────────────┬────────────────────┬───────────┬─────┬─────────────┬─────────────┬───────────┬───────────┐
              │ transcript         ┆ transcript_name ┆ gene               ┆ gene_name ┆ ... ┆ SRR15731248 ┆ SRR15731247 ┆ sum_TPM   ┆ avg_TPM   │
              │ ---                ┆ ---             ┆ ---                ┆ ---       ┆     ┆ ---         ┆ ---         ┆ ---       ┆ ---       │
              │ str                ┆ str             ┆ str                ┆ str       ┆     ┆ f64         ┆ f64         ┆ f64       ┆ f64       │
              ╞════════════════════╪═════════════════╪════════════════════╪═══════════╪═════╪═════════════╪═════════════╪═══════════╪═══════════╡
              │ ENSMUST00000111145 ┆ Dtx2-204        ┆ ENSMUSG00000004947 ┆ Dtx2      ┆ ... ┆ 3.24335     ┆ 5.065645    ┆ 81.865575 ┆ 13.644263 │
              ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌

In [98]:
traf3 = search_expressions_in_bioprojects("Traf3", min_average_tpm)
traf3

OrderedDict([('PRJNA761115',
              shape: (3, 12)
              ┌────────────────────┬─────────────────┬────────────────────┬───────────┬─────┬─────────────┬─────────────┬───────────┬───────────┐
              │ transcript         ┆ transcript_name ┆ gene               ┆ gene_name ┆ ... ┆ SRR15731248 ┆ SRR15731247 ┆ sum_TPM   ┆ avg_TPM   │
              │ ---                ┆ ---             ┆ ---                ┆ ---       ┆     ┆ ---         ┆ ---         ┆ ---       ┆ ---       │
              │ str                ┆ str             ┆ str                ┆ str       ┆     ┆ f64         ┆ f64         ┆ f64       ┆ f64       │
              ╞════════════════════╪═════════════════╪════════════════════╪═══════════╪═════╪═════════════╪═════════════╪═══════════╪═══════════╡
              │ ENSMUST00000021706 ┆ Traf3-201       ┆ ENSMUSG00000021277 ┆ Traf3     ┆ ... ┆ 14.471493   ┆ 15.479019   ┆ 84.452268 ┆ 14.075378 │
              ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌