Here I parse RNA-seq data from Mazin _et al._

First, make an enviroment for processing the RNA data (this is not a functionality of splicevo, so I keep it separate).

In [None]:
## conda env
conda create -n spliser python=3.10 -y
conda activate spliser
conda install -c bioconda pysam -y
conda install -c conda-forge ipykernel -y
python -m ipykernel install --user --name=spliser 
git clone -b speedups https://github.com/CraigIDent/SpliSER.git
cd SpliSER
pip install -e . --config-settings editable_mode=compat

We will run SpliSER for all samples from Mazin et al. The samples are listed in `samples.txt`.

In [None]:
import pandas as pd
import re

samples = pd.read_csv("samples.txt", sep="\t")

# Group info contains samples filenames
samples['Group']

# Remove first part of Group column before the first '.'
samples['Group'] = samples['Group'].apply(lambda x: re.sub(r'^[^.]*\.', '', x))
samples['Group'].value_counts()

# Save the modified DataFrame to a new file
samples.to_csv("samples2.txt", sep="\t", index=False)

Extract Splice site Strength Estimate (SSE) and other splice site values.

In [None]:
import pandas as pd
from pathlib import Path

species="Mus_musculus"
name="Mouse"
tissue="Brain"
group="0dpb"
filename=Path("results", species, f"{name}.{tissue}.{group}.combined.tsv")
df=pd.read_csv(filename, sep="\t")

# columns:
# Sample\tRegion\tSite\tStrand\tgene\tSSE\talpha_count\tbeta1_count\tbeta2_count\tMultiGeneFlag\tOthers\tPartners\tCompetitors

# extract chromosome (region), position (site), SSE, alpha_count, beta1_count, beta2_count
df2 = df[['Region', 'Site', 'SSE', 'alpha_count', 'beta1_count', 'beta2_count']]
df2.head()