# Goals

* Create STAR reference genome index for various species

# Vars

In [14]:
base_dir = "/scratch/multiomics/nickyoungblut/star_refs"
script_dir = "../../scripts/"

# Init

In [15]:
import os
from glob import glob

In [8]:
os.makedirs(base_dir, exist_ok=True)

# Rattus norvegicus

In [53]:
organism = "Rattus norvegicus"
genome_name = "mRatBN7.2"

In [54]:
# format
org_str = organism.replace(" ", "_")
work_dir = os.path.join(base_dir, org_str)
os.makedirs(work_dir, exist_ok=True)

In [55]:
# download genome
!genomepy install --provider Ensembl --annotation --genomes_dir {work_dir} {genome_name}

[0m

In [56]:
# get genome fasta
fasta_file = glob(f"{work_dir}/{genome_name}/*.fa")[0]
fasta_file

'/scratch/multiomics/nickyoungblut/star_refs/Rattus_norvegicus/mRatBN7.2/mRatBN7.2.fa'

In [57]:
# get gtf file
gtf_file = glob(f"{work_dir}/{genome_name}/*.gtf")[0]
gtf_file

'/scratch/multiomics/nickyoungblut/star_refs/Rattus_norvegicus/mRatBN7.2/mRatBN7.2.annotation.gtf'

In [58]:
exe = os.path.join(script_dir, "format-star-ref.py")
!{exe} --organism "{organism}"  --output-dir {work_dir} {gtf_file} 

Processing GTF: mRatBN7.2.annotation.gtf
Output GTF: /scratch/multiomics/nickyoungblut/star_refs/Rattus_norvegicus/Rattus_norvegicus/Rattus_norvegicus.gtf
Total records in GTF: 1284446
Filtered 17547 records by biotype
Filtered 0 records by tag
-- Count of biotypes filtered --
snoRNA: 5118
pseudogene: 4926
snRNA: 4536
miRNA: 1332
rRNA: 630
processed_pseudogene: 576
scaRNA: 111
ribozyme: 108
misc_RNA: 81
Mt_tRNA: 66
Y_RNA: 54
Mt_rRNA: 6
vault_RNA: 3
-- Count of biotypes kept --
protein_coding: 2472948
lncRNA: 34794
IG_V_gene: 382
TR_C_gene: 18
TR_J_gene: 18
TR_V_gene: 9
----------------------------


In [62]:
# final gtf file
gtf_filt_file = os.path.join(work_dir, org_str, f"{org_str}.gtf")
os.path.exists(gtf_filt_file)

True

In [None]:
# star
star_dir = os.path.join(work_dir, "star")
os.makedirs(star_dir, exist_ok=True)

!STAR --runThreadN 16 \
    --runMode genomeGenerate \
    --genomeDir {star_dir} \
    --genomeFastaFiles {fasta_file} \
    --sjdbGTFfile {gtf_filt_file} \
    --sjdbOverhang 100 

	/home/nickyoungblut/miniforge3/envs/asmbl/bin/STAR-avx2 --runThreadN 16 --runMode genomeGenerate --genomeDir /scratch/multiomics/nickyoungblut/star_refs/Rattus_norvegicus/star --genomeFastaFiles /scratch/multiomics/nickyoungblut/star_refs/Rattus_norvegicus/mRatBN7.2/mRatBN7.2.fa --sjdbGTFfile /scratch/multiomics/nickyoungblut/star_refs/Rattus_norvegicus/Rattus_norvegicus/Rattus_norvegicus.gtf --sjdbOverhang 100
	STAR version: 2.7.11b   compiled: 2025-01-09T12:32:06+0000 :/opt/conda/conda-bld/star_1736425784849/work/source
Jan 31 13:04:45 ..... started STAR run

Jan 31 13:04:45 ... starting to generate Genome files
