# Goals

* Create STAR reference genome index for various species

# Vars

In [4]:
base_dir = "/scratch/multiomics/nickyoungblut/star_refs"
script_dir = "../../scripts/"

# Init

In [5]:
import os
from glob import glob

In [6]:
os.makedirs(base_dir, exist_ok=True)

# Rattus norvegicus

In [9]:
# set the organism
organism = "Rattus norvegicus"

In [10]:
# format
org_str = organism.replace(" ", "_")
work_dir = os.path.join(base_dir, org_str)
os.makedirs(work_dir, exist_ok=True)

In [14]:
# show available assemblies
!genomepy search --provider NCBI {organism}

[32m10:14:24[0m [1m|[0m [34mINFO[0m [1m|[0m Downloading assembly summaries from NCBI, this will take a while...
genbank_historical: 75.7k genomes [00:01, 58.3k genomes/s][0m[0m
refseq_historical: 97.1k genomes [00:00, 109k genomes/s][0m[0m
genbank: 2.84M genomes [00:24, 114k genomes/s][0mm[0m
refseq: 427k genomes [00:01, 241k genomes/s][0mmm[0m
[0m[1mname                 provider accession         tax_id annotation species                                  other_info                              [0m
[0mUTH_Rnor_SHR_Utx     NCBI     GCA_023515785.1    10116     [31m✗[39m      Rattus norvegicus                        University of Kentucky                  [0m
[0mUTH_Rnor_WKY_Bbb_1.0 NCBI     GCA_023515805.1    10116     [31m✗[39m      Rattus norvegicus                        Inbred Rat Genome Sequencing Project (UTH/UK/UofL)[0m
[0mRnor_6.0             NCBI     GCA_000001895.4    10116     [32m✓[39m      Rattus norvegicus                        Rat Genome Seq

In [15]:
# show available assemblies
!genomepy search --provider Ensembl {organism}

[1mname                 provider accession         tax_id annotation species                                  other_info                              [0m
[0mUTH_Rnor_SHRSP_BbbUtx_1.0 Ensembl  GCA_021556685.1    10116     [32m✓[39m      Rattus norvegicus                        2022-10-Ensembl/2023-01                 [0m
[0mmRatBN7.2            Ensembl  GCA_015227675.2    10116     [32m✓[39m      Rattus norvegicus                        2020-11-Ensembl/2022-03                 [0m
[0mUTH_Rnor_SHR_Utx     Ensembl  GCA_023515785.1    10116     [32m✓[39m      Rattus norvegicus                        2022-10-Ensembl/2023-01                 [0m
[0mUTH_Rnor_WKY_Bbb_1.0 Ensembl  GCA_023515805.1    10116     [32m✓[39m      Rattus norvegicus                        2022-10-Ensembl/2023-01                 [0m
[0m[32m ^[0m
[0m[32m Use name for [36mgenomepy install[0m
[0m[0m

In [None]:
# set the genome name
genome_name = "mRatBN7.2"

In [55]:
# download genome
!genomepy install --provider Ensembl --annotation --genomes_dir {work_dir} {genome_name}

[0m

In [6]:
# get genome fasta
fasta_file = glob(f"{work_dir}/{genome_name}/*.fa")[0]
fasta_file

'/scratch/multiomics/nickyoungblut/star_refs/Rattus_norvegicus/mRatBN7.2/mRatBN7.2.fa'

In [7]:
# get gtf file
gtf_file = glob(f"{work_dir}/{genome_name}/*.gtf")[0]
gtf_file

'/scratch/multiomics/nickyoungblut/star_refs/Rattus_norvegicus/mRatBN7.2/mRatBN7.2.annotation.gtf'

In [58]:
# format the reference
exe = os.path.join(script_dir, "format-star-ref.py")
!{exe} --organism "{organism}"  --output-dir {work_dir} {gtf_file} 

Processing GTF: mRatBN7.2.annotation.gtf
Output GTF: /scratch/multiomics/nickyoungblut/star_refs/Rattus_norvegicus/Rattus_norvegicus/Rattus_norvegicus.gtf
Total records in GTF: 1284446
Filtered 17547 records by biotype
Filtered 0 records by tag
-- Count of biotypes filtered --
snoRNA: 5118
pseudogene: 4926
snRNA: 4536
miRNA: 1332
rRNA: 630
processed_pseudogene: 576
scaRNA: 111
ribozyme: 108
misc_RNA: 81
Mt_tRNA: 66
Y_RNA: 54
Mt_rRNA: 6
vault_RNA: 3
-- Count of biotypes kept --
protein_coding: 2472948
lncRNA: 34794
IG_V_gene: 382
TR_C_gene: 18
TR_J_gene: 18
TR_V_gene: 9
----------------------------


In [8]:
# set the filtered gtf file
gtf_filt_file = os.path.join(work_dir, org_str, f"{org_str}.gtf")
os.path.exists(gtf_filt_file)

True

In [9]:
# create the star index
star_dir = os.path.join(work_dir, "star")
os.makedirs(star_dir, exist_ok=True)

!STAR --runThreadN 16 \
    --runMode genomeGenerate \
    --genomeDir {star_dir} \
    --genomeFastaFiles {fasta_file} \
    --sjdbGTFfile {gtf_filt_file} \
    --sjdbOverhang 100 

	/home/nickyoungblut/miniforge3/envs/asmbl/bin/STAR-avx2 --runThreadN 16 --runMode genomeGenerate --genomeDir /scratch/multiomics/nickyoungblut/star_refs/Rattus_norvegicus/star --genomeFastaFiles /scratch/multiomics/nickyoungblut/star_refs/Rattus_norvegicus/mRatBN7.2/mRatBN7.2.fa --sjdbGTFfile /scratch/multiomics/nickyoungblut/star_refs/Rattus_norvegicus/Rattus_norvegicus/Rattus_norvegicus.gtf --sjdbOverhang 100
	STAR version: 2.7.11b   compiled: 2025-01-09T12:32:06+0000 :/opt/conda/conda-bld/star_1736425784849/work/source
Jan 31 13:08:11 ..... started STAR run

Jan 31 13:08:11 ... starting to generate Genome files
Jan 31 13:08:47 ..... processing annotations GTF
Jan 31 13:08:59 ... starting to sort Suffix Array. This may take a long time...
Jan 31 13:09:08 ... sorting Suffix Array chunks and saving them to disk...
Jan 31 13:15:51 ... loading chunks from disk, packing SA...
Jan 31 13:16:48 ... finished generating suffix array
Jan 31 13:16:48 ... generating Suffix Array index
Jan 31 13:

# Callithrix jacchus

In [16]:
# set the organism
organism = "Callithrix jacchus"

In [17]:
# format
org_str = organism.replace(" ", "_")
work_dir = os.path.join(base_dir, org_str)
os.makedirs(work_dir, exist_ok=True)

In [18]:
# show available assemblies
!genomepy search --provider NCBI {organism}

[1mname                 provider accession         tax_id annotation species                                  other_info                              [0m
[0mCallithrix_jacchus_cj1700_1.0 NCBI     GCA_009663435.1     9483     [31m✗[39m      Callithrix jacchus                       McDonnell Genome Institute at Washington University[0m
[0mmCalJac1.pat         NCBI     GCA_011100535.1     9483     [31m✗[39m      Callithrix jacchus                       Vertebrate Genomes Project              [0m
[0mmCalJac1.pat.X       NCBI     GCA_011100555.1     9483     [31m✗[39m      Callithrix jacchus                       Vertebrate Genomes Project              [0m
[0mCallithrix_jacchus-3.2 NCBI     GCA_000004665.1     9483     [31m✗[39m      Callithrix jacchus                       Washington University (WashU)           [0m
[0mCallithrix_jacchus_cj1700_1.1 NCBI     GCA_009663435.2     9483     [31m✗[39m      Callithrix jacchus                       McDonnell Genome Institute a

In [19]:
# show available assemblies
!genomepy search --provider Ensembl {organism}

[1mname                 provider accession         tax_id annotation species                                  other_info                              [0m
[0mmCalJac1.pat.X       Ensembl  GCA_011100555.1     9483     [32m✓[39m      Callithrix jacchus                       2020-08-Ensembl/2020-11                 [0m
[0m[32m ^[0m
[0m[32m Use name for [36mgenomepy install[0m
[0m[0m

In [None]:
# set the genome name
genome_name = "mCalJac1.pat.X"

In [17]:
# download genome
!genomepy install --provider Ensembl --annotation --genomes_dir {work_dir} {genome_name}

[32m13:27:30[0m [1m|[0m [34mINFO[0m [1m|[0m Downloading genome from Ensembl. Target URL: http://ftp.ensembl.org/pub/release-113/fasta/callithrix_jacchus/dna/Callithrix_jacchus.mCalJac1.pat.X.dna_sm.toplevel.fa.gz...
Download: 100%|██████████████████████████████| 841M/841M [00:27<00:00, 32.4MB/s][0m[0m
[0m[32m13:27:57[0m [1m|[0m [34mINFO[0m [1m|[0m Genome download successful, starting post processing...
[32m13:28:11[0m [1m|[0m [34mINFO[0m [1m|[0m name: mCalJac1.pat.X
[32m13:28:11[0m [1m|[0m [34mINFO[0m [1m|[0m local name: mCalJac1.pat.X
[32m13:28:11[0m [1m|[0m [34mINFO[0m [1m|[0m fasta: /scratch/multiomics/nickyoungblut/star_refs/Callithrix_jacchus/mCalJac1.pat.X/mCalJac1.pat.X.fa
Filtering Fasta: 47.2M lines [00:12, 3.92M lines/s][0m[0m
[0m[32m13:29:04[0m [1m|[0m [34mINFO[0m [1m|[0m Downloading annotation from Ensembl. Target URL: http://ftp.ensembl.org/pub/release-113/gtf/callithrix_jacchus/Callithrix_jacchus.mCalJac1.pat.X.113.gtf

In [18]:
# get genome fasta
fasta_file = glob(f"{work_dir}/{genome_name}/*.fa")[0]
fasta_file

'/scratch/multiomics/nickyoungblut/star_refs/Callithrix_jacchus/mCalJac1.pat.X/mCalJac1.pat.X.fa'

In [19]:
# get gtf file
gtf_file = glob(f"{work_dir}/{genome_name}/*.gtf")[0]
gtf_file

'/scratch/multiomics/nickyoungblut/star_refs/Callithrix_jacchus/mCalJac1.pat.X/mCalJac1.pat.X.annotation.gtf'

In [20]:
# format the reference
exe = os.path.join(script_dir, "format-star-ref.py")
!{exe} --organism "{organism}"  --output-dir {work_dir} {gtf_file} 

Processing GTF: mCalJac1.pat.X.annotation.gtf
Output GTF: /scratch/multiomics/nickyoungblut/star_refs/Callithrix_jacchus/Callithrix_jacchus/Callithrix_jacchus.gtf
Total records in GTF: 1448697
Filtered 38300 records by biotype
Filtered 0 records by tag
-- Count of biotypes filtered --
misc_RNA: 12612
snRNA: 9249
snoRNA: 4503
pseudogene: 4061
Y_RNA: 2811
processed_pseudogene: 2598
miRNA: 1620
rRNA: 639
scaRNA: 147
vault_RNA: 48
ribozyme: 12
-- Count of biotypes kept --
protein_coding: 2703693
lncRNA: 87632
IG_V_gene: 709
TR_V_gene: 254
TR_J_gene: 108
TR_C_gene: 35
IG_C_gene: 17
----------------------------


In [21]:
# set the filtered gtf file
gtf_filt_file = os.path.join(work_dir, org_str, f"{org_str}.gtf")
os.path.exists(gtf_filt_file)

True

In [22]:
# create the star index
star_dir = os.path.join(work_dir, "star")
os.makedirs(star_dir, exist_ok=True)

!STAR --runThreadN 16 \
    --runMode genomeGenerate \
    --genomeDir {star_dir} \
    --genomeFastaFiles {fasta_file} \
    --sjdbGTFfile {gtf_filt_file} \
    --sjdbOverhang 100 

	/home/nickyoungblut/miniforge3/envs/asmbl/bin/STAR-avx2 --runThreadN 16 --runMode genomeGenerate --genomeDir /scratch/multiomics/nickyoungblut/star_refs/Callithrix_jacchus/star --genomeFastaFiles /scratch/multiomics/nickyoungblut/star_refs/Callithrix_jacchus/mCalJac1.pat.X/mCalJac1.pat.X.fa --sjdbGTFfile /scratch/multiomics/nickyoungblut/star_refs/Callithrix_jacchus/Callithrix_jacchus/Callithrix_jacchus.gtf --sjdbOverhang 100
	STAR version: 2.7.11b   compiled: 2025-01-09T12:32:06+0000 :/opt/conda/conda-bld/star_1736425784849/work/source
Jan 31 13:29:49 ..... started STAR run

Jan 31 13:29:49 ... starting to generate Genome files
Jan 31 13:30:27 ..... processing annotations GTF
Jan 31 13:30:40 ... starting to sort Suffix Array. This may take a long time...
Jan 31 13:30:49 ... sorting Suffix Array chunks and saving them to disk...
Jan 31 13:38:42 ... loading chunks from disk, packing SA...
Jan 31 13:39:49 ... finished generating suffix array
Jan 31 13:39:49 ... generating Suffix Array i

# Equus caballus

In [20]:
organism = "Equus caballus"

In [27]:
# format
org_str = organism.replace(" ", "_")
work_dir = os.path.join(base_dir, org_str)
os.makedirs(work_dir, exist_ok=True)

In [21]:
# show available assemblies
!genomepy search --provider NCBI {organism}

[1mname                 provider accession         tax_id annotation species                                  other_info                              [0m
[0mASM216690v1          NCBI     GCA_002166905.1     9796     [31m✗[39m      Equus caballus                           Institute of Animal Breeding and Genetics - Vetmeduni Vienna[0m
[0mEquCab2.0            NCBI     GCA_000002305.1     9796     [31m✗[39m      Equus caballus                           The Genome Assembly Team                [0m
[0mEquCab3.0            NCBI     GCA_002863925.1     9796     [31m✗[39m      Equus caballus                           University of Louisville                [0m
[0mAjinai1.0            NCBI     GCA_000696655.1     9796     [31m✗[39m      Equus caballus                           College of Animal Science, Inner Mongolian Agricultural University, China[0m
[0mViralProj15487       NCBI     GCF_000866385.1   333919     [32m✓[39m      Equus caballus papillomavirus 1          Cervic

In [22]:
# show available assemblies
!genomepy search --provider Ensembl {organism}

[1mname                 provider accession         tax_id annotation species                                  other_info                              [0m
[0mEquCab3.0            Ensembl  GCA_002863925.1     9796     [32m✓[39m      Equus caballus                           2021-10-Ensembl/2022-01                 [0m
[0m[32m ^[0m
[0m[32m Use name for [36mgenomepy install[0m
[0m[0m

In [None]:
# set the genome name
genome_name = "EquCab3.0"

In [27]:
# download genome
!genomepy install --provider Ensembl --annotation --genomes_dir {work_dir} {genome_name}

[32m13:46:41[0m [1m|[0m [34mINFO[0m [1m|[0m Downloading genome from Ensembl. Target URL: http://ftp.ensembl.org/pub/release-113/fasta/equus_caballus/dna/Equus_caballus.EquCab3.0.dna_sm.toplevel.fa.gz...
Download: 100%|██████████████████████████████| 785M/785M [00:25<00:00, 32.5MB/s][0m[0m
[0m[32m13:47:07[0m [1m|[0m [34mINFO[0m [1m|[0m Genome download successful, starting post processing...
[32m13:47:19[0m [1m|[0m [34mINFO[0m [1m|[0m name: EquCab3.0
[32m13:47:19[0m [1m|[0m [34mINFO[0m [1m|[0m local name: EquCab3.0
[32m13:47:19[0m [1m|[0m [34mINFO[0m [1m|[0m fasta: /scratch/multiomics/nickyoungblut/star_refs/Equus_caballus/EquCab3.0/EquCab3.0.fa
Filtering Fasta: 41.8M lines [00:10, 4.08M lines/s][0m[0m
[0m[32m13:48:06[0m [1m|[0m [34mINFO[0m [1m|[0m Downloading annotation from Ensembl. Target URL: http://ftp.ensembl.org/pub/release-113/gtf/equus_caballus/Equus_caballus.EquCab3.0.113.gtf.gz...
Download: 100%|████████████████████████████

In [28]:
# get genome fasta
fasta_file = glob(f"{work_dir}/{genome_name}/*.fa")[0]
fasta_file

'/scratch/multiomics/nickyoungblut/star_refs/Equus_caballus/EquCab3.0/EquCab3.0.fa'

In [29]:
# get gtf file
gtf_file = glob(f"{work_dir}/{genome_name}/*.gtf")[0]
gtf_file

'/scratch/multiomics/nickyoungblut/star_refs/Equus_caballus/EquCab3.0/EquCab3.0.annotation.gtf'

In [30]:
# format the reference
exe = os.path.join(script_dir, "format-star-ref.py")
!{exe} --organism "{organism}"  --output-dir {work_dir} {gtf_file} 

Processing GTF: EquCab3.0.annotation.gtf
Output GTF: /scratch/multiomics/nickyoungblut/star_refs/Equus_caballus/Equus_caballus/Equus_caballus.gtf
Total records in GTF: 2371117
Filtered 7783 records by biotype
Filtered 0 records by tag
-- Count of biotypes filtered --
pseudogene: 2119
miRNA: 2055
snRNA: 1620
snoRNA: 1443
processed_pseudogene: 294
Mt_tRNA: 66
scaRNA: 60
Y_RNA: 42
rRNA: 36
misc_RNA: 27
ribozyme: 9
vault_RNA: 6
Mt_rRNA: 6
-- Count of biotypes kept --
protein_coding: 4351680
lncRNA: 337930
IG_V_gene: 168
TR_V_gene: 148
TR_C_gene: 81
TR_J_gene: 27
----------------------------


In [31]:
# set the filtered gtf file
gtf_filt_file = os.path.join(work_dir, org_str, f"{org_str}.gtf")
os.path.exists(gtf_filt_file)

True

In [32]:
# create the star index
star_dir = os.path.join(work_dir, "star")
os.makedirs(star_dir, exist_ok=True)

!STAR --runThreadN 16 \
    --runMode genomeGenerate \
    --genomeDir {star_dir} \
    --genomeFastaFiles {fasta_file} \
    --sjdbGTFfile {gtf_filt_file} \
    --sjdbOverhang 100 

	/home/nickyoungblut/miniforge3/envs/asmbl/bin/STAR-avx2 --runThreadN 16 --runMode genomeGenerate --genomeDir /scratch/multiomics/nickyoungblut/star_refs/Equus_caballus/star --genomeFastaFiles /scratch/multiomics/nickyoungblut/star_refs/Equus_caballus/EquCab3.0/EquCab3.0.fa --sjdbGTFfile /scratch/multiomics/nickyoungblut/star_refs/Equus_caballus/Equus_caballus/Equus_caballus.gtf --sjdbOverhang 100
	STAR version: 2.7.11b   compiled: 2025-01-09T12:32:06+0000 :/opt/conda/conda-bld/star_1736425784849/work/source
Jan 31 13:50:28 ..... started STAR run

Jan 31 13:50:28 ... starting to generate Genome files
Jan 31 13:51:06 ..... processing annotations GTF
Jan 31 13:51:23 ... starting to sort Suffix Array. This may take a long time...
Jan 31 13:51:34 ... sorting Suffix Array chunks and saving them to disk...
Jan 31 13:59:55 ... loading chunks from disk, packing SA...
Jan 31 14:00:55 ... finished generating suffix array
Jan 31 14:00:55 ... generating Suffix Array index
Jan 31 14:04:30 ... compl

# Canis lupus familiaris

In [23]:
organism = "Canis lupus familiaris"

In [24]:
# format
org_str = organism.replace(" ", "_")
work_dir = os.path.join(base_dir, org_str)
os.makedirs(work_dir, exist_ok=True)

In [26]:
# show available assemblies
!genomepy search --provider NCBI {organism}

[1mname                 provider accession         tax_id annotation species                                  other_info                              [0m
[0mCanFam3.1            NCBI     GCF_000002285.3     9612     [32m✓[39m      Canis lupus familiaris                   Dog Genome Sequencing Consortium        [0m
[0mBasenji_breed-1.1    NCBI     GCA_004886185.2     9612     [31m✗[39m      Canis lupus familiaris                   University of Missouri                  [0m
[0mASM864105v1          NCBI     GCA_008641055.1     9612     [31m✗[39m      Canis lupus familiaris                   James Cook University                   [0m
[0mASM864105v2          NCBI     GCA_008641055.2     9612     [31m✗[39m      Canis lupus familiaris                   James Cook University                   [0m
[0mUNSW_CanFamBas_1.0   NCBI     GCF_013276365.1     9612     [32m✓[39m      Canis lupus familiaris                   University of New South Wales           [0m
[0mUMICH_Zoey

In [25]:
# show available assemblies
!genomepy search --provider Ensembl {organism}

[1mname                 provider accession         tax_id annotation species                                  other_info                              [0m
[0mUMICH_Zoey_3.1       Ensembl  GCA_005444595.1     9615     [32m✓[39m      Canis lupus familiaris                   2019-07-Ensembl/2019-08                 [0m
[0mROS_Cfam_1.0         Ensembl  GCA_014441545.1     9615     [32m✓[39m      Canis lupus familiaris                   2020-10-Ensembl/2020-10                 [0m
[0mUU_Cfam_GSD_1.0      Ensembl  GCA_011100685.1     9615     [32m✓[39m      Canis lupus familiaris                   2021-03-Ensembl/2021-03                 [0m
[0mDog10K_Boxer_Tasha   Ensembl  GCA_000002285.4     9615     [32m✓[39m      Canis lupus familiaris                   2020-10-Ensembl/2020-10                 [0m
[0mBasenji_breed-1.1    Ensembl  GCA_004886185.1     9615     [32m✓[39m      Canis lupus familiaris                   2019-07-Ensembl/2019-08                 [0m
[0m[32m ^[0

In [None]:
# set the genome name
genome_name = "ROS_Cfam_1.0"

In [35]:
# download genome
!genomepy install --provider Ensembl --annotation --genomes_dir {work_dir} {genome_name}

[32m14:10:42[0m [1m|[0m [34mINFO[0m [1m|[0m Downloading genome from Ensembl. Target URL: http://ftp.ensembl.org/pub/release-113/fasta/canis_lupus_familiaris/dna/Canis_lupus_familiaris.ROS_Cfam_1.0.dna_sm.toplevel.fa.gz...
Download: 100%|██████████████████████████████| 742M/742M [00:23<00:00, 32.5MB/s][0m[0m
[0m[32m14:11:06[0m [1m|[0m [34mINFO[0m [1m|[0m Genome download successful, starting post processing...
[32m14:11:18[0m [1m|[0m [34mINFO[0m [1m|[0m name: ROS_Cfam_1.0
[32m14:11:18[0m [1m|[0m [34mINFO[0m [1m|[0m local name: ROS_Cfam_1.0
[32m14:11:18[0m [1m|[0m [34mINFO[0m [1m|[0m fasta: /scratch/multiomics/nickyoungblut/star_refs/Canis_lupus_familiaris/ROS_Cfam_1.0/ROS_Cfam_1.0.fa
Filtering Fasta: 39.9M lines [00:09, 4.10M lines/s][0m[0m
[0m[32m14:12:04[0m [1m|[0m [34mINFO[0m [1m|[0m Downloading annotation from Ensembl. Target URL: http://ftp.ensembl.org/pub/release-113/gtf/canis_lupus_familiaris/Canis_lupus_familiaris.ROS_Cfam_1.0

In [36]:
# get genome fasta
fasta_file = glob(f"{work_dir}/{genome_name}/*.fa")[0]
fasta_file

'/scratch/multiomics/nickyoungblut/star_refs/Canis_lupus_familiaris/ROS_Cfam_1.0/ROS_Cfam_1.0.fa'

In [37]:
# get gtf file
gtf_file = glob(f"{work_dir}/{genome_name}/*.gtf")[0]
gtf_file

'/scratch/multiomics/nickyoungblut/star_refs/Canis_lupus_familiaris/ROS_Cfam_1.0/ROS_Cfam_1.0.annotation.gtf'

In [38]:
# format the reference
exe = os.path.join(script_dir, "format-star-ref.py")
!{exe} --organism "{organism}"  --output-dir {work_dir} {gtf_file} 

Processing GTF: ROS_Cfam_1.0.annotation.gtf
Output GTF: /scratch/multiomics/nickyoungblut/star_refs/Canis_lupus_familiaris/Canis_lupus_familiaris/Canis_lupus_familiaris.gtf
Total records in GTF: 1295258
Filtered 14652 records by biotype
Filtered 0 records by tag
-- Count of biotypes filtered --
snRNA: 5250
pseudogene: 3819
miRNA: 2958
snoRNA: 1854
processed_pseudogene: 438
scaRNA: 84
Y_RNA: 81
rRNA: 81
misc_RNA: 66
ribozyme: 18
vault_RNA: 3
-- Count of biotypes kept --
protein_coding: 2485196
lncRNA: 48257
IG_V_gene: 293
IG_C_gene: 276
TR_J_gene: 63
TR_C_gene: 43
TR_V_gene: 38
----------------------------


In [39]:
# set the filtered gtf file
gtf_filt_file = os.path.join(work_dir, org_str, f"{org_str}.gtf")
os.path.exists(gtf_filt_file)

True

In [40]:
# create the star index
star_dir = os.path.join(work_dir, "star")
os.makedirs(star_dir, exist_ok=True)

!STAR --runThreadN 16 \
    --runMode genomeGenerate \
    --genomeDir {star_dir} \
    --genomeFastaFiles {fasta_file} \
    --sjdbGTFfile {gtf_filt_file} \
    --sjdbOverhang 100 

	/home/nickyoungblut/miniforge3/envs/asmbl/bin/STAR-avx2 --runThreadN 16 --runMode genomeGenerate --genomeDir /scratch/multiomics/nickyoungblut/star_refs/Canis_lupus_familiaris/star --genomeFastaFiles /scratch/multiomics/nickyoungblut/star_refs/Canis_lupus_familiaris/ROS_Cfam_1.0/ROS_Cfam_1.0.fa --sjdbGTFfile /scratch/multiomics/nickyoungblut/star_refs/Canis_lupus_familiaris/Canis_lupus_familiaris/Canis_lupus_familiaris.gtf --sjdbOverhang 100
	STAR version: 2.7.11b   compiled: 2025-01-09T12:32:06+0000 :/opt/conda/conda-bld/star_1736425784849/work/source
Jan 31 14:12:47 ..... started STAR run

Jan 31 14:12:47 ... starting to generate Genome files
Jan 31 14:13:18 ..... processing annotations GTF
Jan 31 14:13:29 ... starting to sort Suffix Array. This may take a long time...
Jan 31 14:13:37 ... sorting Suffix Array chunks and saving them to disk...
Jan 31 14:19:38 ... loading chunks from disk, packing SA...
Jan 31 14:20:38 ... finished generating suffix array
Jan 31 14:20:38 ... generatin

# Bos taurus

In [27]:
organism = "Bos taurus"

In [28]:
# format
org_str = organism.replace(" ", "_")
work_dir = os.path.join(base_dir, org_str)
os.makedirs(work_dir, exist_ok=True)

In [29]:
# show available assemblies
!genomepy search --provider NCBI {organism}

[1mname                 provider accession         tax_id annotation species                                  other_info                              [0m
[0mBos_taurus_UMD_3.1.1 NCBI     GCA_000003055.5     9913     [32m✓[39m      Bos taurus                               Center for Bioinformatics and Computational Biology, University of Maryland[0m
[0mBtau_4.6.1           NCBI     GCF_000003205.5     9913     [32m✓[39m      Bos taurus                               Cattle Genome Sequencing International Consortium[0m
[0mBtau_5.0             NCBI     GCA_000003205.5     9913     [32m✓[39m      Bos taurus                               Cattle Genome Sequencing International Consortium[0m
[0mARS-UCD1.2           NCBI     GCF_002263795.1     9913     [32m✓[39m      Bos taurus                               USDA ARS                                [0m
[0mARS-UCD1.3           NCBI     GCF_002263795.2     9913     [32m✓[39m      Bos taurus                               USDA A

In [30]:
# show available assemblies
!genomepy search --provider Ensembl {organism}

[1mname                 provider accession         tax_id annotation species                                  other_info                              [0m
[0mUOA_Angus_1          Ensembl  GCA_003369685.2    30522     [32m✓[39m      Bos indicus x Bos taurus                 2018-12-Ensembl/2019-03                 [0m
[0mUOA_Brahman_1        Ensembl  GCA_003369695.2    30522     [32m✓[39m      Bos indicus x Bos taurus                 2018-12-Ensembl/2019-03                 [0m
[0mARS-UCD1.3           Ensembl  GCA_002263795.3     9913     [32m✓[39m      Bos taurus                               2023-02-Ensembl/2023-06                 [0m
[0m[32m ^[0m
[0m[32m Use name for [36mgenomepy install[0m
[0m[0m

In [None]:
# set the genome name
genome_name = "ARS-UCD1.3"

In [None]:
# download genome
!genomepy install --provider Ensembl --annotation --genomes_dir {work_dir} {genome_name}

[32m17:34:02[0m [1m|[0m [34mINFO[0m [1m|[0m Downloading genome from Ensembl. Target URL: http://ftp.ensembl.org/pub/release-113/fasta/bos_taurus/dna/Bos_taurus.ARS-UCD1.3.dna_sm.toplevel.fa.gz...
Download:  32%|█████████▋                    | 270M/833M [00:10<00:19, 30.7MB/s][0m

In [None]:
# get genome fasta
fasta_file = glob(f"{work_dir}/{genome_name}/*.fa")[0]
fasta_file

In [None]:
# get gtf file
gtf_file = glob(f"{work_dir}/{genome_name}/*.gtf")[0]
gtf_file

In [None]:
# format the reference
exe = os.path.join(script_dir, "format-star-ref.py")
!{exe} --organism "{organism}"  --output-dir {work_dir} {gtf_file} 

In [None]:
# set the filtered gtf file
gtf_filt_file = os.path.join(work_dir, org_str, f"{org_str}.gtf")
os.path.exists(gtf_filt_file)

In [None]:
# create the star index
star_dir = os.path.join(work_dir, "star")
os.makedirs(star_dir, exist_ok=True)

!STAR --runThreadN 16 \
    --runMode genomeGenerate \
    --genomeDir {star_dir} \
    --genomeFastaFiles {fasta_file} \
    --sjdbGTFfile {gtf_filt_file} \
    --sjdbOverhang 100 

# Ovis aries

In [31]:
organism = "Ovis aries"

In [32]:
# format
org_str = organism.replace(" ", "_")
work_dir = os.path.join(base_dir, org_str)
os.makedirs(work_dir, exist_ok=True)

In [33]:
# show available assemblies
!genomepy search --provider NCBI {organism}

[1mname                 provider accession         tax_id annotation species                                  other_info                              [0m
[0mOar_v3.1             NCBI     GCF_000298735.1     9940     [32m✓[39m      Ovis aries                               International Sheep Genome Consortium   [0m
[0mARS-UI_Ramb_v2.0     NCBI     GCF_016772045.1     9940     [32m✓[39m      Ovis aries                               University of Idaho                     [0m
[0mOar_v4.0             NCBI     GCA_000298735.2     9940     [31m✗[39m      Ovis aries                               International Sheep Genome Consortium   [0m
[0mOar_rambouillet_v1.0 NCBI     GCA_002742125.1     9940     [31m✗[39m      Ovis aries                               Baylor College of Medicine Human Genome Sequencing Center[0m
[0mOvis_aries_1.0       NCBI     GCA_000005525.1     9940     [31m✗[39m      Ovis aries                               International Sheep Genomics Consortium [

In [34]:
# show available assemblies
!genomepy search --provider Ensembl {organism}

[1mname                 provider accession         tax_id annotation species                                  other_info                              [0m
[0mARS-UI_Ramb_v2.0     Ensembl  GCA_016772045.1     9940     [32m✓[39m      Ovis aries                               2022-08-Ensembl                         [0m
[0mOar_v3.1             Ensembl  GCA_000298735.1     9940     [31m✗[39m      Ovis aries                               2012-12-Ensembl/2015-05                 [0m
[0mASM2241668v1         Ensembl  GCA_022416685.1     9940     [32m✓[39m      Ovis aries                               ENS01                                   [0m
[0mNWAFU_Friesian_1.0   Ensembl  GCA_018804185.1     9940     [32m✓[39m      Ovis aries                               ENS01                                   [0m
[0mASM2243283v1         Ensembl  GCA_022432835.1     9940     [32m✓[39m      Ovis aries                               ENS01                                   [0m
[0mASM2241691

In [None]:
# set the genome name
genome_name = "ARS-UI_Ramb_v2.0"

In [12]:
# download genome
!genomepy install --provider Ensembl --annotation --genomes_dir {work_dir} {genome_name}

[32m08:13:27[0m [1m|[0m [34mINFO[0m [1m|[0m Downloading genome from Ensembl. Target URL: http://ftp.ensembl.org/pub/release-113/fasta/ovis_aries_rambouillet/dna/Ovis_aries_rambouillet.ARS-UI_Ramb_v2.0.dna_sm.toplevel.fa.gz...
Download: 100%|██████████████████████████████| 817M/817M [00:26<00:00, 32.5MB/s][0m[0m
[0m[32m08:13:53[0m [1m|[0m [34mINFO[0m [1m|[0m Genome download successful, starting post processing...
[32m08:14:06[0m [1m|[0m [34mINFO[0m [1m|[0m name: ARS-UI_Ramb_v2.0
[32m08:14:06[0m [1m|[0m [34mINFO[0m [1m|[0m local name: ARS-UI_Ramb_v2.0
[32m08:14:06[0m [1m|[0m [34mINFO[0m [1m|[0m fasta: /scratch/multiomics/nickyoungblut/star_refs/Ovis_aries/ARS-UI_Ramb_v2.0/ARS-UI_Ramb_v2.0.fa
Filtering Fasta: 43.8M lines [00:10, 4.01M lines/s][0m[0m
[0m[32m08:14:54[0m [1m|[0m [34mINFO[0m [1m|[0m Downloading annotation from Ensembl. Target URL: http://ftp.ensembl.org/pub/release-113/gtf/ovis_aries_rambouillet/Ovis_aries_rambouillet.ARS-

In [13]:
# get genome fasta
fasta_file = glob(f"{work_dir}/{genome_name}/*.fa")[0]
fasta_file

'/scratch/multiomics/nickyoungblut/star_refs/Ovis_aries/ARS-UI_Ramb_v2.0/ARS-UI_Ramb_v2.0.fa'

In [14]:
# get gtf file
gtf_file = glob(f"{work_dir}/{genome_name}/*.gtf")[0]
gtf_file

'/scratch/multiomics/nickyoungblut/star_refs/Ovis_aries/ARS-UI_Ramb_v2.0/ARS-UI_Ramb_v2.0.annotation.gtf'

In [15]:
# format the reference
exe = os.path.join(script_dir, "format-star-ref.py")
!{exe} --organism "{organism}"  --output-dir {work_dir} {gtf_file} 

Processing GTF: ARS-UI_Ramb_v2.0.annotation.gtf
Output GTF: /scratch/multiomics/nickyoungblut/star_refs/Ovis_aries/Ovis_aries/Ovis_aries.gtf
Total records in GTF: 1463309
Filtered 9317 records by biotype
Filtered 0 records by tag
-- Count of biotypes filtered --
snRNA: 3699
snoRNA: 2094
pseudogene: 1901
miRNA: 1059
rRNA: 147
processed_pseudogene: 114
scaRNA: 93
Mt_tRNA: 66
misc_RNA: 57
Y_RNA: 33
vault_RNA: 30
ribozyme: 18
Mt_rRNA: 6
-- Count of biotypes kept --
protein_coding: 2776290
lncRNA: 102265
TR_V_gene: 842
IG_V_gene: 196
TR_J_gene: 195
IG_C_gene: 141
TR_C_gene: 63
----------------------------


In [16]:
# set the filtered gtf file
gtf_filt_file = os.path.join(work_dir, org_str, f"{org_str}.gtf")
os.path.exists(gtf_filt_file)

True

In [17]:
# create the star index
star_dir = os.path.join(work_dir, "star")
os.makedirs(star_dir, exist_ok=True)

!STAR --runThreadN 16 \
    --runMode genomeGenerate \
    --genomeDir {star_dir} \
    --genomeFastaFiles {fasta_file} \
    --sjdbGTFfile {gtf_filt_file} \
    --sjdbOverhang 100 

	/home/nickyoungblut/miniforge3/envs/asmbl/bin/STAR-avx2 --runThreadN 16 --runMode genomeGenerate --genomeDir /scratch/multiomics/nickyoungblut/star_refs/Ovis_aries/star --genomeFastaFiles /scratch/multiomics/nickyoungblut/star_refs/Ovis_aries/ARS-UI_Ramb_v2.0/ARS-UI_Ramb_v2.0.fa --sjdbGTFfile /scratch/multiomics/nickyoungblut/star_refs/Ovis_aries/Ovis_aries/Ovis_aries.gtf --sjdbOverhang 100
	STAR version: 2.7.11b   compiled: 2025-01-09T12:32:06+0000 :/opt/conda/conda-bld/star_1736425784849/work/source
Feb 01 08:15:23 ..... started STAR run

Feb 01 08:15:23 ... starting to generate Genome files
Feb 01 08:15:58 ..... processing annotations GTF
Feb 01 08:16:11 ... starting to sort Suffix Array. This may take a long time...
Feb 01 08:16:20 ... sorting Suffix Array chunks and saving them to disk...
Feb 01 08:23:54 ... loading chunks from disk, packing SA...
Feb 01 08:24:52 ... finished generating suffix array
Feb 01 08:24:52 ... generating Suffix Array index
Feb 01 08:28:16 ... completed S

# Sus scrofa

In [35]:
organism = "Sus scrofa"

In [36]:
# format
org_str = organism.replace(" ", "_")
work_dir = os.path.join(base_dir, org_str)
os.makedirs(work_dir, exist_ok=True)

In [37]:
# show available assemblies
!genomepy search --provider NCBI {organism}

[1mname                 provider accession         tax_id annotation species                                  other_info                              [0m
[0mSscrofa10.2          NCBI     GCF_000003025.5     9823     [32m✓[39m      Sus scrofa                               The Swine Genome Sequencing Consortium (SGSC)[0m
[0mSscrofa11            NCBI     GCA_000003025.5     9823     [32m✓[39m      Sus scrofa                               The Swine Genome Sequencing Consortium (SGSC)[0m
[0mminipig_v1.0         NCBI     GCA_000325925.2     9823     [31m✗[39m      Sus scrofa                               BGI-shenzhen                            [0m
[0mTibetan_Pig_v1.0     NCBI     GCA_000472085.1     9823     [32m✓[39m      Sus scrofa                               Novogene                                [0m
[0mNSME_pig_1           NCBI     GCA_006511355.1     9823     [31m✗[39m      Sus scrofa scrofa                        University of Messina                   [0m
[0m

In [38]:
# show available assemblies
!genomepy search --provider Ensembl {organism}

[1mname                 provider accession         tax_id annotation species                                  other_info                              [0m
[0mPietrain_pig_v1      Ensembl  GCA_001700255.1     9823     [32m✓[39m      Sus scrofa                               2019-01-Ensembl/2019-06                 [0m
[0mLarge_White_v1       Ensembl  GCA_001700135.1     9823     [32m✓[39m      Sus scrofa                               2019-01-Ensembl/2019-06                 [0m
[0mBamei_pig_v1         Ensembl  GCA_001700235.1     9823     [32m✓[39m      Sus scrofa                               2019-01-Ensembl/2019-06                 [0m
[0mTibetan_Pig_v2       Ensembl  GCA_000472085.2     9823     [32m✓[39m      Sus scrofa                               2019-01-Ensembl/2019-06                 [0m
[0mSscrofa11.1          Ensembl  GCA_000003025.6     9823     [32m✓[39m      Sus scrofa                               HAV01                                   [0m
[0mLandrace_p

In [None]:
# set the genome name
genome_name = "Sscrofa11.1"

In [44]:
# download genome
!genomepy install --provider Ensembl --annotation --genomes_dir {work_dir} {genome_name}

[32m08:37:06[0m [1m|[0m [34mINFO[0m [1m|[0m Downloading genome from Ensembl. Target URL: http://ftp.ensembl.org/pub/release-113/fasta/sus_scrofa/dna/Sus_scrofa.Sscrofa11.1.dna_sm.toplevel.fa.gz...
Download: 100%|██████████████████████████████| 766M/766M [00:24<00:00, 32.4MB/s][0m[0m
[0m[32m08:37:31[0m [1m|[0m [34mINFO[0m [1m|[0m Genome download successful, starting post processing...
[32m08:37:43[0m [1m|[0m [34mINFO[0m [1m|[0m name: Sscrofa11.1
[32m08:37:43[0m [1m|[0m [34mINFO[0m [1m|[0m local name: Sscrofa11.1
[32m08:37:43[0m [1m|[0m [34mINFO[0m [1m|[0m fasta: /scratch/multiomics/nickyoungblut/star_refs/Sus_scrofa/Sscrofa11.1/Sscrofa11.1.fa
Filtering Fasta: 41.7M lines [00:10, 3.95M lines/s][0m[0m
[0m[32m08:38:30[0m [1m|[0m [34mINFO[0m [1m|[0m Downloading annotation from Ensembl. Target URL: http://ftp.ensembl.org/pub/release-113/gtf/sus_scrofa/Sus_scrofa.Sscrofa11.1.113.gtf.gz...
Download: 100%|████████████████████████████| 17.9M/

In [45]:
# get genome fasta
fasta_file = glob(f"{work_dir}/{genome_name}/*.fa")[0]
fasta_file

'/scratch/multiomics/nickyoungblut/star_refs/Sus_scrofa/Sscrofa11.1/Sscrofa11.1.fa'

In [46]:
# get gtf file
gtf_file = glob(f"{work_dir}/{genome_name}/*.gtf")[0]
gtf_file

'/scratch/multiomics/nickyoungblut/star_refs/Sus_scrofa/Sscrofa11.1/Sscrofa11.1.annotation.gtf'

In [47]:
# format the reference
exe = os.path.join(script_dir, "format-star-ref.py")
!{exe} --organism "{organism}"  --output-dir {work_dir} {gtf_file} 

Processing GTF: Sscrofa11.1.annotation.gtf
Output GTF: /scratch/multiomics/nickyoungblut/star_refs/Sus_scrofa/Sus_scrofa/Sus_scrofa.gtf
Total records in GTF: 1366243
Filtered 16502 records by biotype
Filtered 0 records by tag
-- Count of biotypes filtered --
nonsense_mediated_decay: 6350
snRNA: 3312
pseudogene: 2054
snoRNA: 1773
miRNA: 1164
processed_pseudogene: 697
protein_coding_CDS_not_defined: 273
retained_intron: 242
unitary_pseudogene: 156
unprocessed_pseudogene: 108
scaRNA: 75
Mt_tRNA: 66
rRNA: 63
misc_RNA: 51
transcribed_unprocessed_pseudogene: 46
Y_RNA: 27
ribozyme: 21
translated_unprocessed_pseudogene: 9
vault_RNA: 9
Mt_rRNA: 6
-- Count of biotypes kept --
protein_coding: 2596185
lncRNA: 76916
IG_V_gene: 104
TR_V_gene: 78
TR_J_gene: 36
IG_C_gene: 17
----------------------------


In [48]:
# set the filtered gtf file
gtf_filt_file = os.path.join(work_dir, org_str, f"{org_str}.gtf")
os.path.exists(gtf_filt_file)

True

In [49]:
# create the star index
star_dir = os.path.join(work_dir, "star")
os.makedirs(star_dir, exist_ok=True)

!STAR --runThreadN 16 \
    --runMode genomeGenerate \
    --genomeDir {star_dir} \
    --genomeFastaFiles {fasta_file} \
    --sjdbGTFfile {gtf_filt_file} \
    --sjdbOverhang 100 

	/home/nickyoungblut/miniforge3/envs/asmbl/bin/STAR-avx2 --runThreadN 16 --runMode genomeGenerate --genomeDir /scratch/multiomics/nickyoungblut/star_refs/Sus_scrofa/star --genomeFastaFiles /scratch/multiomics/nickyoungblut/star_refs/Sus_scrofa/Sscrofa11.1/Sscrofa11.1.fa --sjdbGTFfile /scratch/multiomics/nickyoungblut/star_refs/Sus_scrofa/Sus_scrofa/Sus_scrofa.gtf --sjdbOverhang 100
	STAR version: 2.7.11b   compiled: 2025-01-09T12:32:06+0000 :/opt/conda/conda-bld/star_1736425784849/work/source
Feb 01 08:38:56 ..... started STAR run

Feb 01 08:38:56 ... starting to generate Genome files
Feb 01 08:39:30 ..... processing annotations GTF
Feb 01 08:39:42 ... starting to sort Suffix Array. This may take a long time...
Feb 01 08:39:50 ... sorting Suffix Array chunks and saving them to disk...
Feb 01 08:45:53 ... loading chunks from disk, packing SA...
Feb 01 08:46:54 ... finished generating suffix array
Feb 01 08:46:54 ... generating Suffix Array index
Feb 01 08:50:19 ... completed Suffix Arra

# Heterocephalus glaber

In [39]:
organism = "Heterocephalus glaber"

In [40]:
# format
org_str = organism.replace(" ", "_")
work_dir = os.path.join(base_dir, org_str)
os.makedirs(work_dir, exist_ok=True)

In [41]:
# show available assemblies
!genomepy search --provider NCBI {organism}

[1mname                 provider accession         tax_id annotation species                                  other_info                              [0m
[0mHetGla_1.0           NCBI     GCA_000230445.1    10181     [32m✓[39m      Heterocephalus glaber                    Beijing Genomics Institute              [0m
[0mHetGla_female_1.0    NCBI     GCF_000247695.1    10181     [32m✓[39m      Heterocephalus glaber                    Broad Institute                         [0m
[0mHeter_glaber.v1.7_hic_pac NCBI     GCA_014060925.1    10181     [31m✗[39m      Heterocephalus glaber                    Institute of Zoology, Chinese Academy of Sciences[0m
[0mNaked_mole-rat_maternal NCBI     GCA_944319715.1    10181     [31m✗[39m      Heterocephalus glaber                    hospital for sick children              [0m
[0mNaked_mole-rat_paternal NCBI     GCA_944319725.1    10181     [31m✗[39m      Heterocephalus glaber                    hospital for sick children             

In [42]:
# show available assemblies
!genomepy search --provider Ensembl {organism}

[1mname                 provider accession         tax_id annotation species                                  other_info                              [0m
[0mNaked_mole-rat_maternal Ensembl  GCA_944319715.1    10181     [32m✓[39m      Heterocephalus glaber                    2022-08-Ensembl/2022-09                 [0m
[0mNaked_mole-rat_paternal Ensembl  GCA_944319725.1    10181     [32m✓[39m      Heterocephalus glaber                    2022-08-Ensembl/2022-09                 [0m
[0m[32m ^[0m
[0m[32m Use name for [36mgenomepy install[0m
[0m[0m

In [None]:
# set the genome name
genome_name = "Naked_mole-rat_maternal"

In [53]:
# download genome
!genomepy install --provider Ensembl --annotation --genomes_dir {work_dir} {genome_name}

[32m08:53:29[0m [1m|[0m [34mINFO[0m [1m|[0m Downloading genome from Ensembl. Target URL: http://ftp.ensembl.org/pub/release-113/fasta/heterocephalus_glaber_female/dna/Heterocephalus_glaber_female.Naked_mole-rat_maternal.dna_sm.toplevel.fa.gz...
Download: 100%|██████████████████████████████| 780M/780M [00:25<00:00, 32.4MB/s][0m[0m
[0m[32m08:53:55[0m [1m|[0m [34mINFO[0m [1m|[0m Genome download successful, starting post processing...
[32m08:54:07[0m [1m|[0m [34mINFO[0m [1m|[0m name: Naked_mole-rat_maternal
[32m08:54:07[0m [1m|[0m [34mINFO[0m [1m|[0m local name: Naked_mole-rat_maternal
[32m08:54:07[0m [1m|[0m [34mINFO[0m [1m|[0m fasta: /scratch/multiomics/nickyoungblut/star_refs/Heterocephalus_glaber/Naked_mole-rat_maternal/Naked_mole-rat_maternal.fa
Filtering Fasta: 41.7M lines [00:10, 4.04M lines/s][0m[0m
[0m[32m08:54:53[0m [1m|[0m [34mINFO[0m [1m|[0m Downloading annotation from Ensembl. Target URL: http://ftp.ensembl.org/pub/release-

In [54]:
# get genome fasta
fasta_file = glob(f"{work_dir}/{genome_name}/*.fa")[0]
fasta_file

'/scratch/multiomics/nickyoungblut/star_refs/Heterocephalus_glaber/Naked_mole-rat_maternal/Naked_mole-rat_maternal.fa'

In [55]:
# get gtf file
gtf_file = glob(f"{work_dir}/{genome_name}/*.gtf")[0]
gtf_file

'/scratch/multiomics/nickyoungblut/star_refs/Heterocephalus_glaber/Naked_mole-rat_maternal/Naked_mole-rat_maternal.annotation.gtf'

In [56]:
# format the reference
exe = os.path.join(script_dir, "format-star-ref.py")
!{exe} --organism "{organism}"  --output-dir {work_dir} {gtf_file} 

Processing GTF: Naked_mole-rat_maternal.annotation.gtf
Output GTF: /scratch/multiomics/nickyoungblut/star_refs/Heterocephalus_glaber/Heterocephalus_glaber/Heterocephalus_glaber.gtf
Total records in GTF: 1411133
Filtered 22118 records by biotype
Filtered 0 records by tag
-- Count of biotypes filtered --
pseudogene: 13493
snRNA: 5082
snoRNA: 2076
processed_pseudogene: 786
miRNA: 237
misc_RNA: 111
scaRNA: 99
Mt_tRNA: 66
Y_RNA: 57
ribozyme: 51
rRNA: 27
vault_RNA: 27
Mt_rRNA: 6
-- Count of biotypes kept --
protein_coding: 2594851
lncRNA: 151628
IG_V_gene: 234
TR_J_gene: 27
TR_V_gene: 9
TR_C_gene: 9
----------------------------


In [57]:
# set the filtered gtf file
gtf_filt_file = os.path.join(work_dir, org_str, f"{org_str}.gtf")
os.path.exists(gtf_filt_file)

True

In [58]:
# create the star index
star_dir = os.path.join(work_dir, "star")
os.makedirs(star_dir, exist_ok=True)

!STAR --runThreadN 16 \
    --runMode genomeGenerate \
    --genomeDir {star_dir} \
    --genomeFastaFiles {fasta_file} \
    --sjdbGTFfile {gtf_filt_file} \
    --sjdbOverhang 100 

	/home/nickyoungblut/miniforge3/envs/asmbl/bin/STAR-avx2 --runThreadN 16 --runMode genomeGenerate --genomeDir /scratch/multiomics/nickyoungblut/star_refs/Heterocephalus_glaber/star --genomeFastaFiles /scratch/multiomics/nickyoungblut/star_refs/Heterocephalus_glaber/Naked_mole-rat_maternal/Naked_mole-rat_maternal.fa --sjdbGTFfile /scratch/multiomics/nickyoungblut/star_refs/Heterocephalus_glaber/Heterocephalus_glaber/Heterocephalus_glaber.gtf --sjdbOverhang 100
	STAR version: 2.7.11b   compiled: 2025-01-09T12:32:06+0000 :/opt/conda/conda-bld/star_1736425784849/work/source
Feb 01 08:55:20 ..... started STAR run

Feb 01 08:55:20 ... starting to generate Genome files
Feb 01 08:55:53 ..... processing annotations GTF
Feb 01 08:56:05 ... starting to sort Suffix Array. This may take a long time...
Feb 01 08:56:13 ... sorting Suffix Array chunks and saving them to disk...
Feb 01 09:02:31 ... loading chunks from disk, packing SA...
Feb 01 09:03:31 ... finished generating suffix array
Feb 01 09:03

# Oryctolagus cuniculus

In [43]:
organism = "Oryctolagus cuniculus"

In [44]:
# format
org_str = organism.replace(" ", "_")
work_dir = os.path.join(base_dir, org_str)
os.makedirs(work_dir, exist_ok=True)

In [45]:
# show available assemblies
!genomepy search --provider NCBI {organism}

[1mname                 provider accession         tax_id annotation species                                  other_info                              [0m
[0mUM_NZW_0.0           NCBI     GCA_009806435.1     9986     [31m✗[39m      Oryctolagus cuniculus                    Shanghai Institutes for Biological Sciences[0m
[0mOryCun2.0            NCBI     GCA_000003625.1     9986     [31m✗[39m      Oryctolagus cuniculus                    The Broad Institute of MIT and Harvard  [0m
[0mUM_NZW_1.0           NCBI     GCA_009806435.2     9986     [31m✗[39m      Oryctolagus cuniculus                    Shanghai Institutes for Biological Sciences[0m
[0mViralProj14057       NCBI     GCF_000837785.1   122291     [32m✓[39m      Oryctolagus cuniculus papillomavirus 1   The Jake Gittlen Cancer Research Institute, Department of Pathology, Pennsylvania State College of Medicine[0m
[0mOryCun3.0            NCBI     GCA_013371645.1     9986     [31m✗[39m      Oryctolagus cuniculus cunic

In [46]:
# show available assemblies
!genomepy search --provider Ensembl {organism}

[1mname                 provider accession         tax_id annotation species                                  other_info                              [0m
[0mOryCun2.0            Ensembl  GCA_000003625.1     9986     [32m✓[39m      Oryctolagus cuniculus                    2009-11-Ensembl/2019-05                 [0m
[0m[32m ^[0m
[0m[32m Use name for [36mgenomepy install[0m
[0m[0m

In [None]:
# set the genome name
genome_name = "OryCun2.0"

In [7]:
# download genome
!genomepy install --provider Ensembl --annotation --genomes_dir {work_dir} {genome_name}

[32m11:35:01[0m [1m|[0m [34mINFO[0m [1m|[0m Downloading genome from Ensembl. Target URL: http://ftp.ensembl.org/pub/release-113/fasta/oryctolagus_cuniculus/dna/Oryctolagus_cuniculus.OryCun2.0.dna_sm.toplevel.fa.gz...
Download: 100%|██████████████████████████████| 808M/808M [00:25<00:00, 32.7MB/s][0m[0m
[0m[32m11:35:27[0m [1m|[0m [34mINFO[0m [1m|[0m Genome download successful, starting post processing...
[32m11:35:40[0m [1m|[0m [34mINFO[0m [1m|[0m name: OryCun2.0
[32m11:35:40[0m [1m|[0m [34mINFO[0m [1m|[0m local name: OryCun2.0
[32m11:35:40[0m [1m|[0m [34mINFO[0m [1m|[0m fasta: /scratch/multiomics/nickyoungblut/star_refs/Oryctolagus_cuniculus/OryCun2.0/OryCun2.0.fa
Filtering Fasta: 45.6M lines [00:11, 3.95M lines/s][0m[0m
[0m[32m11:36:31[0m [1m|[0m [34mINFO[0m [1m|[0m Downloading annotation from Ensembl. Target URL: http://ftp.ensembl.org/pub/release-113/gtf/oryctolagus_cuniculus/Oryctolagus_cuniculus.OryCun2.0.113.gtf.gz...
Download

In [8]:
# get genome fasta
fasta_file = glob(f"{work_dir}/{genome_name}/*.fa")[0]
fasta_file

'/scratch/multiomics/nickyoungblut/star_refs/Oryctolagus_cuniculus/OryCun2.0/OryCun2.0.fa'

In [9]:
# get gtf file
gtf_file = glob(f"{work_dir}/{genome_name}/*.gtf")[0]
gtf_file

'/scratch/multiomics/nickyoungblut/star_refs/Oryctolagus_cuniculus/OryCun2.0/OryCun2.0.annotation.gtf'

In [10]:
# format the reference
exe = os.path.join(script_dir, "format-star-ref.py")
!{exe} --organism "{organism}"  --output-dir {work_dir} {gtf_file} 

Processing GTF: OryCun2.0.annotation.gtf
Output GTF: /scratch/multiomics/nickyoungblut/star_refs/Oryctolagus_cuniculus/Oryctolagus_cuniculus/Oryctolagus_cuniculus.gtf
Total records in GTF: 1141097
Filtered 12404 records by biotype
Filtered 0 records by tag
-- Count of biotypes filtered --
pseudogene: 4247
snRNA: 3879
snoRNA: 2505
miRNA: 864
processed_pseudogene: 387
misc_RNA: 189
scaRNA: 96
rRNA: 93
Mt_tRNA: 66
Y_RNA: 33
ribozyme: 21
vault_RNA: 18
Mt_rRNA: 6
-- Count of biotypes kept --
protein_coding: 2173515
lncRNA: 57393
TR_J_gene: 63
IG_V_gene: 33
IG_C_gene: 17
TR_V_gene: 15
TR_C_gene: 9
----------------------------


In [11]:
# set the filtered gtf file
gtf_filt_file = os.path.join(work_dir, org_str, f"{org_str}.gtf")
os.path.exists(gtf_filt_file)

True

In [12]:
# create the star index
star_dir = os.path.join(work_dir, "star")
os.makedirs(star_dir, exist_ok=True)

!STAR --runThreadN 16 \
    --runMode genomeGenerate \
    --genomeDir {star_dir} \
    --genomeFastaFiles {fasta_file} \
    --sjdbGTFfile {gtf_filt_file} \
    --sjdbOverhang 100 

	/home/nickyoungblut/miniforge3/envs/asmbl/bin/STAR-avx2 --runThreadN 16 --runMode genomeGenerate --genomeDir /scratch/multiomics/nickyoungblut/star_refs/Oryctolagus_cuniculus/star --genomeFastaFiles /scratch/multiomics/nickyoungblut/star_refs/Oryctolagus_cuniculus/OryCun2.0/OryCun2.0.fa --sjdbGTFfile /scratch/multiomics/nickyoungblut/star_refs/Oryctolagus_cuniculus/Oryctolagus_cuniculus/Oryctolagus_cuniculus.gtf --sjdbOverhang 100
	STAR version: 2.7.11b   compiled: 2025-01-09T12:32:06+0000 :/opt/conda/conda-bld/star_1736425784849/work/source
Feb 01 11:36:54 ..... started STAR run

Feb 01 11:36:55 ... starting to generate Genome files
Feb 01 11:37:32 ..... processing annotations GTF
Feb 01 11:37:46 ... starting to sort Suffix Array. This may take a long time...
Feb 01 11:37:56 ... sorting Suffix Array chunks and saving them to disk...
Feb 01 11:45:15 ... loading chunks from disk, packing SA...
Feb 01 11:46:14 ... finished generating suffix array
Feb 01 11:46:14 ... generating Suffix Ar

# Pan troglodytes

In [47]:
organism = "Pan troglodytes"


In [48]:
# format
org_str = organism.replace(" ", "_")
work_dir = os.path.join(base_dir, org_str)
os.makedirs(work_dir, exist_ok=True)

In [49]:
# show available assemblies
!genomepy search --provider NCBI {organism}

[1mname                 provider accession         tax_id annotation species                                  other_info                              [0m
[0mPan_troglodytes-2.1.4 NCBI     GCF_000001515.6     9598     [32m✓[39m      Pan troglodytes                          Chimpanzee Sequencing and Analysis Consortium[0m
[0mClint_PTR_v0         NCBI     GCA_002880755.1     9598     [32m✓[39m      Pan troglodytes                          University of Washington                [0m
[0mClint_PTR_v1         NCBI     GCA_002880755.2     9598     [32m✓[39m      Pan troglodytes                          University of Washington                [0m
[0mNHGRI_mPanTro3-v1.1-hic.freeze_pri NCBI     GCF_028858775.1     9598     [32m✓[39m      Pan troglodytes                          National Human Genome Research Institute, National Institutes of Health[0m
[0mNHGRI_mPanTro3-v2.0_pri NCBI     GCF_028858775.2     9598     [32m✓[39m      Pan troglodytes                          Natio

In [50]:
# show available assemblies
!genomepy search --provider Ensembl {organism}

[1mname                 provider accession         tax_id annotation species                                  other_info                              [0m
[0mPan_tro_3.0          Ensembl  GCA_000001515.5     9598     [32m✓[39m      Pan troglodytes                          2017-08-Ensembl/2020-03                 [0m
[0m[32m ^[0m
[0m[32m Use name for [36mgenomepy install[0m
[0m[0m

In [51]:
# set the genome name
genome_name = "Pan_tro_3.0"

In [22]:
# download genome
!genomepy install --provider Ensembl --annotation --genomes_dir {work_dir} {genome_name}

[32m11:55:36[0m [1m|[0m [34mINFO[0m [1m|[0m Downloading genome from Ensembl. Target URL: http://ftp.ensembl.org/pub/release-113/fasta/pan_troglodytes/dna/Pan_troglodytes.Pan_tro_3.0.dna_sm.toplevel.fa.gz...
Download: 100%|██████████████████████████████| 960M/960M [00:30<00:00, 32.6MB/s][0m[0m
[0m[32m11:56:07[0m [1m|[0m [34mINFO[0m [1m|[0m Genome download successful, starting post processing...
[32m11:56:22[0m [1m|[0m [34mINFO[0m [1m|[0m name: Pan_tro_3.0
[32m11:56:22[0m [1m|[0m [34mINFO[0m [1m|[0m local name: Pan_tro_3.0
[32m11:56:22[0m [1m|[0m [34mINFO[0m [1m|[0m fasta: /scratch/multiomics/nickyoungblut/star_refs/Pan_troglodytes/Pan_tro_3.0/Pan_tro_3.0.fa
Filtering Fasta: 53.9M lines [00:13, 4.04M lines/s][0m[0m
[0m[32m11:57:23[0m [1m|[0m [34mINFO[0m [1m|[0m Downloading annotation from Ensembl. Target URL: http://ftp.ensembl.org/pub/release-113/gtf/pan_troglodytes/Pan_troglodytes.Pan_tro_3.0.113.gtf.gz...
Download: 100%|███████████

In [23]:
# get genome fasta
fasta_file = glob(f"{work_dir}/{genome_name}/*.fa")[0]
fasta_file

'/scratch/multiomics/nickyoungblut/star_refs/Pan_troglodytes/Pan_tro_3.0/Pan_tro_3.0.fa'

In [24]:
# get gtf file
gtf_file = glob(f"{work_dir}/{genome_name}/*.gtf")[0]
gtf_file

'/scratch/multiomics/nickyoungblut/star_refs/Pan_troglodytes/Pan_tro_3.0/Pan_tro_3.0.annotation.gtf'

In [28]:
# format the reference
exe = os.path.join(script_dir, "format-star-ref.py")
!{exe} --organism "{organism}"  --output-dir {work_dir} {gtf_file} 

Processing GTF: Pan_tro_3.0.annotation.gtf
Output GTF: /scratch/multiomics/nickyoungblut/star_refs/Pan_troglodytes/Pan_troglodytes/Pan_troglodytes.gtf
Total records in GTF: 1296549
Filtered 39696 records by biotype
Filtered 0 records by tag
-- Count of biotypes filtered --
lincRNA: 14191
misc_RNA: 6852
miRNA: 6723
snRNA: 5709
snoRNA: 2634
rRNA: 1641
pseudogene: 1013
processed_pseudogene: 696
scaRNA: 141
Mt_tRNA: 66
ribozyme: 24
Mt_rRNA: 6
-- Count of biotypes kept --
protein_coding: 2487876
IG_V_gene: 1108
TR_V_gene: 791
IG_C_gene: 278
TR_C_gene: 119
----------------------------


In [29]:
# set the filtered gtf file
gtf_filt_file = os.path.join(work_dir, org_str, f"{org_str}.gtf")
os.path.exists(gtf_filt_file)

True

In [31]:
# create the star index
star_dir = os.path.join(work_dir, "star")
os.makedirs(star_dir, exist_ok=True)

!STAR --runThreadN 16 \
    --runMode genomeGenerate \
    --genomeDir {star_dir} \
    --genomeFastaFiles {fasta_file} \
    --sjdbGTFfile {gtf_filt_file} \
    --sjdbOverhang 100 \
    --limitGenomeGenerateRAM=61000000000

	/home/nickyoungblut/miniforge3/envs/asmbl/bin/STAR-avx2 --runThreadN 16 --runMode genomeGenerate --genomeDir /scratch/multiomics/nickyoungblut/star_refs/Pan_troglodytes/star --genomeFastaFiles /scratch/multiomics/nickyoungblut/star_refs/Pan_troglodytes/Pan_tro_3.0/Pan_tro_3.0.fa --sjdbGTFfile /scratch/multiomics/nickyoungblut/star_refs/Pan_troglodytes/Pan_troglodytes/Pan_troglodytes.gtf --sjdbOverhang 100 --limitGenomeGenerateRAM=61000000000
	STAR version: 2.7.11b   compiled: 2025-01-09T12:32:06+0000 :/opt/conda/conda-bld/star_1736425784849/work/source
Feb 01 11:59:39 ..... started STAR run

Feb 01 11:59:39 ... starting to generate Genome files
Feb 01 12:01:10 ..... processing annotations GTF
Feb 01 12:01:57 ... starting to sort Suffix Array. This may take a long time...
Feb 01 12:02:24 ... sorting Suffix Array chunks and saving them to disk...
Feb 01 12:12:17 ... loading chunks from disk, packing SA...
Feb 01 12:13:52 ... finished generating suffix array
Feb 01 12:13:52 ... generatin

# Gallus gallus

In [52]:
organism = "Gallus gallus"

In [53]:
# format
org_str = organism.replace(" ", "_")
work_dir = os.path.join(base_dir, org_str)
os.makedirs(work_dir, exist_ok=True)

In [55]:
# show available assemblies
!genomepy search --provider NCBI {organism}

[1mname                 provider accession         tax_id annotation species                                  other_info                              [0m
[0mGallus_gallus-4.0    NCBI     GCF_000002315.3     9031     [32m✓[39m      Gallus gallus                            International Chicken Genome Consortium [0m
[0mGallus_gallus-5.0    NCBI     GCF_000002315.4     9031     [32m✓[39m      Gallus gallus                            International Chicken Genome Consortium [0m
[0mGRCg6                NCBI     GCA_000002315.4     9031     [32m✓[39m      Gallus gallus                            Genome Reference Consortium             [0m
[0mbGalGal1.pat.whiteleghornlayer.GRCg7w NCBI     GCF_016700215.1     9031     [32m✓[39m      Gallus gallus                            Vertebrate Genomes Project              [0m
[0mASM2420605v1         NCBI     GCA_024206055.1     9031     [31m✗[39m      Gallus gallus                            University of Vienna                    [

In [54]:
# show available assemblies
!genomepy search --provider Ensembl {organism}

[1mname                 provider accession         tax_id annotation species                                  other_info                              [0m
[0mGRCg6a               Ensembl  GCA_000002315.5     9031     [32m✓[39m      Gallus gallus                            2021-09-Ensembl/2022-01                 [0m
[0mbGalGal1.pat.whiteleghornlayer.GRCg7w Ensembl  GCA_016700215.2     9031     [32m✓[39m      Gallus gallus                            2021-09-Ensembl/2022-01                 [0m
[0mbGalGal1.mat.broiler.GRCg7b Ensembl  GCA_016699485.1     9031     [32m✓[39m      Gallus gallus                            2021-09-Ensembl/2022-01                 [0m
[0m[32m ^[0m
[0m[32m Use name for [36mgenomepy install[0m
[0m[0m

In [None]:
# set the genome name
genome_name = "bGalGal1.mat.broiler.GRCg7b"

In [7]:
# download genome
!genomepy install --provider Ensembl --annotation --genomes_dir {work_dir} {genome_name}

[32m12:55:52[0m [1m|[0m [34mINFO[0m [1m|[0m Downloading genome from Ensembl. Target URL: http://ftp.ensembl.org/pub/release-113/fasta/gallus_gallus/dna/Gallus_gallus.bGalGal1.mat.broiler.GRCg7b.dna_sm.toplevel.fa.gz...
Download: 100%|██████████████████████████████| 316M/316M [00:10<00:00, 31.1MB/s][0m[0m
[0m[32m12:56:03[0m [1m|[0m [34mINFO[0m [1m|[0m Genome download successful, starting post processing...
[32m12:56:08[0m [1m|[0m [34mINFO[0m [1m|[0m name: bGalGal1.mat.broiler.GRCg7b
[32m12:56:08[0m [1m|[0m [34mINFO[0m [1m|[0m local name: bGalGal1.mat.broiler.GRCg7b
[32m12:56:08[0m [1m|[0m [34mINFO[0m [1m|[0m fasta: /scratch/multiomics/nickyoungblut/star_refs/Gallus_gallus/bGalGal1.mat.broiler.GRCg7b/bGalGal1.mat.broiler.GRCg7b.fa
Filtering Fasta: 17.6M lines [00:04, 4.13M lines/s][0m[0m
[0m[32m12:56:27[0m [1m|[0m [34mINFO[0m [1m|[0m Downloading annotation from Ensembl. Target URL: http://ftp.ensembl.org/pub/release-113/gtf/gallus_gal

In [8]:
# get genome fasta
fasta_file = glob(f"{work_dir}/{genome_name}/*.fa")[0]
fasta_file

'/scratch/multiomics/nickyoungblut/star_refs/Gallus_gallus/bGalGal1.mat.broiler.GRCg7b/bGalGal1.mat.broiler.GRCg7b.fa'

In [9]:
# get gtf file
gtf_file = glob(f"{work_dir}/{genome_name}/*.gtf")[0]
gtf_file

'/scratch/multiomics/nickyoungblut/star_refs/Gallus_gallus/bGalGal1.mat.broiler.GRCg7b/bGalGal1.mat.broiler.GRCg7b.annotation.gtf'

In [10]:
# format the reference
exe = os.path.join(script_dir, "format-star-ref.py")
!{exe} --organism "{organism}"  --output-dir {work_dir} {gtf_file} 

Processing GTF: bGalGal1.mat.broiler.GRCg7b.annotation.gtf
Output GTF: /scratch/multiomics/nickyoungblut/star_refs/Gallus_gallus/Gallus_gallus/Gallus_gallus.gtf
Total records in GTF: 1476088
Filtered 3588 records by biotype
Filtered 0 records by tag
-- Count of biotypes filtered --
miRNA: 2022
snoRNA: 582
rRNA: 288
pseudogene: 255
snRNA: 246
Mt_tRNA: 66
scaRNA: 45
processed_pseudogene: 45
misc_RNA: 15
Y_RNA: 6
ribozyme: 6
vault_RNA: 6
Mt_rRNA: 6
-- Count of biotypes kept --
protein_coding: 2677015
lncRNA: 239034
----------------------------


In [11]:
# set the filtered gtf file
gtf_filt_file = os.path.join(work_dir, org_str, f"{org_str}.gtf")
os.path.exists(gtf_filt_file)

True

In [12]:
# create the star index
star_dir = os.path.join(work_dir, "star")
os.makedirs(star_dir, exist_ok=True)

!STAR --runThreadN 16 \
    --runMode genomeGenerate \
    --genomeDir {star_dir} \
    --genomeFastaFiles {fasta_file} \
    --sjdbGTFfile {gtf_filt_file} \
    --sjdbOverhang 100

	/home/nickyoungblut/miniforge3/envs/asmbl/bin/STAR-avx2 --runThreadN 16 --runMode genomeGenerate --genomeDir /scratch/multiomics/nickyoungblut/star_refs/Gallus_gallus/star --genomeFastaFiles /scratch/multiomics/nickyoungblut/star_refs/Gallus_gallus/bGalGal1.mat.broiler.GRCg7b/bGalGal1.mat.broiler.GRCg7b.fa --sjdbGTFfile /scratch/multiomics/nickyoungblut/star_refs/Gallus_gallus/Gallus_gallus/Gallus_gallus.gtf --sjdbOverhang 100
	STAR version: 2.7.11b   compiled: 2025-01-09T12:32:06+0000 :/opt/conda/conda-bld/star_1736425784849/work/source
Feb 01 12:57:37 ..... started STAR run

Feb 01 12:57:37 ... starting to generate Genome files
Feb 01 12:57:51 ..... processing annotations GTF
Feb 01 12:57:58 ... starting to sort Suffix Array. This may take a long time...
Feb 01 12:58:02 ... sorting Suffix Array chunks and saving them to disk...
Feb 01 12:59:57 ... loading chunks from disk, packing SA...
Feb 01 13:00:20 ... finished generating suffix array
Feb 01 13:00:20 ... generating Suffix Array 

# Xenopus tropicalis

In [56]:
organism = "Xenopus tropicalis"

In [57]:
# format
org_str = organism.replace(" ", "_")
work_dir = os.path.join(base_dir, org_str)
os.makedirs(work_dir, exist_ok=True)

In [58]:
# show available assemblies
!genomepy search --provider NCBI {organism}

[1mname                 provider accession         tax_id annotation species                                  other_info                              [0m
[0mXtropicalis_v7       NCBI     GCF_000004195.2     8364     [32m✓[39m      Xenopus tropicalis                       DOE Joint Genome Institute              [0m
[0mXenopus_tropicalis_v9.1 NCBI     GCF_000004195.3     8364     [32m✓[39m      Xenopus tropicalis                       DOE Joint Genome Institute              [0m
[0mUCB_Xtro_10.0        NCBI     GCF_000004195.4     8364     [32m✓[39m      Xenopus tropicalis                       University of California, Berkeley      [0m
[0mASM1336827v1         NCBI     GCA_013368275.1     8364     [31m✗[39m      Xenopus tropicalis                       Southern University of Science and Technology[0m
[0m[32m ^[0m
[0m[32m Use name for [36mgenomepy install[0m
[0m[0m

In [59]:
# show available assemblies
!genomepy search --provider Ensembl {organism}

[1mname                 provider accession         tax_id annotation species                                  other_info                              [0m
[0mUCB_Xtro_10.0        Ensembl  GCA_000004195.4     8364     [32m✓[39m      Xenopus tropicalis                       2020-09-Ensembl/2021-02                 [0m
[0m[32m ^[0m
[0m[32m Use name for [36mgenomepy install[0m
[0m[0m

In [None]:
# set the genome name
genome_name = "UCB_Xtro_10.0"

In [16]:
# download genome
!genomepy install --provider Ensembl --annotation --genomes_dir {work_dir} {genome_name}

[32m13:22:41[0m [1m|[0m [34mINFO[0m [1m|[0m Downloading genome from Ensembl. Target URL: http://ftp.ensembl.org/pub/release-113/fasta/xenopus_tropicalis/dna/Xenopus_tropicalis.UCB_Xtro_10.0.dna_sm.toplevel.fa.gz...
Download: 100%|██████████████████████████████| 409M/409M [00:13<00:00, 31.2MB/s][0m[0m
[0m[32m13:22:55[0m [1m|[0m [34mINFO[0m [1m|[0m Genome download successful, starting post processing...
[32m13:23:01[0m [1m|[0m [34mINFO[0m [1m|[0m name: UCB_Xtro_10.0
[32m13:23:01[0m [1m|[0m [34mINFO[0m [1m|[0m local name: UCB_Xtro_10.0
[32m13:23:01[0m [1m|[0m [34mINFO[0m [1m|[0m fasta: /scratch/multiomics/nickyoungblut/star_refs/Xenopus_tropicalis/UCB_Xtro_10.0/UCB_Xtro_10.0.fa
Filtering Fasta: 24.2M lines [00:05, 4.06M lines/s][0m[0m
[0m[32m13:23:28[0m [1m|[0m [34mINFO[0m [1m|[0m Downloading annotation from Ensembl. Target URL: http://ftp.ensembl.org/pub/release-113/gtf/xenopus_tropicalis/Xenopus_tropicalis.UCB_Xtro_10.0.113.gtf.gz...

In [17]:
# get genome fasta
fasta_file = glob(f"{work_dir}/{genome_name}/*.fa")[0]
fasta_file

'/scratch/multiomics/nickyoungblut/star_refs/Xenopus_tropicalis/UCB_Xtro_10.0/UCB_Xtro_10.0.fa'

In [18]:
# get gtf file
gtf_file = glob(f"{work_dir}/{genome_name}/*.gtf")[0]
gtf_file

'/scratch/multiomics/nickyoungblut/star_refs/Xenopus_tropicalis/UCB_Xtro_10.0/UCB_Xtro_10.0.annotation.gtf'

In [19]:
# format the reference
exe = os.path.join(script_dir, "format-star-ref.py")
!{exe} --organism "{organism}"  --output-dir {work_dir} {gtf_file} 

Processing GTF: UCB_Xtro_10.0.annotation.gtf
Output GTF: /scratch/multiomics/nickyoungblut/star_refs/Xenopus_tropicalis/Xenopus_tropicalis/Xenopus_tropicalis.gtf
Total records in GTF: 1449685
Filtered 7753 records by biotype
Filtered 0 records by tag
-- Count of biotypes filtered --
snRNA: 2340
rRNA: 1599
processed_pseudogene: 1365
snoRNA: 1041
miRNA: 801
pseudogene: 319
misc_RNA: 72
Mt_tRNA: 66
ribozyme: 60
scaRNA: 54
vault_RNA: 18
Y_RNA: 12
Mt_rRNA: 6
-- Count of biotypes kept --
protein_coding: 2861708
IG_V_gene: 49
----------------------------


In [20]:
# set the filtered gtf file
gtf_filt_file = os.path.join(work_dir, org_str, f"{org_str}.gtf")
os.path.exists(gtf_filt_file)

True

In [21]:
# create the star index
star_dir = os.path.join(work_dir, "star")
os.makedirs(star_dir, exist_ok=True)

!STAR --runThreadN 16 \
    --runMode genomeGenerate \
    --genomeDir {star_dir} \
    --genomeFastaFiles {fasta_file} \
    --sjdbGTFfile {gtf_filt_file} \
    --sjdbOverhang 100

	/home/nickyoungblut/miniforge3/envs/asmbl/bin/STAR-avx2 --runThreadN 16 --runMode genomeGenerate --genomeDir /scratch/multiomics/nickyoungblut/star_refs/Xenopus_tropicalis/star --genomeFastaFiles /scratch/multiomics/nickyoungblut/star_refs/Xenopus_tropicalis/UCB_Xtro_10.0/UCB_Xtro_10.0.fa --sjdbGTFfile /scratch/multiomics/nickyoungblut/star_refs/Xenopus_tropicalis/Xenopus_tropicalis/Xenopus_tropicalis.gtf --sjdbOverhang 100
	STAR version: 2.7.11b   compiled: 2025-01-09T12:32:06+0000 :/opt/conda/conda-bld/star_1736425784849/work/source
Feb 01 13:23:56 ..... started STAR run

Feb 01 13:23:56 ... starting to generate Genome files
Feb 01 13:24:15 ..... processing annotations GTF
Feb 01 13:24:23 ... starting to sort Suffix Array. This may take a long time...
Feb 01 13:24:28 ... sorting Suffix Array chunks and saving them to disk...
Feb 01 13:27:56 ... loading chunks from disk, packing SA...
Feb 01 13:28:27 ... finished generating suffix array
Feb 01 13:28:27 ... generating Suffix Array ind

# Danio rerio

In [60]:
organism = "Danio rerio"

In [61]:
# format
org_str = organism.replace(" ", "_")
work_dir = os.path.join(base_dir, org_str)
os.makedirs(work_dir, exist_ok=True)

In [62]:
# show available assemblies
!genomepy search --provider NCBI {organism}

[1mname                 provider accession         tax_id annotation species                                  other_info                              [0m
[0mZv9                  NCBI     GCF_000002035.4     7955     [32m✓[39m      Danio rerio                              Wellcome Trust Sanger Institute         [0m
[0mGRCz10               NCBI     GCF_000002035.5     7955     [32m✓[39m      Danio rerio                              Genome Reference Consortium             [0m
[0mCG2v1.0              NCBI     GCA_001483285.1     7955     [31m✗[39m      Danio rerio                              University of Chicago                   [0m
[0mASM3317019v1         NCBI     GCA_033170195.1     7955     [31m✗[39m      Danio rerio                              National Human Genome Research Institute[0m
[0mfDreTuH1.1           NCBI     GCA_903684855.1     7955     [31m✗[39m      Danio rerio                              SC                                      [0m
[0mGRCz11    

In [63]:
# show available assemblies
!genomepy search --provider Ensembl {organism}

[1mname                 provider accession         tax_id annotation species                                  other_info                              [0m
[0mGRCz11               Ensembl  GCA_000002035.4     7955     [32m✓[39m      Danio rerio                              ENS01                                   [0m
[0m[32m ^[0m
[0m[32m Use name for [36mgenomepy install[0m
[0m[0m

In [None]:
# set the genome name
genome_name = "GRCz11"

In [25]:
# download genome
!genomepy install --provider Ensembl --annotation --genomes_dir {work_dir} {genome_name}

[32m13:38:58[0m [1m|[0m [34mINFO[0m [1m|[0m Downloading genome from Ensembl. Target URL: http://ftp.ensembl.org/pub/release-113/fasta/danio_rerio/dna/Danio_rerio.GRCz11.dna_sm.primary_assembly.fa.gz...
Download: 100%|██████████████████████████████| 418M/418M [00:14<00:00, 30.9MB/s][0m[0m
[0m[32m13:39:13[0m [1m|[0m [34mINFO[0m [1m|[0m Genome download successful, starting post processing...
[32m13:39:19[0m [1m|[0m [34mINFO[0m [1m|[0m name: GRCz11
[32m13:39:19[0m [1m|[0m [34mINFO[0m [1m|[0m local name: GRCz11
[32m13:39:19[0m [1m|[0m [34mINFO[0m [1m|[0m fasta: /scratch/multiomics/nickyoungblut/star_refs/Danio_rerio/GRCz11/GRCz11.fa
Filtering Fasta: 22.9M lines [00:05, 4.10M lines/s][0m[0m
[0m[32m13:39:45[0m [1m|[0m [34mINFO[0m [1m|[0m Downloading annotation from Ensembl. Target URL: http://ftp.ensembl.org/pub/release-113/gtf/danio_rerio/Danio_rerio.GRCz11.113.gtf.gz...
Download: 100%|████████████████████████████| 17.3M/17.3M [00:01<00:0

In [26]:
# get genome fasta
fasta_file = glob(f"{work_dir}/{genome_name}/*.fa")[0]
fasta_file

'/scratch/multiomics/nickyoungblut/star_refs/Danio_rerio/GRCz11/GRCz11.fa'

In [27]:
# get gtf file
gtf_file = glob(f"{work_dir}/{genome_name}/*.gtf")[0]
gtf_file

'/scratch/multiomics/nickyoungblut/star_refs/Danio_rerio/GRCz11/GRCz11.annotation.gtf'

In [28]:
# format the reference
exe = os.path.join(script_dir, "format-star-ref.py")
!{exe} --organism "{organism}"  --output-dir {work_dir} {gtf_file} 

Processing GTF: GRCz11.annotation.gtf
Output GTF: /scratch/multiomics/nickyoungblut/star_refs/Danio_rerio/Danio_rerio/Danio_rerio.gtf
Total records in GTF: 1161868
Filtered 75702 records by biotype
Filtered 0 records by tag
-- Count of biotypes filtered --
processed_transcript: 19525
retained_intron: 15599
nonsense_mediated_decay: 12561
lincRNA: 11049
rRNA: 5898
antisense: 4077
snRNA: 1623
unprocessed_pseudogene: 1575
miRNA: 1305
snoRNA: 741
polymorphic_pseudogene: 380
transcribed_unprocessed_pseudogene: 358
misc_RNA: 282
sense_intronic: 281
processed_pseudogene: 92
Mt_tRNA: 66
non_stop_decay: 60
TEC: 59
pseudogene: 57
sense_overlapping: 45
scaRNA: 33
sRNA: 12
ribozyme: 12
IG_pseudogene: 6
Mt_rRNA: 6
-- Count of biotypes kept --
protein_coding: 2183567
TR_V_gene: 884
TR_J_gene: 592
IG_V_pseudogene: 113
IG_C_gene: 34
IG_C_pseudogene: 19
IG_J_pseudogene: 17
TR_D_gene: 14
TR_V_pseudogene: 7
----------------------------


In [29]:
# set the filtered gtf file
gtf_filt_file = os.path.join(work_dir, org_str, f"{org_str}.gtf")
os.path.exists(gtf_filt_file)

True

In [30]:
# create the star index
star_dir = os.path.join(work_dir, "star")
os.makedirs(star_dir, exist_ok=True)

!STAR --runThreadN 16 \
    --runMode genomeGenerate \
    --genomeDir {star_dir} \
    --genomeFastaFiles {fasta_file} \
    --sjdbGTFfile {gtf_filt_file} \
    --sjdbOverhang 100

	/home/nickyoungblut/miniforge3/envs/asmbl/bin/STAR-avx2 --runThreadN 16 --runMode genomeGenerate --genomeDir /scratch/multiomics/nickyoungblut/star_refs/Danio_rerio/star --genomeFastaFiles /scratch/multiomics/nickyoungblut/star_refs/Danio_rerio/GRCz11/GRCz11.fa --sjdbGTFfile /scratch/multiomics/nickyoungblut/star_refs/Danio_rerio/Danio_rerio/Danio_rerio.gtf --sjdbOverhang 100
	STAR version: 2.7.11b   compiled: 2025-01-09T12:32:06+0000 :/opt/conda/conda-bld/star_1736425784849/work/source
Feb 01 13:47:32 ..... started STAR run

Feb 01 13:47:32 ... starting to generate Genome files
Feb 01 13:47:50 ..... processing annotations GTF
Feb 01 13:47:57 ... starting to sort Suffix Array. This may take a long time...
Feb 01 13:48:02 ... sorting Suffix Array chunks and saving them to disk...
Feb 01 13:51:29 ... loading chunks from disk, packing SA...
Feb 01 13:52:01 ... finished generating suffix array
Feb 01 13:52:01 ... generating Suffix Array index
Feb 01 13:54:26 ... completed Suffix Array ind

# Drosophila melanogaster

In [64]:
organism = "Drosophila melanogaster"

In [65]:
# format
org_str = organism.replace(" ", "_")
work_dir = os.path.join(base_dir, org_str)
os.makedirs(work_dir, exist_ok=True)

In [66]:
# show available assemblies
!genomepy search --provider NCBI {organism}

[1mname                 provider accession         tax_id annotation species                                  other_info                              [0m
[0mASM339711v1          NCBI     GCA_003397115.1     7227     [32m✓[39m      Drosophila melanogaster                  University of California - Irvine       [0m
[0mDGRP732              NCBI     GCA_004798075.1     7227     [32m✓[39m      Drosophila melanogaster                  Rutgers University                      [0m
[0mDL_Dmel_Horezu1-2_FlyeQ10reads_1 NCBI     GCA_026123175.1     7227     [31m✗[39m      Drosophila melanogaster                  University of Bucharest                 [0m
[0mRelease_6_plus_ISO1_MT NCBI     GCF_000001215.4     7227     [32m✓[39m      Drosophila melanogaster                  The FlyBase Consortium/Berkeley Drosophila Genome Project/Celera Genomics[0m
[0mASM802v1             NCBI     GCF_000008025.1   163164     [32m✓[39m      Wolbachia endosymbiont of Drosophila melanogaster TIG

In [67]:
# show available assemblies
!genomepy search --provider Ensembl {organism}

[1mname                 provider accession         tax_id annotation species                                  other_info                              [0m
[0mBDGP6.46             Ensembl  GCA_000001215.4     7227     [32m✓[39m      Drosophila melanogaster                  dmel_r6.46_FB2022_03                    [0m
[0m[32m ^[0m
[0m[32m Use name for [36mgenomepy install[0m
[0m[0m

In [None]:
# set the genome name
genome_name = "BDGP6.46"

In [34]:
# download genome
!genomepy install --provider Ensembl --annotation --genomes_dir {work_dir} {genome_name}

[32m14:12:15[0m [1m|[0m [34mINFO[0m [1m|[0m Downloading genome from Ensembl. Target URL: http://ftp.ensembl.org/pub/release-113/fasta/drosophila_melanogaster/dna/Drosophila_melanogaster.BDGP6.46.dna_sm.toplevel.fa.gz...
Download: 100%|████████████████████████████| 42.8M/42.8M [00:02<00:00, 21.6MB/s][0m[0m
[0m[32m14:12:18[0m [1m|[0m [34mINFO[0m [1m|[0m Genome download successful, starting post processing...
[32m14:12:18[0m [1m|[0m [34mINFO[0m [1m|[0m name: BDGP6.46
[32m14:12:18[0m [1m|[0m [34mINFO[0m [1m|[0m local name: BDGP6.46
[32m14:12:18[0m [1m|[0m [34mINFO[0m [1m|[0m fasta: /scratch/multiomics/nickyoungblut/star_refs/Drosophila_melanogaster/BDGP6.46/BDGP6.46.fa
Filtering Fasta: 2.40M lines [00:00, 3.93M lines/s][0m[0m
[0m[32m14:12:22[0m [1m|[0m [34mINFO[0m [1m|[0m Downloading annotation from Ensembl. Target URL: http://ftp.ensembl.org/pub/release-113/gtf/drosophila_melanogaster/Drosophila_melanogaster.BDGP6.46.113.gtf.gz...
Down

In [35]:
# get genome fasta
fasta_file = glob(f"{work_dir}/{genome_name}/*.fa")[0]
fasta_file

'/scratch/multiomics/nickyoungblut/star_refs/Drosophila_melanogaster/BDGP6.46/BDGP6.46.fa'

In [36]:
# get gtf file
gtf_file = glob(f"{work_dir}/{genome_name}/*.gtf")[0]
gtf_file

'/scratch/multiomics/nickyoungblut/star_refs/Drosophila_melanogaster/BDGP6.46/BDGP6.46.annotation.gtf'

In [37]:
# format the reference
exe = os.path.join(script_dir, "format-star-ref.py")
!{exe} --organism "{organism}"  --output-dir {work_dir} {gtf_file} 

Processing GTF: BDGP6.46.annotation.gtf
Output GTF: /scratch/multiomics/nickyoungblut/star_refs/Drosophila_melanogaster/Drosophila_melanogaster/Drosophila_melanogaster.gtf
Total records in GTF: 567802
Filtered 34166 records by biotype
Filtered 0 records by tag
-- Count of biotypes filtered --
transposable_element: 17694
ncRNA: 12804
pseudogene: 1376
tRNA: 952
snoRNA: 899
rRNA: 345
snRNA: 96
-- Count of biotypes kept --
protein_coding: 1053286
----------------------------


In [38]:
# set the filtered gtf file
gtf_filt_file = os.path.join(work_dir, org_str, f"{org_str}.gtf")
os.path.exists(gtf_filt_file)

True

In [39]:
# create the star index
star_dir = os.path.join(work_dir, "star")
os.makedirs(star_dir, exist_ok=True)

!STAR --runThreadN 16 \
    --runMode genomeGenerate \
    --genomeDir {star_dir} \
    --genomeFastaFiles {fasta_file} \
    --sjdbGTFfile {gtf_filt_file} \
    --sjdbOverhang 100

	/home/nickyoungblut/miniforge3/envs/asmbl/bin/STAR-avx2 --runThreadN 16 --runMode genomeGenerate --genomeDir /scratch/multiomics/nickyoungblut/star_refs/Drosophila_melanogaster/star --genomeFastaFiles /scratch/multiomics/nickyoungblut/star_refs/Drosophila_melanogaster/BDGP6.46/BDGP6.46.fa --sjdbGTFfile /scratch/multiomics/nickyoungblut/star_refs/Drosophila_melanogaster/Drosophila_melanogaster/Drosophila_melanogaster.gtf --sjdbOverhang 100
	STAR version: 2.7.11b   compiled: 2025-01-09T12:32:06+0000 :/opt/conda/conda-bld/star_1736425784849/work/source
Feb 01 14:12:32 ..... started STAR run

Feb 01 14:12:32 ... starting to generate Genome files
Feb 01 14:12:36 ..... processing annotations GTF
Feb 01 14:12:39 ... starting to sort Suffix Array. This may take a long time...
Feb 01 14:12:40 ... sorting Suffix Array chunks and saving them to disk...
Feb 01 14:12:54 ... loading chunks from disk, packing SA...
Feb 01 14:12:58 ... finished generating suffix array
Feb 01 14:12:58 ... generating S

# Caenorhabditis elegans

In [68]:
organism = "Caenorhabditis elegans" 

In [69]:
# format
org_str = organism.replace(" ", "_")
work_dir = os.path.join(base_dir, org_str)
os.makedirs(work_dir, exist_ok=True)

In [70]:
# show available assemblies
!genomepy search --provider NCBI {organism}

[1mname                 provider accession         tax_id annotation species                                  other_info                              [0m
[0mASM148330v1          NCBI     GCA_001483305.1     6239     [32m✓[39m      Caenorhabditis elegans                   Hong Kong Baptist University            [0m
[0mWBcel235             NCBI     GCF_000002985.6     6239     [32m✓[39m      Caenorhabditis elegans                   C. elegans Sequencing Consortium        [0m
[0mC_elegans_Bristol_N2_v1_5_4 NCBI     GCA_000939815.1     6239     [31m✗[39m      Caenorhabditis elegans                   WTSI                                    [0m
[0mCael_CB4856_1.0      NCBI     GCA_000975215.1     6239     [31m✗[39m      Caenorhabditis elegans                   University of Washington                [0m
[0mASM148330v2          NCBI     GCA_001483305.2     6239     [31m✗[39m      Caenorhabditis elegans                   Hong Kong Baptist University            [0m
[0mASM

In [71]:
# show available assemblies
!genomepy search --provider Ensembl {organism}

[1mname                 provider accession         tax_id annotation species                                  other_info                              [0m
[0mWBcel235             Ensembl  GCA_000002985.3     6239     [32m✓[39m      Caenorhabditis elegans                   WS282                                   [0m
[0m[32m ^[0m
[0m[32m Use name for [36mgenomepy install[0m
[0m[0m

In [None]:
# set the genome name
genome_name = "WBcel235"

In [43]:
# download genome
!genomepy install --provider Ensembl --annotation --genomes_dir {work_dir} {genome_name}

[32m14:42:25[0m [1m|[0m [34mINFO[0m [1m|[0m Downloading genome from Ensembl. Target URL: http://ftp.ensembl.org/pub/release-113/fasta/caenorhabditis_elegans/dna/Caenorhabditis_elegans.WBcel235.dna_sm.toplevel.fa.gz...
Download: 100%|████████████████████████████| 30.4M/30.4M [00:01<00:00, 19.6MB/s][0m[0m
[0m[32m14:42:27[0m [1m|[0m [34mINFO[0m [1m|[0m Genome download successful, starting post processing...
[32m14:42:27[0m [1m|[0m [34mINFO[0m [1m|[0m name: WBcel235
[32m14:42:27[0m [1m|[0m [34mINFO[0m [1m|[0m local name: WBcel235
[32m14:42:27[0m [1m|[0m [34mINFO[0m [1m|[0m fasta: /scratch/multiomics/nickyoungblut/star_refs/Caenorhabditis_elegans/WBcel235/WBcel235.fa
Filtering Fasta: 1.67M lines [00:00, 3.85M lines/s][0m[0m
[0m[32m14:42:30[0m [1m|[0m [34mINFO[0m [1m|[0m Downloading annotation from Ensembl. Target URL: http://ftp.ensembl.org/pub/release-113/gtf/caenorhabditis_elegans/Caenorhabditis_elegans.WBcel235.113.gtf.gz...
Download:

In [44]:
# get genome fasta
fasta_file = glob(f"{work_dir}/{genome_name}/*.fa")[0]
fasta_file

'/scratch/multiomics/nickyoungblut/star_refs/Caenorhabditis_elegans/WBcel235/WBcel235.fa'

In [45]:
# get gtf file
gtf_file = glob(f"{work_dir}/{genome_name}/*.gtf")[0]
gtf_file

'/scratch/multiomics/nickyoungblut/star_refs/Caenorhabditis_elegans/WBcel235/WBcel235.annotation.gtf'

In [46]:
# format the reference
exe = os.path.join(script_dir, "format-star-ref.py")
!{exe} --organism "{organism}"  --output-dir {work_dir} {gtf_file} 

Processing GTF: WBcel235.annotation.gtf
Output GTF: /scratch/multiomics/nickyoungblut/star_refs/Caenorhabditis_elegans/Caenorhabditis_elegans/Caenorhabditis_elegans.gtf
Total records in GTF: 703291
Filtered 94369 records by biotype
Filtered 0 records by tag
-- Count of biotypes filtered --
piRNA: 46089
ncRNA: 28764
pseudogene: 13162
tRNA: 1931
miRNA: 1725
snoRNA: 1038
lincRNA: 816
antisense_RNA: 391
snRNA: 387
rRNA: 66
-- Count of biotypes kept --
protein_coding: 1203105
----------------------------


In [47]:
# set the filtered gtf file
gtf_filt_file = os.path.join(work_dir, org_str, f"{org_str}.gtf")
os.path.exists(gtf_filt_file)

True

In [48]:
# create the star index
star_dir = os.path.join(work_dir, "star")
os.makedirs(star_dir, exist_ok=True)

!STAR --runThreadN 16 \
    --runMode genomeGenerate \
    --genomeDir {star_dir} \
    --genomeFastaFiles {fasta_file} \
    --sjdbGTFfile {gtf_filt_file} \
    --sjdbOverhang 100

	/home/nickyoungblut/miniforge3/envs/asmbl/bin/STAR-avx2 --runThreadN 16 --runMode genomeGenerate --genomeDir /scratch/multiomics/nickyoungblut/star_refs/Caenorhabditis_elegans/star --genomeFastaFiles /scratch/multiomics/nickyoungblut/star_refs/Caenorhabditis_elegans/WBcel235/WBcel235.fa --sjdbGTFfile /scratch/multiomics/nickyoungblut/star_refs/Caenorhabditis_elegans/Caenorhabditis_elegans/Caenorhabditis_elegans.gtf --sjdbOverhang 100
	STAR version: 2.7.11b   compiled: 2025-01-09T12:32:06+0000 :/opt/conda/conda-bld/star_1736425784849/work/source
Feb 01 14:42:42 ..... started STAR run

Feb 01 14:42:42 ... starting to generate Genome files
Feb 01 14:42:43 ..... processing annotations GTF
Feb 01 14:42:45 ... starting to sort Suffix Array. This may take a long time...
Feb 01 14:42:45 ... sorting Suffix Array chunks and saving them to disk...
Feb 01 14:42:53 ... loading chunks from disk, packing SA...
Feb 01 14:42:56 ... finished generating suffix array
Feb 01 14:42:56 ... generating Suffix

# Schistosoma mansoni

In [72]:
organism = "Schistosoma mansoni"

In [73]:
# format
org_str = organism.replace(" ", "_")
work_dir = os.path.join(base_dir, org_str)
os.makedirs(work_dir, exist_ok=True)

In [74]:
# show available assemblies
!genomepy search --provider NCBI {organism}

[1mname                 provider accession         tax_id annotation species                                  other_info                              [0m
[0mASM23792v2           NCBI     GCF_000237925.1     6183     [32m✓[39m      Schistosoma mansoni                      Schistosoma Genome Network              [0m
[0mGCA_000237925.3      NCBI     GCA_000237925.4     6183     [31m✗[39m      Schistosoma mansoni                      Schistosoma Genome Network              [0m
[0mSM_V9                NCBI     GCA_000237925.5     6183     [31m✗[39m      Schistosoma mansoni                      Schistosoma Genome Network              [0m
[0m[32m ^[0m
[0m[32m Use name for [36mgenomepy install[0m
[0m[0m

In [75]:
# show available assemblies
!genomepy search --provider Ensembl {organism}

[1mname                 provider accession         tax_id annotation species                                  other_info                              [0m
[0mSmansoni_v7          Ensembl  GCA_000237925.3     6183     [32m✓[39m      Schistosoma mansoni                      WBPS12                                  [0m
[0m[32m ^[0m
[0m[32m Use name for [36mgenomepy install[0m
[0m[0m

In [None]:
# set the genome name
genome_name = "Smansoni_v7"

In [52]:
# download genome
!genomepy install --provider Ensembl --annotation --genomes_dir {work_dir} {genome_name}

[32m14:44:32[0m [1m|[0m [34mINFO[0m [1m|[0m Downloading genome from Ensembl. Target URL: http://ftp.ensemblgenomes.org/pub/release-60/metazoa/fasta/schistosoma_mansoni/dna/Schistosoma_mansoni.Smansoni_v7.dna_sm.toplevel.fa.gz...
Download: 100%|██████████████████████████████| 122M/122M [00:04<00:00, 28.0MB/s][0m[0m
[0m[32m14:44:37[0m [1m|[0m [34mINFO[0m [1m|[0m Genome download successful, starting post processing...
[32m14:44:39[0m [1m|[0m [34mINFO[0m [1m|[0m name: Smansoni_v7
[32m14:44:39[0m [1m|[0m [34mINFO[0m [1m|[0m local name: Smansoni_v7
[32m14:44:39[0m [1m|[0m [34mINFO[0m [1m|[0m fasta: /scratch/multiomics/nickyoungblut/star_refs/Schistosoma_mansoni/Smansoni_v7/Smansoni_v7.fa
Filtering Fasta: 6.83M lines [00:01, 4.05M lines/s][0m[0m
[0m[32m14:44:47[0m [1m|[0m [34mINFO[0m [1m|[0m Downloading annotation from Ensembl. Target URL: http://ftp.ensemblgenomes.org/pub/release-60/metazoa/gtf/schistosoma_mansoni/Schistosoma_mansoni.Sman

In [53]:
# get genome fasta
fasta_file = glob(f"{work_dir}/{genome_name}/*.fa")[0]
fasta_file

'/scratch/multiomics/nickyoungblut/star_refs/Schistosoma_mansoni/Smansoni_v7/Smansoni_v7.fa'

In [54]:
# get gtf file
gtf_file = glob(f"{work_dir}/{genome_name}/*.gtf")[0]
gtf_file

'/scratch/multiomics/nickyoungblut/star_refs/Schistosoma_mansoni/Smansoni_v7/Smansoni_v7.annotation.gtf'

In [55]:
# format the reference
exe = os.path.join(script_dir, "format-star-ref.py")
!{exe} --organism "{organism}"  --output-dir {work_dir} {gtf_file} 

Processing GTF: Smansoni_v7.annotation.gtf
Output GTF: /scratch/multiomics/nickyoungblut/star_refs/Schistosoma_mansoni/Schistosoma_mansoni/Schistosoma_mansoni.gtf
Total records in GTF: 335384
Filtered 170 records by biotype
Filtered 0 records by tag
-- Count of biotypes filtered --
pseudogene: 170
-- Count of biotypes kept --
protein_coding: 660284
----------------------------


In [56]:
# set the filtered gtf file
gtf_filt_file = os.path.join(work_dir, org_str, f"{org_str}.gtf")
os.path.exists(gtf_filt_file)

True

In [57]:
# create the star index
star_dir = os.path.join(work_dir, "star")
os.makedirs(star_dir, exist_ok=True)

!STAR --runThreadN 16 \
    --runMode genomeGenerate \
    --genomeDir {star_dir} \
    --genomeFastaFiles {fasta_file} \
    --sjdbGTFfile {gtf_filt_file} \
    --sjdbOverhang 100

	/home/nickyoungblut/miniforge3/envs/asmbl/bin/STAR-avx2 --runThreadN 16 --runMode genomeGenerate --genomeDir /scratch/multiomics/nickyoungblut/star_refs/Schistosoma_mansoni/star --genomeFastaFiles /scratch/multiomics/nickyoungblut/star_refs/Schistosoma_mansoni/Smansoni_v7/Smansoni_v7.fa --sjdbGTFfile /scratch/multiomics/nickyoungblut/star_refs/Schistosoma_mansoni/Schistosoma_mansoni/Schistosoma_mansoni.gtf --sjdbOverhang 100
	STAR version: 2.7.11b   compiled: 2025-01-09T12:32:06+0000 :/opt/conda/conda-bld/star_1736425784849/work/source
Feb 01 14:45:03 ..... started STAR run

Feb 01 14:45:03 ... starting to generate Genome files
Feb 01 14:45:08 ..... processing annotations GTF
Feb 01 14:45:11 ... starting to sort Suffix Array. This may take a long time...
Feb 01 14:45:12 ... sorting Suffix Array chunks and saving them to disk...
Feb 01 14:45:58 ... loading chunks from disk, packing SA...
Feb 01 14:46:07 ... finished generating suffix array
Feb 01 14:46:07 ... generating Suffix Array in

# Anopheles gambiae

In [76]:
organism = "Anopheles gambiae"

In [77]:
# format
org_str = organism.replace(" ", "_")
work_dir = os.path.join(base_dir, org_str)
os.makedirs(work_dir, exist_ok=True)

In [78]:
# show available assemblies
!genomepy search --provider NCBI {organism}

[1mname                 provider accession         tax_id annotation species                                  other_info                              [0m
[0midAnoGambNW_F1_1     NCBI     GCF_943734735.2     7165     [32m✓[39m      Anopheles gambiae                        WELLCOME SANGER INSTITUTE               [0m
[0mAgamP3               NCBI     GCA_000005575.1     7165     [32m✓[39m      Anopheles gambiae str. PEST              The International Consortium for the Sequencing of Anopheles Genome[0m
[0mg4                   NCBI     GCA_000150785.1     7165     [31m✗[39m      Anopheles gambiae                        J. Craig Venter Institute               [0m
[0mViralProj32101       NCBI     GCF_000880635.1  3052098     [32m✓[39m      Anopheles gambiae densovirus             The W. Harry Feinstone Department of Molecular Microbiology and Immunology, Johns Hopkins Bloomberg School of Public Health[0m
[0mASM101452v1          NCBI     GCA_001014525.1     7165     [31m✗

In [79]:
# show available assemblies
!genomepy search --provider Ensembl {organism}

[1mname                 provider accession         tax_id annotation species                                  other_info                              [0m
[0mAgamP4               Ensembl  GCA_000005575.1     7165     [32m✓[39m      Anopheles gambiae                        AgamP4.13                               [0m
[0m[32m ^[0m
[0m[32m Use name for [36mgenomepy install[0m
[0m[0m

In [None]:
# set the genome name
genome_name = "AgamP4"

In [61]:
# download genome
!genomepy install --provider Ensembl --annotation --genomes_dir {work_dir} {genome_name}

[32m15:00:48[0m [1m|[0m [34mINFO[0m [1m|[0m Downloading genome from Ensembl. Target URL: http://ftp.ensemblgenomes.org/pub/release-60/metazoa/fasta/anopheles_gambiae/dna/Anopheles_gambiae.AgamP4.dna_sm.toplevel.fa.gz...
Download: 100%|████████████████████████████| 80.5M/80.5M [00:03<00:00, 26.6MB/s][0m[0m
[0m[32m15:00:52[0m [1m|[0m [34mINFO[0m [1m|[0m Genome download successful, starting post processing...
[32m15:00:53[0m [1m|[0m [34mINFO[0m [1m|[0m name: AgamP4
[32m15:00:53[0m [1m|[0m [34mINFO[0m [1m|[0m local name: AgamP4
[32m15:00:53[0m [1m|[0m [34mINFO[0m [1m|[0m fasta: /scratch/multiomics/nickyoungblut/star_refs/Anopheles_gambiae/AgamP4/AgamP4.fa
Filtering Fasta: 4.72M lines [00:01, 4.06M lines/s][0m[0m
[0m[32m15:00:59[0m [1m|[0m [34mINFO[0m [1m|[0m Downloading annotation from Ensembl. Target URL: http://ftp.ensemblgenomes.org/pub/release-60/metazoa/gtf/anopheles_gambiae/Anopheles_gambiae.AgamP4.60.gtf.gz...
Download: 100%|████

In [62]:
# get genome fasta
fasta_file = glob(f"{work_dir}/{genome_name}/*.fa")[0]
fasta_file

'/scratch/multiomics/nickyoungblut/star_refs/Anopheles_gambiae/AgamP4/AgamP4.fa'

In [63]:
# get gtf file
gtf_file = glob(f"{work_dir}/{genome_name}/*.gtf")[0]
gtf_file

'/scratch/multiomics/nickyoungblut/star_refs/Anopheles_gambiae/AgamP4/AgamP4.annotation.gtf'

In [64]:
# format the reference
exe = os.path.join(script_dir, "format-star-ref.py")
!{exe} --organism "{organism}"  --output-dir {work_dir} {gtf_file} 

Processing GTF: AgamP4.annotation.gtf
Output GTF: /scratch/multiomics/nickyoungblut/star_refs/Anopheles_gambiae/Anopheles_gambiae/Anopheles_gambiae.gtf
Total records in GTF: 230287
Filtered 2221 records by biotype
Filtered 0 records by tag
-- Count of biotypes filtered --
tRNA: 1086
rRNA: 726
pre_miRNA: 231
snRNA: 105
pseudogene: 40
ncRNA: 12
SRP_RNA: 9
snoRNA: 6
RNase_MRP_RNA: 3
RNase_P_RNA: 3
-- Count of biotypes kept --
protein_coding: 443026
lncRNA: 10
----------------------------


In [65]:
# set the filtered gtf file
gtf_filt_file = os.path.join(work_dir, org_str, f"{org_str}.gtf")
os.path.exists(gtf_filt_file)

True

In [66]:
# create the star index
star_dir = os.path.join(work_dir, "star")
os.makedirs(star_dir, exist_ok=True)

!STAR --runThreadN 16 \
    --runMode genomeGenerate \
    --genomeDir {star_dir} \
    --genomeFastaFiles {fasta_file} \
    --sjdbGTFfile {gtf_filt_file} \
    --sjdbOverhang 100

	/home/nickyoungblut/miniforge3/envs/asmbl/bin/STAR-avx2 --runThreadN 16 --runMode genomeGenerate --genomeDir /scratch/multiomics/nickyoungblut/star_refs/Anopheles_gambiae/star --genomeFastaFiles /scratch/multiomics/nickyoungblut/star_refs/Anopheles_gambiae/AgamP4/AgamP4.fa --sjdbGTFfile /scratch/multiomics/nickyoungblut/star_refs/Anopheles_gambiae/Anopheles_gambiae/Anopheles_gambiae.gtf --sjdbOverhang 100
	STAR version: 2.7.11b   compiled: 2025-01-09T12:32:06+0000 :/opt/conda/conda-bld/star_1736425784849/work/source
Feb 01 15:01:08 ..... started STAR run

Feb 01 15:01:08 ... starting to generate Genome files
Feb 01 15:01:12 ..... processing annotations GTF
Feb 01 15:01:14 ... starting to sort Suffix Array. This may take a long time...
Feb 01 15:01:15 ... sorting Suffix Array chunks and saving them to disk...
Feb 01 15:01:40 ... loading chunks from disk, packing SA...
Feb 01 15:01:46 ... finished generating suffix array
Feb 01 15:01:46 ... generating Suffix Array index
Feb 01 15:02:44 

# Arabidopsis thaliana

In [80]:
organism = "Arabidopsis thaliana"

In [81]:
# format
org_str = organism.replace(" ", "_")
work_dir = os.path.join(base_dir, org_str)
os.makedirs(work_dir, exist_ok=True)

In [83]:
# show available assemblies
!genomepy search --provider NCBI {organism}

[1mname                 provider accession         tax_id annotation species                                  other_info                              [0m
[0mTAIR10               NCBI     GCF_000001735.3     3702     [32m✓[39m      Arabidopsis thaliana                     The Arabidopsis Information Resource (TAIR)[0m
[0massembly_1           NCBI     GCA_900234365.1     3702     [31m✗[39m      Arabidopsis thaliana                     MPI FOR DEVELOPMENTAL BIOLOGY           [0m
[0mASM2091176v1         NCBI     GCA_020911765.1     3702     [31m✗[39m      Arabidopsis thaliana                     Institute of Genetics and Developmental Biology, The Innovative Academy of Seed Design[0m
[0mCol-CC               NCBI     GCA_028009825.2     3702     [31m✗[39m      Arabidopsis thaliana                     Community-Consensus Arabidopsis Thaliana Reference Genome Assembly Consortium[0m
[0mTAIR10.1             NCBI     GCF_000001735.4     3702     [32m✓[39m      Arabidopsis th

In [82]:
# show available assemblies
!genomepy search --provider Ensembl {organism}

[1mname                 provider accession         tax_id annotation species                                  other_info                              [0m
[0mTAIR10               Ensembl  GCA_000001735.1     3702     [32m✓[39m      Arabidopsis thaliana                     Araport11                               [0m
[0m[32m ^[0m
[0m[32m Use name for [36mgenomepy install[0m
[0m[0m

In [None]:
# set the genome name
genome_name = "TAIR10"

In [71]:
# download genome
!genomepy install --provider Ensembl --annotation --genomes_dir {work_dir} {genome_name}

[32m15:04:59[0m [1m|[0m [34mINFO[0m [1m|[0m Downloading genome from Ensembl. Target URL: http://ftp.ensemblgenomes.org/pub/release-60/plants/fasta/arabidopsis_thaliana/dna/Arabidopsis_thaliana.TAIR10.dna_sm.toplevel.fa.gz...
Download: 100%|████████████████████████████| 36.7M/36.7M [00:06<00:00, 6.08MB/s][0m[0m
[0m[32m15:05:05[0m [1m|[0m [34mINFO[0m [1m|[0m Genome download successful, starting post processing...
[32m15:05:06[0m [1m|[0m [34mINFO[0m [1m|[0m name: TAIR10
[32m15:05:06[0m [1m|[0m [34mINFO[0m [1m|[0m local name: TAIR10
[32m15:05:06[0m [1m|[0m [34mINFO[0m [1m|[0m fasta: /scratch/multiomics/nickyoungblut/star_refs/Arabidopsis_thaliana/TAIR10/TAIR10.fa
Filtering Fasta: 1.99M lines [00:00, 3.95M lines/s][0m[0m
[0m[32m15:05:09[0m [1m|[0m [34mINFO[0m [1m|[0m Downloading annotation from Ensembl. Target URL: http://ftp.ensemblgenomes.org/pub/release-60/plants/gtf/arabidopsis_thaliana/Arabidopsis_thaliana.TAIR10.60.gtf.gz...
Downlo

In [72]:
# get genome fasta
fasta_file = glob(f"{work_dir}/{genome_name}/*.fa")[0]
fasta_file

'/scratch/multiomics/nickyoungblut/star_refs/Arabidopsis_thaliana/TAIR10/TAIR10.fa'

In [73]:
# get gtf file
gtf_file = glob(f"{work_dir}/{genome_name}/*.gtf")[0]
gtf_file

'/scratch/multiomics/nickyoungblut/star_refs/Arabidopsis_thaliana/TAIR10/TAIR10.annotation.gtf'

In [74]:
# format the reference
exe = os.path.join(script_dir, "format-star-ref.py")
!{exe} --organism "{organism}"  --output-dir {work_dir} {gtf_file} 

Processing GTF: TAIR10.annotation.gtf
Output GTF: /scratch/multiomics/nickyoungblut/star_refs/Arabidopsis_thaliana/Arabidopsis_thaliana/Arabidopsis_thaliana.gtf
Total records in GTF: 888095
Filtered 6110 records by biotype
Filtered 0 records by tag
-- Count of biotypes filtered --
tRNA: 2157
ncRNA: 1464
miRNA: 975
snoRNA: 861
nontranslating_CDS: 362
snRNA: 246
rRNA: 45
-- Count of biotypes kept --
protein_coding: 1710971
lncRNA: 21987
----------------------------


In [75]:
# set the filtered gtf file
gtf_filt_file = os.path.join(work_dir, org_str, f"{org_str}.gtf")
os.path.exists(gtf_filt_file)

True

In [76]:
# create the star index
star_dir = os.path.join(work_dir, "star")
os.makedirs(star_dir, exist_ok=True)

!STAR --runThreadN 16 \
    --runMode genomeGenerate \
    --genomeDir {star_dir} \
    --genomeFastaFiles {fasta_file} \
    --sjdbGTFfile {gtf_filt_file} \
    --sjdbOverhang 100

	/home/nickyoungblut/miniforge3/envs/asmbl/bin/STAR-avx2 --runThreadN 16 --runMode genomeGenerate --genomeDir /scratch/multiomics/nickyoungblut/star_refs/Arabidopsis_thaliana/star --genomeFastaFiles /scratch/multiomics/nickyoungblut/star_refs/Arabidopsis_thaliana/TAIR10/TAIR10.fa --sjdbGTFfile /scratch/multiomics/nickyoungblut/star_refs/Arabidopsis_thaliana/Arabidopsis_thaliana/Arabidopsis_thaliana.gtf --sjdbOverhang 100
	STAR version: 2.7.11b   compiled: 2025-01-09T12:32:06+0000 :/opt/conda/conda-bld/star_1736425784849/work/source
Feb 01 15:05:52 ..... started STAR run

Feb 01 15:05:52 ... starting to generate Genome files
Feb 01 15:05:53 ..... processing annotations GTF
Feb 01 15:05:56 ... starting to sort Suffix Array. This may take a long time...
Feb 01 15:05:56 ... sorting Suffix Array chunks and saving them to disk...
Feb 01 15:06:06 ... loading chunks from disk, packing SA...
Feb 01 15:06:09 ... finished generating suffix array
Feb 01 15:06:09 ... generating Suffix Array index
F

# Oryza sativa

In [84]:
organism = "Oryza sativa"

In [85]:
# format
org_str = organism.replace(" ", "_")
work_dir = os.path.join(base_dir, org_str)
os.makedirs(work_dir, exist_ok=True)

In [86]:
# show available assemblies
!genomepy search --provider NCBI {organism}

[1mname                 provider accession         tax_id annotation species                                  other_info                              [0m
[0mHR-12_Indica_denovo_asembly NCBI     GCA_000725085.1     4530     [32m✓[39m      Oryza sativa Indica Group                Centre for Cellular and Molecular Platforms[0m
[0mZS97RS1              NCBI     GCA_001623345.1     4530     [32m✓[39m      Oryza sativa Indica Group                Huazhong Agricultural University        [0m
[0mZS97RS2              NCBI     GCA_001623345.2     4530     [31m✗[39m      Oryza sativa Indica Group                Huazhong Agricultural University        [0m
[0mMH63RS1              NCBI     GCA_001623365.1     4530     [32m✓[39m      Oryza sativa Indica Group                Huazhong Agricultural University        [0m
[0mASM195236v2          NCBI     GCA_001952365.2     4530     [31m✗[39m      Oryza sativa aus subgroup                University of Arizona                   [0m
[0m

In [87]:
# show available assemblies
!genomepy search --provider Ensembl {organism}

[1mname                 provider accession         tax_id annotation species                                  other_info                              [0m
[0mOs127742RS1          Ensembl  GCA_009831045.1    39946     [32m✓[39m      Oryza sativa Indica Group                2020-12                                 [0m
[0mMH63RS2              Ensembl  GCA_001618785.1    39946     [32m✓[39m      Oryza sativa Indica Group                2020-12                                 [0m
[0mOs132424RS1          Ensembl  GCA_009831025.1    39946     [32m✓[39m      Oryza sativa Indica Group                2020-12                                 [0m
[0mOsN22RS2             Ensembl  GCA_001952365.2  1736659     [32m✓[39m      Oryza sativa aus subgroup                2020-12                                 [0m
[0mAzucenaRS1           Ensembl  GCA_009830595.1  1736656     [32m✓[39m      Oryza sativa tropical japonica subgroup  2020-12                                 [0m
[0mIRGSP-1.0 

In [88]:
# set the genome name
genome_name = "IRGSP-1.0"

In [82]:
# download genome
!genomepy install --provider Ensembl --annotation --genomes_dir {work_dir} {genome_name}

[32m15:07:45[0m [1m|[0m [34mINFO[0m [1m|[0m Downloading genome from Ensembl. Target URL: http://ftp.ensemblgenomes.org/pub/release-60/plants/fasta/oryza_sativa/dna/Oryza_sativa.IRGSP-1.0.dna_sm.toplevel.fa.gz...
Download: 100%|██████████████████████████████| 117M/117M [00:04<00:00, 26.6MB/s][0m[0m
[0m[32m15:07:50[0m [1m|[0m [34mINFO[0m [1m|[0m Genome download successful, starting post processing...
[32m15:07:51[0m [1m|[0m [34mINFO[0m [1m|[0m name: IRGSP-1.0
[32m15:07:51[0m [1m|[0m [34mINFO[0m [1m|[0m local name: IRGSP-1.0
[32m15:07:51[0m [1m|[0m [34mINFO[0m [1m|[0m fasta: /scratch/multiomics/nickyoungblut/star_refs/Oryza_sativa/IRGSP-1.0/IRGSP-1.0.fa
Filtering Fasta: 6.25M lines [00:01, 4.08M lines/s][0m[0m
[0m[32m15:07:58[0m [1m|[0m [34mINFO[0m [1m|[0m Downloading annotation from Ensembl. Target URL: http://ftp.ensemblgenomes.org/pub/release-60/plants/gtf/oryza_sativa/Oryza_sativa.IRGSP-1.0.60.gtf.gz...
Download: 100%|█████████████

In [83]:
# get genome fasta
fasta_file = glob(f"{work_dir}/{genome_name}/*.fa")[0]
fasta_file

'/scratch/multiomics/nickyoungblut/star_refs/Oryza_sativa/IRGSP-1.0/IRGSP-1.0.fa'

In [84]:
# get gtf file
gtf_file = glob(f"{work_dir}/{genome_name}/*.gtf")[0]
gtf_file

'/scratch/multiomics/nickyoungblut/star_refs/Oryza_sativa/IRGSP-1.0/IRGSP-1.0.annotation.gtf'

In [85]:
# format the reference
exe = os.path.join(script_dir, "format-star-ref.py")
!{exe} --organism "{organism}"  --output-dir {work_dir} {gtf_file} 

Processing GTF: IRGSP-1.0.annotation.gtf
Output GTF: /scratch/multiomics/nickyoungblut/star_refs/Oryza_sativa/Oryza_sativa/Oryza_sativa.gtf
Total records in GTF: 625162
Filtered 12884 records by biotype
Filtered 0 records by tag
-- Count of biotypes filtered --
ncRNA: 9816
rRNA: 825
tRNA: 734
snoRNA: 648
pre_miRNA: 291
sense_intronic: 267
snRNA: 249
antisense_RNA: 18
tRNA_pseudogene: 18
SRP_RNA: 6
nontranslating_CDS: 6
RNase_MRP_RNA: 3
pseudogene: 3
-- Count of biotypes kept --
protein_coding: 1186982
----------------------------


In [86]:
# set the filtered gtf file
gtf_filt_file = os.path.join(work_dir, org_str, f"{org_str}.gtf")
os.path.exists(gtf_filt_file)

True

In [87]:
# create the star index
star_dir = os.path.join(work_dir, "star")
os.makedirs(star_dir, exist_ok=True)

!STAR --runThreadN 16 \
    --runMode genomeGenerate \
    --genomeDir {star_dir} \
    --genomeFastaFiles {fasta_file} \
    --sjdbGTFfile {gtf_filt_file} \
    --sjdbOverhang 100

	/home/nickyoungblut/miniforge3/envs/asmbl/bin/STAR-avx2 --runThreadN 16 --runMode genomeGenerate --genomeDir /scratch/multiomics/nickyoungblut/star_refs/Oryza_sativa/star --genomeFastaFiles /scratch/multiomics/nickyoungblut/star_refs/Oryza_sativa/IRGSP-1.0/IRGSP-1.0.fa --sjdbGTFfile /scratch/multiomics/nickyoungblut/star_refs/Oryza_sativa/Oryza_sativa/Oryza_sativa.gtf --sjdbOverhang 100
	STAR version: 2.7.11b   compiled: 2025-01-09T12:32:06+0000 :/opt/conda/conda-bld/star_1736425784849/work/source
Feb 01 15:08:10 ..... started STAR run

Feb 01 15:08:10 ... starting to generate Genome files
Feb 01 15:08:15 ..... processing annotations GTF
Feb 01 15:08:18 ... starting to sort Suffix Array. This may take a long time...
Feb 01 15:08:19 ... sorting Suffix Array chunks and saving them to disk...
Feb 01 15:08:59 ... loading chunks from disk, packing SA...
Feb 01 15:09:07 ... finished generating suffix array
Feb 01 15:09:07 ... generating Suffix Array index
Feb 01 15:10:18 ... completed Suffi

# Solanum lycopersicum

In [89]:
organism = "Solanum lycopersicum"

In [90]:
# format
org_str = organism.replace(" ", "_")
work_dir = os.path.join(base_dir, org_str)
os.makedirs(work_dir, exist_ok=True)

In [91]:
# show available assemblies
!genomepy search --provider NCBI {organism}

[1mname                 provider accession         tax_id annotation species                                  other_info                              [0m
[0mSL2.40               NCBI     GCF_000188115.2     4081     [32m✓[39m      Solanum lycopersicum                     Solanaceae Genomics Project             [0m
[0mSL2.50               NCBI     GCF_000188115.3     4081     [32m✓[39m      Solanum lycopersicum                     Solanaceae Genomics Project             [0m
[0mSL3.0                NCBI     GCF_000188115.4     4081     [32m✓[39m      Solanum lycopersicum                     Solanaceae Genomics Project             [0m
[0mSLYcer_r1.1          NCBI     GCA_016860505.2     4081     [31m✗[39m      Solanum lycopersicum var. cerasiforme    Kazusa DNA Research Institute           [0m
[0mASM3651221v1         NCBI     GCA_036512215.1     4081     [31m✗[39m      Solanum lycopersicum                     Kazusa DNA Research Institute           [0m
[0mSolanum   

In [92]:
# show available assemblies
!genomepy search --provider Ensembl {organism}

[1mname                 provider accession         tax_id annotation species                                  other_info                              [0m
[0mSL3.0                Ensembl  GCA_000188115.3     4081     [32m✓[39m      Solanum lycopersicum                     2017-02-SOL/2018-10                     [0m
[0m[32m ^[0m
[0m[32m Use name for [36mgenomepy install[0m
[0m[0m

In [None]:
# set the genome name
genome_name = "SL3.0"

In [91]:
# download genome
!genomepy install --provider Ensembl --annotation --genomes_dir {work_dir} {genome_name}

[32m15:17:35[0m [1m|[0m [34mINFO[0m [1m|[0m Downloading genome from Ensembl. Target URL: http://ftp.ensemblgenomes.org/pub/release-60/plants/fasta/solanum_lycopersicum/dna/Solanum_lycopersicum.SL3.0.dna_sm.toplevel.fa.gz...
Download: 100%|██████████████████████████████| 229M/229M [00:08<00:00, 30.0MB/s][0m[0m
[0m[32m15:17:44[0m [1m|[0m [34mINFO[0m [1m|[0m Genome download successful, starting post processing...
[32m15:17:47[0m [1m|[0m [34mINFO[0m [1m|[0m name: SL3.0
[32m15:17:47[0m [1m|[0m [34mINFO[0m [1m|[0m local name: SL3.0
[32m15:17:47[0m [1m|[0m [34mINFO[0m [1m|[0m fasta: /scratch/multiomics/nickyoungblut/star_refs/Solanum_lycopersicum/SL3.0/SL3.0.fa
Filtering Fasta: 13.8M lines [00:03, 4.02M lines/s][0m[0m
[0m[32m15:18:03[0m [1m|[0m [34mINFO[0m [1m|[0m Downloading annotation from Ensembl. Target URL: http://ftp.ensemblgenomes.org/pub/release-60/plants/gtf/solanum_lycopersicum/Solanum_lycopersicum.SL3.0.60.gtf.gz...
Download: 10

In [92]:
# get genome fasta
fasta_file = glob(f"{work_dir}/{genome_name}/*.fa")[0]
fasta_file

'/scratch/multiomics/nickyoungblut/star_refs/Solanum_lycopersicum/SL3.0/SL3.0.fa'

In [93]:
# get gtf file
gtf_file = glob(f"{work_dir}/{genome_name}/*.gtf")[0]
gtf_file

'/scratch/multiomics/nickyoungblut/star_refs/Solanum_lycopersicum/SL3.0/SL3.0.annotation.gtf'

In [94]:
# format the reference
exe = os.path.join(script_dir, "format-star-ref.py")
!{exe} --organism "{organism}"  --output-dir {work_dir} {gtf_file} 

Processing GTF: SL3.0.annotation.gtf
Output GTF: /scratch/multiomics/nickyoungblut/star_refs/Solanum_lycopersicum/Solanum_lycopersicum/Solanum_lycopersicum.gtf
Total records in GTF: 526279
Filtered 4997 records by biotype
Filtered 0 records by tag
-- Count of biotypes filtered --
nontranslating_CDS: 1496
tRNA: 1356
snRNA: 666
rRNA: 588
snoRNA: 498
pre_miRNA: 168
SRP_RNA: 126
sense_intronic: 78
antisense_RNA: 12
RNase_MRP_RNA: 6
ncRNA: 3
-- Count of biotypes kept --
protein_coding: 1008135
----------------------------


In [95]:
# set the filtered gtf file
gtf_filt_file = os.path.join(work_dir, org_str, f"{org_str}.gtf")
os.path.exists(gtf_filt_file)

True

In [96]:
# create the star index
star_dir = os.path.join(work_dir, "star")
os.makedirs(star_dir, exist_ok=True)

!STAR --runThreadN 16 \
    --runMode genomeGenerate \
    --genomeDir {star_dir} \
    --genomeFastaFiles {fasta_file} \
    --sjdbGTFfile {gtf_filt_file} \
    --sjdbOverhang 100

	/home/nickyoungblut/miniforge3/envs/asmbl/bin/STAR-avx2 --runThreadN 16 --runMode genomeGenerate --genomeDir /scratch/multiomics/nickyoungblut/star_refs/Solanum_lycopersicum/star --genomeFastaFiles /scratch/multiomics/nickyoungblut/star_refs/Solanum_lycopersicum/SL3.0/SL3.0.fa --sjdbGTFfile /scratch/multiomics/nickyoungblut/star_refs/Solanum_lycopersicum/Solanum_lycopersicum/Solanum_lycopersicum.gtf --sjdbOverhang 100
	STAR version: 2.7.11b   compiled: 2025-01-09T12:32:06+0000 :/opt/conda/conda-bld/star_1736425784849/work/source
Feb 01 15:18:12 ..... started STAR run

Feb 01 15:18:12 ... starting to generate Genome files
Feb 01 15:18:26 ..... processing annotations GTF
Feb 01 15:18:32 ... starting to sort Suffix Array. This may take a long time...
Feb 01 15:18:35 ... sorting Suffix Array chunks and saving them to disk...
Feb 01 15:20:01 ... loading chunks from disk, packing SA...
Feb 01 15:20:18 ... finished generating suffix array
Feb 01 15:20:18 ... generating Suffix Array index
Feb

# Zea mays

In [93]:
organism = "Zea mays"

In [94]:
# format
org_str = organism.replace(" ", "_")
work_dir = os.path.join(base_dir, org_str)
os.makedirs(work_dir, exist_ok=True)

In [95]:
# show available assemblies
!genomepy search --provider NCBI {organism}

[1mname                 provider accession         tax_id annotation species                                  other_info                              [0m
[0mB73_RefGen_v3        NCBI     GCF_000005005.1     4577     [32m✓[39m      Zea mays                                 Maize Genome Sequencing Project         [0m
[0mZm-W22-REFERENCE-NRGENE-1.0 NCBI     GCA_001644905.1     4577     [32m✓[39m      Zea mays subsp. mays                     W22 Sequencing Consortium               [0m
[0mCIMBL55_1.0          NCBI     GCA_027171705.1     4577     [31m✗[39m      Zea mays subsp. mays                     China Agricultural University           [0m
[0mASM2717170v2         NCBI     GCA_027171705.2     4577     [31m✗[39m      Zea mays subsp. mays                     China Agricultural University           [0m
[0mB73_RefGen_v4        NCBI     GCA_000005005.6     4577     [32m✓[39m      Zea mays                                 maizesequence                           [0m
[0mZea

In [96]:
# show available assemblies
!genomepy search --provider Ensembl {organism}

[1mname                 provider accession         tax_id annotation species                                  other_info                              [0m
[0mZm-B73-REFERENCE-NAM-5.0 Ensembl  GCA_902167145.1     4577     [32m✓[39m      Zea mays                                 2019-12-CSHL/2019-12                    [0m
[0m[32m ^[0m
[0m[32m Use name for [36mgenomepy install[0m
[0m[0m

In [None]:
# set the genome name
genome_name = "Zm-B73-REFERENCE-NAM-5.0"

In [100]:
# download genome
!genomepy install --provider Ensembl --annotation --genomes_dir {work_dir} {genome_name}

[32m15:23:10[0m [1m|[0m [34mINFO[0m [1m|[0m Downloading genome from Ensembl. Target URL: http://ftp.ensemblgenomes.org/pub/release-60/plants/fasta/zea_mays/dna/Zea_mays.Zm-B73-REFERENCE-NAM-5.0.dna_sm.toplevel.fa.gz...
Download: 100%|██████████████████████████████| 641M/641M [00:21<00:00, 31.9MB/s][0m[0m
[0m[32m15:23:31[0m [1m|[0m [34mINFO[0m [1m|[0m Genome download successful, starting post processing...
[32m15:23:41[0m [1m|[0m [34mINFO[0m [1m|[0m name: Zm-B73-REFERENCE-NAM-5.0
[32m15:23:41[0m [1m|[0m [34mINFO[0m [1m|[0m local name: Zm-B73-REFERENCE-NAM-5.0
[32m15:23:41[0m [1m|[0m [34mINFO[0m [1m|[0m fasta: /scratch/multiomics/nickyoungblut/star_refs/Zea_mays/Zm-B73-REFERENCE-NAM-5.0/Zm-B73-REFERENCE-NAM-5.0.fa
Filtering Fasta: 36.4M lines [00:08, 4.19M lines/s][0m[0m
[0m[32m15:24:22[0m [1m|[0m [34mINFO[0m [1m|[0m Downloading annotation from Ensembl. Target URL: http://ftp.ensemblgenomes.org/pub/release-60/plants/gtf/zea_mays/Zea_m

In [101]:
# get genome fasta
fasta_file = glob(f"{work_dir}/{genome_name}/*.fa")[0]
fasta_file

'/scratch/multiomics/nickyoungblut/star_refs/Zea_mays/Zm-B73-REFERENCE-NAM-5.0/Zm-B73-REFERENCE-NAM-5.0.fa'

In [102]:
# get gtf file
gtf_file = glob(f"{work_dir}/{genome_name}/*.gtf")[0]
gtf_file

'/scratch/multiomics/nickyoungblut/star_refs/Zea_mays/Zm-B73-REFERENCE-NAM-5.0/Zm-B73-REFERENCE-NAM-5.0.annotation.gtf'

In [103]:
# format the reference
exe = os.path.join(script_dir, "format-star-ref.py")
!{exe} --organism "{organism}"  --output-dir {work_dir} {gtf_file} 

Processing GTF: Zm-B73-REFERENCE-NAM-5.0.annotation.gtf
Output GTF: /scratch/multiomics/nickyoungblut/star_refs/Zea_mays/Zea_mays/Zea_mays.gtf
Total records in GTF: 1302218
Filtered 18078 records by biotype
Filtered 0 records by tag
-- Count of biotypes filtered --
misc_non_coding: 18078
-- Count of biotypes kept --
protein_coding: 2528524
----------------------------


In [104]:
# set the filtered gtf file
gtf_filt_file = os.path.join(work_dir, org_str, f"{org_str}.gtf")
os.path.exists(gtf_filt_file)

True

In [105]:
# create the star index
star_dir = os.path.join(work_dir, "star")
os.makedirs(star_dir, exist_ok=True)

!STAR --runThreadN 16 \
    --runMode genomeGenerate \
    --genomeDir {star_dir} \
    --genomeFastaFiles {fasta_file} \
    --sjdbGTFfile {gtf_filt_file} \
    --sjdbOverhang 100

	/home/nickyoungblut/miniforge3/envs/asmbl/bin/STAR-avx2 --runThreadN 16 --runMode genomeGenerate --genomeDir /scratch/multiomics/nickyoungblut/star_refs/Zea_mays/star --genomeFastaFiles /scratch/multiomics/nickyoungblut/star_refs/Zea_mays/Zm-B73-REFERENCE-NAM-5.0/Zm-B73-REFERENCE-NAM-5.0.fa --sjdbGTFfile /scratch/multiomics/nickyoungblut/star_refs/Zea_mays/Zea_mays/Zea_mays.gtf --sjdbOverhang 100
	STAR version: 2.7.11b   compiled: 2025-01-09T12:32:06+0000 :/opt/conda/conda-bld/star_1736425784849/work/source
Feb 01 15:24:44 ..... started STAR run

Feb 01 15:24:44 ... starting to generate Genome files
Feb 01 15:25:13 ..... processing annotations GTF
Feb 01 15:25:23 ... starting to sort Suffix Array. This may take a long time...
Feb 01 15:25:30 ... sorting Suffix Array chunks and saving them to disk...
Feb 01 15:33:25 ... loading chunks from disk, packing SA...
Feb 01 15:34:19 ... finished generating suffix array
Feb 01 15:34:19 ... generating Suffix Array index
Feb 01 15:37:23 ... compl

# Saccharomyces cerevisiae

In [97]:
organism = "Saccharomyces cerevisiae"

In [98]:
# format
org_str = organism.replace(" ", "_")
work_dir = os.path.join(base_dir, org_str)
os.makedirs(work_dir, exist_ok=True)

In [100]:
# show available assemblies
!genomepy search --provider NCBI {organism} | head -n 100

name	provider	accession	tax_id	annotation	species	other_info
ASM9106v1	NCBI	GCA_000091065.1	4932	True	Saccharomyces cerevisiae	Tsukuba Life Science Center
M22	NCBI	GCA_000182075.1	4932	False	Saccharomyces cerevisiae M22	Washington University School of Medicine (WashU)
YPS163	NCBI	GCA_000182095.1	4932	False	Saccharomyces cerevisiae YPS163	Washington University School of Medicine (WashU)
K7_1.0	NCBI	GCA_000260735.2	4932	True	Saccharomyces cerevisiae Kyokai no. 7	National Research Institute of Brewing
ScNAM34-4C_assembly01	NCBI	GCA_000508805.2	4932	False	Saccharomyces cerevisiae NAM34-4C	National Institute of Advanced Industrial Science and Technology
ScIR-2_assembly01	NCBI	GCA_000508825.2	4932	False	Saccharomyces cerevisiae IR-2	National Institute of Advanced Industrial Science and Technology
Sc_YJM993_v1	NCBI	GCA_000662435.2	4932	True	Saccharomyces cerevisiae YJM993	Duke University
ASM73323v1	NCBI	GCA_000733235.1	4932	True	Saccharomyces cerevisiae UFMG A-905	UFMG
ASM73323v2	NCBI	GCA_000

In [108]:
# show available assemblies
!genomepy search --provider Ensembl {organism}

[1mname                 provider accession         tax_id annotation species                                  other_info                              [0m
[0mR64-1-1              Ensembl  GCA_000146045.2   559292     [32m✓[39m      Saccharomyces cerevisiae S288c           2018-08-SGD                             [0m
[0m[32m ^[0m
[0m[32m Use name for [36mgenomepy install[0m
[0m[0m

In [None]:
# set the genome name
genome_name = "R64-1-1"

In [109]:
# download genome
!genomepy install --provider Ensembl --annotation --genomes_dir {work_dir} {genome_name}

[32m15:43:35[0m [1m|[0m [34mINFO[0m [1m|[0m Downloading genome from Ensembl. Target URL: http://ftp.ensembl.org/pub/release-113/fasta/saccharomyces_cerevisiae/dna/Saccharomyces_cerevisiae.R64-1-1.dna_sm.toplevel.fa.gz...
Download: 100%|████████████████████████████| 3.69M/3.69M [00:00<00:00, 5.33MB/s][0m[0m
[0m[32m15:43:36[0m [1m|[0m [34mINFO[0m [1m|[0m Genome download successful, starting post processing...
[32m15:43:36[0m [1m|[0m [34mINFO[0m [1m|[0m name: R64-1-1
[32m15:43:36[0m [1m|[0m [34mINFO[0m [1m|[0m local name: R64-1-1
[32m15:43:36[0m [1m|[0m [34mINFO[0m [1m|[0m fasta: /scratch/multiomics/nickyoungblut/star_refs/Saccharomyces_cerevisiae/R64-1-1/R64-1-1.fa
Filtering Fasta: 203k lines [00:00, 3.30M lines/s][0m
[0m[32m15:43:37[0m [1m|[0m [34mINFO[0m [1m|[0m Downloading annotation from Ensembl. Target URL: http://ftp.ensembl.org/pub/release-113/gtf/saccharomyces_cerevisiae/Saccharomyces_cerevisiae.R64-1-1.113.gtf.gz...
Download: 

In [110]:
# get genome fasta
fasta_file = glob(f"{work_dir}/{genome_name}/*.fa")[0]
fasta_file

'/scratch/multiomics/nickyoungblut/star_refs/Saccharomyces_cerevisiae/R64-1-1/R64-1-1.fa'

In [111]:
# get gtf file
gtf_file = glob(f"{work_dir}/{genome_name}/*.gtf")[0]
gtf_file

'/scratch/multiomics/nickyoungblut/star_refs/Saccharomyces_cerevisiae/R64-1-1/R64-1-1.annotation.gtf'

In [112]:
# format the reference
exe = os.path.join(script_dir, "format-star-ref.py")
!{exe} --organism "{organism}"  --output-dir {work_dir} {gtf_file} 

Processing GTF: R64-1-1.annotation.gtf
Output GTF: /scratch/multiomics/nickyoungblut/star_refs/Saccharomyces_cerevisiae/Saccharomyces_cerevisiae/Saccharomyces_cerevisiae.gtf
Total records in GTF: 41879
Filtered 1594 records by biotype
Filtered 0 records by tag
-- Count of biotypes filtered --
tRNA: 957
transposable_element: 273
snoRNA: 233
rRNA: 77
pseudogene: 36
snRNA: 18
-- Count of biotypes kept --
protein_coding: 73862
ncRNA: 90
----------------------------


In [113]:
# set the filtered gtf file
gtf_filt_file = os.path.join(work_dir, org_str, f"{org_str}.gtf")
os.path.exists(gtf_filt_file)

True

In [114]:
# create the star index
star_dir = os.path.join(work_dir, "star")
os.makedirs(star_dir, exist_ok=True)

!STAR --runThreadN 16 \
    --runMode genomeGenerate \
    --genomeDir {star_dir} \
    --genomeFastaFiles {fasta_file} \
    --sjdbGTFfile {gtf_filt_file} \
    --sjdbOverhang 100

	/home/nickyoungblut/miniforge3/envs/asmbl/bin/STAR-avx2 --runThreadN 16 --runMode genomeGenerate --genomeDir /scratch/multiomics/nickyoungblut/star_refs/Saccharomyces_cerevisiae/star --genomeFastaFiles /scratch/multiomics/nickyoungblut/star_refs/Saccharomyces_cerevisiae/R64-1-1/R64-1-1.fa --sjdbGTFfile /scratch/multiomics/nickyoungblut/star_refs/Saccharomyces_cerevisiae/Saccharomyces_cerevisiae/Saccharomyces_cerevisiae.gtf --sjdbOverhang 100
	STAR version: 2.7.11b   compiled: 2025-01-09T12:32:06+0000 :/opt/conda/conda-bld/star_1736425784849/work/source
Feb 01 15:43:47 ..... started STAR run

Feb 01 15:43:47 ... starting to generate Genome files
Feb 01 15:43:47 ..... processing annotations GTF
Feb 01 15:43:47 ... starting to sort Suffix Array. This may take a long time...
Feb 01 15:43:47 ... sorting Suffix Array chunks and saving them to disk...
Feb 01 15:43:48 ... loading chunks from disk, packing SA...
Feb 01 15:43:48 ... finished generating suffix array
Feb 01 15:43:48 ... generatin

# sessionInfo

In [116]:
!mamba list

# packages in environment at /home/nickyoungblut/miniforge3/envs/asmbl:
#
# Name                    Version                   Build  Channel
_libgcc_mutex             0.1                 conda_forge    conda-forge
_openmp_mutex             4.5                       2_gnu    conda-forge
alsa-lib                  1.2.11               hd590300_1    conda-forge
anyio                     4.8.0              pyhd8ed1ab_0    conda-forge
appdirs                   1.4.4              pyhd8ed1ab_1    conda-forge
asttokens                 2.4.1              pyhd8ed1ab_0    conda-forge
bash                      5.2.21               h7f99829_0    conda-forge
bash_kernel               0.9.3              pyh4f82c71_0    conda-forge
bedtools                  2.31.1               hf5e1c6e_2    bioconda
biopython                 1.85            py312h66e93f0_1    conda-forge
biothings_client          0.4.1              pyh29332c3_0    conda-forge
blast                     2.16.0               hc155240_3  