## Download the fasta reference

In [None]:
import os
from tqdm.notebook import tqdm

In [None]:
fasta_ref_dir = os.getcwd() +  '/data/mouse_ref'
magicblast_dir = os.getcwd() + '/data/magic_blast'

os.makedirs(fasta_ref_dir, exist_ok=True)
os.makedirs(magicblast_dir, exist_ok=True)

In [None]:
!cd {fasta_ref_dir} && wget http://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_mouse/release_M27/gencode.vM27.transcripts.fa.gz

In [None]:
!cd {fasta_ref_dir} && gunzip gencode.vM27.transcripts.fa.gz

Linearize the fasta

In [None]:
%%bash -s {fasta_ref_dir}

cd $1 && awk '/^>/ {printf("%s%s\t",(N>0?"\n":""),$0);N++;next;} {printf("%s",$0);} END {printf("\n");}' < gencode.vM27.transcripts.fa > gencode.vM27.transcripts.linear.fa

Select the genes

In [None]:
!cd {fasta_ref_dir} && grep -E 'Col9a2|Smim41' gencode.vM27.transcripts.linear.fa > gencode.vM27.transcripts.selected.linear.fa

Revert to non-linearized

In [None]:
!cd {fasta_ref_dir} && tr "\t" "\n" < gencode.vM27.transcripts.selected.linear.fa | fold -w 130 > gencode.vM27.transcripts.selected.fa

In [None]:
# Cut the header to show only the gene isoform
fin, fout = f'{fasta_ref_dir}/gencode.vM27.transcripts.selected.fa', f'{fasta_ref_dir}/gencode.vM27.genes.selected.fa'

file_object_in, file_object_out = open(fin, 'r'), open(fout, 'w')

for line in tqdm(file_object_in):
    if line[0] == '>':
        contents = line.split('|')
        file_object_out.write('|'.join([contents[0], contents[4]])+'\n')
    else:
        file_object_out.write(line)

file_object_in.close()
file_object_out.close()

## Magicblast

In [None]:
!mamba install -c bioconda magicblast

### Create the reference

In [None]:
!makeblastdb -in {fasta_ref_dir}/gencode.vM27.genes.selected.fa -out {fasta_ref_dir}/mouse_ref_gencode_magicblast -parse_seqids -dbtype nucl

# Downloading FASTQ files

In [None]:
n_procs = 16

## Oprescu

In [None]:
!cd data/oprescu && parallel-fastq-dump -s SRR10275413 -t {n_procs} --split-files --gzip

## De Micheli

In [None]:
!cd data/demicheli_mouse && parallel-fastq-dump -s SRR10870296 -t {n_procs} --split-files --gzip

In [None]:
!cd data/demicheli_mouse && parallel-fastq-dump -s SRR10870297 -t {n_procs} --split-files --gzip

## Giordani

In [None]:
!cd data/giordani && aria2c -x 16 https://sra-pub-src-1.s3.amazonaws.com/SRR8352706/20180917_uninjured_wt_possorted_genome_bam.bam.1

In [None]:
!cd data/giordani && aria2c -x 16 https://sra-pub-src-1.s3.amazonaws.com/SRR8352705/20171018_uninjured_wt_possorted_genome_bam.bam.1

In [None]:
program_dir = '/media/seth/SETH_DATA/SETH_Alex/Programs/'

In [None]:
!cd {program_dir} && bamtofastq-1.3.2 {os.getcwd()}/data/giordani/20171018_uninjured_wt_possorted_genome_bam.bam.1 {os.getcwd()}/data/giordani

In [None]:
!cd {program_dir} && bamtofastq-1.3.2 {os.getcwd()}/data/giordani/20180917_uninjured_wt_possorted_genome_bam.bam.1 {os.getcwd()}/data/giordani

In [None]:
!cat data/giordani/bam_20171018/count_MissingLibrary_1_HNMVKBGX2/*_R2_*.fastq.gz > data/giordani/SRR8352705.fastq.gz

In [None]:
!cat data/giordani/bam_20180917/count_MissingLibrary_1_H7HGTBGX9/*_R2_*.fastq.gz > data/giordani/SRR8352706.fastq.gz

## Dell'Orso

In [None]:
!cd data/dellorso && parallel-fastq-dump -s SRR8602275 -t {n_procs} --split-files --gzip

In [None]:
!cd data/dellorso && parallel-fastq-dump -s SRR8602276 -t {n_procs} --split-files --gzip

# Using magicblast

## Oprescu

In [None]:
!cd data/oprescu && magicblast -query SRR10275413_1.fastq.gz -query_mate SRR10275413_2.fastq.gz -db {fasta_ref_dir}/mouse_ref_gencode_magicblast -infmt fastq -num_threads {n_procs} -no_unaligned > {magicblast_dir}/oprescu.sam

In [None]:
!samtools view -b -o {magicblast_dir}/oprescu.bam {magicblast_dir}/oprescu.sam
!samtools sort {magicblast_dir}/oprescu.bam > {magicblast_dir}/oprescu.sorted.bam
!samtools index {magicblast_dir}/oprescu.sorted.bam

## De Micheli

In [None]:
!cd data/demicheli_mouse && magicblast -query SRR10870296_3.fastq.gz -db {fasta_ref_dir}/mouse_ref_gencode_magicblast -infmt fastq -num_threads {n_procs} -no_unaligned > {magicblast_dir}/demicheli_1.sam

In [None]:
!cd data/demicheli_mouse && magicblast -query SRR10870297_3.fastq.gz -db {fasta_ref_dir}/mouse_ref_gencode_magicblast -infmt fastq -num_threads {n_procs} -no_unaligned > {magicblast_dir}/demicheli_2.sam

In [None]:
!samtools view -b -o {magicblast_dir}/demicheli_1.bam {magicblast_dir}/demicheli_1.sam
!samtools view -b -o {magicblast_dir}/demicheli_2.bam {magicblast_dir}/demicheli_2.sam
!samtools merge -o {magicblast_dir}/demicheli.bam {magicblast_dir}/demicheli_1.bam {magicblast_dir}/demicheli_2.bam
!samtools sort {magicblast_dir}/demicheli.bam > {magicblast_dir}/demicheli.sorted.bam
!samtools index {magicblast_dir}/demicheli.sorted.bam

## Giordani

In [None]:
!cd data/giordani && magicblast -query SRR8352705.fastq.gz -db {fasta_ref_dir}/mouse_ref_gencode_magicblast -infmt fastq -num_threads {n_procs} -no_unaligned > {magicblast_dir}/giordani_1.sam

In [None]:
!cd data/giordani && magicblast -query SRR8352706.fastq.gz -db {fasta_ref_dir}/mouse_ref_gencode_magicblast -infmt fastq -num_threads {n_procs} -no_unaligned  > {magicblast_dir}/giordani_2.sam

In [None]:
!samtools view -b -o {magicblast_dir}/giordani_1.bam {magicblast_dir}/giordani_1.sam
!samtools view -b -o {magicblast_dir}/giordani_2.bam {magicblast_dir}/giordani_2.sam
!samtools merge -o {magicblast_dir}/giordani.bam {magicblast_dir}/giordani_1.bam {magicblast_dir}/giordani_2.bam
!samtools sort {magicblast_dir}/giordani.bam > {magicblast_dir}/demicheli.sorted.bam
!samtools index {magicblast_dir}/giordani.sorted.bam

## Dell'Orso

In [None]:
!cd data/dellorso && magicblast -query SRR8602275_3.fastq.gz -db {fasta_ref_dir}/mouse_ref_gencode_magicblast -infmt fastq -num_threads {n_procs} -no_unaligned > {magicblast_dir}/dellorso_1.sam

In [None]:
!cd data/dellorso && magicblast -query SRR8602276_3.fastq.gz -db {fasta_ref_dir}/mouse_ref_gencode_magicblast -infmt fastq -num_threads {n_procs} -no_unaligned > {magicblast_dir}/dellorso_2.sam

In [None]:
!samtools view -b -o {magicblast_dir}/dellorso_1.bam {magicblast_dir}/dellorso_1.sam
!samtools view -b -o {magicblast_dir}/dellorso_2.bam {magicblast_dir}/dellorso_2.sam
!samtools merge -o {magicblast_dir}/dellorso.bam {magicblast_dir}/dellorso_1.bam {magicblast_dir}/dellorso_2.bam
!samtools sort {magicblast_dir}/dellorso.bam > {magicblast_dir}/dellorso.sorted.bam
!samtools index {magicblast_dir}/dellorso.sorted.bam

# Filtering reads from files

In [None]:
genes = "'Col9a2|Smim41'"

In [None]:
!cd {magicblast_dir} && zcat oprescu.txt.gz | grep -E {genes} > oprescu_filtered.txt

In [None]:
print(f'zcat dellorso_1.txt.gz dellorso_2.txt.gz | grep {genes} > dellorso_filtered.txt')

In [None]:
!cd {magicblast_dir} && zcat demicheli_1.txt.gz demicheli_2.txt.gz | grep -E {genes} > demicheli_filtered.txt

In [None]:
!cd {magicblast_dir} && zcat giordani_1.txt.gz giordani_2.txt.gz | grep -E {genes} > giordani_filtered.txt

In [None]:
!cd {magicblast_dir} && zcat dellorso_1.txt.gz dellorso_2.txt.gz | grep -E {genes} > dellorso_filtered.txt