### Installing Pysradb

In [None]:
!pip install pysradb

### Downloading Data

In [None]:
from pysradb import SRAweb

db = SRAweb()
data_name = []

for i in range(10):
  data_name.append('SRR123031%s' % (45+i))

# selected_df = db.sra_metadata(data_name[0], detailed=True)
selected_df = db.sra_metadata('SRR12303145', detailed=True)
db.download(df=selected_df, skip_confirmation=True)

#### Downloading Ref Genome

In [None]:
# !wget -c https://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/annotation/GRCh38_latest/refseq_identifiers/GRCh38_latest_genomic.fna.gz -O ./ref_genome.fna.gz
# !wget -c https://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/annotation/GRCh37_latest/refseq_identifiers/GRCh37_latest_genomic.fna.gz -O ./GRCh37.fna.gz
!wget -c https://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/annotation/GRCh38_latest/refseq_identifiers/GRCh38_latest_genomic.gtf.gz -O ./ref_genome_annotate.gtf.gz
# !gzip -d ./GRCh37.fna.gz
# !gzip -d ./ref_genome.fna.gz
!gzip -d /content/ref_genome_annotate.gtf.gz

# !mkdir -p ./bowtie_index/GRCh37
# !mv ./GRCh37.fna ./bowtie_index/GRCh37/GRCh37.fa

### Installing STAR

In [None]:
import os

!git clone https://github.com/alexdobin/STAR.git
%cd ./STAR/source
!make STAR
%cd /content
os.environ['PATH'] += ':/content/STAR/source/'
!mkdir ./star_index
!wget -c https://labshare.cshl.edu/shares/gingeraslab/www-data/dobin/STAR/STARgenomes/Human/GRCh38_Ensembl99_sparseD3_sjdbOverhang99/SA -P /content/star_index/
!wget -c https://labshare.cshl.edu/shares/gingeraslab/www-data/dobin/STAR/STARgenomes/Human/GRCh38_Ensembl99_sparseD3_sjdbOverhang99/Homo_sapiens.GRCh38.99.gtf -P /content/star_index/
!wget -c https://labshare.cshl.edu/shares/gingeraslab/www-data/dobin/STAR/STARgenomes/Human/GRCh38_Ensembl99_sparseD3_sjdbOverhang99/SAindex -P /content/star_index/

#### Creating Genome Indecies

In [None]:
!mkdir -p /content/genome_index
!STAR --runMode genomeGenerate --limitGenomeGenerateRAM 12500000000 --genomeDir /content/genome-index --genomeFastaFiles /content/ref_genome.fna --sjdbGTFfile /content/ref_genome_annotate.gtf

### Converting SRA to Fastq

In [None]:
!wget -c https://repo.continuum.io/archive/Anaconda3-5.1.0-Linux-x86_64.sh
!chmod +x Anaconda3-5.1.0-Linux-x86_64.sh
!bash ./Anaconda3-5.1.0-Linux-x86_64.sh -b -f -p /usr/local

import sys

sys.path.append("/usr/local/lib/python3.6/site-packages/")

!conda config --add channels defaults
!conda config --add channels bioconda
!conda config --add channels conda-forge

# insalling parallel-fastq-dump
!conda install -y parallel-fastq-dump

#### parallel fastq dump

In [None]:
import glob
import subprocess


paths = glob.glob("/content/pysradb_downloads/SRP273256/*/*.sra")
command = 'parallel-fastq-dump --threads 4 --outdir sratofastq/ --split-files --tmpdir tmpdir --gzip -s '
!mkdir -p sratofastq && mkdir -p tmpdir
commands = []
for i in range(len(paths)):
  commands.append(command + paths[i])

### Downloading Fastqc

In [None]:
import os

!wget -c https://www.bioinformatics.babraham.ac.uk/projects/fastqc/fastqc_v0.11.9.zip
!unzip /content/fastqc_v0.11.9.zip
!rm /content/fastqc_v0.11.9.zip
!chmod 777 /content/FastQC/fastqc
os.environ['PATH'] += ':/content/FastQC/'

### Trimming

In [None]:
!wget -c http://www.usadellab.org/cms/uploads/supplementary/Trimmomatic/Trimmomatic-0.39.zip
!unzip /content/Trimmomatic-0.39.zip
!rm /content/Trimmomatic-0.39.zip

### HISAT2 alignment

In [None]:
import os

!wget -c https://cloud.biohpc.swmed.edu/index.php/s/oTtGWbWjaxsQ2Ho/download
!unzip ./download
!rm ./download
os.environ['PATH'] += ':/content/hisat2-2.2.1/'

!wget https://genome-idx.s3.amazonaws.com/hisat/grch38_genome.tar.gz
!tar -xf ./grch38_genome.tar.gz
!rm ./grch38_genome.tar.gz

### Tophat alignment

In [None]:
import os

!wget --content-disposition -c https://sourceforge.net/projects/bowtie-bio/files/bowtie2/2.4.5/bowtie2-2.4.5-linux-x86_64.zip/download -P ./bowtie
!unzip ./bowtie/bowtie2-2.4.5-linux-x86_64.zip -d ./bowtie
!rm ./bowtie/bowtie2-2.4.5-linux-x86_64.zip
os.environ['PATH'] += ':/content/bowtie/bowtie2-2.4.5-linux-x86_64/'

!wget -c http://ccb.jhu.edu/software/tophat/downloads/tophat-2.1.1.Linux_x86_64.tar.gz
!tar -xf ./tophat-2.1.1.Linux_x86_64.tar.gz
!rm ./tophat-2.1.1.Linux_x86_64.tar.gz
os.environ['PATH'] += ':/content/tophat-2.1.1.Linux_x86_64'

# Downloading pre-build index
!wget -c https://genome-idx.s3.amazonaws.com/bt/GRCh37.zip
# !wget -c ftp://ftp.ccb.jhu.edu/pub/data/bowtie_indexes/GRCh38_no_alt.zip
!unzip ./GRCh37.zip -d ./bowtie_index
# !unzip ./GRCh38_no_alt.zip -d ./bowtie_index
!rm ./GRCh37.zip
# !rm ./GRCh38_no_alt.zip

### Samtools

In [None]:
import os

!wget -c https://github.com/samtools/samtools/releases/download/1.16.1/samtools-1.16.1.tar.bz2
!tar -xjf ./samtools-1.16.1.tar.bz2
!rm ./samtools-1.16.1.tar.bz2
%cd ./samtools-1.16.1
!./configure --prefix=/content/samtools/
!make
!make install
%cd ../
os.environ['PATH'] += ':/content/samtools/bin/'
!rm -r ./samtools-1.16.1

### Deseq2

In [None]:
!pip install matplotlib
!pip install Cython
!pip install pysam
!pip install HTSeq
!apt-get install r-base
!pip install rpy2
!R -e 'install.packages("BiocManager")'
!R -e 'BiocManager::install("DESeq2")'

# Creating count matrix for sample
!htseq-count -f bam ./hisat_out/sorted.bam ./ref_genome_annotate.gtf > ./count_matrix.txt

In [None]:
import pandas as pd
import rpy2.robjects as robjects
from rpy2.robjects import pandas2ri, Formula
pandas2ri.activate()
from rpy2.robjects.packages import importr

deseq = importr('DESeq2')

data = pd.read_table("/content/count_matrix.txt")
data.columns = ['genes', 'sample1']

gr = [x for x in data.columns[1:]]
gr = pd.DataFrame(gr, columns=["Group"])

design_formula = Formula("~ Group")

cds = deseq.DESeqDataSetFromMatrix(data.iloc[:,1:], colData=gr, design=design_formula)
cds = deseq.DESeq(cds)

### Running code

In [None]:
import subprocess

path_to_fq = '/content/sratofastq/'
trim = '/content/Trimmomatic-0.39/trimmomatic-0.39.jar'

for c in commands:
  subprocess.run(c, shell=True)

  file_name = c[c.rfind('/')+1:-4]
  subprocess.run('mkdir -p /content/QC/' + file_name, shell=True)

  fq1 = file_name + '_1.fastq.gz'
  fq2 = file_name + '_2.fastq.gz'
  fastqc_command = 'fastqc --outdir=/content/QC/' + file_name + '/ ' + path_to_fq + fq1 + ' ' + path_to_fq + fq2
  subprocess.run(fastqc_command, shell=True)

  subprocess.run('mkdir -p /content/TrimmedFq/', shell=True)
  tr_fq1_p = '/content/TrimmedFq/' + file_name + '_1_paired.fastq.gz '
  tr_fq1_unp = '/content/TrimmedFq/' + file_name + '_1_unpaired.fastq.gz '
  tr_fq2_p = '/content/TrimmedFq/' + file_name + '_2_paired.fastq.gz '
  tr_fq2_unp = '/content/TrimmedFq/' + file_name + '_2_unpaired.fastq.gz '
  trimming_command = 'java -jar ' + trim + ' PE ' + path_to_fq + fq1 + ' ' + path_to_fq + fq2 + ' ' + tr_fq1_p + tr_fq1_unp + tr_fq2_p + tr_fq2_unp + 'ILLUMINACLIP:'+ '/content/Trimmomatic-0.39/adapters/TruSeq3-PE-2.fa' + ':2:30:10 LEADING:3 TRAILING:3 SLIDINGWINDOW:4:20 MINLEN:36'
  subp = subprocess.run(trimming_command, shell=True, capture_output=True, text=True)
  trimm_log = subp.stdout.strip('\n')
  print(trimm_log)
  print()

  index = './grch38/genome '
  fq1 = './TrimmedFq/' + file_name + '_1_paired.fastq.gz '
  fq2 = './TrimmedFq/' + file_name + '_2_paired.fastq.gz '
  unpaired = './TrimmedFq/' + file_name + '_1_unpaired.fastq.gz,./TrimmedFq/' + file_name + '_2_unpaired.fastq.gz '
  output = './hisat_out/' + file_name + '/output.sam'
  subprocess.run('mkdir -p ./hisat_out/' + file_name, shell=True)
  hisat_command = 'hisat2 -p 4 -x ' + index + '-1 ' + fq1 + '-2 ' + fq2 + '-U ' + unpaired + '-S ' + output
  subp = subprocess.run(hisat_command, shell=True, capture_output=True, text=True)
  hisat_log = subp.stdout.strip('\n')
  print(hisat_log)
  print()

  # long process not suitable for colab
  index = './bowtie_index/GRCh37/GRCh37 '
  tophat_command = 'tophat --no-convert-bam --b2-fast -p 2 ' + index + fq1 + fq2
  subp = subprocess.run(tophat_command, shell=True, capture_output=True, text=True)
  subprocess.run('mkdir -p ./output_sam', shell=True)
  subprocess.run('cp ./tophat_out/accepted_hits.sam ./output_sam/accepted_hits.sam', shell=True)
  tophat_log = subp.stdout.strip('\n')
  print(tophat_log)
  print()

  # hisat2 sam location
  sam_location = './hisat_out/' + file_name + '/output.sam '
  bam_location = './hisat_out/' + file_name + '/output.bam '
  sorted_bam_location = './hisat_out/' + file_name + '/sorted.bam '

  # tophat sam location
  # sam_location = './output_sam/accepted_hits.sam '
  # bam_location = './output_sam/accepted_hits.bam '
  # sorted_bam_location = './output_sam/sorted.bam '

  sam_to_bam = 'samtools view -S -b ' + sam_location + '> ' + bam_location
  sort_bam = 'samtools sort ' + bam_location + '-o ' + sorted_bam_location
  index_bam = 'samtools index ' + sorted_bam_location
  subprocess.run(sam_to_bam, shell=True)
  subprocess.run(sort_bam, shell=True)
  subprocess.run(index_bam, shell=True)

  annotate_file = './ref_genome_annotate.gtf '
  output_name = './count_matrix.txt'
  htseq = 'htseq-count -f bam ' + sorted_bam_location + annotate_file + '> ' + output_name
  subprocess.run(htseq, shell=True)

  # subprocess.run('rm /content/hisat_out/*', shell=True)
  # subprocess.run('rm /content/sratofastq/*', shell=True)