## 00-1.Mount Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive') 

Mounted at /content/drive


## 00-2.Open terminal

In [None]:
!pip install kora
from kora import console
console.start()

## 01. Installation

# Install java, bwa, GATK, picard, samtools    

In [None]:
%%bash
wget https://github.com/broadinstitute/gatk/releases/download/4.2.2.0/gatk-4.2.2.0.zip 
unzip gatk-4.2.2.0


In [None]:
%%bash
wget https://sourceforge.net/projects/bio-bwa/files/bwa-0.7.17.tar.bz2/download
tar -xvf /content/drive/download/bwa-0.7.17.tar.bz2

#move to bwa folder 
make 

In [None]:
import os       #importing os to set environment variable
def install_java():
  !apt-get install -y openjdk-8-jdk-headless -qq > /dev/null      #install openjdk
  os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"     #set environment variable
  !java -version       #check java version
install_java()

In [None]:
%%bash
wget https://sourceforge.net/projects/picard/files/picard-tools/1.73/picard-tools-1.73.zip
unzip picard-tools-1.73.zip


In [None]:
%%bash
wget https://github.com/samtools/samtools/releases/download/1.3.1/samtools-1.3.1.tar.bz2
tar -xf samtools-1.3.1.tar.bz2

#

## 02. Initial Mapping (global alignment)

In [None]:
%%bash
export Drive=/content/drive/mydrive
bwa mem -t 8 $Drive/ucsc.hg19.fasta $Drive/illumina.fastq > illumina.sam     #-t: number of threads
samtools view -bhS illumina.sam > illumina_raw.bam
samtools sort illumina_raw.bam -o illumina_sorted.bam
samtools index illumina_sorted.bam

#sam: sequence alignment/map format. bam: binary version of sam 

## 02-1. Deduplication

In [None]:
%%bash
java -jar picard.jar MarkDuplicates \
I = illumina_sorted.bam \
O = illumina_dedup_sorted.bam \
M = Marked_dup_metrics.txt \
REMOVE_DUPLICATES = true

## 03. Base Quality Score Recalibration (BQSR)

In [None]:
%%bash
gatk --java-options "-Xmx8G" BaseRecalibrator \
-I illumina_dedup_sorted.bam \
-R $Drive/ucsc.hg19.fasta \
--known-sites $Drive/dbsnp_138.hg19.vcf \
-O illumina_recal.table 

gatk --java-options "-Xmx8G" ApplyBQSR \
-I illumina_dedup_sorted.bam \
--bqsr-recal-file illumina_recal.table\
-O illumina_postrecal.bam




## Variant Calling with HaplotypeCaller

In [None]:
%%bash
gatk --java-options "-Xmx8G" HaplotypeCaller \ 
-I illumina_postrecal.bam \ 
-R $Drive/ucsc.hg19.fasta \
-O illumina.vcf.gz \ 


## Filtering SNV/INDEL

In [None]:
%%bash
gatk –java-options ”-Xmx8G” SelectVariants \
–tmp-dir=$Drive/tmp \
-V illumina.vcf.gz \ 
-select-type SNP \ -O SNV_illumina.vcf.gz

gatk –java-options ”-Xmx8G” SelectVariants \
–tmp-dir=$Drive/tmp \ 
-V illumina.vcf.gz \
-select-type INDEL \ -O INDEL_illumina.vcf.gz

gatk –java-options ”-Xmx8G” VariantFiltration \
–tmp-dir=$Drive/tmp \ 
-V SNV_illumina.vcf.gz
-filter “QD < 2.0” –filter-name “QD2” \
-filter “DP < 10.0” –filter-name “LowDP10” \ 
-filter “QUAL < 30.0” –filter-name “QUAL30” \
-filter “SOR > 3.0” –filter-name “SOR3” \
-filter “FS > 60.0” –filter-name “FS60” \
-filter “MQ < 40.0” –filter-name “MQ40” \
-filter “MQRankSum < -12.5” –filter-name “MQRankSum-12.5” \
-filter “ReadPosRankSum < -8.0” –filter-name “ReadPosRankSum-8” \
-O Filtered_SNV_illumina.vcf.gz

gatk –java-options ”-Xmx8G” VariantFiltration
–tmp-dir=$Drive/tmp \
-V INDEL_illumina.vcf.gz \ 
-filter “QD < 2.0” –filter-name “QD2”\
-filter “DP < 10.0” –filter-name “LowDP10” \
-filter “QUAL < 30.0” –filter-name “QUAL30” \
-filter “FS > 200.0” –filter-name “FS200” \
-filter “ReadPosRankSum < -20.0” –filter-name “ReadPosRankSum-20” \
-O Filtered_INDEL_illumina.vcf.gz