**UNZIP data**
```
soar
cd /n/scratch2/ajit/duv_pdx/rawdata
find . -name "*.gz" | while read filename; do gunzip "`dirname "$filename"`" "$filename"; done;
```
**Create a sample description file**
```
alignment.csv
```

**Download Reference genome**

```
mkdir reference
wget ftp://ftp.ensembl.org/pub/release-96/fasta/homo_sapiens/cdna/Homo_sapiens.GRCh38.cdna.all.fa.gz
wget ftp://ftp.ensembl.org/pub/release-96/gtf/homo_sapiens/Homo_sapiens.GRCh38.96.gtf.gz
gunzip *
```

**Create YAML File**

```
vim o2.yaml
details:
  - analysis: RNA-seq
    genome_build: hg38
    algorithm:
      transcriptome_fasta: /n/scratch2/ajit/duv_pdx/reference/Homo_sapiens.GRCh38.cdna.all.fa
      transcriptome_gtf: /n/scratch2/ajit/duv_pdx/reference/Homo_sapiens.GRCh38.96.gtf
      aligner: hisat2
      strandedness: unstranded
      tools_on: [bcbiornaseq]
      bcbiornaseq:
          organism: homo sapiens
          interesting_groups: conditions
upload:
  dir: ../final
```

**Intiate bcBio**

```bash
module load bcbio/latest
unset PYTHONPATH
bcbio_nextgen.py -w template o2.yaml alignment.csv rawdata/ --separator '_'
```

**Run bcBio**

```bash
cd alignment/work
vim submit_bcbio.sh

#!/bin/sh
#SBATCH -p medium
#SBATCH -J bcbio_O2              
#SBATCH -o run.o
#SBATCH -e run.e
#SBATCH -t 1-00:00
#SBATCH --cpus-per-task=3
#SBATCH --mem=100G
#SBATCH --mail-type=END         # Type of email notification- BEGIN,END,FAIL,ALL
#SBATCH --mail-user=ajitj_nirmal@dfci.harvard.edu   # Email to which notifications will be sent

export PATH=/n/app/bcbio/tools/bin:$PATH
bcbio_nextgen.py ../config/alignment.yaml \
    -n 24 -t ipython -s slurm -q medium -r t=1-00:00 --timeout 2000

sbatch submit_bcbio.sh
```

---

**Processing EXOME sequecing data**

```
srun --pty -p interactive --mem 70G -t 0-06:00 /bin/bash
find . -name "*.gz" | while read filename; do gunzip "`dirname "$filename"`" "$filename"; done;
```

**Move all the files fom inside a folder to outside**

```
find . -name '*.fq' -exec mv {} . \;
find . -depth -type d -empty -exec rmdir {} \;
```

**Move extra smaller files into a directory**

```
du -bsh *
```

**Make a copy of the vechile samples and renmae them**

```
mkdir vechile_1
cp V* vechile_1
mkdir vechile_2
cp V* vechile_2/

cd vechile_1
rename 'USD16092608' comb *
rename 'USD16092609' comb *
rename 'USD16092610' comb *

cd vechile_2
rename 'USD16092608' romi *
rename 'USD16092609' romi *
rename 'USD16092610' romi *
```

**Rename the files**

```
for file in *; do mv "${file}" "${file/_HMGWWDSXX_L4/}"; done # Remove defined string from name
for file in *; do mv "${file}" "${file/s47880_/}"; done # Remove defined string from name
```

**Create a sample description file**


(echo 'samplename,description'; for f in raw_files/*fq*; do readlink -f $f | perl -pe 's/(.*?_(S[0-9]+)_.*)/\1,\2/'; done) > alignment.csv

Edit alignment.csv remotely to include phenotype, batch and drug details.

**Create the YAML file**

```
details:
- analysis: variant2
  genome_build: hg38
  algorithm:
    aligner: bwa
    disambiguate: [mm10]
    tools_on: [gemini]
    remove_lcr: true
    variantcaller: [mutect2]
    variant_regions: ../input/S07604514_Regions.bed
    svcaller: [cnvkit, lumpy, delly]
upload:
  dir: ../final
```

**Download the variant region file**

Download the BED files from https://earray.chem.agilent.com/suredesign/index.htm and save it inside a folder named input

**Create the submission script**

```
#!/bin/sh
#SBATCH -p priority
#SBATCH -J exome
#SBATCH -o run.o
#SBATCH -e run.e
#SBATCH -t 7-00:00
#SBATCH --cpus-per-task=20
#SBATCH --mem=80G
#SBATCH --mail-type=END         # Type of email notification- BEGIN,END,FAIL,ALL
#SBATCH --mail-user=ajitj_nirmal@dfci.harvard.edu   # Email to which notifications will be sent

export PATH=/n/app/bcbio/tools/bin:$PATH
bcbio_nextgen.py ../config/alignment.yaml \
    -n 24 -t local
```

**Run bcbio**

```bash
module load bcbio/latest
unset PYTHONPATH
bcbio_nextgen.py -w template O2.yaml alignment.csv raw_files/ --separator '_'
```

**Submit Job**

```
cp submit_bcbio.sh alignment/work/
cd alignment/work
sbatch submit_bcbio.sh
```

## Re-running Exome sequencing data

**Merge files: multiple lanes and all vechile samples into a single entity**

**Create tomerge file**

```
(echo 'samplename,description'; for f in raw_files/*; do readlink -f $f | perl -pe 's/(.?(S[0-9]+).)/\1,\2/'; done) > tomerge.csv
```

**Merge multiple files into a single file**

```
#!/bin/sh
#SBATCH -p priority
#SBATCH -J merge
#SBATCH -o run.o
#SBATCH -e run.e
#SBATCH -t 0-12:00
#SBATCH --cpus-per-task=20
#SBATCH --mail-type=END         # Type of email notification- BEGIN,END,FAIL,ALL
#SBATCH --mail-user=ajitj_nirmal@dfci.harvard.edu   # Email to which notifications will be sent

module load bcbio/latest
cd raw_files
bcbio_prepare_samples.py --memory-per-job 2G --out merged --csv tomerge.csv
```

**Create a sample description file**

(echo 'samplename,description'; for f in raw_files/*; do readlink -f $f | perl -pe 's/(.*?_(S[0-9]+)_.*)/\1,\2/'; done) > alignment.csv

Edit alignment.csv remotely to include phenotype, batch and drug details.

**Create the YAML file**

```
details:
- analysis: variant2
  genome_build: hg38
  algorithm:
    aligner: bwa
    disambiguate: [mm10]
    tools_on: [gemini]
    remove_lcr: true
    variantcaller: [mutect2]
    variant_regions: ../input/S07604514_Regions.bed
    svcaller: [cnvkit, lumpy, delly]
upload:
  dir: ../final
```

**Download the variant region file**

Download the BED files from https://earray.chem.agilent.com/suredesign/index.htm and save it inside a folder named input

**Create the submission script**

```
vim submit_bcbio.sh


#!/bin/sh
#SBATCH -p priority
#SBATCH -J exome
#SBATCH -o run.o
#SBATCH -e run.e
#SBATCH -t 7-00:00
#SBATCH --cpus-per-task=20
#SBATCH --mem=80G
#SBATCH --mail-type=END         # Type of email notification- BEGIN,END,FAIL,ALL
#SBATCH --mail-user=ajitj_nirmal@dfci.harvard.edu   # Email to which notifications will be sent

export PATH=/n/app/bcbio/tools/bin:$PATH
bcbio_nextgen.py ../config/alignment.yaml \
    -n 24 -t local
```

**Run bcbio**

```bash
module load bcbio/latest
unset PYTHONPATH
bcbio_nextgen.py -w template O2.yaml alignment.csv raw_files/ --separator '_'
```

**Submit Job**

```
cp submit_bcbio.sh alignment/work/
cd alignment/work
sbatch submit_bcbio.sh
```


#### Working with the VCF files

```
# Merge common entries
module load gcc/6.2.0 bcftools/1.9

# D1 and D2
bcftools isec -p isec_output -Oz 4-mutect2-annotated.vcf.gz 5-mutect2-annotated.vcf.gz
bcftools merge --merge all 0002.vcf.gz 0003.vcf.gz > D12merged.vcf

# D2 and D3
bcftools isec -p isec_output -Oz 5-mutect2-annotated.vcf.gz 6-mutect2-annotated.vcf.gz
bcftools merge --merge all 0002.vcf.gz 0003.vcf.gz > D23merged.vcf

# D1 and D3
bcftools isec -p isec_output -Oz 4-mutect2-annotated.vcf.gz 6-mutect2-annotated.vcf.gz
bcftools merge --merge all 0002.vcf.gz 0003.vcf.gz > D13merged.vcf

# D1, D2 and D3
bcftools isec -n +3 4-mutect2-annotated.vcf.gz 5-mutect2-annotated.vcf.gz 6-mutect2-annotated.vcf.gz -p D123

# Variants in two or more VCF files
bcftools isec -n +2 4-mutect2-annotated.vcf.gz 5-mutect2-annotated.vcf.gz 6-mutect2-annotated.vcf.gz -p Dtwo


```

## Exome analysis (12th Dec 2020)

#### Create Alignment file

```
(echo 'samplename,description'; for f in raw/*.gz; do readlink -f $f | perl -pe 's/(.*?(S[0-9]+)_.*)/\1,\2/'; done) > alignment.csv
```

#### Edit the alignemnt file as so:

```
samplename,description,batch,phenotype
/n/scratch3/users/a/ajn16/pdx_exome/raw/D1_1.fq.gz,D1,tumor
/n/scratch3/users/a/ajn16/pdx_exome/raw/D2_1.fq.gz,D2,tumor
/n/scratch3/users/a/ajn16/pdx_exome/raw/D3_1.fq.gz,D3,tumor
/n/scratch3/users/a/ajn16/pdx_exome/raw/V_1.fastq.gz,D1;D2;D3,normal
```

#### Yaml file

```
vim O2.yaml

details:
- analysis: variant2
  genome_build: hg38
  algorithm:
    aligner: bwa
    disambiguate: [mm10]
    tools_on: [gemini]
    remove_lcr: true
    variantcaller: [mutect2]
    variant_regions: /n/scratch3/users/a/ajn16/pdx_exome/input/S07604514_Regions.bed
    svcaller: [cnvkit, lumpy, delly]
upload:
  dir: ../final
  
```

#### Sbatch script

````
#!/bin/sh
#SBATCH -p priority
#SBATCH -J exome_1
#SBATCH -o run.o
#SBATCH -e run.e
#SBATCH -t 2-00:00
#SBATCH --cpus-per-task=20
#SBATCH --mem=80G
#SBATCH --mail-type=END         # Type of email notification- BEGIN,END,FAIL,ALL
#SBATCH --mail-user=ajitj_nirmal@dfci.harvard.edu   # Email to which notifications will be sent

export PATH=/n/app/bcbio/tools/bin:$PATH
bcbio_nextgen.py ../config/alignment.yaml \
    -n 24 -t local
```

#### Run bcbio

```
module load bcbio/latest
unset PYTHONPATH
bcbio_nextgen.py -w template O2.yaml alignment.csv raw/*
```

#### Submit Job

```
cp submit_sbatch.sh alignment/work/
cd alignment/work
sbatch submit_sbatch.sh
```

# CNV analysis




**Collect PON samples**

```
mkdir normal
cd normal
mkdir input config
```

**Add Bed file to config and bam files to input**

```
cd config
cp /n/scratch3/users/a/ajn16/pdx_exome/input/S07604514_Regions.bed .

cd input
cp /n/scratch3/users/a/ajn16/pdx_exome/alignment/final/V1/V1-ready.bam .
cp /n/scratch3/users/a/ajn16/pdx_exome/alignment/final/D1/D1-ready.bam .
mv V1-ready.bam V1.bam
mv D1-ready.bam D1.bam
```

**Create pon.csv**

```
cd ..

vim pon.csv

samplename,description,svclass,batch
V1.bam,V1,control,pon_build
D1.bam,D1,tumor,pon_build

```

**Create pon_template.yaml**

```
vim pon_template.yaml

details:
  - analysis: variant2
    genome_build: hg38
    algorithm:
      svcaller: [gatk-cnv]
      variant_regions: /n/scratch3/users/a/ajn16/pdx_exome/cnv/normal/config/S07604514_Regions.bed

```

**Run bcbio**

```
module load bcbio/latest
bcbio_nextgen.py -w template pon_template.yaml pon.csv input/*.bam
```

**submit a sbarch script to create PON files**

```
vim pon_submit.sh

#!/bin/sh
#SBATCH -p priority
#SBATCH -J pon_1
#SBATCH -o run.o
#SBATCH -e run.e
#SBATCH -t 1-00:00
#SBATCH --cpus-per-task=20
#SBATCH --mem=80G
#SBATCH --mail-type=END         # Type of email notification- BEGIN,END,FAIL,ALL
#SBATCH --mail-user=ajitj_nirmal@dfci.harvard.edu   # Email to which notifications will be sent

export PATH=/n/app/bcbio/tools/bin:$PATH
bcbio_nextgen.py ../config/pon.yaml \
    -n 24 -t local

cp pon_submit.sh pon/work
cd pon/work
sbatch pon_submit.sh

```

**TUMOR**
**Create pon_tn project structure**

```
mkdir pon_tn
cd pon_tn
mkdir config input
cd config

cp /n/scratch3/users/a/ajn16/pdx_exome/cnv/normal/pon/final/2020-12-17_pon/gatkcnv-pon.hdf5 .
cp /n/scratch3/users/a/ajn16/pdx_exome/input/S07604514_Regions.bed .

```

**Copy the tumor and normal samples into the input folder**

```
# We have one normal and three tumor samples

cp /n/scratch3/users/a/ajn16/pdx_exome/alignment/final/V1/V1-ready.bam .
cp /n/scratch3/users/a/ajn16/pdx_exome/alignment/final/D1/D1-ready.bam .
cp /n/scratch3/users/a/ajn16/pdx_exome/alignment/final/D2/D2-ready.bam .
cp /n/scratch3/users/a/ajn16/pdx_exome/alignment/final/D3/D3-ready.bam .

mv V1-ready.bam V1.bam
mv D1-ready.bam D1.bam
mv D2-ready.bam D2.bam
mv D3-ready.bam D3.bam

```

**Prepare a sample sheet pon_tn.csv**

```
cd ..

vim pon_tn.csv

samplename,description,batch,phenotype
V1.bam,V1,Da;Db;Dc,normal
D1.bam,D1,Da,tumor
D2.bam,D2,Db,tumor
D3.bam,D3,Dc,tumor

```

**Premate a yaml template: pon_tn_template.yaml**
    
```

vim pon_tn_template.yaml


details:
  - analysis: variant2
    genome_build: hg38
    algorithm:
      svcaller: [gatk-cnv]
      variant_regions: /n/scratch3/users/a/ajn16/pdx_exome/cnv/pon_tn/config/S07604514_Regions.bed
      coverage_interval: regional
      background:
        cnv_reference:
          gatk-cnv: /n/scratch3/users/a/ajn16/pdx_exome/cnv/pon_tn/config/gatkcnv-pon.hdf5

```

**configure pon_tn project**

```
module load bcbio/latest
bcbio_nextgen.py -w template pon_tn_template.yaml pon_tn.csv input/*.bam

```

**Run bcbio pon_tn project**

```
vim cnv_submit.sh

#!/bin/sh
#SBATCH -p priority
#SBATCH -J cnv_2
#SBATCH -o run.o
#SBATCH -e run.e
#SBATCH -t 1-00:00
#SBATCH --cpus-per-task=20
#SBATCH --mem=80G
#SBATCH --mail-type=END         # Type of email notification- BEGIN,END,FAIL,ALL
#SBATCH --mail-user=ajitj_nirmal@dfci.harvard.edu   # Email to which notifications will be sent

export PATH=/n/app/bcbio/tools/bin:$PATH
bcbio_nextgen.py ../config/pon_tn.yaml \
    -n 24 -t local

cp cnv_submit.sh pon_tn/work
cd pon_tn/work
sbatch cnv_submit.sh

```


### Working with VCF files
#### FInd the common variants across all three samples

```
bcftools isec -n +3 Da-mutect2-annotated.vcf.gz Db-mutect2-annotated.vcf.gz Dc-mutect2-annotated.vcf.gz -p D123
# It will produce 3 files and all are identical (use any one)
```

#### Find variants that are present in two or more samples

```
bcftools isec -n +2 Da-mutect2-annotated.vcf.gz Db-mutect2-annotated.vcf.gz Dc-mutect2-annotated.vcf.gz -p Dtwo
# It will again produce 3 files but the variants will be different. Merge the three files to identify the union

bcftools view 0000.vcf -Oz -o 0000.vcf.gz
bcftools view 0001.vcf -Oz -o 0001.vcf.gz
bcftools view 0002.vcf -Oz -o 0002.vcf.gz

# index them
bcftools index 0000.vcf.gz
bcftools index 0001.vcf.gz
bcftools index 0002.vcf.gz


# Merge (did not work) some propblem with VCF file due to GATK processing
bcftools merge 0000.vcf.gz  0001.vcf.gz  0002.vcf.gz --force-samples -O v -o merged.vcf
```

#### Finding common CNV's

```
bcftools isec -n +3 Da-gatk-cnv.vcf.gz Db-gatk-cnv.vcf.gz Dc-gatk-cnv.vcf.gz -p D123
bcftools isec -n +2 Da-gatk-cnv.vcf.gz Db-gatk-cnv.vcf.gz Dc-gatk-cnv.vcf.gz -p Dtwo

```

#### From Novogene

```
#SNP
bcftools index s47880_D1.GATK.snp.vcf.gz 
bcftools index s47880_D2.GATK.snp.vcf.gz
bcftools index s47880_D3.GATK.snp.vcf.gz
bcftools isec -n +3 s47880_D1.GATK.snp.vcf.gz s47880_D2.GATK.snp.vcf.gz s47880_D3.GATK.snp.vcf.gz -p D123

#INDEL
bcftools index s47880_D1.GATK.indel.vcf.gz
bcftools index s47880_D2.GATK.indel.vcf.gz
bcftools index s47880_D3.GATK.indel.vcf.gz
bcftools isec -n +3 s47880_D1.GATK.indel.vcf.gz s47880_D2.GATK.indel.vcf.gz s47880_D3.GATK.indel.vcf.gz -p D123

```