# TCGA RNA-seq hgr1 alignments -- Error follow-up
```
pi:ababaian
files: ~/Crown/data2/tcga_3_general
start: 2018 09 13
complete : 2018 09 14
```

## Introduction

In `TCGA_RNA-seq_Generic_1` ~100 runs had failed. The most likely explaination is that the pipeline crashed due to memory over-run (the files which failed appear to be larger then normal).

Re-optimize the pipeline slightly and run it using a larger EC2 instance with more threads/memory.


In [1]:
WORKDIR='/home/artem/Crown/data2/tcga_3_general'

cd $WORKDIR



## Materials and Methods

#### TCGA Data Input

From the `logs/run1_bamlist.alh.txt` containing the `ls -alh` of all the hgr1 bam files, all entries which were <10kbytes were flagged as "failed runs" and saved into
`logs/failed_runs1.txt`


In [5]:
# List of failed hgr1 alignment runs
cat logs/failed_runs_1.txt # 93 entries
echo ''
echo ''

# Extract input lines for all failed runs into a seperate file

for FILE in $(cat logs/failed_runs_1.txt | cut -f3 -)
do
    grep "$FILE" logs/tcga_run_all.txt >> tcga_failed_runs.txt
    grep "$FILE" ../tcga_2_lusc/*.txt >> tcga_failed_runs.txt
    grep "$FILE" ../tcga_1_coad/*.txt >> tcga_failed_runs.txt
done

# output file manually cleaned up, duplicates removed -- 93 entries

TCGA-BLCA/TCGA-BL-A13J-01B.hgr1.bam	TCGA-BLCA	TCGA-BL-A13J-01B
TCGA-BLCA/TCGA-BT-A20R-01A.hgr1.bam	TCGA-BLCA	TCGA-BT-A20R-01A
TCGA-BRCA/TCGA-A7-A0DC-01A.hgr1.bam	TCGA-BRCA	TCGA-A7-A0DC-01A
TCGA-BRCA/TCGA-A7-A13E-01A.hgr1.bam	TCGA-BRCA	TCGA-A7-A13E-01A
TCGA-BRCA/TCGA-E9-A1RI-11A.hgr1.bam	TCGA-BRCA	TCGA-E9-A1RI-11A
TCGA-COAD/TCGA-A6-2684-01C.hgr1.bam	TCGA-COAD	TCGA-A6-2684-01C
TCGA-COAD/TCGA-AA-3697-01A.hgr1.bam	TCGA-COAD	TCGA-AA-3697-01A
TCGA-ESCA/TCGA-L5-A43C-01A.hgr1.bam	TCGA-ESCA	TCGA-L5-A43C-01A
TCGA-ESCA/TCGA-L5-A43C-11A.hgr1.bam	TCGA-ESCA	TCGA-L5-A43C-11A
TCGA-ESCA/TCGA-L5-A4OG-11A.hgr1.bam	TCGA-ESCA	TCGA-L5-A4OG-11A
TCGA-ESCA/TCGA-L5-A4OJ-11A.hgr1.bam	TCGA-ESCA	TCGA-L5-A4OJ-11A
TCGA-ESCA/TCGA-L5-A4OO-11A.hgr1.bam	TCGA-ESCA	TCGA-L5-A4OO-11A
TCGA-KIRC/TCGA-CZ-5456-01A.hgr1.bam	TCGA-KIRC	TCGA-CZ-5456-01A
TCGA-LIHC/TCGA-BC-A10Q-11A.hgr1.bam	TCGA-LIHC	TCGA-BC-A10Q-11A
TCGA-LIHC/TCGA-BC-A216-01A.hgr1.bam	TCGA-LIHC	TCGA-BC-A216-01A
TCGA-LIHC/TCGA-DD-A113-01A.hgr1.bam	TCGA

#### Scripts

In [7]:
cd $WORKDIR
# First versions of scripts copied to ./scripts_run1/
# edited for run 2

cat hgr1_align_v2.tcga.sh
echo 
echo
cat queenB.sh
echo 
echo
cat droneB.sh
echo 
echo 

#!/bin/bash
# 1kg_align_v2.tcga.sh
# rDNA alignment pipeline
# 180831 build -- TCGA
# AMI: crown-180813 - ami-0031fd61f932bdef9
# EC2: c4.2xlarge (8cpu / 15 gb)
# EC2: c4.xlarge  (4cpu / 8  gb)
# Storage: 200 Gb
#

# Input Requirements --------------------------

# $1 : Library name and file-output name
# $2 : Library population/analysis set
# $3 : Library UUID

# Control Panel -------------------------------
# CPU
	THREADS='7'

# Sequencing Data
	LIBRARY=$1 # Library/ File name

# TCGA FILE UUID
  UUID=$3

 # FastQ File-names
    FQ0="$LIBRARY.tmp.sort.0.fq"
    FQ1="$LIBRARY.tmp.sort.1.fq"
    FQ2="$LIBRARY.tmp.sort.2.fq"
    
# Read Group Data
# Extract from downloaded BAM file / input
	RGPO=$2 # Patient Population

	#RGSM= # Sample. Patient Identifer
	#RGID= # Read Group ID. Accession Number
    
	RGLB=$LIBRARY # Library Name. Accession Number
	RGPL='ILLUMINA'  # Sequencing Platform.
    
	# Extract Sequencing Run Info
	#  RGPU=$(gzip -dc $

In [8]:
# Differences between versions only
diff hgr1_align_v2.tcga.sh scripts_run1/hgr1_align_v2.tcga.sh

echo ''
echo ''

diff queenB.sh scripts_run1/queenB.sh

19c19
< 	THREADS='7'
---
> 	THREADS='3'
169a170,191
>       # Extract just the 45S unit
>       #aws s3 cp s3://crownproject/resources/rDNA_45s.bed ./
>       #~/bin/samtools view -b -L rDNA_45s.bed align.F4.bam > align.F4.45s.bam
> 
>       # What are the mapped readnames
>       ~/bin/samtools view align.F4.bam | cut -f1 - > read.names.tmp
> 
>       # Extract mapped reads
>       ~/bin/samtools view align.F4.bam | grep -Ff read.names.tmp - > align.F4.tmp.sam
> 
> 
>       # Extract cases of read pairs mapped on edge of region of interest
>       # read:                  ====---====
>       ~/bin/samtools view align.F4.bam | grep -Ff read.names.tmp - > align.F4.tmp.sam
> 
>       # Complete mapped reads list
>       #cut -f1 align.F4.tmp.sam > read.names.45s.long.tmp
> 
>       # Extract unmapped reads with a mapped pair
>       ~/bin/samtools view align.f4F8.bam | grep -Ff read.names.tmp - > align.f4F8.tmp.sam
> 
171c193
<       ~/bin/samtools cat -h align

## Pilot Run


In [12]:
# LOCAL:
cd $WORKDIR

head -n2 tcga_failed_runs.txt > tcga_failed_pilot.txt

# run...
echo ''; cat tcga_failed_pilot.txt; echo ''

aws s3 cp queenB.sh s3://crownproject/tcga/scripts/
aws s3 cp droneB.sh s3://crownproject/tcga/scripts/
aws s3 cp hgr1_align_v2.tcga.sh s3://crownproject/tcga/scripts/
aws s3 cp tcga_failed_pilot.txt s3://crownproject/tcga/scripts/


TCGA-BL-A13J-01B TCGA-BLCA 459ce800-7bab-428d-aeff-323609e11707
TCGA-BT-A20R-01A TCGA-BLCA 295826fd-0ff5-4982-a80f-0e49c2478acc

Completed 3.8 KiB/3.8 KiB with 1 file(s) remainingupload: ./queenB.sh to s3://crownproject/tcga/scripts/queenB.sh
Completed 657 Bytes/657 Bytes with 1 file(s) remainingupload: ./droneB.sh to s3://crownproject/tcga/scripts/droneB.sh
Completed 6.4 KiB/6.4 KiB with 1 file(s) remainingupload: ./hgr1_align_v2.tcga.sh to s3://crownproject/tcga/scripts/hgr1_align_v2.tcga.sh
Completed 128 Bytes/128 Bytes with 1 file(s) remainingupload: ./tcga_failed_pilot.txt to s3://crownproject/tcga/scripts/tcga_failed_pilot.txt


In [13]:
# Remote:
# Manually open an Amazon Linux 2 AMI
# ami-6cd6f714
# t2.micro
#
# ssh login:
# ssh -i "crown.pem" ec2-user@PUBLICDNS
#

# Commands on EC2 machine to set-up AWS
# enter personal login info:

# REMOTE:
#aws configure
  # AWS Key ID
  # AWS Secret Key ID
  # Region: us-west-2
  
# Copy local run files to S3 and download them on EC2

# REMOTE:
# aws s3 cp --recursive s3://crownproject/tcga/scripts/ ./
#
# mv <KEY>.pem ~/.ssh/
# chmod 400 ~/.ssh/<KEY>.pem

# REMOTE:
# Open logging screen and being launchign EC2 instances
# screen -L
# 
# bash queenB.sh tcga_failed_pilot.txt

aws s3 cp s3://crownproject/tcga/logs/tcga_3_failed_pilot.log logs/

cat logs/tcga_3_failed_pilot.log

Completed 1.3 KiB/1.3 KiB with 1 file(s) remainingdownload: s3://crownproject/tcga/logs/tcga_3_failed_pilot.log to logs/tcga_3_failed_pilot.log
kec2-user@ip-172-31-24-91:~\[?1034h[ec2-user@ip-172-31-24-91 ~]$ bash queenB.sh tcga_failed_pilot.txt [K
Launch instance # 1
Thu Sep 13 17:55:17 UTC 2018
Instance Type: c4.2xlarge
AMI Image: ami-0031fd61f932bdef9
Run Script: s3://crownproject/tcga/scripts/hgr1_align_v2.tcga.sh
Parameters: TCGA-BL-A13J-01B TCGA-BLCA 459ce800-7bab-428d-aeff-323609e11707
Instance ID: i-0eb6322c4d24d59f5
Public DNS: ec2-54-213-221-34.us-west-2.compute.amazonaws.com
download: s3://crownproject/tcga/scripts/hgr1_align_v2.tcga.sh to ./hgr1_align_v2.tcga.sh


Launch instance # 2
Thu Sep 13 17:58:24 UTC 2018
Instance Type: c4.2xlarge
AMI Image: ami-0031fd61f932bdef9
Run Script: s3://crownproject/tcga/scripts/hgr1_align_v2.tcga.sh
Parameters: TCGA-BT-A20R-01A TCGA-BLCA 295826fd-0ff5-4982-a80f-0e49c2478acc
Instance ID: i-048c7eb6

## hgr1 re-run

Pilot run was succesful; files were 400 + 700 Mb respectively!

Run entire re-run set.

In [15]:
# LOCAL:
cd $WORKDIR

# Turn off AWS EC2 self-shutdown
# uncomment line 209
aws s3 cp hgr1_align_v2.tcga.sh s3://crownproject/tcga/scripts/


# removed two entries above from pilot run
echo ''; cat tcga_failed_runs.txt; echo ''

aws s3 cp tcga_failed_runs.txt s3://crownproject/tcga/scripts/

Completed 6.4 KiB/6.4 KiB with 1 file(s) remainingupload: ./hgr1_align_v2.tcga.sh to s3://crownproject/tcga/scripts/hgr1_align_v2.tcga.sh

TCGA-A7-A0DC-01A TCGA-BRCA d04aa38e-221c-44f4-ba52-1e2f66ba2bbe
TCGA-A7-A13E-01A TCGA-BRCA 29fdc300-034c-49bf-81c3-e1ec822b2f78
TCGA-E9-A1RI-11A TCGA-BRCA e4265883-5495-4b2f-aa1e-032210e391c9
TCGA-A6-2684-01C TCGA-COAD a1da668f-e62b-4b39-b1de-be6df71496ad
TCGA-AA-3697-01A TCGA-COAD 8714c72b-5800-4ee7-91d2-848742701d15
TCGA-L5-A43C-01A TCGA-ESCA 5feae052-9067-4070-a5bb-e3f247b8a2a6
TCGA-L5-A43C-11A TCGA-ESCA ffd9843a-4043-425c-9ecf-eb6111125de3
TCGA-L5-A4OG-11A TCGA-ESCA 76d356fe-5de9-408c-a9d3-afde9671e1b4
TCGA-L5-A4OJ-11A TCGA-ESCA d647ec8a-6b27-42b0-b568-04b135fd6c2e
TCGA-L5-A4OO-11A TCGA-ESCA 646fbfbe-c3f5-486d-9842-119a28bc0c53
TCGA-CZ-5456-01A TCGA-KIRC f56d27a8-7d77-4078-a2a1-4158d8917e80
TCGA-BC-A10Q-11A TCGA-LIHC 5b7028a0-9435-40f4-a54f-f9696ad4780f
TCGA-BC-A216-01A TCGA-LIHC ad8506c7-ecf3-4e33-b577-54a67a8330e8
TCGA-DD-A113-

In [None]:
# Remote:
# aws s3 cp screenlog.1 s3://crownproject/tcga/logs/tcga_failed_runs.log

## Crown AMI Update



In [None]:
## Launched 180906 Crown AMI

## Delete old TCGA data and re-download all TCGA data
#rm -r ~/tcga

#aws s3 cp s3://crownproject/tcga ./tcga --recursive

#cd tcga
#mv tcga-coad-1/* TCGA-COAD/
#rmdir tcga-coad-1/

#mv tcga-lusc/* TCGA-LUSC/
#rmdir  tcga-lusc/

#rm -r ../logs/; mv logs/ ../

#mv scripts/* ../scripts/; rmdir scripts

#rm -r tcga-coad0
#rm -r vcf/ vcf_1248/

Sanity Check that new alignment files are complete
```
ls -alh */*.bam > bamlist.alh.txt
aws s3 cp bamlist.alh.txt s3://crownproject/tmp/
```

Bam files which remained failed... (Note the old dates; I accidently overwrote the TCGA-COAD files with tcga-coad-1 files and lusc respectively.

```
-rw-rw-r--	1	ubuntu	ubuntu	353	Sep	5	6:29	TCGA-STAD/TCGA-BR-7851-01A.hgr1.bam
-rw-rw-r--	1	ubuntu	ubuntu	364	Aug	31	16:14	TCGA-LUSC/TCGA-22-5471-01A.hgr1.bam
-rw-rw-r--	1	ubuntu	ubuntu	364	Aug	31	16:14	TCGA-LUSC/TCGA-22-5482-01A.hgr1.bam
-rw-rw-r--	1	ubuntu	ubuntu	364	Aug	31	16:15	TCGA-LUSC/TCGA-22-5491-01A.hgr1.bam
-rw-rw-r--	1	ubuntu	ubuntu	364	Aug	31	16:15	TCGA-LUSC/TCGA-33-6737-01A.hgr1.bam
-rw-rw-r--	1	ubuntu	ubuntu	368	Aug	31	16:15	TCGA-LUSC/TCGA-33-4587-11A.hgr1.bam
-rw-rw-r--	1	ubuntu	ubuntu	368	Aug	31	16:15	TCGA-LUSC/TCGA-56-7730-01A.hgr1.bam
-rw-rw-r--	1	ubuntu	ubuntu	369	Aug	31	16:15	TCGA-LUSC/TCGA-43-7657-01A.hgr1.bam
-rw-rw-r--	1	ubuntu	ubuntu	370	Aug	31	16:15	TCGA-LUSC/TCGA-56-8083-11A.hgr1.bam
-rw-rw-r--	1	ubuntu	ubuntu	370	Aug	31	16:15	TCGA-LUSC/TCGA-58-8386-11A.hgr1.bam
-rw-rw-r--	1	ubuntu	ubuntu	371	Aug	23	21:34	TCGA-COAD/TCGA-A6-2684-01C.hgr1.bam
-rw-rw-r--	1	ubuntu	ubuntu	373	Aug	31	16:15	TCGA-LUSC/TCGA-51-4079-11A.hgr1.bam
-rw-rw-r--	1	ubuntu	ubuntu	373	Aug	31	16:15	TCGA-LUSC/TCGA-51-4081-11A.hgr1.bam
```


In [None]:
# aws s3 cp s3://crownproject/tcga/TCGA-COAD/ TCGA-COAD/ --recursive
# aws s3 cp s3://crownproject/tcga/TCGA-LUSC/ TCGA-LUSC/ --recursive

Failed bam files...
```
-rw-rw-r--	1	ubuntu	ubuntu	353	Sep	5	6:29	TCGA-STAD/TCGA-BR-7851-01A.hgr1.bam
```

One failure is OK for ~1360 samples. Will delete this bam files and its respective normal from further analysis.

```
ubuntu@ip-172-31-18-211:~/tcga$ ls TCGA-STAD/TCGA-BR-7851-*
TCGA-STAD/TCGA-BR-7851-01A.flagstat       TCGA-STAD/TCGA-BR-7851-11A.flagstat
TCGA-STAD/TCGA-BR-7851-01A.hgr1.bam       TCGA-STAD/TCGA-BR-7851-11A.hgr1.bam
TCGA-STAD/TCGA-BR-7851-01A.hgr1.bam.bai   TCGA-STAD/TCGA-BR-7851-11A.hgr1.bam.bai
TCGA-STAD/TCGA-BR-7851-01A.hgr1.flagstat  TCGA-STAD/TCGA-BR-7851-11A.hgr1.flagstat
```

```
rm TCGA-STAD/TCGA-BR-7851-*
```


In [None]:
## Save bamlist file to vcf output folder

# cd ~/tcga/
# mkdir ~/vcf_1248
# bam */*.bam > ~/vcf_1248/bamlist.alh.txt


## Re-run VCF 1248 analysis (see below)
# screen -L
# cd ~
# bash ~/scripts/ADcalc.sh

# mv *.vcf ../vcf_1248/
# mv 18S_1248macp.vcf.bamlist ../vcf_1248/
# mv ../screenlog.0 ../vcf_1248/18S_1248macp.vcf.log

# cd ~/tcga/

# cd ~/vcf_1248

# for FILE in $(ls *vcf)
# do
#     cat $FILE |\
#       sed 's/^[A-Za-z0-9\t .,]*:AD\t//g' - |\
#       sed 's/\t/\n/g' - \
#       >> TCGA-18S_1248.tsv
# done
# 
# cd ..

# aws s3 cp --recursive ./ s3://crownproject/tcga/180914_1248macp_vcf/
# mv ~/vcf_1248 data/180914_1248macp_vcf

In [None]:
#!/bin/bash
# ADcalc.sh
# Allelic Depth Calculator
# for a position

cd ~/tcga/

# Controls -----------------
REGION='chr13:1004908'
OUTPUT='18S_1248macp.vcf'
DEPTH='100000'
BAMLIST='bam.list.tmp'

# Iterate through every TCGA Cancer Type
for TYPE in $(ls)
do
    echo Analyzing $TYPE...
    
    cd $TYPE

    ls *.bam > bam.list.tmp
    ls *.bam >> ../$OUTPUT.bamlist

    # Iterate through every bam file in directory
    # look-up position and return VCF
        bcftools mpileup -f ~/resources/hgr1/hgr1.fa \
      --max-depth $DEPTH -A --min-BQ 30 \
      -a FORMAT/DP,AD \
      -r "$REGION" \
      --ignore-RG \
      -b $BAMLIST | \
      bcftools annotate -x INFO,FORMAT/PL - | \
      bcftools view -O v -H - \
      >> ../$TYPE.$OUTPUT
      
    rm bam.list.tmp
    
    cd ..
done


In [None]:
# AMI SNAPSHOT TAKEN AT THIS POINT

# crown-180914
# ami-096bcb9d18c32d4d5
# 180 Gb

