## Genome: cov3m reference
```
Lead     : ababaian / RCE
Issue    : n/a
start    : 2020 05 24
complete : 2020 05 24
files    : s3://serratus-public/seq/cov3m/
```

# Boot-up / Installation for Genome-Builder

In [None]:
# EC2 Instance Commands:
# Build/Run `serratus-align`container for indexing
sudo yum install -y docker
sudo yum install -y git
sudo service docker start

#export DOCKERHUB_USER='serratusbio'
#sudo docker login

git clone https://github.com/ababaian/serratus.git; cd serratus/containers
./build_containers

sudo docker run --rm --entrypoint /bin/bash -it serratus-align:latest

In [None]:
# Dev tools
yum install -y wget tar gzip less vim unzip

In [None]:
# Pre-compiled binary
wget --quiet https://downloads.sourceforge.net/project/bowtie-bio/bowtie2/"$BOWTIEVERSION"/bowtie2-"$BOWTIEVERSION"-linux-x86_64.zip &&\
  unzip bowtie2-"$BOWTIEVERSION"-linux-x86_64.zip &&\
  rm    bowtie2-"$BOWTIEVERSION"-linux-x86_64.zip

mv bowtie2-"$BOWTIEVERSION"*/bowtie2* /usr/local/bin/ &&\
  rm -rf bowtie2-"$BOWTIEVERSION"*

In [None]:
# Python3
yum install -y python3 python3-devel
curl -O https://bootstrap.pypa.io/get-pip.py
python3 get-pip.py
rm get-pip.py
pip3 install biopython

In [None]:
# SeqKit Install
wget https://github.com/shenwei356/seqkit/releases/download/v0.12.0/seqkit_linux_amd64.tar.gz &&\
  tar -xvf seqkit* && mv seqkit /usr/local/bin/ &&\
  rm seqkit_linux*

In [None]:
# local bedtools install
wget https://github.com/arq5x/bedtools2/releases/download/v2.29.2/bedtools.static.binary
mv bedtools.static.binary bedtools
chmod 755 bedtools; mv bedtools /usr/bin/

In [None]:
# Local usearch install
#The clustered database was made with usearch:
wget https://drive5.com/downloads/usearch11.0.667_i86linux32.gz
gzip -dc usearch11.0.667_i86linux32.gz > usearch
chmod 755 usearch; mv usearch /usr/bin/usearch


In [None]:
# Local Dustmasker install
cd /home/serratus/
wget ftp://ftp.ncbi.nlm.nih.gov/blast/executables/LATEST/ncbi-blast-2.10.0+-x64-linux.tar.gz
tar -xvf ncbi-blast-2.10.0+-x64-linux.tar.gz
cp ncbi-blast-2.10.0+/bin/* /usr/bin/


In [None]:
# Install EDirect
# Dependency Hell
yum install -y cpanminus expat-devel
sudo cpanm --force IO::Socket::SSL
sudo cpanm --force LWP::Protocol::https
sudo cpanm --force JSON::PP
sudo cpanm --force HTML::Entities
sudo cpanm --force XML::Simple

perl -MNet::FTP -e \
  '$$ftp = new Net::FTP("ftp.ncbi.nlm.nih.gov", Passive => 1);
  $$ftp->login; $$ftp->binary;
  $$ftp->get("/entrez/entrezdirect/edirect.tar.gz");'
  
tar -xvf edirect.tar.gz; rm edirect.tar.gz
export PATH=${PATH}:/home/serratus/edirect
yes y | ./edirect/setup.sh

## CovRef Beta Build

In [None]:
# CoV Fragments and offsets from RCE
# ????
mkdir /home/serratus/cov3a; cd /home/serratus/cov3a

# Download RefSeq (FLOM) Sequences
aws s3 cp s3://serratus-public/seq/flom2/flom2.fa ./
aws s3 cp s3://serratus-public/seq/flom2/flom2.sumzer.tsv ./
samtools faidx flom2.fa

# Download CovRef from RCE
# 4477d8655defe69b1e847427a8c6529d
wget https://drive5.com/tmp/covref.fa.gz
gzip -dc covref.fa.gz > covref3.fa
samtools faidx covref3.fa

# cb6bebcc97aecc8b9b9ab1c7eafeb054
wget https://drive5.com/tmp/covref_meta.tsv.gz
gzip -dc covref_meta.tsv.gz > covref3.sumzer.tsv

md5sum *
wc -l *

```
bash-4.2# md5sum *
4477d8655defe69b1e847427a8c6529d  covref3.fa
2b60eb7e508f9214149c693e88223bdc  covref3.fa.fai
cb6bebcc97aecc8b9b9ab1c7eafeb054  covref3.sumzer.tsv
57f452445b65be1f37ba54ab8dc24bc1  flom2.fa
bc92ecb5578ba62831c65355001236da  flom2.fa.fai
d47dbf3bc78ae4dc72c7f45f9a0aa7e2  flom2.sumzer.tsv

bash-4.2# wc -l *
  483781 covref3.fa
   10101 covref3.fa.fai
   10101 covref3.sumzer.tsv
  732745 flom2.fa
    2704 flom2.fa.fai
    2704 flom2.sumzer.tsv
 1242136 total
```

In [None]:
# Create a header file to store original flom2 headers
NAME='covref3'

grep "^>" $NAME.fa > $NAME.headers
gzip $NAME.headers

# Remove duplicates, short sequences and trim headers
#  (retains Accesion)
#  (removes ScientificName)
seqkit rmdup -s -i -D $NAME.duplicates $NAME.fa |
seqkit seq -i -m 200 - > $NAME.tmp

# 1 Duplicate entries are Removed

mv $NAME.tmp $NAME.fa

In [None]:
# Manual Blacklisting of bad entries (if they exist)
INPUTFA='covref3.fa'
NAME="covref3"

# Generate blacklist via Bed Format
aws s3 cp s3://serratus-public/seq/cov0/cov0.fa.fai ./

echo "AF209745" >> blacklist.tmp
echo "AX191447" >> blacklist.tmp
echo "AX191449" >> blacklist.tmp
echo "AY204704" >> blacklist.tmp
echo "AY204705" >> blacklist.tmp
echo "CS382036" >> blacklist.tmp
echo "DL231478" >> blacklist.tmp
echo "FB764528" >> blacklist.tmp
echo "FV537211" >> blacklist.tmp
echo "FV537213" >> blacklist.tmp
echo "FV537213" >> blacklist.tmp
echo "HV449436" >> blacklist.tmp
echo "KC786228" >> blacklist.tmp

grep -f blacklist.tmp cov0.fa.fai \
  | cut -f1,2 - \
  | sed 's/\t/\t1\t/g' - > blacklist.bed

seqkit grep $INPUTFA -i -r -v \
  -p AF209745 \
  -p AX191447 \
  -p AX191449 \
  -p AY204704 \
  -p AY204705 \
  -p CS382036 \
  -p DL231478 \
  -p FB764528 \
  -p FV537211 \
  -p FV537213 \
  -p FV537213 \
  -p HV449436 \
  -p KC786228 \
  > $NAME.bl.fa

rm cov0.fa.fai *tmp

In [None]:
# Manually set blacklisted regions
echo -e "CS460762.1\t30166\t30243" >> blacklist.bed
echo -e "CS460762.1\t37177\t37211" >> blacklist.bed
echo -e "CS480537.1\t30130\t30332" >> blacklist.bed
echo -e "CS480537.1\t30166\t30241" >> blacklist.bed
echo -e "CS480537.1\t37170\t37220" >> blacklist.bed
echo -e "DI086074.1\t1\t168" >> blacklist.bed
echo -e "DL231478.1\t1\t50" >> blacklist.bed
echo -e "DL231478.1\t43\t2296" >> blacklist.bed
echo -e "DL231478.1\t8568\t8687" >> blacklist.bed
echo -e "JB181528.1\t1\t200" >> blacklist.bed
echo -e "JB181528.1\t3111\t3307" >> blacklist.bed
echo -e "JB181528.1\t3650\t4300" >> blacklist.bed
echo -e "JB181528.1\t3675\t4258" >> blacklist.bed
echo -e "KC786228.1\t1\t200 " >> blacklist.bed
echo -e "KF530130.1\t4852\t4896" >> blacklist.bed
echo -e "KY967725.1\t17220\t17250" >> blacklist.bed
echo -e "LQ338105.1\t1\t60" >> blacklist.bed
echo -e "LR721664.1\t31112\t31189" >> blacklist.bed
echo -e "MG600026.1\t3926\t3981" >> blacklist.bed
echo -e "MH002340.1\t30202\t30242" >> blacklist.bed
echo -e "MK204388.1\t1\t64" >> blacklist.bed
echo -e "MK562374.1\t472\t561" >> blacklist.bed
echo -e "MK562374.1\t474\t542" >> blacklist.bed


In [None]:
# SimpleRepeat Mask Annotation
INPUTFA='covref3.fa'
NAME="covref3"

# Short Window Dust Masking ---------------------
# Soft mask low complexity regions via dustmasker
dustmasker -in $INPUTFA \
  -window 30 -outfmt interval \
  -out $NAME.dust30

# Convert interval dust file to bed file
while read -r line; do
  if [ $(echo $line | head -c 1) = ">" ]; then
    header=$( echo $line | sed 's/>//g' - )
  else
    start=$(echo $line | cut -f1 -d' ' -)
    end=$(echo $line | cut -f3 -d' ' - )
    echo -e "$header\t$start\t$end" >> dust30.bed
  fi
done < $NAME.dust30


# Long Window Dust Masking ---------------------
# Soft mask low complexity regions via dustmasker
dustmasker -in $INPUTFA \
  -window 64 -outfmt interval \
  -out $NAME.dust64

# Convert interval dust file to bed file
while read -r line; do
  if [ $(echo $line | head -c 1) = ">" ]; then
    header=$( echo $line | sed 's/>//g' - )
  else
    start=$(echo $line | cut -f1 -d' ' -)
    end=$(echo $line | cut -f3 -d' ' - )
    echo -e "$header\t$start\t$end" >> dust64.bed
  fi
done < $NAME.dust64

echo ''

In [None]:
# Poly-NT Mask Annotation
INPUTFA='covref3.fa'

# Create polyNT masks (10-X seed)
# FLOM seems to be missing any NT tract >7nt
seqkit locate --bed -i -m 0 -p 'AAAAAAAAAA' $INPUTFA > poly10.bed
  bedtools sort -chrThenSizeA -i poly10.bed > poly10.sort.bed
  bedtools merge -s -i poly10.sort.bed > polyAT.mask.bed

seqkit locate --bed -i -m 0 -p 'GGGGGGGGGG' $INPUTFA > poly10.bed
  bedtools sort -chrThenSizeA -i poly10.bed > poly10.sort.bed
  bedtools merge -s -i poly10.sort.bed > polyGC.mask.bed

cat polyAT.mask.bed polyGC.mask.bed > \
  polyNT.bed

rm polyAT.mask.bed polyGC.mask.bed poly10.bed poly10.sort.bed

In [None]:
# Combine blacklist, nt mask and dustmask

# Merge the bed files
cat polyNT.bed blacklist.bed dust30.bed dust64.bed > mask.tmp

# Sort the cat bed file
sort -k1,1 -k2,2n mask.tmp > mask.sort.tmp
  
# Clean up some bugs (-1 and a space)
sed 's/ //g' mask.sort.tmp \
  | sed 's/-1/0/g' - \
  > mask.sort.clean.tmp

# Merge BED file
bedtools merge -i mask.sort.clean.tmp > mask.regions.tmp

# Clean up some bugs (-1 and a space)
sed 's/ //g' mask.regions.tmp \
  | sed 's/-1/0/g' - \
  > mask.regions.bed

rm *tmp

wc -l *.bed

rm polyNT.bed blacklist.bed dust30.bed dust64.bed

```
    32 blacklist.bed
  2344 dust30.bed
  3223 dust64.bed
  2921 mask.regions.bed
   491 polyNT.bed
  9011 total
```

In [None]:
# Hard and Soft Mask the Genome
INPUTFA='covref3.bl.fa'
NAME="covref3"

# Had to manually remove line 8447 which started with "-1"
# There's a bug in there somewhere, likely a 1-base / 0-base

# cov2 pan-genome
# Soft-masked pan-genome
bedtools maskfasta -fi $INPUTFA \
  -bed mask.regions.bed -fo $NAME.softmasked.fa -soft
 
# Hard-masked pan-genome
bedtools maskfasta -fi $INPUTFA \
  -bed mask.regions.bed -fo $NAME.hardmasked.fa

In [None]:
# Confirm masking worked as expected manually 
NAME="covref3"

diff $NAME.fa $NAME.softmasked.fa  | head -n20 -
diff $NAME.fa $NAME.hardmasked.fa  | head -n20 -

cp $NAME.fa $NAME.raw.fa
mv $NAME.hardmasked.fa $NAME.fa
mv $NAME.bl.fa $NAME.unmasked.fa

# Count each fasta file
wc -l *.fa 

```
   636347 covref3.fa
   637498 covref3.raw.fa
   636347 covref3.softmasked.fa
   636347 covref3.unmasked.fa
   732745 flom2.fa
  3279284 total
```

In [None]:
# Remove intermediates and non deployment files
rm *.dust* *.gb *.fai 

# Compress stuff we don't need immediatly
gzip $NAME.softmasked.fa $NAME.unmasked.fa \
     $NAME.raw.fa mask.regions.bed

In [None]:
# Build bowtie2 + faidx index for flom2.fa
#bowtie2-build --threads 4 --seed 1337 $NAME.fa $NAME
samtools faidx $NAME.fa

md5sum *
md5sum * > covref3.md5sum

In [None]:
aws s3 sync ./ s3://serratus-public/seq/covref3/

```
3748280af0e85afaf42dd6e62626bc53  covref3.duplicates
289083fefaf1eef20417d01f2096e545  covref3.fa
bf130c649583de011259d8d2add21723  covref3.fa.fai
748c8a00cea792c59faa9917bd734e12  covref3.headers.gz
5c054bd941953ac7b40acd8ef9abafc1  covref3.raw.fa.gz
76a6316734dda93f03bf4063b566f7d0  covref3.softmasked.fa.gz
cb6bebcc97aecc8b9b9ab1c7eafeb054  covref3.sumzer.tsv
120a9a767e87f5cf9af16c67875b59e0  covref3.unmasked.fa.gz
57f452445b65be1f37ba54ab8dc24bc1  flom2.fa
d47dbf3bc78ae4dc72c7f45f9a0aa7e2  flom2.sumzer.tsv
0fbd53f641f3e5b0742fc6635b88d5ae  mask.regions.bed.gz
```

## Hotfix update to FLOM2 Sumzer

In [None]:
# flom2 sumzer update
# Index
samtools faidx flom2.fa

# Extract unique families
cut -f 4 flom2.sumzer.tsv \
  | sort -k1,1 - \
  | uniq - \
  > family.unq.tmp
  
# Jacked summary statistics
# Sum Count Average Median Min Max
function statSummary {
    sort -n | awk '
      BEGIN {
        c = 0;
        sum = 0;
      }
      $1 ~ /^(\-)?[0-9]*(\.[0-9]*)?$/ {
        a[c++] = $1;
        sum += $1;
      }
      END {
        ave = sum / c;
        if( (c % 2) == 1 ) {
          median = a[ int(c/2) ];
        } else {
          median = ( a[c/2] + a[c/2-1] ) / 2;
        }
        OFS="\t";
        print sum, c, ave, median, a[0], a[c-1];
      }
    '
}

echo -e "Family Sum Count Average Median Min Max" > flom2.stats

while read -r line; do
  grep "$line" flom2.sumzer.tsv \
  | cut -f2 - \
  > fam.tmp
  
  echo -e "$line\t"$(cat fam.tmp | statSummary) >> flom2.stats
  
done < family.unq.tmp

# Round down and parse to tsv
sed -i 's/ /\t/g' flom2.stats
sed -i 's/\.[0-9]*/g' flom2.stats

md5sum flom2.stats
cat flom2.stats

rm *tmp

`8f6ce8bc6ab4d3982aab2fe58d330818  flom2.stats`

```
Family  Sum     Count   Average Median  Min     Max
Adenoviridae    2623213 83      31605   33501   415     45781
Adomaviridae    40802   2       20401   20401   19275   21527
Alloherpesviridae       2131357 13      163951  207914  451     295146
Amnoonviridae   10323   10      1032    1071    465     1641
Anelloviridae   245431  85      2887    2878    2002    3907
Arenaviridae    473096  92      5142    3534    2016    7325
Arteriviridae   424643  28      15165   15269   12704   18410
Asfarviridae    3689242 20      184462  185164  170101  193886
Astroviridae    347331  58      5988    6450    2403    7722
Birnaviridae    34799   12      2899    2863    2605    3429
Bornaviridae    148002  17      8706    8908    5572    9006
Caliciviridae   365637  48      7617    7531    6434    8513
Circoviridae    202925  100     2029    1866    648     3925
Filoviridae     170795  9       18977   18940   18875   19114
Flaviviridae    964141  99      9738    10290   1035    12983
Geminiviridae   2383    1       2383    2383    2383    2383
Genomoviridae   145903  66      2210    2200    1998    2826
Hantaviridae    422604  117     3612    3630    324     6588
Hepadnaviridae  66925   21      3186    3182    3018    3542
Hepeviridae     42089   6       7014    7070    6654    7310
Herpesviridae   12584059        93      135312  148687  603     241087
Iridoviridae    1609039 14      114931  113436  1392    208501
Matonaviridae   9762    1       9762    9762    9762    9762
Nairoviridae    167562  27      6206    4598    1590    12184
Nodaviridae     18127   8       2265    2267    1421    3107
Orthomyxoviridae        179991  105     1714    1760    519     2427
Papillomaviridae        1511956 199     7597    7583    5748    8809
Paramyxoviridae 1078540 67      16097   15522   14904   19212
Parvoviridae    515968  103     5009    5075    3780    6148
Peribunyaviridae        527880  130     4060    4420    754     7428
Phenuiviridae   170044  42      4048    4151    1628    6504
Picobirnaviridae        26988   13      2076    1745    1688    2666
Picornaviridae  1244709 160     7779    7591    2086    10101
Pneumoviridae   132411  9       14712   15140   13350   15225
Polyomaviridae  643494  123     5231    5112    3962    14334
Poxviridae      7611778 42      181233  181133  296     359853
Reoviridae      887108  440     2016    1726    528     14939
Rhabdoviridae   854565  72      11869   11510   6552    16133
Smacoviridae    118374  46      2573    2571    2403    3028
Sunviridae      17187   1       17187   17187   17187   17187
Tobaniviridae   354967  15      23664   27318   799     33452
Togaviridae     244333  22      11106   11573   381     11964
Totiviridae     32772   5       6554    6688    4647    7788
Unknown 631637  81      7797    4668    501     158250
```

In [None]:
# Merge 0 offset and median genome size into sumzer

# Convert accessions into Median Genome
cut -f4 flom2.sumzer.tsv > fam.tmp

# Offset
yes 0 \
  | head -n $(wc -l fam.tmp | cut -f1 -d' ') \
  > offset.tmp

while read -r family; do
  grep $family flom2.stats \
  | cut -f 5 \
  >> median.tmp
done < fam.tmp

mv flom2.sumzer.tsv flom2.sumzer.tmp

wc -l *tmp

paste flom2.sumzer.tmp offset.tmp median.tmp > flom2.sumzer.tsv

rm *tmp
md5sum flom2.sumzer.tsv

`d47dbf3bc78ae4dc72c7f45f9a0aa7e2  flom2.sumzer.tsv`

```
  2704 fam.tmp
  2704 flom2.sumzer.tmp
  2704 median.tmp
  2704 offset.tmp
```

# cov3m COV Pan-Genome + Mega-Genome

In [None]:
# CoV Fragments and offsets from RCE
# ????
mkdir -p /home/serratus/cov3m; cd /home/serratus/cov3m

# Download RefSeq (FLOM) Sequences
aws s3 cp s3://serratus-public/seq/flom2/flom2.fa ./
aws s3 cp s3://serratus-public/seq/flom2/flom2.sumzer.tsv ./

# Download CovRef from RCE
aws s3 cp s3://serratus-public/seq/covref3/covref3.fa ./
aws s3 cp s3://serratus-public/seq/covref3/covref3.sumzer.tsv ./

md5sum *

```
289083fefaf1eef20417d01f2096e545  covref3.fa
cb6bebcc97aecc8b9b9ab1c7eafeb054  covref3.sumzer.tsv
57f452445b65be1f37ba54ab8dc24bc1  flom2.fa
d47dbf3bc78ae4dc72c7f45f9a0aa7e2  flom2.sumzer.tsv
```

In [None]:
# Merge covref3 and flom2 for cov3a
NAME='cov3m'

cat covref3.fa flom2.fa > $NAME.fa
cat covref3.sumzer.tsv flom2.sumzer.tsv > $NAME.sumzer.tsv

rm covref3* flom2*

In [None]:
# Build bowtie2 + faidx index
NAME='cov3m'

bowtie2-build --threads 4 --seed 1337 $NAME.fa $NAME
samtools faidx $NAME.fa

md5sum *
md5sum * > cov3m.md5sum

```
f77ca3cb853a88ff8909f7f08a1e67de  cov3m.1.bt2
b16e0bdff349a6c2ccf8b7fe863345e5  cov3m.2.bt2
afae28a945b6b8609b9926b45cfe9261  cov3m.3.bt2
16e6e789d34cd51db47774e581e0bf1e  cov3m.4.bt2
67489492ead93dfa4f49809f82bf0cfb  cov3m.fa
85d9b961d1a79de1116090392b73b48d  cov3m.fa.fai
8ea0b9543a1d9e08d4fda02b95e06437  cov3m.rev.1.bt2
953d4699bde8feb7366614a72ecd7d45  cov3m.rev.2.bt2
22a90f6b7649171b4ed83e4cd035a351  cov3m.sumzer.tsv
```

In [None]:
aws s3 sync ./ s3://serratus-public/seq/cov3m/