In [0]:
import os
import sys
import pandas as pd
import numpy as np
import scipy as sp
import scandir
import glob
import shutil
from ipyparallel import Client

In [0]:
ddocent = "/home/cfriedline/g/src/dDocent_run/dDocent"

In [0]:
fastq_files = [x.strip() for x in open("/home/cfriedline/eckertlab/Mitra/SWWP_seq2/fastq_files.txt").readlines()]

In [0]:
fastq_files

In [0]:
assert len(fastq_files) == len(set(fastq_files)) #list is unique

In [0]:
analysis_dir = "/home/cfriedline/eckertlab/Mitra/dDocent"

In [0]:
rc = Client(profile="sge")

In [0]:
dv = rc[:]
lv = rc.load_balanced_view()
len(dv)

In [0]:
with dv.sync_imports():
    import os
    import shutil

In [0]:
def copy_file(args):
    src, dst = args
    shutil.copy(src, dst)
    return dst

In [0]:
dv['copy_file'] = copy_file

In [0]:
def extract_pop_samp(name):
    popname = None
    sampname = None
    if name.startswith("LP"):
        i = 2
        popname = name[0:i]
        sampname = name[i:]
    else:
        i = 3
        popname = name[0:i]
        sampname = name[i:]
    return popname, sampname

jobs = []
for i, f in enumerate(fastq_files):
    fname = os.path.basename(f).split(".")[0]
    p, s = extract_pop_samp(fname)
    dst = os.path.join(analysis_dir, "{}_{}.F.fq".format(p, fname))
    jobs.append(lv.apply_async(copy_file, (f, dst)))

In [0]:
np.sum([x.ready() for x in jobs]), len(jobs)

In [0]:
%%writefile /home/cfriedline/eckertlab/Mitra/dDocent/dDocent.config
Number of Processors
40
Maximum Memory
30g
Trimming
yes
Assembly?
yes
Type_of_Assembly
SE
Clustering_Similarity%
0.86
Mapping_Reads?
yes
Mapping_Match_Value
1
Mapping_MisMatch_Value
4
Mapping_GapOpen_Penalty
6
Calling_SNPs?
yes
Email
cfriedline@vcu.edu

### Compress fastq files

```
cd /home/cfriedline/eckertlab/Mitra/dDocent
ls *.fq | parallel --bar bgzip {}
```

### Use TruSeq2-SE

`sed -i 's/TruSeq2-PE/TruSeq2-SE/g' /home/cfriedline/g/src/dDocent_run/dDocent`

### Choices during dDocent

* Number of Unique Sequences with More than X Coverage (Counted within individuals): `3`
* Number of Unique Sequences present in more than X Individuals: `15 (~5%)`

### makefile

```
>cat /home/cfriedline/eckertlab/Mitra/dDocent/makefile

default:
    unset -f module; \
    OLDPATH=$$PATH; \
    export PATH=~/g/src/dDocent_run:~/g/src/dDocent_run/bwa:~/g/src/dDocent_run/seqtk; \
    export PATH=~/g/src/vcftools_0.1.11/bin:~/g/src/dDocent_run/freebayes/bin:$$PATH; \
    export PATH=$$PATH:$$OLDPATH; \
    ~/g/src/dDocent_run/dDocent dDocent.config

```

### Run

```
qrsh -N dD_Mitra -pe smp 40 -q godel199@godel96
cd /home/cfriedline/eckertlab/Mitra/dDocent
make
```

## Optimize %sim

### In `./refopt`:

```
~/g/src/dDocent/scripts/ReferenceOpt.sh 4 8 10 15 SE 32
```



In [0]:
opt_df = pd.read_csv("~/eckertlab/Mitra/dDocent/opt/kopt.data", 
                     sep=" ", 
                     header=None,
                    names=['k1', 'k2', 'sim', 'seqs'])

In [0]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from IPython.display import display
sns.set_context("talk")

In [0]:
sns.boxplot(opt_df.seqs, groupby=opt_df.sim)

## Optimize k1, k2

### In `./mapopt`

```
ln -s *F*.gz .
ln -s *R1*.gz .
wget https://raw.githubusercontent.com/jpuritz/WinterSchool.2016/master/Exercises/Day%201/RefMapOpt.sh
# repace TruSeq*PE* with TruSeq*SE*
# change `zcat $k.R.fq.gz | head -2 | tail -1 >> lengths.txt` to `zcat $k.F.fq.gz | head -2 | tail -1 >> lengths.txt`
# change bwa mem string to:
# bwa mem reference.fasta $k.R1.fq.gz -L 20,5 -t 32 -a -M -T 10 -A 1 -B 4 -O 6 -R "@RG\tID:$k\tSM:$k\tPL:Illumina" 2> bwa.$i.log | mawk '!/\t[2-9].[SH].*/' | mawk '!/[2-9].[SH]\t/' | samtools view -@32 -q 1 -SbT reference.fasta - > $k.bam
# change MM line to `MM=$(samtools flagstat $k.bam | grep -E 'mapped \(' | cut -f1 -d '+' | tr -d '\n')` to fix regex with samtools flagstat
```

### Setup makefile

```
default:
        unset -f module; \
        OLDPATH=$$PATH; \
        export PATH=~/g/src/dDocent_run:~/g/src/dDocent_run/bwa:~/g/src/dDocent_run/seqtk; \
        export PATH=~/g/src/vcftools_0.1.11/bin:~/g/src/dDocent_run/freebayes/bin:$$PATH; \
        export PATH=/home/cfriedline/g/src/samtools-0.1.19:$$PATH; \
        export PATH=$$PATH:$$OLDPATH; \
        bash RefMapOpt.sh 2 6 6 10 0.86 50 SE
```



In [0]:
map_df = pd.read_csv("/home/cfriedline/eckertlab/Mitra/dDocent/mapopt/mapping.results", sep="\t")
map_df