In [0]:
import os, sys
sys.path.append("/home/cfriedline/ipynb/include_utils")
import include_utils as u
import pandas as pd
import numpy as np
from IPython.display import display

In [0]:
file_dir = "/home/cfriedline/eckertlab/gypsy_indiv/raw_demult_gbsx"
if not os.path.exists(file_dir):
    os.mkdir(file_dir)
os.chdir(file_dir)

In [0]:
proc_files = ['/gpfs_fs/home/eckertlab/gypsy_indiv/HiSeq_140425/lane1_Undetermined_L001_R1_001.fastq',
             '/gpfs_fs/home/eckertlab/gypsy_indiv/HiSeq_140425/lane2_Undetermined_L002_R1_001.fastq']

In [0]:
proc_files

In [0]:
pwd

In [0]:
!mkdir I1
!mkdir I3

In [0]:
!cp /home/cfriedline/ipynb/gypsy_moth/barcodes_i1.txt I1/barcodes
!cp /home/cfriedline/ipynb/gypsy_moth/barcodes_i3.txt I3/barcodes

In [0]:
!ln -s /gpfs_fs/home/eckertlab/gypsy_indiv/HiSeq_140425/lane1_Undetermined_L001_R1_001.fastq I1/lane1_Undetermined_L001_R1_001.fastq

In [0]:
!ln -s /gpfs_fs/home/eckertlab/gypsy_indiv/HiSeq_140425/lane2_Undetermined_L002_R1_001.fastq I3/lane2_Undetermined_L002_R1_001.fastq

In [0]:
#redo barcodes for gbsx
bcs = ["I1/barcodes", "I3/barcodes"]
for b in bcs:
    d = pd.read_csv(b, sep="\t")
    d['bc'] = d.barcode2.apply(lambda x: x.replace("CTCTTTCCCTACACGACGCTCTTCCGATCT", "").upper())
    d['sample_name'] = d.apply(lambda row: "{}_{}_{}".format(row['location'], row['sample'], row['bc']), axis=1)
    d['enz'] = "EcoRI"
    out = os.path.join(os.path.dirname(b), "barcodes_gbsx.txt")
    d[['sample_name', 'bc', 'enz']].to_csv(out, header=False, index=False, sep="\t")

In [0]:
file_dir

In [0]:
proc_files = ! find . | grep '.fastq$'

In [0]:
proc_files = [os.path.abspath(x) for x in proc_files]
proc_files

In [0]:
def build_gbsx_cmd(fastq, bc, enz):
    cmd = "/home/cfriedline/g/src/jdk1.8.0_92/bin/java -jar /home/cfriedline/g/src/GBSX/GBSX_v1.2.jar --Demultiplexer"
    return "{} -f1 {} -i {} -gzip false-rad true -mb 2 -me 1 -ea {}".format(cmd, fastq, bc, enz), "gbsx"

def write_qsub(workdir, cmd, label, run, cmd_label):
    with open(os.path.join(workdir, "run_{}.sh".format(cmd_label)), "w") as o:
        o.write("""#!/bin/bash
#$ -N {4}{0}
#$ -cwd
#$ -V
#$ -S /bin/bash
#$ -e {4}_{3}_{0}.err
#$ -o {4}_{3}_{0}.out
cd {1}
{2}
""".format(label, workdir, cmd, run, cmd_label))

for s in proc_files:
    run = os.path.basename(os.path.dirname(s))
    label = 1
    bc_file = os.path.join(os.path.dirname(s), "barcodes_gbsx.txt")
    enz_file = os.path.join(file_dir, "ecori.txt")
    workdir = os.path.dirname(s)
    gbsx_cmd, gbsx_label = build_gbsx_cmd(s, bc_file, enz_file)
    write_qsub(workdir, gbsx_cmd, label, run, gbsx_label)

In [0]:
os.chdir(file_dir)

In [0]:
pwd

### Submit jobs to SGE
```
cd /gpfs_fs/home/eckertlab/gypsy_indiv/raw_demult_gbsx
find . -name "run_gbsx.sh" | xargs chmod +x
find . -name "run_gbsx.sh" -exec qsub {} \;
```

In [0]:
counts = {"orig": "/home/cfriedline/eckertlab/gypsy_indiv/raw_demult/sample_counts.txt",
         "gbsx_I1": "/home/cfriedline/eckertlab/gypsy_indiv/raw_demult_gbsx/I1/gbsDemultiplex.stats",
         "gbsx_I3": "/home/cfriedline/eckertlab/gypsy_indiv/raw_demult_gbsx/I3/gbsDemultiplex.stats"}


orig = pd.read_csv(counts["orig"], sep="\t", header=None, names=['sample', 'count'])

i1 = pd.read_csv(counts["gbsx_I1"], sep="\t", index_col=0)
i1.index = ["I1_" + x if x.startswith("undeter") else x[:-1] for x in i1.index]

i3 = pd.read_csv(counts["gbsx_I3"], sep="\t", index_col=0)
i3.index = ["I3_" + x if x.startswith("undeter") else x[:-1] for x in i3.index]

gbsx = pd.concat([i1, i3]).sort_index()

In [0]:
orig["sampleID"] = orig['sample'].apply(lambda x: os.path.basename(x).split(".")[0])
orig.index = orig['sampleID']

In [0]:
joined = gbsx.join(orig).sort_index()

In [0]:
counts_df = pd.DataFrame(joined[['total.count', 'count']])
counts_df.columns = ['gbsx', 'cjf']
counts_df.index.name = "sampleID"

In [0]:
def get_perc_diff(row):
    if not "undet" in row.name:
        return ((row.gbsx-row.cjf)*100)/row.cjf
    return None

counts_df["perc_diff"] =counts_df.apply(get_perc_diff, axis=1)

In [0]:
counts_df.to_csv("gbsx_vs_orig.txt", sep="\t")

In [0]:
i1