In [2]:
import io
from IPython.nbformat import current
def execute_notebook(nbfile):
    with io.open(nbfile) as f:
        nb = current.read(f, 'json')
    ip = get_ipython()
    for cell in nb.worksheets[0].cells:
        if cell.cell_type != 'code':
            continue
        ip.run_cell(cell.input)
execute_notebook("/cellar/users/ramarty/Projects/kir/KIR_development/bin/imports.ipynb")
execute_notebook("/cellar/users/ramarty/Projects/kir/KIR_development/bin/samples.ipynb")

Populating the interactive namespace from numpy and matplotlib
Populating the interactive namespace from numpy and matplotlib


`%matplotlib` prevents importing * from pylab and numpy


## Getting the k-mer information for KhoeSan individuals

In [3]:
samples = ['SRR1171049', 'SRR1171058', 'SRR1175182', 'SRR1258307', 
            'SRR1171017', 'SRR1171051', 'SRR1175179', 'SRR1175183', 'SRR1265483',
            'SRR1171020', 'SRR1171052', 'SRR1175180', 'SRR1175184', 'SRR1265484',
            'SRR1171021', 'SRR1171056', 'SRR1175181', 'SRR1258307']

In [4]:
out_dirs = ['/nrnb/users/ramarty/kir/PING/KhoeSan/sra/{0}'.format(x) for x in samples]

In [8]:
def create_cluster_script_gather(samples, out_dirs):

    new_script_file = '/cellar/users/ramarty/Projects/kir/KIR_development/data_gathering/KhoeSan/exon_data_pull.sh'
    
    with open(new_script_file, 'w') as out_file:
        out_file.write("#! /bin/csh\n")
        out_file.write("#$ -V\n")
        out_file.write("#$ -S /bin/csh\n")
        out_file.write("#$ -o /cellar/users/ramarty/Data/kir/sge-system_files\n")
        out_file.write("#$ -e /cellar/users/ramarty/Data/kir/sge-system_files\n")
        out_file.write("#$ -cwd\n")
        out_file.write("#$ -t 1-{0}\n".format(len(samples)))
        #out_file.write("#$ -t 1-2\n".format(len(samples)))
        out_file.write("#$ -l h_vmem=30G\n")
        out_file.write("#$ -tc 30\n")
        out_file.write("#$ -l long")
        out_file.write("\n")

        out_file.write("set samples=({0})\n".format(" ".join(samples)))
        out_file.write("set outs=({0})\n".format(" ".join(out_dirs)))
        out_file.write("\n")

        out_file.write("set sample=$samples[$SGE_TASK_ID]\n")
        out_file.write("set out=$outs[$SGE_TASK_ID]\n")
        out_file.write("\n")

        out_file.write("date\n")
        out_file.write("hostname\n")
        
        # Make directories
        out_file.write("mkdir $out\n")
        out_file.write("mkdir $out/features\n")
                
        # Already have a single fastq
        #out_file.write("mv $out.fastq $out/full_exome.fastq\n")
        
        # Map to reference
        out_file.write("python /cellar/users/ramarty/Projects/kir/KIR_development/data_gathering/bin/map_to_reference.py $out/full_exome.fastq $out/full_exome_kir.bam /cellar/users/ramarty/Data/kir/ref/all_alleles cellar\n")
        out_file.write("echo Mapped to KIR.\n")
        out_file.write("python /cellar/users/ramarty/Projects/kir/KIR_development/data_gathering/bin/convert_to_fastq.py $out/full_exome_kir.bam $out/full_exome_kir.fastq\n")
        out_file.write("echo Stripped reads.\n")
        
        # Collect components        
        out_file.write("python /cellar/users/ramarty/Projects/kir/KIR_development/data_gathering/bin/component_collection.py $out/full_exome_kir.fastq $out/features /cellar/users/ramarty/Data/kir/kmers/kmer_groups/component_and_four_mers.txt components_four_kir\n")
        out_file.write("echo Components gathered - KIR.\n")
        out_file.write("python /cellar/users/ramarty/Projects/kir/KIR_development/data_gathering/bin/component_collection.py $out/full_exome.fastq $out/features /cellar/users/ramarty/Data/kir/kmers/kmer_groups/component_and_four_mers.txt components_four\n")
        out_file.write("echo Components gathered - whole exome.\n")
        
        # Run PING
        #out_file.write("cd /nrnb/users/ramarty/programs/PING\n")
        #out_file.write("Rscript --vanilla /cellar/users/ramarty/Projects/kir/KIR_development/data_gathering/bin/runPING_extractor.R $out/ filtered_exome_1.fastq filtered_exome_2.fastq $out/PING_sequences/ 4\n")
        #out_file.write("Rscript --vanilla /cellar/users/ramarty/Projects/kir/KIR_development/data_gathering/bin/runPING_gc_caller.R $out/PING_sequences/  $out/PING/ 40000\n")
        #out_file.write("echo PING completed.\n")
        #out_file.write("date\n")
        
        
        # Clean up
        #out_file.write("rm $out/full_exome*.fastq\n")
        out_file.write("rm $out/full_exome_kir.fastq\n")
        out_file.write("rm $out/full_exome_sorted*\n")
        out_file.write("rm $out/full_exome*bam\n")
        #out_file.write("rm -r $out/PING*\n")
        out_file.write("date\n")

In [9]:
create_cluster_script_gather(samples, out_dirs)