In [11]:
#!/usr/bin/env python

import os, sys, time, shutil, re
import datetime
from datetime import date
from time import time as _time
import glob
import smtplib, ssl
from smtplib import SMTP_SSL as smtp
from pathlib import Path
import csv
import pandas as pd

print('Number of args: ', len(sys.argv))
print('Arg list: ', str(sys.argv))

NPROCS=16
bcl_human_template=os.path.expanduser('~') + "/seq_proc_dev/bcl_human_template.sh"
print(bcl_human_template)

#scan_dir will normally be provided by cron job scanning this location
scan_dir = '/Users/jdereus/Desktop/seq_proc_dev/'

data_path = scan_dir

# RTAComplete.txt is final file written by sequencers
check_file = "RTAComplete.txt"

def get_new_directories(scan_dir):
    """Get any new sequencing raw data locations"""

    new_dirs = []
    today = date.today()
    print(today)
    print(scan_dir)

    ### allow for 1 day to scan back in time
    os.chdir(scan_dir)
    print("current working dir " + os.getcwd())
    for root, dirs, files in os.walk(scan_dir):
        for name in dirs:
            filedate = date.fromtimestamp(os.path.getmtime(os.path.join(root,name)))
            if (today - filedate).days < 1:
                new_dirs.append(name)
        break

    #print(new_dirs)

    for dir in new_dirs:
        timestamp_str = time.strftime ( '%m/%d/%Y :: %H:%M:%S',
                                       time.gmtime(os.path.getmtime(scan_dir + '/' + dir)))
        print(timestamp_str, ' -->', (scan_dir + '/' + dir))

    #print("found " + str(len(new_dirs)) + " new directories")

    for dir in (new_dirs):
        #print(dir)
        if os.path.isfile(os.path.join(scan_dir, dir, "RTAComplete.txt")):
            print(dir + " exists")

    return new_dirs

def prep_data_location(run_directory, scan_dir):
    data_path = os.path.join(scan_dir,run_directory)
    mk_path = os.path.join(data_path, "Data/Fastq")

    print("new data path = " + data_path)
    if (os.path.isfile(os.path.join(data_path, "RTAComplete.txt"))) and (os.path.isfile(os.path.join(data_path, "*.csv"))):
        ("###continue")
    elif (os.path.isfile(os.path.join(data_path, "RTAComplete.txt"))) and (not os.path.isfile(os.path.join(data_path, "*.csv"))):
        print("RTAComplete but no sample sheets")
        smtp = smtplib.SMTP()
        ###smtp.connect("localhost", 25)
        ###smtp.sendmail(from_addr="test_email@noreply.com", to_addrs="jdereus@gmail.com", msg="something is wrong")

    if os.path.isfile(os.path.join(data_path, "alockfile")):
        print("data is processing for " + data_path)
    elif os.path.isfile(os.path.join(data_path, "processed")):
        print("data processing is complete " + data_path)
    elif (not os.path.exists(os.path.join(data_path, "alockfile"))) or (not os.path.exists(os.path.join(data_path, "processed"))):
        try:
            Path(os.path.join(data_path, "alockfile")).touch()
        except not FileExistsError:
            print("unable to get lockfile")

    if not os.path.isfile(os.path.join(data_path, "alockfile")):
        try:
                ### make data output location for bcl conversion
            ###mk_path = os.path.join(data_path, "Data/Fastq")
            os.makedirs(mk_path, mode=750, exist_ok=True)
        except OSError as error:
            print(error)

    return mk_path

def parse_csv(directory):
    print("inside test_csv ")
    os.chdir(os.path.join(scan_dir, directory))
    for csvfile in glob.glob('*.{}'.format("csv")):
        print(csvfile)
        sections = [lines.index(line) for line in csvfile if "[" in line]
        print(line)
        print(sections)
        for i in range(len(sections)):
            header = lines[sections[i]]
            df = pd.DataFrame(lines[sections[i]+1:sections[i+1]],
                              columns=header)
            print("head items " + df.head())



def process_data(directory, data_path, mk_path):
    contact_df = pd.DataFrame({'':[]})
    read_df = pd.DataFrame({'':[]})
    bioinfo_df = pd.DataFrame({'':[]})
    #info_dict = []
    csvfile=''
    base_mask=''
    bclconvert_template=''
    dependent_job=False
    experiment_name=''

    extension = 'csv'
    print("directory = ", directory)
    print("dir=",directory)
    os.chdir(os.path.join(scan_dir, directory))
    print("some text " + directory)

    samplesheet_copies = "orig_sample_sheets"

    data_path = os.path.join(data_path, directory)

    print("data path = ", data_path)
    ### create directory for original sample sheets
    sample_sheet_storage=os.path.join(data_path, samplesheet_copies)
    if not os.path.exists(sample_sheet_storage):
        os.mkdir(sample_sheet_storage)

    for csvfile in glob.glob('*.{}'.format(extension)):
        N_count=0
        job_index_val="I12"

        #with open(csvfile, 'r') as csvfile:
        print("sample sheets found : " + csvfile)
        df = pd.read_csv(csvfile, header=None)#,
                        #na_values=['NaN'])

        ### copy orig sample sheet to separate directory.  this does NOT create numerical increments of file copies
        ### one copy only of latest file
        sample_sheet = os.path.join(data_path, csvfile)
        if not os.path.isfile(os.path.join(sample_sheet_storage, csvfile+".bak")):
            shutil.copyfile(sample_sheet, os.path.join(sample_sheet_storage, csvfile+".bak"))
            #df.head()

        # drop all lines with no data (all "NaN")
        ### effectively compresses sample sheet to no blank lines.
        df = df.dropna(how='all')

        ### search for Bioinformatics does not work unless we replace NaN with .
        ### TODO: figure this out
        newdf = df.fillna(".")

        info_df = pd.DataFrame()

        ### initialize dictionary for sample sheet info
        ### experiment, chemistry, etc.
        info_dict = {}

        ### initialize contact_index in case no contact information on sample sheet.
        contact_index=int()
        ### intialize bio_index in case no bioinformatics information
        bio_index=int()

        for index in range(len(newdf)):
            if 'Bioinformatics' in newdf.iloc[index,0]: ##,0]:
                bio_index = index
                #### bio_index=i ###newdf.iloc[i,0]
                print("bio_index=", index)
            if '[Contact' in newdf.iloc[index,0]: ##,0]:
                print("contact_index=", index)
                ### remove [Contact] label.  reference maillist by index == 0
                contact_index = index+1 #newdf.iloc[i,0]
            else:
                contact_index=len(newdf)
            if '[Reads]' in newdf.iloc[index,0]:
                read_index_1 = index
                read_index_2 = index+3

            ### assemble dictionary for csv variables
            if 'Experiment' in newdf.iloc[index,0]:
                info_df = info_df.append(newdf.iloc[index])
                experiment_name = newdf.iloc[index,1]
                info_dict["ExperimentName"] = experiment_name
                print("experiment name =", experiment_name)
            if 'Assay' in newdf.iloc[index,0]:
                info_df = info_df.append(newdf.iloc[index])
                assay_type = newdf.iloc[index,1]
                info_dict['Assay'] = assay_type
            if 'Chemistry' in newdf.iloc[index,0]:
                info_df = info_df.append(newdf.iloc[index])
                chemistry_type = newdf.iloc[index,1]
                info_dict['Chemistry'] = chemistry_type
            if 'ReverseComplement' in newdf.iloc[index,0]:
                info_df = info_df.append(newdf.iloc[index])
                reverse_comp = newdf.iloc[index,1]
                info_dict['ReverseComplement'] = reverse_comp

            #print(info_df.T)
        for key,value in info_dict.items():
            print(key,value)

        if info_dict["Chemistry"] == "Amplicon":
            print("Amplicon chemistry true. Removing false barcodes")
        #else: print("False")
            NNN_list=["NNNNNNNN", "NNNNNNNNNNNN"]
            for line in csvfile:
                for entry in NNN_list:
                    if entry in line:
                        print(N_count=len(entry))

            if N_count == "12":
                job_index_val = "I12"
            elif N_count == "8":
                job_index_val="I8"
            elif N_count == 0:
                job_index_val="I12"

            base_mask="--use-bases-mask Y150" + "," + job_index_val + "," + job_index_val +"," + "Y150"
            print(base_mask)

            ### check to see if both Read values are present at 150/151
            ### if both, direction==2
            ### else direction==1
            ### used for bases-mask in bcl2fastq
            ### pull from read_df or develop other method


            ### replace N strings if Amplicon Sample sheets with blank for processing.
            ### can be moved to where we parse chemistry variable.
            ### we have already copied original to backup location.
            ### create new sample sheet with same name as original, removing false barcodes
            csv_1 = open(csvfile, 'r')


            ### should check to see if NNNNNNNN is present but this is historical
            csv_1 = ''.join([i for i in csv_1]).replace("NNNNNNNNNNNN", "")
            csv_2 = open(csvfile, 'w')
            csv_2.writelines(csv_1)
            csv_2.close()

            #for line in csvfile:
            #    for entry in NNN_list:
            #        if entry in line:
            #            line = line.replace(entry,"")
            #            print(N_count=len(entry))

        for key,value in info_dict.items():
            print(key,value)


        bioinfo_df=pd.DataFrame(newdf.iloc[bio_index:contact_index-1])
        #bioinfo_df=bioinfo_df.dropna()
        #header = bioinfo_df.iloc[]
        #bioinfo_df.columns = header
        bioinfo_df

        print("contact_index == ", contact_index)

        contact_df=pd.DataFrame(newdf.iloc[contact_index:])
        if contact_df.empty:
            contact_df = pd.DataFrame({"some_email@gmail.com"})
        print(contact_df.empty)


        print("after empty df check")
        ### access read count by column index == 0
        read_df=pd.DataFrame(newdf.iloc[read_index_1:read_index_2])
        #read_df.columns = ["Reads"]
        #read_df = pd.DataFrame(read_df.reshape(3,-1))

        print(bioinfo_df)
        print(contact_df)

        print(read_df.T)
        #read_df = read_df.T
        read_df = read_df.dropna(how='all')
        print(read_df)

    ### job submission building
    ### mk_path contains fastq output location.

    if os.path.isdir(mk_path):
        print("directory exists", mk_path)

    sbatch_submit(mk_path, contact_df, read_df, bioinfo_df, info_dict, csvfile, base_mask, bclconvert_template, dependent_job, experiment_name)


def sbatch_submit(mk_path, contact_df, read_df, bioinfo_df, info_dict, csvfile, base_mask, bclconvert_template, dependent_job, experiment_name ):
    scl_cmd_proc = []
    """ need to work on job script output format for below write/append contact_index
    #!/bin/bash
    #SBATCH --job-name=RKL0072
    #SBATCH --ntasks=16
    #SBATCH --ntasks-per-node=16
    #SBATCH --export=ALL
    #SBATCH --time=72:00:00
    #SBATCH --mem-per-cpu=6G
    #SBATCH --output=_output_path/%x_%A_%a.out
    #SBATCH --error=output_path/%x_%A_%a.err
    #SBATCH --mail-type=e,TIME_LIMIT_80
    #SBATCH --mail-user=someemail@gmail.com
    source ~/.bash_profile
    ['sbatch ', '--parsable ', '--qos=seq_proc ', '--export=seqdir=<user_location>/seq_proc_dev,outputdir=,csvfile=metasample.csv,base_mask=--use-bases-mask Y150,I12,I12,Y150 ', '--job-name=RKL0072 ', ' ']
    """

    job_file = open(os.path.join(mk_path, experiment_name + "_sbatch.sh"), 'a')
    job_file.write("#!/bin/bash\n")
    job_file.write("#SBATCH --job-name=%s\n" % experiment_name)
    job_file.write("#SBATCH --ntasks=16\n")
    job_file.write("#SBATCH --ntasks-per-node=16\n")
    job_file.write("#SBATCH --export=ALL\n")
    job_file.write("#SBATCH --time=72:00:00\n")
    job_file.write("#SBATCH --mem-per-cpu=6G\n")
    job_file.write("#SBATCH --output=_output_path/%x_%A_%a.out\n")
    job_file.write("#SBATCH --error=output_path/%x_%A_%a.err\n")
    job_file.write("#SBATCH --mail-type=e,TIME_LIMIT_80\n")
    job_file.write("#SBATCH --mail-user=someemail@gmail.com\n")

    job_file.write("source ~/.bash_profile\n")
    scl_cmd_proc.append('sbatch ')

    if dependent_job:
        scl_cmd_proc.append("--dependency=afterok:%s " % dependency_job_id)

    scl_cmd_proc.append("--parsable ")
    scl_cmd_proc.append("--qos=seq_proc ")
    scl_cmd_proc.append("--export=seqdir=%s,outputdir=%s,csvfile=%s,base_mask=%s " % (data_path,mk_path,csvfile,base_mask))
    scl_cmd_proc.append("--job-name=%s " % experiment_name)
    scl_cmd_proc.append("%s " % bclconvert_template)

    job_file.write(str(scl_cmd_proc))

def human_filter():
    for dir in new_directories:
        print(dir)

def run_param():
    print("whatup")


new_directories = get_new_directories(scan_dir)
print("new dirs values " + str(new_directories))

### loop over all new unprocessed directories
###mk_path=str()

for directory in new_directories:
    prep_data_location(directory, scan_dir)
    process_data(directory, data_path, mk_path)
    ###parse_csv(directory)


Number of args:  3
Arg list:  ['/Users/jdereus/miniconda3/envs/seq_proc/lib/python3.9/site-packages/ipykernel_launcher.py', '-f', '/Users/jdereus/Library/Jupyter/runtime/kernel-d3b2badc-87bc-490f-8800-e3877905046c.json']
/Users/jdereus/seq_proc_dev/bcl_human_template.sh
2021-08-20
/Users/jdereus/Desktop/seq_proc_dev/
current working dir /Users/jdereus/Desktop/seq_proc_dev
08/20/2021 :: 16:10:35  --> /Users/jdereus/Desktop/seq_proc_dev//sequencing_test
sequencing_test exists
new dirs values ['sequencing_test']
new data path = /Users/jdereus/Desktop/seq_proc_dev/sequencing_test
RTAComplete but no sample sheets
data is processing for /Users/jdereus/Desktop/seq_proc_dev/sequencing_test
directory =  sequencing_test
dir= sequencing_test
some text sequencing_test
data path =  /Users/jdereus/Desktop/seq_proc_dev/sequencing_test
sample sheets found : amplicon_sample.csv
bio_index= 18
Assay TruSeq HT
Chemistry Amplicon
ReverseComplement 0
Amplicon chemistry true. Removing false barcodes
--use-ba