In [2]:
import pandas as pd
import csv, warnings, os, sys, hashlib, subprocess
from zipfile import ZipFile
from io import StringIO 

warnings.filterwarnings("ignore")

In [49]:
def get_inputs():
    username = input("Enter the username provided by WalkUp:")
    password = input("Enter the password provided by WalkUp:")
    url = input("Enter the URL provided by WalkUp:")
    outputname = input("Enter the name of the output folder to save demultiplexed files in:")
    outputpath = input("Path to output folder to save demultiplexed files in (Do not include the name of the folder itself)'):")
    return username, password, url, outputname, outputpath

def download_locally():
    username, password, url, outputname, outputpath = get_inputs()
    dir_cmd = 'mkdir '+outputpath+'/'+outputname
    print('Creating directory...')
    os.system(dir_cmd)
    cmd = 'wget --tries=10 --continue --mirror --user '+ username +' --password '+ password +\
    ' --no-check-certificate '+ url +' -P '+outputpath+'/'+outputname
    try:
        sys.path.append('/usr/local/bin/')
        print('Downloading files...')
        os.system(cmd)
        print('Files downloaded...')
    except:
        sys.exit()
    return outputname, outputpath, username

def upload_Gdrive(in_path, outputname, username):
    out_path = "R&D/Sequencing_Backup/"
    year_folder = input('Please enter the year of sequencing:')
    out_path = out_path+year_folder+'/'
    cmd = 'rclone copy \"'+in_path+outputname+'\" remote:\"'+out_path+outputname+'\"'
    try:
        print('Backing up files...')
        os.system(cmd)
        val = check_md5(in_path,outputname,'/Volumes/GoogleDrive/Shared drives/GPP Cloud /'+out_path,username)
        print('Files moved and MD5 checksums verified')
        #Removing local copy
        cmd = 'rm -r \"'+in_path+outputname+'\"'
        os.system(cmd)
    except:
        sys.exit('Error transferring files')
    return

def check_md5(in_path, foldername, out_path, username):
    in_folder = in_path+'/'+foldername+'/get.broadinstitute.org/pkgs/'+username
    out_folder = out_path+'/'+foldername+'/get.broadinstitute.org/pkgs/'+username
    in_files = os.listdir(in_folder)
    in_files = [x for x in in_files if 'fastq' in x]
    print(in_files)
    #out_files = os.listdir(out_folder)
    for f in in_files:
        print(f)
        #out_file = [x for x in out_files if x==f][0]
        try:
            assert(hashlib.md5(open(in_folder+'/'+f,'rb').read()).hexdigest() == hashlib.md5(open(out_folder+'/'+f,'rb').read()).hexdigest())
        except AssertionError:
            print('MD5 checksums do not match;'+f)
    return 

def run_dplex():
    #dplex_path = '/Volumes/GoogleDrive/Shared drives/GPP Cloud /Informatics/Tools/'
    dplex_path = '/Volumes/rnai_software/bin/dplex-0.1.0'
    cond_file = input('Please enter the path to the .csv conditions file:')
    inputfolder = input('Please enter path to the inputfolder:')
    username = input('Please enter the username:')
    outputfolder = inputfolder+'/demultiplexed/'
    print(outputfolder)
    #cmd = 'mkdir '+outputfolder
    #os.system(cmd)
    file_folder = inputfolder+'/get.broadinstitute.org/pkgs/'+username+'/'
    print(file_folder)
    files = os.listdir(file_folder)
    barcode_file = [x for x in files if 'barcode' in x][0]
    print(barcode_file)
    construct_file = [x for x in files if ('fastq' in x) and ('barcode' not in x)][0]
    print(construct_file)
    cmd = 'java -jar \"'+dplex_path+'\" -c '+cond_file+' -1 \"'+file_folder+'/'+barcode_file+'\" -2 \"'+file_folder+'/'+construct_file+'\" -z -o \"'+outputfolder+'\"'
    #os.system(cmd)
    subprocess.run([cmd],capture_output=True)
    return

def run_fastqc():
    fastqc_in_folder = input('Please enter the path to the folder with files for FASTQC:')
    fastqc_out_folder = input('Please enter the name of the output folder:')
    year_folder = input('Please enter year of sequencing:')
    check_year = os.listdir("/Volumes/GoogleDrive/Shared drives/GPP Cloud /R&D/FASTQC_outputs/")
    if year_folder not in check_year:
        os.system('mkdir "/Volumes/GoogleDrive/Shared drives/GPP Cloud /R&D/FASTQC_outputs/"'+year_folder)
    fastqc_in_files = os.listdir(fastqc_in_folder)
    fastqc_in_files = [x for x in fastqc_in_files if ('fastq' in x)&('barcode' not in x)]
    print(fastqc_in_files)
    for f in fastqc_in_files:
        cmd = 'mkdir "/Volumes/GoogleDrive/Shared drives/GPP Cloud /R&D/FASTQC_outputs/'+year_folder+'/'+fastqc_out_folder+'"'
        os.system(cmd)
        cmd = 'fastqc "'+fastqc_in_folder+'/'+f+'" -o "/Volumes/GoogleDrive/Shared drives/GPP Cloud /R&D/FASTQC_outputs/'+year_folder+'/'+fastqc_out_folder+'"'
        os.system(cmd)
    return fastqc_out_folder

def agg_fastqc(fastqc_out_folder):
    outputfile = fastqc_out_folder+'/quality_summary.txt'
    fastqc_outputs = [x for x in os.listdir(fastqc_out_folder) if 'fastqc.zip' in x]
    with open(outputfile, 'w') as o:
        w = csv.writer(o, delimiter='\t')
        colnames = ['Sample','Average read quality','Median read quality']
        i=0
        for fo in fastqc_outputs:
            z = ZipFile(fastqc_out_folder+'/'+fo,'r')
            for f in z.namelist():
                if 'fastqc_data.txt' in f:
                    df = z.read(f).decode("utf-8")
                    for d in df.split('>>'):
                        if d.startswith('Per base sequence quality'):
                            StringData = StringIO(d)
                            df = pd.read_csv(StringData, sep ="\t",header=1)
                            sub_df = df[['#Base','Mean']].set_index('#Base').T
                            avg = df.Mean.mean()
                            med = df.Median.median()
                            row = [f.split('/')[0],avg,med]
                            row.extend(sub_df.loc['Mean',:].to_list())
                            if i == 0:
                                colnames.extend(sub_df.columns.to_list())
                                i+=1
                                w.writerow(colnames)
                            w.writerow(row)
                        else:
                            continue
        

This notebook will help set up input files for running CRISPResso. If you are running this notebook for the first time,
before running this notebook, please make sure you:
<ol>
    <li>Install Homebrew on your system by pasting the following command on your terminal:
    <b>/bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/HEAD/install.sh)"</b></li>
    <li>Once Homebrew is installed, install wget by typing <b>brew install wget</b> on the terminal. wget helps download files from the WalkUp sequencing server.</li>
</ol>

## Downloading sequencing files

We will first download files on to our machine. 

In [5]:
outputname, outputpath, username = download_locally()

Enter the username provided by WalkUp:SN0214918
Enter the password provided by WalkUp:ziy9YStJGOc3
Enter the URL provided by WalkUp:https://get.broadinstitute.org/pkgs/SN0214918/
Enter the name of the output folder to save demultiplexed files inZMS_BCV_20210205
Path to output folder to save demultiplexed files in (Do not include the name of the folder itself)')/Users/mhegde/Desktop/test_download/
Creating directory...
Downloading files...
Files downloaded...


## Backing up on GDrive

In [6]:
upload_Gdrive(outputpath, outputname, username)





Backing up files...
['1_JF3CL.1.barcode_1.fastq.gz', '1_JF3CL.1.1.fastq.gz']
1_JF3CL.1.barcode_1.fastq.gz
1_JF3CL.1.1.fastq.gz
Files moved and MD5 checksums verified


## Demultiplexing

If the sequencing files have not been demultiplexed, run this.

In [43]:
run_dplex()

Please enter the path to the .csv conditions file:/Users/mhegde/Desktop/2021-02-09_CTNNB1_Conditions_File.csv
Please enter path to the inputfolder:/Volumes/GoogleDrive/Shared drives/GPP Cloud /R&D/Sequencing_Backup/2021/ZMS_BCV_20210205
Please enter the username:SN0214918
/Volumes/GoogleDrive/Shared drives/GPP Cloud /R&D/Sequencing_Backup/2021/ZMS_BCV_20210205/demultiplexed/
/Volumes/GoogleDrive/Shared drives/GPP Cloud /R&D/Sequencing_Backup/2021/ZMS_BCV_20210205/get.broadinstitute.org/pkgs/SN0214918/
1_JF3CL.1.barcode_1.fastq.gz
1_JF3CL.1.1.fastq.gz


FileNotFoundError: [Errno 2] No such file or directory: 'java -jar "/Volumes/rnai_software/bin/dplex-0.1.0" -c /Users/mhegde/Desktop/2021-02-09_CTNNB1_Conditions_File.csv -1 "/Volumes/GoogleDrive/Shared drives/GPP Cloud /R&D/Sequencing_Backup/2021/ZMS_BCV_20210205/get.broadinstitute.org/pkgs/SN0214918//1_JF3CL.1.barcode_1.fastq.gz" -2 "/Volumes/GoogleDrive/Shared drives/GPP Cloud /R&D/Sequencing_Backup/2021/ZMS_BCV_20210205/get.broadinstitute.org/pkgs/SN0214918//1_JF3CL.1.1.fastq.gz" -z -o "/Volumes/GoogleDrive/Shared drives/GPP Cloud /R&D/Sequencing_Backup/2021/ZMS_BCV_20210205/demultiplexed/"'

## Running FASTQC

In [50]:
run_fastqc()

Please enter the path to the folder with files for FASTQC:/Volumes/GoogleDrive/Shared drives/GPP Cloud /R&D/Sequencing_Backup/2021/ZMS_BCV_20210205/demultiplexed/
Please enter the name of the output folder:ZMS_BCV_20210205
Please enter year of sequencing:2021
['HT29_RDA414_B_14_Dropout.construct.fastq.gz', 'HT29_RDA409_A_14_Dropout.construct.fastq.gz', 'HT29_RDA410_B_14_Dropout.construct.fastq.gz', 'HT29_RDA409_B_14_Dropout.construct.fastq.gz', 'HT29_RDA414_A_14_Dropout.construct.fastq.gz', 'HT29_RDA414_B_7_Dropout.construct.fastq.gz', 'HT29_RDA415_B_14_Dropout.construct.fastq.gz', 'HT29_RDA414_A_7_Dropout.construct.fastq.gz', 'HT29_RDA415_A_14_Dropout.construct.fastq.gz', 'HT29_RDA410_A_14_Dropout.construct.fastq.gz']


'ZMS_BCV_20210205'

In [51]:
agg_fastqc('/Volumes/GoogleDrive/Shared drives/GPP Cloud /R&D/FASTQC_outputs/2021/ZMS_BCV_20210205')