In [2]:
"""
Module for working with scATAC-seq data.
.
"""
import os
import numpy
import pandas as pd
import pysam

os.listdir(".")

['rik_kernel-ffc70f98-573e-480e-8220-a36b7e7bb0a0.json',
 'rik_kernel-a7aac536-249f-45df-a8a9-60d6e10deea1.json',
 'rik_kernel-90464393-b627-4c00-aef2-3622c95fb9cb.json',
 'Median_Unique_Fragments_vs_Mean_Reads_P11.tsv',
 'rik_kernel-dfe84e2c-a18c-4b64-9d33-031d0eafa00a.json',
 'rik_kernel-05fbc854-db60-4158-9eaa-5000d87e5a29.json',
 'rik_kernel-e70268af-81fc-476d-99f4-963ee1222dd0.json',
 'rik_kernel-bb0d892c-a418-4582-a4be-8d1ef34af002.json',
 'rik_kernel-14edb403-475e-4db4-b0ea-05f8ce6e9ae6.json',
 'rik_kernel-3eae174b-acd7-4008-ba33-0953973170ff.json',
 'rik_kernel-c7941a21-d8da-45cf-9c2d-e525e72b1825.json',
 'rik_kernel-dc9ed0f5-67de-4a64-b0ad-b136c02baa89.json',
 'rik_kernel-310f0072-246f-40ad-9214-7518f40122a5.json',
 'rik_kernel-8e849afc-7e45-4d72-a925-1cf8492346d1.json',
 'rik_kernel-2e56e435-040b-4b05-97ea-21c05a35956d.json',
 'rik_kernel-7c188cf3-254e-4d1a-ae12-b171bc555844.json',
 'rik_kernel-19a17257-29ed-4bdf-879e-ea4488f13e32.json',
 'rik_kernel-69b50ede-5421-448e-9e18-9

In [3]:
def makeSimulatedCell(numReads, bulkFile, name, directory, debug=False):
    """
    Create a simulated cell.

    Make a simulated cell, with a (roughly) equal number of reads as another
    cell. Sort the resulting bam file. RNG seed is a concatination of the
    number of reads and the name of the cell, therefore using the same
    parameters should return the same result.

    Parameters
    ----------
    numReads : int or list of int
        Number of reads to generate for the simulated cell.
    bulkFile : string
        String of path to BAM/SAM file of bulk sample to generate simulated
        cell from.
    name : string or list of strings
        Name to give simulated cell. Automatically recieves ".sorted.bam"
        suffix and will generate index.
    directory : string
        Directory to put simulated cell BAM file in.
    """
    import random

    bulk = pysam.AlignmentFile(bulkFile, "rb")
    bulkReads = [read for read in bulk]

    if type(numReads) == int:
        numReads = [numReads]

    if type(name) == str:
        name = [name]

    for num, sample in enumerate(name):
        random.seed(str(numReads[num]) + sample)
        with pysam.AlignmentFile(os.path.join(directory,
                                 sample + ".bam"),
                                 "wb", header=bulk.header) as simCell:
            for read in random.sample(bulkReads, numReads[num]):
                simCell.write(read)
        pysam.sort(os.path.join(directory, sample + ".bam"),
                   '-T tmp',
                   '-o',
                   os.path.join(directory, sample + ".sorted.bam"),
                   catch_stdout=False)
        os.remove(os.path.join(directory, sample + ".bam"))
        pysam.index(os.path.join(directory, sample + ".sorted.bam"),
                    catch_stdout=False)

In [None]:
# Bulk file names
GMR_GAL4_UAS_Sp1='DFB__04e55c__OmniATAC_GMR-Gal4_uas-sp1_L3_EAD_S92_R1_001_q4_sorted.bam'
GMR_GAL4_UAS_ttk88='DFB__4babeb__OmniATAC_GMR-Gal4_uasttk88_L3_EAD_S89_R1_001_q4_sorted.bam'
GMR_GAL4_UAS_psq='DFB__7cdabb__OmniATAC_GMR-Gal4_uaspsq_L3_EAD_S90_R1_001_q4_sorted.bam'
GMR_GAL4_UAS_lov='DFB__a510e9__OmniATAC_GMR-Gal4_uaslov_L3_EAD_S97_R1_001_q4_sorted.bam'
GMR_GAL4_UAS_fru9280='DFB__b9bbec__OmniATAC_GMR-Gal4_uasfru9280_L3_EAD_S95_R1_001_q4_sorted.bam'
GMR_GAL4_UAS_l3neo38='DFB__bbe8fb__OmniATAC_GMR-Gal4_uasl_3_neo38_L3_EAD_S91_R1_001_q4_sorted.bam'
GMR_GAL4_UAS_fru2366='DFB__ecbc98__OmniATAC_GMR-Gal4_uas2366_L3_EAD_S96_R1_001_q4_sorted.bam'
DG2_WT='DGRP_55026_q4_sorted.bam'
GMR_GAL4_UAS_nerfinHA='EAD__17b679__OmniATAC_GMR_Gal4_cross_UAS_nerfin_HA_L3_EAD_S7_R1_001_q4_sorted.bam'
GMR_GAL4_UAS_pros='EAD__7431f0__OmniATAC_6_GMR-Gal4_uas-pros_L_eye_disc_S2_R1_001_q4_sorted.bam'
GMR_GAL4_UAS_ttk69='EAD__acd73b__OmniATAC_9_GMR-Gal4_uas-ttk_69_eye_disc_S3_R1_001_q4_sorted.bam'
GMR_GAL4_UAS_lolaT='EAD__c1c9f1__OmniATAC_11_GMR-Gal4_uas-lola_T_eye_disc_S4_R1_001_q4_sorted.bam'
GMR_GAL4_UAS_lola_L='EAD__dbf214__OmniATAC_4_GMR-Gal4_uas-lola_L_eye_disc_S1_R1_001_q4_sorted.bam'
GMR_GAL4_UAS_lz='EAD__e7d69d__OmniATAC_GMR_Gal4_cross_UAS_lz_L3_EAD_S4_R1_001_q4_sorted.bam'
GMR_GAL4_nerfin_CE='EAD__fa6722__OmniATAC_GMR_Gal4_cross_UAS_nerfin_CE_L3_EAD_S6_R1_001_q4_sorted.bam'

In [None]:
directory='/Figure_6/Bootstrapped_BAM'
numReads=numpy.random.randint(20000, high=20001, size=51)
name=list()
for i in range(1,51):
        cell_name= "GMR_GAL4_UAS_Sp1_"+ str(i)
        name.append(cell_name)


makeSimulatedCell(numReads, GMR_GAL4_UAS_Sp1, name, directory, debug=False)

directory='/Figure_6/Bootstrapped_BAM'
numReads=numpy.random.randint(20000, high=20001, size=51)
name=list()
for i in range(1,51):
        cell_name= "GMR_GAL4_UAS_ttk88_"+ str(i)
        name.append(cell_name)


makeSimulatedCell(numReads, GMR_GAL4_UAS_ttk88, name, directory, debug=False)

directory='/Figure_6/Bootstrapped_BAM'
numReads=numpy.random.randint(20000, high=20001, size=51)
name=list()
for i in range(1,51):
        cell_name= "GMR_GAL4_UAS_psq_"+ str(i)
        name.append(cell_name)


makeSimulatedCell(numReads, GMR_GAL4_UAS_psq, name, directory, debug=False)

directory='/Figure_6/Bootstrapped_BAM'
numReads=numpy.random.randint(20000, high=20001, size=51)
name=list()
for i in range(1,51):
        cell_name= "GMR_GAL4_UAS_lov_"+ str(i)
        name.append(cell_name)


makeSimulatedCell(numReads, GMR_GAL4_UAS_lov, name, directory, debug=False)


directory='/Figure_6/Bootstrapped_BAM'
numReads=numpy.random.randint(20000, high=20001, size=51)
name=list()
for i in range(1,51):
        cell_name= "GMR_GAL4_UAS_fru9280_"+ str(i)
        name.append(cell_name)


makeSimulatedCell(numReads, GMR_GAL4_UAS_fru9280, name, directory, debug=False)



directory='/Figure_6/Bootstrapped_BAM'
numReads=numpy.random.randint(20000, high=20001, size=51)
name=list()
for i in range(1,51):
        cell_name= "GMR_GAL4_UAS_l3neo38_"+ str(i)
        name.append(cell_name)


makeSimulatedCell(numReads, GMR_GAL4_UAS_l3neo38, name, directory, debug=False)

directory='/Figure_6/Bootstrapped_BAM'
numReads=numpy.random.randint(20000, high=20001, size=51)
name=list()
for i in range(1,51):
        cell_name= "GMR_GAL4_UAS_fru2366_"+ str(i)
        name.append(cell_name)


makeSimulatedCell(numReads, GMR_GAL4_UAS_fru2366, name, directory, debug=False)


directory='/Figure_6/Bootstrapped_BAM'
numReads=numpy.random.randint(20000, high=20001, size=51)
name=list()
for i in range(1,51):
        cell_name= "DG2_WT_"+ str(i)
        name.append(cell_name)


makeSimulatedCell(numReads, DG2_WT, name, directory, debug=False)

directory='/Figure_6/Bootstrapped_BAM'
numReads=numpy.random.randint(20000, high=20001, size=51)
name=list()
for i in range(1,51):
        cell_name= "GMR_GAL4_UAS_nerfinHA_"+ str(i)
        name.append(cell_name)


makeSimulatedCell(numReads, GMR_GAL4_UAS_nerfinHA, name, directory, debug=False)

directory='/Figure_6/Bootstrapped_BAM'
numReads=numpy.random.randint(20000, high=20001, size=51)
name=list()
for i in range(1,51):
        cell_name= "GMR_GAL4_UAS_pros_"+ str(i)
        name.append(cell_name)


makeSimulatedCell(numReads, GMR_GAL4_UAS_pros, name, directory, debug=False)

directory='/Figure_6/Bootstrapped_BAM'
numReads=numpy.random.randint(20000, high=20001, size=51)
name=list()
for i in range(1,51):
        cell_name= "GMR_GAL4_UAS_ttk69_"+ str(i)
        name.append(cell_name)


makeSimulatedCell(numReads, GMR_GAL4_UAS_ttk69, name, directory, debug=False)

directory='/Figure_6/Bootstrapped_BAM'
numReads=numpy.random.randint(20000, high=20001, size=51)
name=list()
for i in range(1,51):
        cell_name= "GMR_GAL4_UAS_lolaT_"+ str(i)
        name.append(cell_name)


makeSimulatedCell(numReads, GMR_GAL4_UAS_lolaT, name, directory, debug=False)

directory='/Figure_6/Bootstrapped_BAM'
numReads=numpy.random.randint(20000, high=20001, size=51)
name=list()
for i in range(1,51):
        cell_name= "GMR_GAL4_nerfin_CE_"+ str(i)
        name.append(cell_name)


makeSimulatedCell(numReads, GMR_GAL4_nerfin_CE, name, directory, debug=False)

directory='/Figure_6/Bootstrapped_BAM'
numReads=numpy.random.randint(20000, high=20001, size=51)
name=list()
for i in range(1,51):
        cell_name= "GMR_GAL4_UAS_lola_L_"+ str(i)
        name.append(cell_name)


makeSimulatedCell(numReads, GMR_GAL4_UAS_lola_L, name, directory, debug=False)

directory='/Figure_6/Bootstrapped_BAM'
numReads=numpy.random.randint(20000, high=20001, size=51)
name=list()
for i in range(1,51):
        cell_name= "GMR_GAL4_UAS_lz_"+ str(i)
        name.append(cell_name)


makeSimulatedCell(numReads, GMR_GAL4_UAS_lz, name, directory, debug=False)