In [1]:
import pandas as pd

%load_ext autoreload
%autoreload 2

folder = '/projects/ps-yeolab/obotvinnik/singlecell_pnms'
csv_folder = '{}/csvs_for_paper'.format(folder)
bed_folder = '{}/bed'.format(folder)
# folder2 = '/projects/ps-yeolab2/obotvinnik/singlecell_pnm'

splicing_feature_folder = '{}/splicing_feature_data'.format(csv_folder)
alternative_feature_folder = '{}/alternative'.format(splicing_feature_folder)
constitutive_feature_folder = '{}/constitutive'.format(splicing_feature_folder)

alt_exons_bedfile = '{}/exons.bed'.format(alternative_feature_folder)
constitutive_bedfile = '{}/exons.bed'.format(constitutive_feature_folder)


# splicing_feature_data = pd.read_csv('{}/splicing_feature_data.csv'.format(csv_folder), index_col=0)
# print(splicing_feature_data.shape)
# splicing_feature_data.head()

  _get_xdg_config_dir())


In [2]:
import pybedtools

In [3]:
alt_exons = pybedtools.BedTool(alt_exons_bedfile)
constitutive_exons = pybedtools.BedTool(constitutive_bedfile)

In [4]:
print(len(alt_exons), len(constitutive_exons))

(43903, 5893)


## Caclulate upstream nucleotide fraction

In [5]:
hg19_fasta = '/projects/ps-yeolab/genomes/hg19/chromosomes/all.fa'

In [18]:
from Bio import SeqIO

def calculate_nucleotide_content(bed, window, nt, genome, fasta, nucleotide, context, intersect=None):
    """Count the percentage of nucleotides in windows flanking a bed file (e.g. exons)
    
    Parameters
    ----------
    bed : pybedtools.BedTool
        BedTool object of exons whose flanking nucleotide content you want to measure
    window : int
        Size of the windows to use
    nt : int
        Size of the flanking region
    genome : str
        Unique name of the genome to look up chromosome sizes, or the chromosome sizes themselves
    fasta : str
        Location of the genome sequences as a fasta file
    nucleotide : str
        Which nucleotide you're counting
    context : 'upstream' | 'downstream'
        Relative to the strand, whether you want the upstream or downstream sequences
    intersect = pybedtools.BedTool, optional
        Another bed file to intersect with if you're only interested in a subset of regions (e.g. only conserved elemets)
    
    Returns
    -------
    nucleotide_fraction : pandas.DataFrame
        
    
    """
    windows = nt/window

    index = [x.name for x in bed]

    nucleotide_fractions = []
    if context.startswith('up'):
        # Upstream - go left (upstream) the window size, and right zero
        flank_kws = dict(l=window, r=0, s=True, genome=genome)
    elif context.startswith('d'):
        # downstream - go right (downstream) the window size, and right zero
        flank_kws = dict(l=0, r=window, s=True, genome=genome)

    for i in range(windows):
        if i == 0:
            subset = bed.flank(**flank_kws)
        else:
            subset = subset.flank(**flank_kws)
            
        if intersect is not None:
            subset = subset.intersect(intersect)
            df = subset.to_dataframe()
            index = df['name']

        seqs = subset.sequence(fi=fasta, s=True)
        with open(seqs.seqfn) as f:
            records = list(SeqIO.parse(f, 'fasta'))
        nucleotide_fraction = [sum(1 for x in s.seq if x == nucleotide)/float(window) for s in records]
        nucleotide_fraction = pd.Series(nucleotide_fraction, index=index).reset_index()
        nucleotide_fractions.append(nucleotide_fraction)
    nucleotide_fraction_df = pd.concat(nucleotide_fractions, axis=1)
    return nucleotide_fraction_df

In [19]:
csv_folder = '/projects/ps-yeolab/obotvinnik/singlecell_pnms/csvs_for_paper'

In [20]:
import six

In [21]:
splicing_feature_folder = '{}/splicing_feature_data'.format(csv_folder)

In [22]:
%pdb

Automatic pdb calling has been turned ON


In [None]:
placental = '/projects/ps-yeolab/genomes/hg19/database/phastConsElements46wayPlacental.bed'

nucleotides = 'ACGT'
contexts = 'upstream', 'downstream'

alt_exons = pybedtools.BedTool(alt_exons_bedfile)
constitutive_exons = pybedtools.BedTool(constitutive_bedfile)
print(len(alt_exons), len(constitutive_exons))

beds = {'alternative': alt_exons, 'constitutive': constitutive_exons}

for nucleotide in nucleotides:
    for context in contexts:
        for name, bed in beds.items():
            six.print_(nucleotide, context, name)
            nucleotide_folder = '{}/{}/nucleotide_content'.format(splicing_feature_folder, name)
            ! mkdir $nucleotide_folder
            nucleotide_content = calculate_nucleotide_content(bed, window=10, nt=400, genome='hg19', fasta=hg19_fasta, 
                                                             nucleotide=nucleotide, context=context, intersect=placental)
            nucleotide_content.to_csv('{}/{}400_nucleotide_content_{}_placental_mammal.csv'.format(
                    nucleotide_folder, context, nucleotide), index=False)

(43903, 5893)
A upstream alternative
mkdir: cannot create directory `/projects/ps-yeolab/obotvinnik/singlecell_pnms/csvs_for_paper/splicing_feature_data/alternative/nucleotide_content': File exists


In [None]:
nucleotides = 'ACGT'
contexts = 'upstream', 'downstream'

alt_exons = pybedtools.BedTool(alt_exons_bedfile)
constitutive_exons = pybedtools.BedTool(constitutive_bedfile)
print(len(alt_exons), len(constitutive_exons))

beds = {'alternative': alt_exons, 'constitutive': constitutive_exons}

for nucleotide in nucleotides:
    for context in contexts:
        for name, bed in beds.items():
            six.print_(nucleotide, context, name)
            nucleotide_folder = '{}/{}/nucleotide_content'.format(splicing_feature_folder, name)
            ! mkdir $nucleotide_folder
            nucleotide_content = calculate_nucleotide_content(bed, window=10, nt=400, genome='hg19', fasta=hg19_fasta, 
                                                             nucleotide=nucleotide, context=context)
            nucleotide_content.to_csv('{}/{}400_nucleotide_content_{}.csv'.format(
                    nucleotide_folder, context, nucleotide))