# We need to count the edits across 3'UTRs to compare bulk data to single cell.
- Just like for single cell and bulk, we're converting (aggr) edit BED files to BAM files and using those as inputs to featureCounts. 
- Using 10X three_prime_utr annotations, generate a SAF file using just 3'UTRs and count edits along these regions only.
- Just so we have everything, we're counting edits in every single dataset, resulting in a single counts.txt file.

In [1]:
%matplotlib inline

import glob
import os
import pandas as pd
import gffutils
import pysam
import pybedtools
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as stats
from collections import OrderedDict
from tqdm import tnrange, tqdm_notebook

pd.set_option('display.max_columns', 50)

# Important! Filter the edit file for conf score
- Since we're counting the number of edits found, we'll need to set a cutoff at which we call sites "edited." 

In [2]:
conf=0.9

In [3]:
input_dir = '/home/bay001/projects/kris_apobec_20200121/permanent_data2/07_scRNA_groups/sailor_outputs_groups_deep/combined_outputs_w_cov_info'
output_dir = '/home/bay001/projects/kris_apobec_20200121/permanent_data2/07_scRNA_groups/sailor_outputs_groups_deep/edit_featurecounts{}'.format(conf)
tmp_dir = '/home/bay001/projects/kris_apobec_20200121/permanent_data2/07_scRNA_groups/sailor_outputs_groups_deep/edit_featurecounts{}/tmp'.format(conf)

In [4]:
# Just make sure the path exists...
if not os.path.exists(tmp_dir):
    ! mkdir -p $tmp_dir
print(os.path.exists(tmp_dir))

True


# Convert BED to BAM
- every 'alignment' is an edit site. 
- .fx means that the 'chr' is affixed to the ensembl-style chromosome. We don't actually want that (actually grabbing the 'nonfx' files), but I can't figure out a way to glob while excluding these fx bedfiles. So I'm going to glob these and assume a specific naming schema. Dumb, I know.

In [5]:
all_bed_files = []
_ = sorted(glob.glob(os.path.join(input_dir, '*.fx.bed'))) 
for b in _:
    all_bed_files.append(b.replace('.fx.bed','.bed'))
    assert os.path.exists(b.replace('.fx.bed','.bed'))
print(len(all_bed_files))
all_bed_files[:3]

14


['/home/bay001/projects/kris_apobec_20200121/permanent_data2/07_scRNA_groups/sailor_outputs_groups_deep/combined_outputs_w_cov_info/RPS2-STAMP_possorted_genome_bam-RPS2_STAMP_Apo_filtered_lenti_common_expression_barcodes.txt.bed',
 '/home/bay001/projects/kris_apobec_20200121/permanent_data2/07_scRNA_groups/sailor_outputs_groups_deep/combined_outputs_w_cov_info/RPS2-STAMP_possorted_genome_bam_MD-RPS2_STAMP_Apo_filtered_lenti_common_expression_barcodes.txt.downsampled50M.bed',
 '/home/bay001/projects/kris_apobec_20200121/permanent_data2/07_scRNA_groups/sailor_outputs_groups_deep/combined_outputs_w_cov_info/RPS2-STAMP_possorted_genome_bam_MD-RPS2_STAMP_Apo_filtered_lenti_common_expression_barcodes.txt.bed']

In [6]:
def filter_bed(input_bed, output_bed, conf):
    """
    Filters the BED file 
    """
    edit_head = ['chrom','start','end','conf','frac','strand']
    df = pd.read_table(input_bed, names=edit_head)
    df = df[df['conf']>=conf]
    df.to_csv(output_bed, sep='\t', header=False, index=False)

# Basically filter for conf, perform bedToBam and samtools sort on these guys.

In [7]:
genome = '/home/bay001/projects/kris_apobec_20200121/permanent_data2/07_scRNA_groups/inputs/refdata-cellranger-hg19_lenti_common-3.0.0/star/chrNameLength.txt'
progress = tnrange(len(all_bed_files))
for bed in all_bed_files:
    output_bam = os.path.join(tmp_dir, os.path.basename(bed).replace('.bed','.bam'))
    output_sorted_bam = os.path.join(tmp_dir, os.path.basename(bed).replace('.bed','.sorted.bam'))
    filter_fn = os.path.join(tmp_dir, os.path.basename(bed) + ".{}.bed".format(conf))
    filter_bed(bed, filter_fn, conf)
    cmd = 'bedToBam '
    cmd += '-i {} '.format(filter_fn)
    cmd += '-g {} '.format(genome)
    cmd += '> {}'.format(output_bam)
    # print(cmd)  # debug
    ! $cmd
    sort_cmd = 'samtools sort {} > {}'.format(output_bam, output_sorted_bam)
    ! $sort_cmd
    # print(sort_cmd)
    progress.update(1)

HBox(children=(IntProgress(value=0, max=14), HTML(value='')))

  
  if (await self.run_code(code, result,  async_=asy)):


# Run featureCounts to get the number of edits assigned to each gene.

In [8]:
gtf = '/home/bay001/projects/kris_apobec_20200121/permanent_data2/07_scRNA_groups/inputs/refdata-cellranger-hg19_lenti_common-3.0.0/genes/genes.gtf'

In [9]:
cmd = 'module load subreadfeaturecounts;featureCounts '
cmd += '-a {} '.format(gtf)
cmd += '-s 1 '
# cmd += '-O '
cmd += '-o {}'.format(os.path.join(output_dir, 'counts_at_conf_{}.txt '.format(conf)))
cmd += '-R CORE '
cmd += os.path.join(tmp_dir, '*.sorted.bam')
cmd += ' > counts.log 2>&1'

print("Command: [{}]".format(cmd))
print("Writing to: {}".format(os.path.join(output_dir, 'counts_at_conf_{}.txt '.format(conf))))

! $cmd

Command: [module load subreadfeaturecounts;featureCounts -a /home/bay001/projects/kris_apobec_20200121/permanent_data2/07_scRNA_groups/inputs/refdata-cellranger-hg19_lenti_common-3.0.0/genes/genes.gtf -s 1 -o /home/bay001/projects/kris_apobec_20200121/permanent_data2/07_scRNA_groups/sailor_outputs_groups_deep/edit_featurecounts0.9/counts_at_conf_0.9.txt -R CORE /home/bay001/projects/kris_apobec_20200121/permanent_data2/07_scRNA_groups/sailor_outputs_groups_deep/edit_featurecounts0.9/tmp/*.sorted.bam > counts.log 2>&1]
Writing to: /home/bay001/projects/kris_apobec_20200121/permanent_data2/07_scRNA_groups/sailor_outputs_groups_deep/edit_featurecounts0.9/counts_at_conf_0.9.txt 


# Let's run featureCounts using just the CDS, in addition to the whole gene.
- Now we're using 10X annotations so it'll be easier to compare to sc data.
- Just so I don't have to keep checking this, the sc notebook pulls data from the 'lenti-common' gtf, which is different from the below reference by only one line (the lenti common contig). The 3'UTRs are exactly the same.
- this is in rebuttal for RPS STAMP sc vs bulk comparisons. Compare the CDS

#### Simply grep for 'CDS' tab from GTF file to get all CDS annotations.

In [10]:
gtf_cds_file = os.path.join(tmp_dir, 'genes.cds.gtf')

! grep --color -P '\tCDS\t' $gtf > $gtf_cds_file

#### Now re-format gtf-formatted file into an SAF-formatted file. 

In [11]:
gtf_cds = pd.read_csv(gtf_cds_file, names=['chrom','src','region','start','end','.','strand','.','attr'], sep='\t')
gtf_cds['geneid'] = gtf_cds['attr'].str.extract("gene_id \"([\w\d\.]+)\"")
# gtf_cds['chromfx'] = 'chr' + gtf_cds['chrom'].astype(str)
gtf_cds = gtf_cds[['geneid','chromfx','start','end','strand']]
gtf_cds.head()

  return _read(filepath_or_buffer, kwds)
  interactivity=interactivity, compiler=compiler, result=result)


KeyError: "['chromfx'] not in index"

In [None]:
gtf_cds.to_csv(
    os.path.join(input_dir, 'genes.cds_only.SAF'),
    sep='\t',
    index=False,
    header=False
)

In [None]:
saf = os.path.join(input_dir, 'genes.cds_only.SAF')

#### Run featureCounts using the SAF file that was creating from just the CDS.

In [None]:
cmd = 'module load subreadfeaturecounts;featureCounts '
cmd += '-a {} '.format(saf)
cmd += '-F SAF '
cmd += '-s 1 '
cmd += '-o {}'.format(os.path.join(output_dir, 'counts_at_conf_{}.cds.txt '.format(conf)))
cmd += '-R CORE '
cmd += os.path.join(tmp_dir, '*.sorted.bam')

print("writing to: {}".format(os.path.join(output_dir, 'counts_at_conf_{}.cds.txt '.format(conf))))

! $cmd

# Let's run featureCounts using 3utrs also, in addition to the whole gene.

### First make an annotation SAF file from our GTF file by simply grepping for three_prime_utr regions.

In [None]:
gtf_3utr_file = os.path.join(tmp_dir, 'genes.three_prime_utr_only.gtf')

! grep --color 'three_prime_utr' $gtf > $gtf_3utr_file

In [None]:
gtf_3utr = pd.read_csv(gtf_3utr_file, names=['chrom','src','region','start','end','.','strand','.','attr'], sep='\t')
gtf_3utr['geneid'] = gtf_3utr['attr'].str.extract("gene_id \"([\w\d\.]+)\"")
gtf_3utr = gtf_3utr[['geneid','chrom','start','end','strand']]
gtf_3utr.head()

In [None]:
gtf_3utr.to_csv(
    os.path.join(input_dir, 'genes.three_prime_utr_only.SAF'),
    sep='\t',
    index=False,
    header=False
)

In [None]:
saf = os.path.join(input_dir, 'genes.three_prime_utr_only.SAF')

### Run featureCounts

In [None]:
cmd = 'module load subreadfeaturecounts;featureCounts '
cmd += '-a {} '.format(saf)
cmd += '-F SAF '
cmd += '-s 1 '
cmd += '-o {}'.format(os.path.join(output_dir, 'counts_at_conf_{}.three_prime_utr.txt '.format(conf)))
cmd += '-R CORE '
cmd += os.path.join(tmp_dir, '*.sorted.bam')
cmd += ' > three_prime_utr_counts.log 2>&1'

print("Command is [{}]".format(cmd))
print("Writing to: {}".format(os.path.join(output_dir, 'counts_at_conf_{}.three_prime_utr.txt '.format(conf))))

! $cmd

### One of these groups had a really low gene assignment %, let's check it out
- The offending sample is the 10X Apo aggregate of all cells
- Looking at the alignments on IGV, mostly seems to be: annotated as 'intronic', on opposite strand, or possibly in an unannotated UTR/exon

In [None]:
df = pd.read_csv(os.path.join(output_dir, 'counts_at_conf_{}.txt.summary'.format(conf)), sep='\t')
# df.columns = [c.replace(tmp_dir, '') for c in df.columns]
df

In [None]:
for c in df.columns:
    print(c)