In [1]:
import itertools
import time
import os
import sys

In [2]:
import HTSeq
import pandas as pd

In [3]:
def invert_strand(iv):
    iv2 = iv.copy()
    if iv2.strand == "+":
        iv2.strand = "-"
    elif iv2.strand == "-":
        iv2.strand = "+"
    else:
        raise ValueError("Illegal strand")
    return iv2

In [4]:
# select gtf file: instantiate GFF_Reader Object
gtf_file = HTSeq.GFF_Reader("/gcm-lfs1/pablo/data/RNAdeg/annotation/gff/schizosaccharomyces_pombe.chr.extended.gff3")

In [5]:
## heterochromatic genes
htc_genes = ('dg1', 'dh1', 'after_tlh', 'MAT2', 'MAT3', 'MAT1')

# Counting ungapped single-end reads: **ChIP-Seq data**

- Obtain `GenomicArrayOfSets` for the `gene` **features** from the `GFF` **File** (for `DNA` **unstranded**)

In [6]:
# instantiate `GenomicArrayOfSets` for `gene` features: (`DNA` **unstranded**)
gene_features = HTSeq.GenomicArrayOfSets("auto", stranded=False)

genes_dict = {}
## for DNA we have to use the `gene` features 
#features_of_interest = ['gene', 'snRNA_gene', 'rRNA_gene', 'pseudogene', 'snoRNA_gene', 'tRNA_gene', 'ncRNA_gene']

## loop over all features in gtf file
for feature in gtf_file:
    
    ## parse features contained in `features_of_interest`
    #if feature.type in features_of_interest:
    if 'gene' in feature.type:
        
        # get `gene` feature id
        try:
            ## identify each `gene` feature by `gene_id` attribute: transcript/gene
            gene_id = feature.attr["gene_id"]

        except:
            ## sub-set of pseudogenes that behave as transcripts
            assert feature.type == 'pseudogene'
            gene_id = feature.attr["Parent"].split(':')[1]
        
        ## add `gene` feature to `GenomicArrayOfSets`
        gene_features[feature.iv] += gene_id

        # Is this the first time we see this gene?
        if gene_id not in genes_dict:
            # If so, add to the 'genes_dict' an empty list 
            genes_dict[ gene_id ] = list()

        # add the feature to the gene list
        genes_dict[ gene_id ].append( feature )

In [7]:
len(genes_dict)
#genes_dict

6992

![img_overlap](https://htseq.readthedocs.io/en/release_0.11.1/_images/count_modes.png)

- Parse `BAM` **File** to **count reads** falling into each `gene` **feature** 

In [8]:
#sample_name = '1022_INPUT'
sample_name = '1168_S2ChIP'

In [9]:
#os.path.join('/gcm-lfs1/pablo/data/RNAdeg/data/ChIP/bam', sample_name, sample_name + '.Aligned.sortedByCoord.out.bam')

In [10]:
import collections
counts = collections.Counter( )

start_time = time.time()

# select bam file: instantiate BAM_Reader Object (ChIP-seq)
#bam_file = HTSeq.BAM_Reader(os.path.join('/gcm-lfs1/pablo/data/RNAdeg/data/sequencing_new/ChIP/bam', sample_name, sample_name + '.Aligned.sortedByCoord.out.bam'))
#bam_file = HTSeq.BAM_Reader(os.path.join('/data/pablo/RNAdeg/results/ChIP/bams', sample_name + '.Aligned.sortedByCoord.out.bam'))
bam_file = HTSeq.BAM_Reader(os.path.join('/gcm-lfs1/pablo/data/RNAdeg/data/ChIP/bam', sample_name, sample_name + '.Aligned.sortedByCoord.out.bam'))

## -------
## Options
## -------
    
minaqual = 10

## How to deal with overlap of READ and FEATURE 
overlap_mode = "intersection-nonempty"
#overlap_mode =  "intersection-strict"

## How to deal with overlapping FEATURES and multimapped reads
multimapped_mode = 'all'
#multimapped_mode = 'none'

## How to deal with multimapped reads (secondary alignments!)
#secondary_alignment_mode = 'ignore'
secondary_alignment_mode = 'none'

i = 0

for aln in bam_file:
    
    if i > 0 and i % 100000 == 0:
        sys.stderr.write("{} alignment records processed. {} s\n".format(i,  time.time() - start_time))
        sys.stderr.flush()
    i += 1          
    
    ## ----------------------
    ## Inspect read alignment
    ## ----------------------
    
    ## _mapped or _unmapped (our BAM files only contain _mapped)
    counts["_total"] += 1
    
    if not aln.aligned:
        counts["_unmapped"] += 1
        ## skips to next iteration
        continue
    
    ## Multimapped reads are contained as separate entries in the BAM file.
    try:
        if aln.optional_field("NH") > 1:
            counts["_alignment_not_unique"] += 1
            if multimapped_mode == 'none':
                ## skips to next iteration
                continue
            elif ((secondary_alignment_mode == 'ignore') and aln.not_primary_alignment):
                counts["_not_primary_alignment"] += 1
                ## skips to next iteration
                continue

    except KeyError:
        pass

    #if aln.aQual < minaqual:
    #    #import pdb
    #    #pdb.set_trace()
    #    counts["_too_low_aQual"] +=  1
    #    continue

    ## -----------------------------
    ## Read and Feature Overlap Mode
    ## -----------------------------
        
    ## A. Union: the union of all the sets S(i). This mode is recommended for most use cases.
    if overlap_mode == "union":
        ## feature set
        gene_ids = set()
        
        for iv, fs in gene_features[ aln.iv ].steps():
            gene_ids = gene_ids.union(fs)
    
    ## B. Intersection-strict: the intersection of all the sets S(i).
    ## C. Intersection-nonempty: the intersection of all non-empty sets S(i).
    elif overlap_mode in ("intersection-strict", "intersection-nonempty"):
        ## feature set
        gene_ids = None
        
        for iv, fs in gene_features[ aln.iv ].steps():
            if ((len(fs) > 0) or (overlap_mode == "intersection-strict")):
                if gene_ids is None:
                    gene_ids = fs.copy()
                else:
                    gene_ids = gene_ids.intersection(fs)
                    
    ## Other: Ilegal!                   
    else:
        sys.exit("Illegal overlap mode.")
    
    ## --------------
    ## Count Features
    ## --------------
    
    ## A. Mapped to unknown feature (it is empty)
    if gene_ids is None or len(gene_ids) == 0:
        counts["_no_feature"] += 1
        
    ## B. Mapped to a region with overlapping features (contains more than one element)
    ## See next how to deal with this ambiguous read alignments! (multimapped_mode)
    elif len(gene_ids) > 1:
        counts["_ambiguous"] += 1
    
    ## C. Uniquely Mapped (contains exactly one element) 
    #else:
    #    gene_id = list(gene_ids)[0]
    #    counts[gene_id] += 1
    
    ## Deal with multimapped reads!
    if gene_ids is not None and len(gene_ids) > 0:
        
        ## - ignore multimapped reads
        if multimapped_mode == 'none':
            
            ## C. Uniquely Mapped (contains exactly one element) 
            if len(gene_ids) == 1:
                counts[list(gene_ids)[0]] += 1
        
        ## - count each multimapped feature
        elif multimapped_mode == 'all':
            for fsi in list(gene_ids):
                counts[fsi] += 1
        
        ## Other: Ilegal!                   
        else:
            sys.exit("Illegal multimap mode.")

        
print('Elapsed Time (Counting reads):', time.time() - start_time)

100000 alignment records processed. 4.783107280731201 s
200000 alignment records processed. 9.3605797290802 s
300000 alignment records processed. 13.950458288192749 s
400000 alignment records processed. 18.52493667602539 s
500000 alignment records processed. 23.137054920196533 s
600000 alignment records processed. 27.737872838974 s
700000 alignment records processed. 32.317275047302246 s
800000 alignment records processed. 36.915855884552 s
900000 alignment records processed. 41.47606015205383 s
1000000 alignment records processed. 46.062005043029785 s
1100000 alignment records processed. 50.65654182434082 s
1200000 alignment records processed. 55.23732233047485 s
1300000 alignment records processed. 59.828253746032715 s
1400000 alignment records processed. 64.47795486450195 s
1500000 alignment records processed. 68.90028405189514 s
1600000 alignment records processed. 73.50031232833862 s
1700000 alignment records processed. 78.17464709281921 s
1800000 alignment records processed. 82.8

Elapsed Time (Counting reads): 131.55697774887085


In [11]:
#for gene_id in counts:
#    print(gene_id, counts[gene_id])

In [12]:
#{k: v for k, v in sorted(counts.items(), key=lambda item: item[1], reverse=True)}
#{k: v for k, v in counts.items()}

- Convert `counter` to **DataFrame**

In [13]:
counts_df = pd.DataFrame.from_dict(counts, orient='index').reset_index()
counts_df = counts_df.rename(columns={'index':'gene-id', 0:'count'})
counts_df.head(20)

Unnamed: 0,gene-id,count
0,_total,2854303
1,_alignment_not_unique,371550
2,_no_feature,359218
3,SPAC212.11,1096
4,SPAC212.10,137
5,SPAC212.09c,372
6,SPNCRNA.70,32
7,SPAC212.08c,145
8,SPAC212.07c,50
9,SPAC212.12,80


In [14]:
counts_df.shape

(6808, 2)

- Summary of counts

In [15]:
counts_df[counts_df['gene-id'].str.startswith('_')]

Unnamed: 0,gene-id,count
0,_total,2854303
1,_alignment_not_unique,371550
2,_no_feature,359218
17,_ambiguous,343992


In [16]:
## now contains multiple counting
counts_df[~counts_df['gene-id'].str.startswith('_')]['count'].sum()

2856050

- Show counts for genes of interest: **Heterochromatic Genes**

In [17]:
counts_df[counts_df['gene-id'].isin(htc_genes)]

Unnamed: 0,gene-id,count
2072,dh1,1340
2075,dg1,1668
4212,MAT1,24
4224,MAT2,172
4226,MAT3,224
5545,after_tlh,622


## Compare results with **Parastou's counting**

- Results using **Parastou's counting script**

In [18]:
xp_chip = '/gcm-lfs1/pablo/data/RNAdeg/data/ChIP/xp_data/chip_pombe_gene_count_matrix.csv'

In [19]:
xp_chip_df = pd.read_csv(xp_chip, sep='\t')
xp_chip_df = xp_chip_df[['gene-id', 'length', 'type', 'category', 'bio_type', sample_name]].astype({sample_name: 'int64'})
xp_chip_df.head()

FileNotFoundError: [Errno 2] File b'/gcm-lfs1/pablo/data/RNAdeg/data/ChIP/xp_data/chip_pombe_gene_count_matrix.csv' does not exist: b'/gcm-lfs1/pablo/data/RNAdeg/data/ChIP/xp_data/chip_pombe_gene_count_matrix.csv'

In [None]:
total = xp_chip_df[sample_name].sum()
total

- **Merge** both counts DataFrames

In [None]:
merged_xp_chip = pd.merge(counts_df, xp_chip_df, on='gene-id', how='outer')
merged_xp_chip

- Bigger differences

In [None]:
merged_xp_chip['diff_count'] = abs(merged_xp_chip['count'] - merged_xp_chip[sample_name])

In [None]:
merged_xp_chip.sort_values('diff_count', na_position='first')

In [None]:
merged_xp_chip['diff_count'].plot.hist(xlim=(0, 11000))

In [None]:
merged_xp_chip.plot.scatter(x = 'count', y = sample_name, xlim=(0, 1000), ylim=(0, 1000))

- Show counts for genes of interest: **Heterochromatic Genes**

In [None]:
merged_xp_chip[merged_xp_chip['gene-id'].isin(htc_genes)]

## Tests

In [None]:
select_genes = ['SPCC569.04', 'SPCC569.05c']

In [None]:
merged_xp_chip[merged_xp_chip['gene-id'].isin(select_genes)].reset_index(drop=True)
#merged_xp_chip[merged_xp_chip['gene-id'] == 'SPCC569.05c'].reset_index(drop=True)

- Check for (read) interval

In [None]:
iv_test = HTSeq.GenomicInterval("MT", 11190, 12500, ".")

gene_ids = set()

## loop over features overlapping with the read
for iv, val in gene_features[ iv_test ].steps():
    gene_ids |= val

gene_ids