In [2]:
import os

import pysam
import pandas as pd

---

In [6]:
in_dir = '/data/parastou/RNAdeg/results/RipChip/bams/'

In [7]:
out_dir = '/data/parastou/RNAdeg/CoveragePlots/ChIP/'

In [32]:
def update_coverage_dict(coverage_dict, ch, start, end, inc):
    
    for i in range(start, end):
        key = ch + ':' + str(i)
        if key in coverage_dict:
            coverage_dict[key] += inc
        else:
            coverage_dict[key] = inc

In [33]:
def generate_coverage_dict(alignmentFile):

    coverage_dict = {}

    for read in alignmentFile.fetch(until_eof=True):

        nh = read.get_tag('NH')
        inc = 1.0 / nh
        ch = read.reference_name
        start = read.reference_start
        end = read.reference_end

        update_coverage_dict(coverage_dict, ch, start, end, inc)
        
    return coverage_dict

In [34]:
def bed_coverage_dict(in_bed):
    
    cov_dict = {}
    with open(in_total) as itc:
        for line in itc:
            ls = line.split('\t')
            ch = ls[0].strip(' ')
            pos = ls[1].strip(' ')
            val = ls[2].strip(' \n')
            key = ch + ':' + pos
            cov_dict[key] = val
            
    return cov_dict

### Step1 : create the scaffold coverage file with bedtools

In [30]:
for file in os.listdir(in_dir):
    
    if file.endswith('.bam'):
        
        base_name = file.split('.')[0]
        in_path = os.path.join(in_dir, file)
        out_path = os.path.join(out_dir, base_name + '.bed')
        ! bedtools genomecov -d -ibam $in_path > $out_path

### Step 2 : calculate coverages for bam files

In [35]:
for file in os.listdir(in_dir):
    
    if file.endswith('.bam'):
        
        base_name = file.split('.')[0]
        in_path = os.path.join(in_dir, file)
        bed_path = os.path.join(out_dir, base_name + '.bed')
        out_path = os.path.join(out_dir, base_name + '.coverage.txt')
        
        print('bam file : %s ...' % in_path)
        ## Load a bam file and create coverage information for mapped regions
        st = pysam.AlignmentFile(open(in_path, 'rb'))      
        coverage_dict = generate_coverage_dict(st)
        
        ## Load bedtools coverage file as scafold, create coverage dictionary
        bed_cov_dict = bed_coverage_dict(bed_path)
        
        assert len(bed_cov_dict) >= len(coverage_dict)
        
        ## Merge coverage information into a new file
        with open(out_path, 'w+') as out:
    
            for coord, item in bed_cov_dict.items():
                coverage = item
                if coord in coverage_dict:
                    coverage = coverage_dict[coord]
                ch = coord.split(':')[0]
                pos = coord.split(':')[1]
                line = ch + '\t' + str(pos) + '\t' + str(coverage) + '\n'
                out.write(line)
        print('Coverage file save in %s.' % out_path)

bam file : /data/parastou/RNAdeg/results/RipChip/bams/63_S2ChIPp.ztr.Aligned.sortedByCoord.out.bam ...
Coverage file save in /data/parastou/RNAdeg/CoveragePlots/ChIP/63_S2ChIPp.coverage.txt.
bam file : /data/parastou/RNAdeg/results/RipChip/bams/63_S2ChIP_2.ztr.Aligned.sortedByCoord.out.bam ...
Coverage file save in /data/parastou/RNAdeg/CoveragePlots/ChIP/63_S2ChIP_2.coverage.txt.
bam file : /data/parastou/RNAdeg/results/RipChip/bams/65_S2ChIP.Aligned.sortedByCoord.out.bam ...
Coverage file save in /data/parastou/RNAdeg/CoveragePlots/ChIP/65_S2ChIP.coverage.txt.
bam file : /data/parastou/RNAdeg/results/RipChip/bams/80_S2ChIP_2.Aligned.sortedByCoord.out.bam ...
Coverage file save in /data/parastou/RNAdeg/CoveragePlots/ChIP/80_S2ChIP_2.coverage.txt.
bam file : /data/parastou/RNAdeg/results/RipChip/bams/80_S2ChIP.Aligned.sortedByCoord.out.bam ...
Coverage file save in /data/parastou/RNAdeg/CoveragePlots/ChIP/80_S2ChIP.coverage.txt.
bam file : /data/parastou/RNAdeg/results/RipChip/bams/80_

### Step 3 : Generate Read-per-million normalized coverages

In [8]:
for file in os.listdir(out_dir):
    
    if file.endswith('.coverage.txt'):
        
        in_path = os.path.join(out_dir, file)
        print(in_path)
        base_name = file.split('.')[0]
        out_path = os.path.join(out_dir, base_name + '.norm.coverage.txt')
        
        df = df = pd.read_csv(in_path, sep='\t', names=range(3))
        total = sum(df[2]) / 50
        print(total)
        scaling_factor = (total / 1000000)
        print(scaling_factor)
        df[2] = round(df[2] / scaling_factor, 2)
        df.to_csv(out_path, sep='\t', index=None, header=None)
        print(out_path)

/data/parastou/RNAdeg/CoveragePlots/ChIP/1168_S2ChIP_1.coverage.txt
12140573.19616848
12.14057319616848
/data/parastou/RNAdeg/CoveragePlots/ChIP/1168_S2ChIP_1.norm.coverage.txt
/data/parastou/RNAdeg/CoveragePlots/ChIP/1168_S2ChIP.coverage.txt
4811017.73666634
4.81101773666634
/data/parastou/RNAdeg/CoveragePlots/ChIP/1168_S2ChIP.norm.coverage.txt
/data/parastou/RNAdeg/CoveragePlots/ChIP/301_S2_ChIP.coverage.txt
4286852.040619181
4.286852040619181
/data/parastou/RNAdeg/CoveragePlots/ChIP/301_S2_ChIP.norm.coverage.txt
/data/parastou/RNAdeg/CoveragePlots/ChIP/301_S2ChIP.coverage.txt
5087370.131110993
5.087370131110993
/data/parastou/RNAdeg/CoveragePlots/ChIP/301_S2ChIP.norm.coverage.txt
/data/parastou/RNAdeg/CoveragePlots/ChIP/302_S2_ChIP.coverage.txt
4968244.299999734
4.968244299999735
/data/parastou/RNAdeg/CoveragePlots/ChIP/302_S2_ChIP.norm.coverage.txt
/data/parastou/RNAdeg/CoveragePlots/ChIP/302_S2ChIP.coverage.txt
6028582.592165854
6.028582592165854
/data/parastou/RNAdeg/CoveragePlot

-----

#### Sanity check for library depth

In [10]:
for file in os.listdir(in_dir):
    
    if file.endswith('.bam'):
        
        base_name = file.split('.')[0]
        in_path = os.path.join(in_dir, file)       
        print(base_name)
        
        ## Load a bam file and create coverage information for mapped regions
        st = pysam.AlignmentFile(open(in_path, 'rb'))
        total = 0
        for r in st.fetch(until_eof=True):
            nh = r.get_tag('NH')
            inc = 1.0 / nh
            total += inc
        print(total/1000000)

63_S2ChIPp
5.356333999892295
63_S2ChIP_2
5.356333999892295
65_S2ChIP
6.267980999970094
80_S2ChIP_2
6.328341999773746
80_S2ChIP
5.357875999907185
80_S2Ph_ChIP
1.7535309999972273
80S2ChIP_1
9.338956999841269
301_S2ChIP
5.060596999987656
302_S2ChIP
6.005917999931659
324_S2ChIP
6.132901999953194
491_S2ChIP
6.6667069999678406
504S2ChIP_1
4.112042000080235
504S2ChIP_2
4.489328000043439
530ChIP_1
6.190077999752234
530S2ChIP_2
3.334843000082868
591_S2PChIP
9.78366099994927
638ChIP_1
7.90067499979194
63_S2ChIP
5.344684999903266
80_S2_ChIP
6.328341999773746
491_S2_ChIP
4.098884000109946
324_S2_ChIP
3.8086730001036035
302_S2_ChIP
4.960510000015301
301_S2_ChIP
4.256788000142784
638_S2_ChIP
6.007343999866427
1168_S2ChIP_1
12.099033000172284
1168_S2ChIP
2.5741050000027546
510_S2ChIP
13.202423000104327
