In [66]:
import sys
import pandas as pd
import numpy as np
import math

infile = "test.methylation_calls.nocgi.bed"
outfile = "test.methylation_by_read.tsv"

log_lik_threshold = 2.5

def iter_chunk_by_read_name(file):
    csv_reader = pd.read_csv(file, iterator=True, chunksize=1000, sep='\t', header=None, names=["chromosome", "start", "end", "read_name", "log_lik_ratio", "strand"])
    last_read_in_chunk=pd.DataFrame()
    chunk_id = 0
    for chunk in csv_reader:
        chunk = pd.concat([last_read_in_chunk, chunk])
        methylation_calls = pd.DataFrame(chunk)
        # Set aside rows corresponding to the last read in the chunk so that they 
        # can be grouped with the next chunk
        last_read_name_in_chunk = methylation_calls.tail(1)["read_name"].iloc[0]
        last_read_in_chunk = methylation_calls.loc[methylation_calls['read_name'] == last_read_name_in_chunk]
        reads = methylation_calls.groupby(["read_name"])
        for read_name, cpgs in reads:
            if read_name != last_read_name_in_chunk:
                #print(chunk_id, read_name, len(cpgs.index))
                yield(cpgs)
        chunk_id = chunk_id + 1
    #print(last_read_name_in_chunk)
    yield(last_read_in_chunk)



In [67]:
of = open(outfile, 'w')

read_iter = iter_chunk_by_read_name(infile)

for read in read_iter:
    chromosome = read["chromosome"].iloc[0]
    read_name = read["read_name"].iloc[0]
    read_start_pos = min(read["start"])
    read_end_pos = max(read["end"])
    m = 0
    u = 0
    for index, locus in read.iterrows():
        #print(locus['read_name'], locus['log_lik_ratio'])
        if locus.log_lik_ratio > log_lik_threshold:
            m = m + 1
        if locus.log_lik_ratio < -log_lik_threshold:
            u = u + 1
    print ('\t'.join([str(chromosome), str(read_start_pos), str(read_end_pos), read_name, str(m), str(u)]) + "\n")
    of.write('\t'.join([str(chromosome), str(read_start_pos), str(read_end_pos), read_name, str(m), str(u)]) + "\n")



1	120152678	149361987	1e26cc42-8250-4fae-9afa-f8258edb4437	52	16

1	224945802	224962927	5b055a71-e465-4eb6-8f71-e35afe814c5f	52	3

1	202774911	202784280	60122057-ff3a-4087-9558-ef9f0d16af03	57	4

1	228221963	228233379	68047932-829d-402e-b18b-81e46f279177	52	4

1	162494838	162536994	8c41616a-7b43-4074-8516-e56d916eb690	204	8

1	186858767	186862762	a912c2e1-6971-42e6-b0a1-3921345a6298	8	0

1	153392272	153409072	ab6f41c6-7456-458b-8791-3e3d66cfed2a	116	7

1	174313800	174321045	b8821331-62de-43f9-9229-72d5e44f8cca	30	0

1	247520693	247553044	0b352958-a219-42d1-bdbe-ea736109473c	130	6

1	204040014	204077719	198710c6-74dd-4fce-8653-98486171d8ed	157	21

1	209289716	209315431	98d3bed1-cc72-4a58-995d-172f4041fdf2	69	3

10	25647494	25686867	667cb729-d67b-4589-b55d-f2bdf72babdb	147	5

10	59965607	59965607	b4582729-23c9-41a4-aef5-3ee31fd8620b	0	1

11	19388129	19388318	c869d2a9-db97-4e33-9881-10e528bd1187	4	0

10	45932856	49902055	d341097e-af8d-4c98-b684-85880515db5a	1	0

11	66940256	66946456	59b82