In [1]:
import os
import glob
import pandas as pd

In [2]:
def collect(bedfiles, condition_file, my_conditions) : 
    
    
    conditions = {}
    with open(condition_file, 'r') as f : 
        for line in f : 
            if not line.startswith("simple_") : 
                info = line.strip().split(",")
                sample = info[0]
                condition = info[1]
                conditions[sample] = condition
    f.close()
    
    res = pd.DataFrame()
    for e,b in enumerate(bedfiles) :
        conds = []
        samples = []
        seqs = []
        counts = []
        genes = []
        seqlens = []
        sample = os.path.basename(b).split(".")[0]
        condition = conditions[sample]
        
        if condition in my_conditions :  
            with open(b, 'r') as f : 
                for line in f : 
                    if not line.startswith("chrom") : 
                        info = line.strip().split("\t")
                        
                        chrom = str( info[0] )
                        seq = str( info[3] )
                        gene = str( info[10] )
                        count = float( info[13] )
                        feature = str( info[9] )
                        if feature == "piRNA" : 

                            conds.append(condition)
                            samples.append(sample)
                            seqs.append(seq)
                            genes.append(gene)
                            counts.append(count)
                            seqlens.append(len(seq))
            f.close()
        
            if not res.empty : 
                tmp = pd.DataFrame({
                    'seq' : seqs,
                    'gene' : genes,
                    'condition' : conds, 
                    'sample' : samples, 
                    'count' : counts,
                    'seqlen' : seqlens
                })
                res = pd.concat([res, tmp], ignore_index = True)
            else : 
                res = pd.DataFrame({
                    'seq' : seqs,
                    'condition' : conds, 
                    'sample' : samples, 
                    'count' : counts,
                    'gene' : genes,
                    'seqlen' : seqlens
                })
    
    res_grouped = res.groupby(['seq', 'condition', 'gene', 'seqlen']).agg( M = ('count', 'mean'), N = ('count', 'size')).reset_index()
    
    return res_grouped
        
                    
    

In [3]:
my_df = collect(bedfiles = glob.glob("/users/PAS1473/benpasto1/PCON0160/ben/projects/2024_disl2/smRNA_seq_mutants_remove_rRNA/20250115/bed/*.tsv"),
        condition_file = "/users/PAS1473/benpasto1/PCON0160/ben/projects/2024_disl2/smRNA_seq_mutants_remove_rRNA/20250115/samples/replicates.csv", 
        my_conditions = ['disl2_control', 'disl2_how27', 'old_pup1', 'pup1how27'] )

In [4]:
def collect_tailor(tailor_file, condition_file, my_conditions) : 
    
    
    conditions = {}
    with open(condition_file, 'r') as f : 
        for line in f : 
            if not line.startswith("simple_") : 
                info = line.strip().split(",")
                sample = info[0]
                condition = info[1]
                conditions[sample] = condition
    f.close()
        
    conds = []
    samples = []
    seqs = []
    counts = []
    genes = []
    tails = []
        
        
    with open(tailor_file, 'r') as f : 
        for line in f : 
            if not line.startswith("gene") : 
                info = line.strip().split("\t")
                condition = str( info[6] )
                if condition in my_conditions : 
                    sample = str( info[5] )
                    gene = str( info[0] )
                    seq = str( info[1] )
                    count = float( info[3] )
                    tail = str( info[2] )


                    conds.append(condition)
                    samples.append(sample)
                    seqs.append(seq)
                    genes.append(gene)
                    counts.append(count)
                    tails.append(tail)
    f.close()
        
            
    res = pd.DataFrame({
        'seq' : seqs,
        'tail' : tails,
        'count' : counts,
        'condition' : conds, 
        'sample' : samples, 
        'gene' : genes
    })
    
    
    res_grouped = res.groupby(['seq', 'tail', 'condition', 'gene']).agg( M = ('count', 'mean'), N = ('count', 'size')).reset_index()

    return res_grouped
        
                    
    

In [5]:
my_tailor_df = collect_tailor(tailor_file = "./piRNA_tailing_raw_data.tsv",
        condition_file = "/users/PAS1473/benpasto1/PCON0160/ben/projects/2024_disl2/smRNA_seq_mutants_remove_rRNA/20250115/samples/replicates.csv", 
        my_conditions = ['disl2_control', 'disl2_how27', 'old_pup1', 'pup1how27'] )

In [14]:
my_tailor_df.query('gene.str.contains("WBGene00048233")')[['seq', 'tail', 'M', 'condition']].sort_values(['condition'])

Unnamed: 0,seq,tail,M,condition


In [17]:
my_df.query('gene.str.contains("WBGene00048233")')[['seq', 'M', 'condition', 'seqlen']].sort_values(['condition', 'seqlen'])

Unnamed: 0,seq,M,condition,seqlen
50589,TCACATAGGTGTTTCTTTT,0.045348,disl2_control,19
50591,TCACATAGGTGTTTCTTTTT,0.090696,disl2_control,20
50594,TCACATAGGTGTTTCTTTTTT,0.303379,disl2_control,21
50600,TCACATAGGTGTTTCTTTTTTTTTAAATAGAA,0.045348,disl2_control,32
50588,TCACATAGGTGTTTCT,0.039396,disl2_how27,16
50592,TCACATAGGTGTTTCTTTTT,0.048108,disl2_how27,20
50595,TCACATAGGTGTTTCTTTTTT,1.470524,disl2_how27,21
50598,TCACATAGGTGTTTCTTTTTTT,0.068565,disl2_how27,22
50599,TCACATAGGTGTTTCTTTTTTTTTAA,0.029168,disl2_how27,26
50590,TCACATAGGTGTTTCTTTT,0.078952,old_pup1,19
