In [1]:
import numpy as np
from os import listdir
from os.path import join
from itertools import compress
import pandas as pd
from collections import Counter

In [2]:
def readin_fastq(core_path,filename):
    seqsraw = pd.read_csv(join(core_path,filename),sep="\t",header=None)  # txt files are tab-separated that is why we say that the seperator is "tab" represented by "\t" (sep = "\t")
    seqs = seqsraw.iloc[range(1,seqsraw.shape[0],4),:] # the range function specifies the number of lines in each file and that onle every 4th line should be read -> fastq files have 4 lines per read, one with seqeunce and remaining 3 quality indicators
    
    return(seqs)

In [3]:
data_path = '/project2/kribelba_1515/data/EXTRA_seq/2024_data/endoMPRA_lib5p4'

In [5]:
files = listdir(data_path)
files

['gDNA_rep1_19_8_7_5p4_minP_MPRA4p75_S3_R1_001.fastq.gz',
 'gDNA_rep2_19_8_7_5p4_minP_MPRA4p75_S4_R1_001.fastq.gz',
 'gDNA_rep3_19_8_7_5p4_minP_MPRA4p75_S7_R1_001.fastq.gz',
 'gDNA_rep4_19_8_7_5p4_minP_MPRA4p75_S8_R1_001.fastq.gz',
 'mRNA_rep1_19_8_7_5p4_minP_MPRA4p75_S5_R1_001.fastq.gz',
 'mRNA_rep2_19_8_7_5p4_minP_MPRA4p75_S6_R1_001.fastq.gz',
 'mRNA_rep3_19_8_7_5p4_minP_MPRA4p75_S9_R1_001.fastq.gz',
 'mRNA_rep4_19_8_7_5p4_minP_MPRA4p75_S10_R1_001.fastq.gz',
 'mapping_r2_to_BC_5p4_plDNA_lib_total_BCcount_55bp_from_R2end_ofhublib4p75tab_unique_BC_read_mappings_only_2023_09_07.csv',
 'mapping_r2_to_BC_5p4_plDNA_lib_total_BCcount_55bp_from_R2end_ofhublib4p75tab_unique_BC_read_mappings_only_plus_inferred_TFsyntax_2025_04_04.csv']

In [6]:
# Filter for gDNA and mRNA separately
files_gDNA = list(compress(files, ["gDNA" in i for i in files]))
files_mRNA = list(compress(files, ["mRNA" in i for i in files]))

print("gDNA Files:", files_gDNA)
print("mRNA Files:", files_mRNA)

gDNA Files: ['gDNA_rep1_19_8_7_5p4_minP_MPRA4p75_S3_R1_001.fastq.gz', 'gDNA_rep2_19_8_7_5p4_minP_MPRA4p75_S4_R1_001.fastq.gz', 'gDNA_rep3_19_8_7_5p4_minP_MPRA4p75_S7_R1_001.fastq.gz', 'gDNA_rep4_19_8_7_5p4_minP_MPRA4p75_S8_R1_001.fastq.gz']
mRNA Files: ['mRNA_rep1_19_8_7_5p4_minP_MPRA4p75_S5_R1_001.fastq.gz', 'mRNA_rep2_19_8_7_5p4_minP_MPRA4p75_S6_R1_001.fastq.gz', 'mRNA_rep3_19_8_7_5p4_minP_MPRA4p75_S9_R1_001.fastq.gz', 'mRNA_rep4_19_8_7_5p4_minP_MPRA4p75_S10_R1_001.fastq.gz']


In [7]:
gDNA_sequences = []
for file in files_gDNA:
    print(f"Reading gDNA file: {file} ...")
    gDNA_sequences.append(readin_fastq(data_path, file))
    print(f"Finished reading {file} ✅")

Reading gDNA file: gDNA_rep1_19_8_7_5p4_minP_MPRA4p75_S3_R1_001.fastq.gz ...
Finished reading gDNA_rep1_19_8_7_5p4_minP_MPRA4p75_S3_R1_001.fastq.gz ✅
Reading gDNA file: gDNA_rep2_19_8_7_5p4_minP_MPRA4p75_S4_R1_001.fastq.gz ...
Finished reading gDNA_rep2_19_8_7_5p4_minP_MPRA4p75_S4_R1_001.fastq.gz ✅
Reading gDNA file: gDNA_rep3_19_8_7_5p4_minP_MPRA4p75_S7_R1_001.fastq.gz ...
Finished reading gDNA_rep3_19_8_7_5p4_minP_MPRA4p75_S7_R1_001.fastq.gz ✅
Reading gDNA file: gDNA_rep4_19_8_7_5p4_minP_MPRA4p75_S8_R1_001.fastq.gz ...
Finished reading gDNA_rep4_19_8_7_5p4_minP_MPRA4p75_S8_R1_001.fastq.gz ✅


In [8]:
gDNA_sequences

[                                                          0
 1         CTCAGNATGACGGACAGACCTAGTCTGATTCCCATATGGAAGATCC...
 5         CTCAGNATGACGGACAGACCTCCGTTTTCCGTCATATGGAAGATCC...
 9         CTCAGNATGACGGACAGACCTTGCCGTCGTAGCATATGGAAGATCC...
 13        CTCAGNATGACGGACAGACCTGCTAAACCATGCATATGGAAGATCC...
 17        CTCAGNATGACGGACAGACCTTCGCACCATATCATATGGAAGATCC...
 ...                                                     ...
 15112825  CTCAGCATGACGGACAGACCGTAGTCGCTCAGCATATGGAAGATCC...
 15112829  CTCAGCATGACGGACAGACCCTATAAGCGTGCCATATGGAAGATCC...
 15112833  CTCAGCATGACGGACAGACCTTACGTAAAGTTCATATGGAAGATCC...
 15112837  CTCAGCATGACGGACAGACCGGATTGTCCTGCCATATGGAAGATCC...
 15112841  CTCAGCATGACGGACAGACCGGGGAGATACGTCATATGGAAGATCC...
 
 [3778211 rows x 1 columns],
                                                           0
 1         CTCAGNATGACGGACAGACCTGCGTGAGGCGTCATATGGAAGATCC...
 5         CTCGGNATGACGGACAGACCCCGCTACATCAACATATGGAAGATCC...
 9         CTCAGNATGACGGACAGACCCCGGCCCCCAAACATATGGAAGA

In [9]:
mRNA_sequences = []
for file in files_mRNA:
    print(f"Reading mRNA file: {file} ...")
    mRNA_sequences.append(readin_fastq(data_path, file))
    print(f"Finished reading {file} ✅")

Reading mRNA file: mRNA_rep1_19_8_7_5p4_minP_MPRA4p75_S5_R1_001.fastq.gz ...
Finished reading mRNA_rep1_19_8_7_5p4_minP_MPRA4p75_S5_R1_001.fastq.gz ✅
Reading mRNA file: mRNA_rep2_19_8_7_5p4_minP_MPRA4p75_S6_R1_001.fastq.gz ...
Finished reading mRNA_rep2_19_8_7_5p4_minP_MPRA4p75_S6_R1_001.fastq.gz ✅
Reading mRNA file: mRNA_rep3_19_8_7_5p4_minP_MPRA4p75_S9_R1_001.fastq.gz ...
Finished reading mRNA_rep3_19_8_7_5p4_minP_MPRA4p75_S9_R1_001.fastq.gz ✅
Reading mRNA file: mRNA_rep4_19_8_7_5p4_minP_MPRA4p75_S10_R1_001.fastq.gz ...
Finished reading mRNA_rep4_19_8_7_5p4_minP_MPRA4p75_S10_R1_001.fastq.gz ✅


In [38]:
mRNA_sequences

[                                                          0
 1         CTAGCNTGACGGACAGACCCGTAGAACTCTGCATATGGAAGATCCT...
 5         CTCAGNATGACGGACAGACCCCCCACCCAAGCCATATGGAAGATCC...
 9         CTCAGNATGACGGACAGACCTCTATGCATGCACATATGGAAGATCC...
 13        CTCAGNATGACGGACAGACCATCACGGAAAACCATATGGAAGATCC...
 17        CTCAGNATGACGGACAGACCAGTGTGGCTGTTCATATGGAAGATCC...
 ...                                                     ...
 18280625  CTCAGCATGACGGACAGACCCACTGCCGCTCGCATATGGAAGATCC...
 18280629  CTCAGCATGACGGACAGACCGACCAAGTTGCTCATATGGAAGATCC...
 18280633  CTCAGCATGACGGACAGACCGAGCCTCGAGGCCATATGGAAGATCC...
 18280637  CTCAGCATGACGGACAGACCCAGCGACCAGTACATATGGAAGATCC...
 18280641  CTCAGCATGACGGACAGACCTTTACCCCAACCCATATGGAAGATCC...
 
 [4570161 rows x 1 columns],
                                                           0
 1         CTCAGNATGACGGACAGACCACTCTCAGTCGGCATATGGAAGATCC...
 5         CTCAGNATGACGGACAGACCTATGGGGATATGCATATGGAAGATCC...
 9         CTCAGNATGACGGACAGACCCGTGGGAGCTGCCATATGGAAGA

In [13]:
f = pd.read_csv(join(data_path,'gDNA_rep1_19_8_7_5p4_minP_MPRA4p75_S3_R1_001.fastq.gz'),sep="\t",header=None)
f = f.iloc[range(1,f.shape[0],4),:]

In [14]:
# making sure that each sequence is 75 nucleotides
count = 0
for seq in f.iloc[:,0]:
    if len(seq) == 75:
        count += 1

print(count == len(f.iloc[:,0]))

True


In [40]:
# it seems that NOT all sequences start with a certain fixed primer of length 20
count = 0
count2 = 0
for seq in f.iloc[:,0]:
    if seq[:20] == 'CTCAGNATGACGGACAGACC' or seq[:20] == 'CTCAGCATGACGGACAGACC':
     count += 1
    else:
     count2 +=1
print('number of sequences with the same starting sequence: ', count, ' | number of sequences with a different starting sequence: ', count2)

number of sequences with the same starting sequence:  3475038  | number of sequences with a different starting sequence:  303173


In [51]:
def extract_barcode_with_flanks(sequence, start, end, le_length=8, ri_length=8):
    barcode = sequence[start:end]
    left_flank = sequence[start - le_length:start]
    right_flank = sequence[end:end + ri_length]
    return left_flank, barcode, right_flank

def process_sequences(sequences, le, ri, start, end, seq_type='gDNA'):
    sequences_dict = {}  # key: barcode, value: list of 4 counts (one per replicate)
    
    for replicate_num, df in enumerate(sequences):
        print(f'Processing replicate {replicate_num + 1}...') # just for visualization
        
        extracted = df.iloc[:, 0].apply(lambda seq: extract_barcode_with_flanks(seq, start, end))
        df_extracted = pd.DataFrame(extracted.tolist(), columns=['flank_l', 'BC', 'flank_r']) # a dataframe where rows are sequences, and columns are left-flanking, right-flanking, and barcode regions
        
        # filter sequences that match the given left and right flanks
        df_filtered = df_extracted[(df_extracted['flank_l'] == le) & (df_extracted['flank_r'] == ri)]
        
        # count occurrences of each barcode
        barcode_counts = Counter(df_filtered['BC'])
        
        # store counts in the dictionary
        for barcode, count in barcode_counts.items():
            if barcode not in sequences_dict:
                sequences_dict[barcode] = [0] * replicate_num  # initialize with zeros for the past replicates
            sequences_dict[barcode] += [0] * (replicate_num + 1 - len(sequences_dict[barcode]))
            sequences_dict[barcode][replicate_num] = count  # assign count to each replicate
    
    for barcode in sequences_dict:
     if len(sequences_dict[barcode]) < len(sequences):
        sequences_dict[barcode] += [0] * (len(sequences) - len(sequences_dict[barcode]))
    # convert dict to dataframe
    seq_data = [[seq] + counts for seq, counts in sequences_dict.items()]
    seq_columns = ['sequence'] + [f'{seq_type}_rep{i+1}' for i in range(len(sequences))]
    seq_df = pd.DataFrame(seq_data, columns=seq_columns)
    
    print("✅ Finished processing!")
    return seq_df

In [52]:
gDNA_df = process_sequences(gDNA_sequences, le='GACAGACC', ri='CATATGGA', start=20, end=32, seq_type='gDNA')
gDNA_df

Processing replicate 1...
Processing replicate 2...
Processing replicate 3...
Processing replicate 4...
✅ Finished processing!


Unnamed: 0,sequence,gDNA_rep1,gDNA_rep2,gDNA_rep3,gDNA_rep4
0,TAGTCTGATTCC,66,0,7,113
1,TCCGTTTTCCGT,96,0,0,0
2,TTGCCGTCGTAG,45,99,42,0
3,TGCTAAACCATG,442,201,146,184
4,TTCGCACCATAT,236,0,406,487
...,...,...,...,...,...
394863,ACAAGGCAGCGA,0,0,0,1
394864,CGTCGTTTAACA,0,0,0,1
394865,ACGGAATGAGCC,0,0,0,1
394866,GTCATTAAAGGT,0,0,0,1


In [53]:
gDNA_df.to_csv('barcode_gDNA_counts_upstream.tsv', sep='\t', index=False)

In [54]:
mRNA_df = process_sequences(mRNA_sequences, le='GACAGACC', ri='CATATGGA', start=20, end=32, seq_type='mRNA')
mRNA_df

Processing replicate 1...
Processing replicate 2...
Processing replicate 3...
Processing replicate 4...
✅ Finished processing!


Unnamed: 0,sequence,mRNA_rep1,mRNA_rep2,mRNA_rep3,mRNA_rep4
0,CCCCACCCAAGC,2664,2831,3139,2462
1,TCTATGCATGCA,86,31,0,99
2,ATCACGGAAAAC,1576,2110,1725,2124
3,AGTGTGGCTGTT,1094,990,883,1171
4,TTCAGCACGAAT,1014,836,882,876
...,...,...,...,...,...
306492,GTGTACACAGTT,0,0,0,1
306493,CGGGGCAACAAG,0,0,0,1
306494,TGTGAGCAGTCA,0,0,0,1
306495,GGGTACAAATGC,0,0,0,1


In [55]:
mRNA_df.to_csv('barcode_mRNA_counts_upstream.tsv', sep='\t', index=False)