In [2]:
import numpy as np
from os import listdir
from os.path import join
from itertools import compress
import pandas as pd
from collections import Counter

In [3]:
def readin_fastq(core_path,filename):
    seqsraw = pd.read_csv(join(core_path,filename),sep="\t",header=None)  # txt files are tab-separated that is why we say that the seperator is "tab" represented by "\t" (sep = "\t")
    seqs = seqsraw.iloc[range(1,seqsraw.shape[0],4),:] # the range function specifies the number of lines in each file and that onle every 4th line should be read -> fastq files have 4 lines per read, one with seqeunce and remaining 3 quality indicators
    
    return(seqs)

In [4]:
data_path = '/project2/kribelba_1515/data/EXTRA_seq/2024_data/endoSTARR_lib5p3'

In [5]:
files = listdir(data_path)
files

['gDNA_rep1_19_10cyc_5p3_endoSTARR_4p75_S7_R1_001.fastq.gz',
 'gDNA_rep2_19_10cyc_5p3_endoSTARR_4p75_S8_R1_001.fastq.gz',
 'gDNA_rep3_20_10cyc_5p3_endoSTARR_4p75_S11_R1_001.fastq.gz',
 'mRNA_rep1_21_10cyc_5p3_endoSTARR_4p75_S9_R1_001.fastq.gz',
 'mRNA_rep2_21_10cyc_5p3_endoSTARR_4p75_S10_R1_001.fastq.gz',
 'mRNA_rep3_22_10cyc_5p3_endoSTARR_4p75_S12_R1_001.fastq.gz',
 'mapping']

In [6]:
# Filter for gDNA and mRNA separately
files_gDNA = list(compress(files, ["gDNA" in i for i in files]))
files_mRNA = list(compress(files, ["mRNA" in i for i in files]))

print("gDNA Files:", files_gDNA)
print("mRNA Files:", files_mRNA)

gDNA Files: ['gDNA_rep1_19_10cyc_5p3_endoSTARR_4p75_S7_R1_001.fastq.gz', 'gDNA_rep2_19_10cyc_5p3_endoSTARR_4p75_S8_R1_001.fastq.gz', 'gDNA_rep3_20_10cyc_5p3_endoSTARR_4p75_S11_R1_001.fastq.gz']
mRNA Files: ['mRNA_rep1_21_10cyc_5p3_endoSTARR_4p75_S9_R1_001.fastq.gz', 'mRNA_rep2_21_10cyc_5p3_endoSTARR_4p75_S10_R1_001.fastq.gz', 'mRNA_rep3_22_10cyc_5p3_endoSTARR_4p75_S12_R1_001.fastq.gz']


In [7]:
gDNA_sequences = []
for file in files_gDNA:
    print(f"Reading gDNA file: {file} ...")
    gDNA_sequences.append(readin_fastq(data_path, file))
    print(f"Finished reading {file} ✅")

Reading gDNA file: gDNA_rep1_19_10cyc_5p3_endoSTARR_4p75_S7_R1_001.fastq.gz ...
Finished reading gDNA_rep1_19_10cyc_5p3_endoSTARR_4p75_S7_R1_001.fastq.gz ✅
Reading gDNA file: gDNA_rep2_19_10cyc_5p3_endoSTARR_4p75_S8_R1_001.fastq.gz ...
Finished reading gDNA_rep2_19_10cyc_5p3_endoSTARR_4p75_S8_R1_001.fastq.gz ✅
Reading gDNA file: gDNA_rep3_20_10cyc_5p3_endoSTARR_4p75_S11_R1_001.fastq.gz ...
Finished reading gDNA_rep3_20_10cyc_5p3_endoSTARR_4p75_S11_R1_001.fastq.gz ✅


In [8]:
gDNA_sequences

[                                                          0
 1         GTGGCNTAACCAGCTGTCGGATCCGTCATGAAACTATCGTCCGAGC...
 5         TCTCTNACATGCGCTGTCGGATCCGTTGGGGTGCACGTCCCCTATG...
 9         CGTTCNGATCAGGCTGTCGGATCCGTTGGGGTGCACGTCCCCTATG...
 13        GTAAANCTGGTCGCTGTCGGATCCGTTGGGGTGCACGTCCCCTATG...
 17        AGGGANCGGCATGCTGTCGGATCCGTAAAAGTGAAGAACGGCGCCC...
 ...                                                     ...
 32342449  TTCAGGCATTCCGCTGTCGGATCCGTTGGGGTGCACGTCCCCTATG...
 32342453  ATTCGTGCGCCGGCTGTCGGATCCGTCATGAAACTATCGTCCGAGC...
 32342457  GGTCTCTCTATGGCTGTCGGATCCGTCATGAAACTATCGTCCGAGC...
 32342461  CATCCTGATATGGCTGTCGGATCCGTTGGGGTGCACGTCCCCTATG...
 32342465  GCCCCCCATATTGCTGTCGGATCCGTCATGAAACTATCGTCCGAGC...
 
 [8085617 rows x 1 columns],
                                                           0
 1         GCGAGNCTCACAGCTGTCGGATCCGTAAAAGTGAAGAACGGCGCCC...
 5         GAGTGNATCCGGGCTGTCGGATCCGTCATGAAACTATCGTCCGAGC...
 9         TGAAANACCACAGCTGTCGGATCCGTCATGAAACTATCGTCCG

In [9]:
mRNA_sequences = []
for file in files_mRNA:
    print(f"Reading mRNA file: {file} ...")
    mRNA_sequences.append(readin_fastq(data_path, file))
    print(f"Finished reading {file} ✅")

Reading mRNA file: mRNA_rep1_21_10cyc_5p3_endoSTARR_4p75_S9_R1_001.fastq.gz ...
Finished reading mRNA_rep1_21_10cyc_5p3_endoSTARR_4p75_S9_R1_001.fastq.gz ✅
Reading mRNA file: mRNA_rep2_21_10cyc_5p3_endoSTARR_4p75_S10_R1_001.fastq.gz ...
Finished reading mRNA_rep2_21_10cyc_5p3_endoSTARR_4p75_S10_R1_001.fastq.gz ✅
Reading mRNA file: mRNA_rep3_22_10cyc_5p3_endoSTARR_4p75_S12_R1_001.fastq.gz ...
Finished reading mRNA_rep3_22_10cyc_5p3_endoSTARR_4p75_S12_R1_001.fastq.gz ✅


In [10]:
mRNA_sequences

[                                                          0
 1         GCAACNATAATAGCTGTCGGATCCGTAAAAGTGAAGAACGGCGCCC...
 5         TAAGGNACTGTGGCTGTCGGATCCGTTGGGGTGCACGTCCCCTATG...
 9         TATATNTTTAAAGCTGTCGGATCCGTTGGGGTGCACGTCCCCTATG...
 13        GCAACNATAATAGCTGTCGGATCCGTAAAAGTGAAGAACGGCGCCC...
 17        CCCAGNCAGCATGCTGTCGGATCCGTCATGAAACTATCGTCCGAGC...
 ...                                                     ...
 33731997  GCAACAATAATAGCTGTCGGATCCGTAAAAGTGAAGAACGGCGCCC...
 33732001  GAGAATAGCTTTGCTGTCGGATCCGTTGGGGTGCACGTCCCCTATG...
 33732005  GATGCCCGCCGGGCTGTCGGATCCGTAAAAGCGAAGAACGGCGCCC...
 33732009  GATATACCTGTCGCTGTCGGATCCGTCATGAAACTATCGTCCGAGC...
 33732013  TGGCTTCTGCATGCTGTCGGATCCGTCATGAAACTATCGTCCGAGC...
 
 [8433004 rows x 1 columns],
                                                           0
 1         GCAACNATAATAGCTGTCGGATCCGTAAAAGTGAAGAACGGCGCCC...
 5         ACACTNAACCACGCTGTCGGATCCGTCATGAAACTATCGTCCGAGC...
 9         TGCTANAGCTTAGCTGTCGGATCCGTTGGGGTGCACGTCCCCT

In [11]:
f = pd.read_csv(join(data_path,'gDNA_rep1_19_10cyc_5p3_endoSTARR_4p75_S7_R1_001.fastq.gz'),sep="\t",header=None)
f = f.iloc[range(1,f.shape[0],4),:]
f

Unnamed: 0,0
1,GTGGCNTAACCAGCTGTCGGATCCGTCATGAAACTATCGTCCGAGC...
5,TCTCTNACATGCGCTGTCGGATCCGTTGGGGTGCACGTCCCCTATG...
9,CGTTCNGATCAGGCTGTCGGATCCGTTGGGGTGCACGTCCCCTATG...
13,GTAAANCTGGTCGCTGTCGGATCCGTTGGGGTGCACGTCCCCTATG...
17,AGGGANCGGCATGCTGTCGGATCCGTAAAAGTGAAGAACGGCGCCC...
...,...
32342449,TTCAGGCATTCCGCTGTCGGATCCGTTGGGGTGCACGTCCCCTATG...
32342453,ATTCGTGCGCCGGCTGTCGGATCCGTCATGAAACTATCGTCCGAGC...
32342457,GGTCTCTCTATGGCTGTCGGATCCGTCATGAAACTATCGTCCGAGC...
32342461,CATCCTGATATGGCTGTCGGATCCGTTGGGGTGCACGTCCCCTATG...


In [12]:
# making sure that each sequence is 75 nucleotides
count = 0
for seq in f.iloc[:,0]:
    if len(seq) == 75:
        count += 1

print(count == len(f.iloc[:,0]))

True


In [63]:
# it seems that NOT all sequences have the same right flanking sequence
count = 0
count2 = 0
for seq in f.iloc[:,0]:
    if seq[12:26] == 'GCTGTCGGATCCGT':
     count += 1
    else:
     count2 +=1
print('number of sequences with the same flanking sequence: ', count, ' | number of sequences with a different flanking sequence: ', count2)

number of sequences with the same flanking sequence:  7935062  | number of sequences with a different flanking sequence:  150555


In [72]:
def extract_barcode_with_flanks(sequence, start, end, ri_length=14):
    barcode = sequence[start:end]
    right_flank = sequence[end:end + ri_length]
    return barcode, right_flank

def process_sequences(sequences, ri, start, end, seq_type='gDNA'):
    sequences_dict = {}  # key: barcode, value: list of 3 counts (one per replicate)
    
    for replicate_num, df in enumerate(sequences):
        print(f'Processing replicate {replicate_num + 1}...') # just for visualization
        
        extracted = df.iloc[:, 0].apply(lambda seq: extract_barcode_with_flanks(seq, start, end))
        df_extracted = pd.DataFrame(extracted.tolist(), columns=['BC', 'flank_r']) # a dataframe where rows are sequences, and columns are right-flanking and barcode regions
        
        # filter sequences that match the given left and right flanks
        df_filtered = df_extracted[df_extracted['flank_r'] == ri]
        
        # count occurrences of each barcode
        barcode_counts = Counter(df_filtered['BC'])
        
        # store counts in the dictionary
        for barcode, count in barcode_counts.items():
            if barcode not in sequences_dict:
                sequences_dict[barcode] = [0] * replicate_num  # initialize with zeros for the past replicates
            sequences_dict[barcode] += [0] * (replicate_num + 1 - len(sequences_dict[barcode]))
            sequences_dict[barcode][replicate_num] = count  # assign count to each replicate
    
    for barcode in sequences_dict:
     if len(sequences_dict[barcode]) < len(sequences):
        sequences_dict[barcode] += [0] * (len(sequences) - len(sequences_dict[barcode]))
    # convert dict to dataframe
    seq_data = [[seq] + counts for seq, counts in sequences_dict.items()]
    seq_columns = ['sequence'] + [f'{seq_type}_rep{i+1}' for i in range(len(sequences))]
    seq_df = pd.DataFrame(seq_data, columns=seq_columns)
    
    print("✅ Finished processing!")
    return seq_df

In [83]:
gDNA_df = process_sequences(gDNA_sequences, ri='GCTGTCGGATCCGT', start=0, end=12, seq_type='gDNA')
gDNA_df

Unnamed: 0,sequence,gDNA_rep1,gDNA_rep2,gDNA_rep3
0,GTGGCNTAACCA,1,0,0
1,TCTCTNACATGC,1,0,0
2,CGTTCNGATCAG,1,0,0
3,GTAAANCTGGTC,1,0,0
4,AGGGANCGGCAT,2,0,0
...,...,...,...,...
378319,CAAACCACGGAC,0,0,1
378320,AGACGCTATCAG,0,0,1
378321,CATAGCAAGAGC,0,0,1
378322,TGGCCGGGAAGC,0,0,1


In [84]:
gDNA_df.to_csv('barcode_gDNA_counts_dataset2.tsv', sep='\t', index=False)

In [85]:
mRNA_df = process_sequences(mRNA_sequences, ri='GCTGTCGGATCCGT', start=0, end=12, seq_type='mRNA')
mRNA_df

Processing replicate 1...
Processing replicate 2...
Processing replicate 3...
✅ Finished processing!


Unnamed: 0,sequence,mRNA_rep1,mRNA_rep2,mRNA_rep3
0,GCAACNATAATA,770,1403,631
1,TAAGGNACTGTG,19,26,8
2,TATATNTTTAAA,3,0,0
3,CCCAGNCAGCAT,8,12,5
4,CACGCNAAGTAA,1,0,1
...,...,...,...,...
205857,TTACAATCTAAG,0,0,1
205858,CGCTCGCGGAGT,0,0,1
205859,ACTTTATTCACG,0,0,1
205860,TCTAGTACCTTC,0,0,1


In [86]:
mRNA_df.to_csv('barcode_mRNA_counts_dataset2.tsv', sep='\t', index=False)