In [None]:
import pandas as pd
import re
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
#Global Variables
Lib_Name = 'Lib_1' #UPDATE with library name or base name you want for your files
Fig_Format = 'jpeg' #UPDATE default graph file format

fastq_file_path = 'myseqdata.fastq'#UPDATE with file path to your paired seq reads (fastq)

design_file_txt = 'list_of_my_Designed_seq.txt'#UPDATE with design file containing your list of designed DNA seq or tiles as a txt file

Threshold = 5 #UPDATE minimum read threshold you want to filter for 

Output_Directory = f'{Lib_Name}_Maps_and_Graphs' #UPDATE name you want your output directory for all your files to go 
os.makedirs(Output_Directory, exist_ok=True)

#create summary tabble
summary_dict = {'Category': [], 'Read Count': []}

In [None]:
def find_designed(des):
    """Creates a lookup dictionary of all designed tiles from a file."""
    dt = []
    with open(des, 'r') as f_des:
        for line in f_des:
            # Remove the left primer
            left_trimmed = line.replace("CCCAGCTTAAGCCACCATG", "") #UPDATE with the left sequence you want to trim from all of the tiles in your design file, usually the primer adapting part that is not a part of the unique design
            
            # Remove everything after (and including) the right sequence
            right_trimmed = left_trimmed.split("GGATCCGAGCTCG")[0] #UPDATE with the right sequence you want to trim NOTE this removes the seq AND everything after it you can change to .replace like the left side if you just want to remove the seq
            
            dt.append(right_trimmed.strip())
    return {tile: 1 for tile in dt}


In [None]:
def getmid(seq, pre, post):
    """Extracts the sequence between pre and post substrings."""
    match = re.search(f"{pre}(.*){post}", seq)
    return match.group(1) if match else "X"

def tilebc_mapper(readfile, dtd, t_len=6, bc1_len=9, bc3_len=16, designed_len=162, #UPDATE change to all the appropriate flanking seq and lengths 
                  bc1_pre="GAGCTCGCTAGC", bc1_post="CTCGAGATAA",
                  bc2_pre="CTCGAGATAACTTCGTATAATGTATGCTAT", bc2_post="GGCCGGCCATAGGGCCCC",
                  bc3_pre="GGCCGGCCATAGGGCCCC", bc3_post="GCGGTCCA",
                  designed_pre="CACCATG", designed_post="GGATCCG"): 
    """Processes input sequences to map tiles, BC1, BC3s, and Designed sequences."""

    # Lists to store extracted data
    tile_list, tile_lengths, tq_list = [], [], []
    bc1_list, bc1_lengths, bc1q_list = [], [], []
    bc3_list, bc3_lengths, bc3q_list = [], [], []
    designed_list, designed_lengths, designedq_list = [], [], []
    sequences = []
    total_sequences = 0

    with open(readfile, 'r') as fin:
        for line in fin:
            if line.startswith('@'):
                seq = next(fin).strip()
                sequences.append(seq)
                total_sequences += 1

                # Extract Tile
                tile = getmid(seq, bc2_pre, bc2_post)
                tile_len = len(tile)
                tile_quality = 1 if tile_len == t_len else 0

                # Extract BC1BC
                adBC = getmid(seq, bc1_pre, bc1_post)
                adBC_len = len(adBC)
                adBC_quality = 1 if adBC_len == bc1_len else 0

                # Extract BC3
                bc3 = getmid(seq, bc3_pre, bc3_post)
                bc3_len_actual = len(bc3)
                bc3_quality = 1 if bc3_len_actual == bc3_len else 0

                # Extract Designed
                designed = getmid(seq, designed_pre, designed_post)
                designed_len_actual = len(designed)
                designed_quality = 1 if designed in dtd else 0

                # Store all values
                tile_list.append(tile)
                tile_lengths.append(tile_len)
                tq_list.append(tile_quality)

                bc1_list.append(adBC)
                bc1_lengths.append(adBC_len)
                bc1q_list.append(adBC_quality)

                bc3_list.append(bc3)
                bc3_lengths.append(bc3_len_actual)
                bc3q_list.append(bc3_quality)

                designed_list.append(designed)
                designed_lengths.append(designed_len_actual)
                designedq_list.append(designed_quality)

    # Create DataFrame
    tileBC_df = pd.DataFrame({
        "Reads": sequences,
        "BC1": bc1_list,
        "BC1 Len": bc1_lengths,
        "BC1 Qual": bc1q_list,
        "BC2": tile_list,
        "BC2 Len": tile_lengths,
        "BC2 Qual": tq_list,
        "BC3": bc3_list,
        "BC3 Len": bc3_lengths,
        "BC3 Qual": bc3q_list,
        "Designed": designed_list,
        "Designed Len": designed_lengths,
        "Designed Qual": designedq_list
    })

    return tileBC_df



In [None]:
def process_maps(input_file, design_file):
    designed_tile_dict = find_designed(design_file)
    map1 = tilebc_mapper(input_file, designed_tile_dict)
    return map1, designed_tile_dict

In [None]:
map1, designed_tile_dict = process_maps(fastq_file_path, design_file_txt)

### Output df has all reads from the seqfile

map1 is the original df

In [None]:
#export the Map1 LUT
map1.to_csv(os.path.join(Output_Directory, f'{Lib_Name}_map1.csv'), index=False)
map1.to_excel(os.path.join(Output_Directory, f'{Lib_Name}_map1.xlsx'), index=False)

In [None]:
map1

In [None]:
#below are seires of analysis on the loook up table that was created 

In [None]:
#count reads with correct tile length, BC lenngth, and are in designed file
count_rows_a = ((map1['BC1 Qual'] == 1)).sum()

summary_dict['Category'].append(f'Reads with Correct BC1 Length')
summary_dict['Read Count'].append(count_rows_a)

print("Number of rows with 1 in BC1 Qual :", count_rows_a)

In [None]:
#count reads with correct tile length, BC lenngth, and are in designed file
count_rows_t = ((map1['BC2 Qual'] == 1)).sum()

summary_dict['Category'].append(f'Reads with Correct BC2 Length')
summary_dict['Read Count'].append(count_rows_t)

print("Number of rows with 1 in BC2 Qual :", count_rows_t)

In [None]:
#count reads with correct tile length, BC lenngth, and are in designed file
count_rows_r = ((map1['BC3 Qual'] == 1)).sum()

summary_dict['Category'].append(f'Reads with Correct BC3 Length')
summary_dict['Read Count'].append(count_rows_r)

print("Number of rows with 1 in BC3 Qual :", count_rows_r)

In [None]:
#count reads with correct tile length, BC lenngth, and are in designed file
count_rows_one = len(map1[(map1['BC1 Qual'] == 1) & (map1['BC2 Qual'] == 1)])

summary_dict['Category'].append(f'Rows with correct BC2 length and BC1BC length')
summary_dict['Read Count'].append(count_rows_one)

print("Number of rows with 1 in BC1 Qual and BC2 Qual:", count_rows_one)

In [None]:
#count reads with correct tile length, BC lenngth, and are in designed file
count_rows_2 = len(map1[(map1['BC2 Qual'] == 1) & (map1['BC3 Qual'] == 1)])

summary_dict['Category'].append(f'Rows with correct BC2 length and BC3 length')
summary_dict['Read Count'].append(count_rows_2)

print("Number of rows with 1 in BC2 Qual and BC3 Qual:", count_rows_2)

In [None]:
#count reads with correct tile length, BC lenngth, and are in designed file
count_rows_3 = len(map1[(map1['BC1 Qual'] == 1) & (map1['BC3 Qual'] == 1)])

summary_dict['Category'].append(f'Rows with correct BC1 length and BC3 length')
summary_dict['Read Count'].append(count_rows_3)

print("Number of rows with 1 in BC1 Qual and BC3 Qual:", count_rows_3)

In [None]:
#count reads with correct tile length, BC lenngth, and are in designed file
count_rows_4 = len(map1[(map1['BC1 Qual'] == 1) & (map1['BC2 Qual'] == 1) & (map1['BC3 Qual'] == 1)])

summary_dict['Category'].append(f'Rows with correct BC1 length, BC2 length, and BC3 length')
summary_dict['Read Count'].append(count_rows_4)

print("Number of rows with 1 in BC1 Qual, BC2 Qual, and BC3 Qual:", count_rows_4)

In [None]:
#count reads with correct tile length, BC lenngth, and are in designed file
count_rows_5 = len(map1[(map1['BC1 Qual'] == 1) & (map1['BC3 Qual'] == 1) & (map1['BC2 Qual'] == 1)& (map1['Designed Qual'] == 1)])

summary_dict['Category'].append(f'Rows with correct BC1 length, BC2 length, AD BC3 length AND in Designed')
summary_dict['Read Count'].append(count_rows_5)

print("Number of rows with 1 in BC1 Qual, BC2 Qual, BC3 Qual, and Designed Qual:", count_rows_5)

In [None]:
# plot AD BC len
plt.hist(map1['BC1 Len'], bins=100)
plt.xlim([0, 12]) #UPDATE if your bc1 length won't fit in this range
plt.title(f'BC1 Length Frequency {Lib_Name}')
plt.xlabel('BC1 Length')
plt.ylabel('Frequency')
plt.savefig(os.path.join(Output_Directory, f'{Lib_Name}_BC1_length.{Fig_Format}'))
plt.show()

In [None]:
#plot bc1 qual as histogram

plt.hist(map1['BC1 Qual'])
plt.title(f'BC1 Quality Frequency {Lib_Name}')
plt.xlabel('BC1 Qual')
plt.ylabel('Frequency')
plt.savefig(os.path.join(Output_Directory, f'{Lib_Name}_BC1_Quality.{Fig_Format}'))
plt.show()

In [None]:
#plot tile length histogram
plt.hist(map1['BC2 Len'])
plt.xlim([0, 10]) #UPDATE if your tile length won't fit in this range
plt.title(f'BC2 Length Frequency {Lib_Name}')
plt.xlabel('BC2 Length')
plt.ylabel('Frequency')
plt.savefig(os.path.join(Output_Directory, f'{Lib_Name}_BC2_Length.{Fig_Format}'))
plt.show()


In [None]:
#plot tile qual as histogram
plt.hist(map1['BC2 Qual'])
plt.title(f'BC2 Quality Frequency {Lib_Name}')
plt.xlabel('BC2 Qual')
plt.ylabel('Frequency')
plt.savefig(os.path.join(Output_Directory, f'{Lib_Name}_BC2_Quality.{Fig_Format}'))
plt.show()

In [None]:
#plot bc1 qual as histogram

plt.hist(map1['BC3 Qual'])
plt.title(f'BC3 Quality Frequency {Lib_Name}')
plt.xlabel('BC3 Qual')
plt.ylabel('Frequency')
plt.savefig(os.path.join(Output_Directory, f'{Lib_Name}_BC3_Quality.{Fig_Format}'))
plt.show()

In [None]:
# plot AD BC len
plt.hist(map1['BC3 Len'], bins=100)
plt.xlim([0, 20]) #UPDATE if your barcode length won't fit in this range
plt.title(f'BC3 Length Frequency {Lib_Name}')
plt.xlabel('BC3 Length')
plt.ylabel('Frequency')
plt.savefig(os.path.join(Output_Directory, f'{Lib_Name}_BC3_length.{Fig_Format}'))
plt.show()

### Filtering out reads that either do not have the consensus sequences to find the tiles/BCs OR have tiles/BCs of unexpected lengths

map2 is the df with only tiles that match designed and where BC lengths are as expected. All length and quality columns are dropped, as well as the column that stored the full read strings.

In [None]:

#Replace all 0s in map1 with NaN to filter out any Qual=0 reads
map1_nans = map1.replace(0, np.nan)
map2 = map1_nans.dropna().reset_index()

#get rid of some now useless columns
clabels = ['index','Reads', 'BC2 Len','BC2 Qual', 'BC1 Len','BC1 Qual', 'BC3 Qual', 'BC3 Len', 'Designed Len', 'Designed Qual']
map2 = map2.drop(clabels, axis = 1)


# how many reads are lost? rc = readcount
rcmap1 = map1.shape[0]
rcmap2 = map2.shape[0]
diffpct = ((rcmap1 - rcmap2) / rcmap1)*100
print("% Reads lost:")
print(diffpct)

summary_dict['Category'].append(f'Map1 Shape')
summary_dict['Read Count'].append(rcmap2)

summary_dict['Category'].append(f'New Section')
summary_dict['Read Count'].append('Quality = 0 rows removed')

### Coverage
map3 is a df with tiles, BCs, and combos of tiles and BCs

In [None]:
map2

In [None]:
map3 = map2.copy()
map3.head()

In [None]:
#Add column that connects BC1 to the Tile it is paired with (Cat for concatenation) 
# Create the Cat column by concatenating BC1, BC2, and BC3
# Create the Cat column by concatenating BC1, BC2, BC3, and Designed
map3['Cat'] = map3['BC1'].str.cat([map3['BC2'], map3['BC3'], map3['Designed']], sep='-')
map3['HA'] = map3['BC1'].str.cat([map3['BC2']], sep='-')
map3['HAR'] = map3['BC1'].str.cat([map3['BC2'], map3['BC3']], sep='-')

summary_dict['Category'].append('Map3 Shape')
summary_dict['Read Count'].append(map3.shape[0])


In [None]:
map3

In [None]:
#Frequency of each tile-bc combo

tbcov = map3['Cat'].value_counts().to_frame().reset_index()

summary_dict['Category'].append(f'Unique BC1 + BC2 + BC3 + Designed Tile coverage')
summary_dict['Read Count'].append(tbcov.shape[0])

print(f'number unique BC1 + BC2 + BC3+ Designed Tile combos:{tbcov.shape[0]}')



In [None]:
# plot tb coverage on histogram
plt.figure()
plt.title(f'Unique BC1 + BC2 + BC3 + Designed Tile Read Coverage Frequency {Lib_Name}')
plt.hist(tbcov['Cat'],  bins=75)
plt.xlabel('Coverage')
plt.ylabel('Counts')
plt.savefig(os.path.join(Output_Directory, f'{Lib_Name}_Cat_cov.{Fig_Format}'))
plt.show()

In [None]:
# plot tb coverage on histogram
plt.figure()
plt.title(f'Unique BC1 + BC2 + BC3 + Designed Tile Read Coverage Frequency log scale {Lib_Name}')
plt.hist(tbcov['Cat'],  bins=75)
plt.xlabel('Coverage')
plt.ylabel('Counts')
plt.xscale('log')
plt.yscale('log')
plt.savefig(os.path.join(Output_Directory, f'{Lib_Name}_Cat_log_cov.{Fig_Format}'))
plt.show()

In [None]:
#Unique ad bc coverage
abcov = map3['BC1'].value_counts().to_frame().reset_index()

summary_dict['Category'].append(f'Unique BC1')
summary_dict['Read Count'].append(abcov.shape[0])

print(f'Unique BC1: {abcov.shape[0]}') 


In [None]:
# plot ad bc coverage 
plt.hist(abcov['BC1'], bins=100)
plt.title(f'Unique BC1BC Read Coverage Frequency {Lib_Name}')
plt.xlabel('Coverage')
plt.ylabel('Counts')
plt.savefig(os.path.join(Output_Directory, f'{Lib_Name}_BC1_cov.{Fig_Format}'))
plt.show()

In [None]:
# Unique Tile coverage
tcov = map2['BC2'].value_counts().to_frame().reset_index()

summary_dict['Category'].append(f'Unique BC2 coverage')
summary_dict['Read Count'].append(tcov.shape[0])

print (f'Unique BC2s: {tcov.shape[0]}')

In [None]:
# plot tile coverage 
plt.hist(tcov['BC2'], bins=100)
plt.title(f'Unique BC2 Coverage Frequency {Lib_Name}')
plt.xlabel('Coverage')
plt.ylabel('Counts')
plt.savefig(os.path.join(Output_Directory, f'{Lib_Name}_BC2_cov.{Fig_Format}'))
plt.show()

In [None]:
# plot tile coverage 
plt.hist(tcov['BC2'], bins=100)
plt.title(f'Unique BC2 Coverage Frequency LOG {Lib_Name}')
plt.xlabel('Coverage')
plt.ylabel('Counts')
plt.xscale('log')
plt.yscale('log')
plt.savefig(os.path.join(Output_Directory, f'{Lib_Name}_BC2_log_cov.{Fig_Format}'))
plt.show()

In [None]:
#Unique ad bc coverage
Rbcov = map3['BC3'].value_counts().to_frame().reset_index()

summary_dict['Category'].append(f'Unique BC3')
summary_dict['Read Count'].append(Rbcov.shape[0])

print(f'Unique BC3s: {Rbcov.shape[0]}') 


In [None]:
# Unique Tile coverage
tcovArd3t = map3['Designed'].value_counts().to_frame().reset_index()

summary_dict['Category'].append(f'Map3 Unique Tiles ')
summary_dict['Read Count'].append(tcovArd3t.shape[0])

print (f'Unique Tile Map3: {tcovArd3t.shape[0]}')

In [None]:
# plot ad bc coverage 
plt.hist(Rbcov['BC3'], bins=100)
plt.title(f'Unique BC3 Read Coverage Frequency {Lib_Name}')
plt.xlabel('Coverage')
plt.ylabel('Counts')
plt.savefig(os.path.join(Output_Directory, f'{Lib_Name}_BC3_cov.{Fig_Format}'))
plt.show()

In [None]:
# plot ad bc coverage 
plt.hist(Rbcov['BC3'], bins=100)
plt.title(f'Unique BC3 Read Coverage Frequency LOG {Lib_Name}')
plt.xlabel('Coverage')
plt.ylabel('Counts')
plt.xscale('log')
plt.yscale('log')
plt.savefig(os.path.join(Output_Directory, f'{Lib_Name}_BC3_cov_log.{Fig_Format}'))
plt.show()

In [None]:
# Group by BC1 and count unique BC2s
adbc_per_bc1 = map2.groupby('BC1')['BC2'].nunique()

# Plot histogram
plt.figure(figsize=(10, 6))
plt.hist(adbc_per_bc1, bins=range(1, adbc_per_bc1.max() + 2), edgecolor='black')
plt.title(f'Histogram of Unique BC2s per BC1 {Lib_Name}')
plt.xlabel('Number of Unique BC2s')
plt.ylabel('Number of BC1')
plt.grid(True)
plt.tight_layout()
plt.savefig(os.path.join(Output_Directory, f'{Lib_Name}_uniBC2perBC1_cov.{Fig_Format}'))
plt.show()

# Print summary statistics
print(f"Min: {adbc_per_bc1.min()}")
print(f"Max: {adbc_per_bc1.max()}")
print(f"Median: {adbc_per_bc1.median()}")
print(f"Mean: {adbc_per_bc1.mean():.2f}")

# Print top 10 BC1 with highest unique BC2 counts
top10_adbc_per_bc1 = adbc_per_bc1.sort_values(ascending=False).head(10)
print("\nTop 10 BC1 with highest number of unique BC2s:")
print(top10_adbc_per_bc1)

In [None]:
# Group by HA and count unique BC3s
bc3_per_ha = map3.groupby('HA')['BC3'].nunique()

# Plot histogram
plt.figure(figsize=(10, 6))
plt.hist(bc3_per_ha, bins=range(1, bc3_per_ha.max() + 2), edgecolor='black')
plt.title(f'Histogram of Unique BC3s per HA {Lib_Name}')
plt.xlabel('Number of Unique BC3s')
plt.ylabel('Number of HA Entries')
plt.grid(True)
plt.tight_layout()
plt.savefig(os.path.join(Output_Directory, f'{Lib_Name}_uniBC3perHA_cov.{Fig_Format}'))
plt.show()

# Print summary statistics
print(f"Min: {bc3_per_ha.min()}")
print(f"Max: {bc3_per_ha.max()}")
print(f"Median: {bc3_per_ha.median()}")
print(f"Mean: {bc3_per_ha.mean():.2f}")
# Print top 10 HA entries with highest unique BC3 counts
top10_bc3_per_ha = bc3_per_ha.sort_values(ascending=False).head(10)
print("\nTop 10 HA entries with highest number of unique BC3s:")
print(top10_bc3_per_ha)


In [None]:
# Group by BC3 and count unique HA values
ha_per_bc3 = map3.groupby('BC3')['HA'].nunique()

# Plot histogram
plt.figure(figsize=(10, 6))
plt.hist(ha_per_bc3, bins=range(1, ha_per_bc3.max() + 2), edgecolor='black')
plt.title(f'Histogram of Unique HA per BC3 {Lib_Name}')
plt.xlabel('Number of Unique HA Values')
plt.ylabel('Number of BC3 Entries')
plt.grid(True)
plt.tight_layout()
plt.savefig(os.path.join(Output_Directory, f'{Lib_Name}_uniHAperBC3_cov.{Fig_Format}'))
plt.show()

# Print summary statistics
print(f"Min: {ha_per_bc3.min()}")
print(f"Max: {ha_per_bc3.max()}")
print(f"Median: {ha_per_bc3.median()}")
print(f"Mean: {ha_per_bc3.mean():.2f}")

# Print top 10 BC3s with highest unique HA counts
top10_ha_per_bc3 = ha_per_bc3.sort_values(ascending=False).head(10)
print("\nTop 10 BC3s with highest number of unique HA values:")
print(top10_ha_per_bc3)


In [None]:
# Unique Tile coverage
tcovA = map3['HA'].value_counts().to_frame().reset_index()

summary_dict['Category'].append(f'Unique BC1+BC2 (HA) combos coverage')
summary_dict['Read Count'].append(tcovA.shape[0])

print (f'Unique BC1+BC2 combos: {tcovA.shape[0]}')

In [None]:
# Unique Tile coverage
tcovAr = map3['HAR'].value_counts().to_frame().reset_index()

summary_dict['Category'].append(f'Unique BC1+BC2+ BC3 (HAR) combos coverage')
summary_dict['Read Count'].append(tcovAr.shape[0])

print (f'Unique BC1+BC2+ BC3 combos: {tcovAr.shape[0]}')

In [None]:
#make csv of map3
map3.to_csv(os.path.join(Output_Directory, f'{Lib_Name}_map3.csv'), index=False)
map3.to_excel(os.path.join(Output_Directory, f'{Lib_Name}_map3.xlsx'), index=False)

In [None]:
summary_dict['Category'].append(f'New Section')
summary_dict['Read Count'].append(f'Filter Map3 to keep rows that have {Threshold} minimum # of Reads per Cat')

In [None]:
#Filter to remove rows that appear less than threshold number of times

# Count how many times each Cat appears in map3
cat_counts = map3['Cat'].value_counts()

# Keep only Cats that occur >= Threshold times
valid_cats = cat_counts[cat_counts >= Threshold].index

# Filter map3 to keep only those rows
map3min = map3[map3['Cat'].isin(valid_cats)].copy()

print(f"Threshold: {Threshold}")
print(f"Original map3 rows: {len(map3)}")
print(f"Rows kept in map3min: {len(map3min)}")
print(f"Unique Cats kept: {map3min['Cat'].nunique()}")

In [None]:
summary_dict['Category'].append(f'Map3 Filtered by read threshold shape')
summary_dict['Read Count'].append(map3min.shape[0])

In [None]:
# Unique Tile coverage
tcovArd3 = map3min['Designed'].value_counts().to_frame().reset_index()

summary_dict['Category'].append(f'Map3 Filtered by read threshold shape unique Designed Tile ')
summary_dict['Read Count'].append(tcovArd3.shape[0])

print (f'Unique Tile combos Map3: {tcovArd3.shape[0]}')

In [None]:
summary_dict['Category'].append(f'Map3 Filtered by read threshold shape unique Cat')
summary_dict['Read Count'].append(map3min['Cat'].nunique())

In [None]:
summary_dict['Category'].append(f'Map3 Filtered by read threshold shape unique BC1')
summary_dict['Read Count'].append(map3min['BC1'].nunique())

In [None]:
summary_dict['Category'].append(f'Map3 Filtered by read threshold shape unique BC2')
summary_dict['Read Count'].append(map3min['BC2'].nunique())

In [None]:
summary_dict['Category'].append(f'Map3 Filtered by read threshold shape unique HA')
summary_dict['Read Count'].append(map3min['HA'].nunique())

In [None]:
summary_dict['Category'].append(f'Map3 Filtered by read threshold shape unique HAR')
summary_dict['Read Count'].append(map3min['HAR'].nunique())

In [None]:
summary_dict['Category'].append(f'New Section')
summary_dict['Read Count'].append('Map3 filtered by read coverage and by removing BC3 that map to multiple HA')

In [None]:
#identify BC3 that map back to multiple HA and remove all rows that contrain those HA and/or BC3 

# Group by BC3 and count unique HA values
ha_per_bc3 = map3min.groupby('BC3')['HA'].nunique()

# Identify BC3s with more than 1 unique HA
bad_bc3 = ha_per_bc3[ha_per_bc3 > 1].index

# Identify all HAs associated with those BC3s
bad_ha = map3min.loc[map3min['BC3'].isin(bad_bc3), 'HA'].unique()

# Remove rows with bad BC3s or bad HAs
map3_filtered = map3min[~map3min['BC3'].isin(bad_bc3) & ~map3min['HA'].isin(bad_ha)]

# --- Recompute statistics on filtered dataset ---
ha_per_bc3_filtered = map3_filtered.groupby('BC3')['HA'].nunique()

#NOTE they should all equl 1 because the graph is made after filtering so that all BC3 should have a unique HA
# Plot histogram
plt.figure(figsize=(10, 6))
plt.hist(ha_per_bc3_filtered, bins=range(1, ha_per_bc3_filtered.max() + 2), edgecolor='black')
plt.title(f'Histogram of Unique HA per BC3 (Filtered) {Lib_Name}')
plt.xlabel('Number of Unique HA Values')
plt.ylabel('Number of BC3 Entries')
plt.grid(True)
plt.tight_layout()
plt.savefig(os.path.join(Output_Directory, f'{Lib_Name}_uniHAperBC3_cov_filtered.{Fig_Format}'))
plt.show()

# Print summary statistics
print(f"Min: {ha_per_bc3_filtered.min()}")
print(f"Max: {ha_per_bc3_filtered.max()}")
print(f"Median: {ha_per_bc3_filtered.median()}")
print(f"Mean: {ha_per_bc3_filtered.mean():.2f}")

# Print top 10 BC3s with highest unique HA counts
top10_ha_per_bc3 = ha_per_bc3_filtered.sort_values(ascending=False).head(10)
print("\nTop 10 BC3s with highest number of unique HA values (Filtered):")
print(top10_ha_per_bc3)

In [None]:
#Adding the rows that were removed due to mapping to multiple things and exporting them into a new df and file so that you can analyze them further 
# Mask for removed rows (bad BC3s or bad HAs)
removed_mask = map3min['BC3'].isin(bad_bc3) | map3min['HA'].isin(bad_ha)

# DataFrame of removed rows
removed_rows = map3min[removed_mask].copy()

# # Optional: save removed rows to CSV for inspection
# removed_fp = os.path.join(Output_Directory, f'{Lib_Name}_removed_badBC3_HA.csv')
# removed_rows.to_csv(removed_fp, index=False)

print(f"Removed rows saved to: {removed_fp}")
print(f"Number of removed rows: {len(removed_rows)}")
print(removed_rows.head(10))  # peek at first 10


In [None]:
# Group removed_rows by Cat and count how many times each appears
removed_cats = (
    removed_rows.groupby('Cat')
    .size()
    .reset_index(name='Cat_Count')
)

# Optional: save this summary
removed_cats_fp = os.path.join(Output_Directory, f'{Lib_Name}_removed_CatCounts.csv')
removed_cats.to_csv(removed_cats_fp, index=False)
removed_cats_fpp = os.path.join(Output_Directory, f'{Lib_Name}_removed_CatCounts.xlsx')
removed_cats.to_excel(removed_cats_fpp, index=False)

print(f"Removed Cat summary saved to: {removed_cats_fp}")
print(removed_cats.head(10))  # preview first 10
print(len(removed_cats))


In [None]:
# --- Sort removed_cats by largest Cat_Count ---
removed_cats_sorted = removed_cats.sort_values(by='Cat_Count', ascending=False)

# Save sorted version
removed_cats_sorted_fp = os.path.join(Output_Directory, f'{Lib_Name}_removed_CatCounts_sorted.csv')
removed_cats_sorted.to_csv(removed_cats_sorted_fp, index=False)

print(f"Sorted removed Cat summary saved to: {removed_cats_sorted_fp}")
print(removed_cats_sorted.head(10))  # preview top 10

# --- Plot histogram of Cat_Count ---
plt.figure(figsize=(10, 6))
plt.hist(removed_cats_sorted['Cat_Count'], bins=75, edgecolor='black')
plt.title(f'Histogram of Cat_Count (Removed Rows) {Lib_Name}')
plt.xlabel('Cat_Count')
plt.ylabel('Number of Cats')
plt.grid(True)
plt.tight_layout()
plt.savefig(os.path.join(Output_Directory, f'{Lib_Name}_removed_CatCount_hist.{Fig_Format}'))
plt.show()


In [None]:
#seeing the number of HA that have multiple BC3. This number can be larger than 1 and will just have to collapse all of the BC3 for each HA in final analysis to count as one rep
# Group by HA and count unique BC3s
bc3_per_hat = map3_filtered.groupby('HA')['BC3'].nunique()

# Plot histogram
plt.figure(figsize=(10, 6))
plt.hist(bc3_per_hat, bins=range(1, bc3_per_hat.max() + 2), edgecolor='black')
plt.title(f'Histogram of Unique BC3s per HA map3filtereed {Lib_Name}')
plt.xlabel('Number of Unique BC3s')
plt.ylabel('Number of HA Entries')
plt.grid(True)
plt.tight_layout()
plt.savefig(os.path.join(Output_Directory, f'{Lib_Name}_uniBC3perHA_cov_map3filtered.{Fig_Format}'))
plt.show()

# Print summary statistics
print(f"Min: {bc3_per_hat.min()}")
print(f"Max: {bc3_per_hat.max()}")
print(f"Median: {bc3_per_hat.median()}")
print(f"Mean: {bc3_per_hat.mean():.2f}")



In [None]:
# Step 1: Count how many times each Cat value appears
cat_countss = map3_filtered['Cat'].value_counts()

# Step 2: Drop duplicate Cat rows, keeping the first occurrence
map4 = map3_filtered.drop_duplicates(subset='Cat').copy()

# Step 3: Add the Cat_Coverage column
map4['Cat_Counts'] = map4['Cat'].map(cat_countss)
map4

In [None]:
map4 = map4.sort_values(by='Cat_Counts', ascending=False)
map4

In [None]:
summary_dict['Category'].append(f'Map3 read coverage >= {Threshold} and no BC3 that map to multiple HA')
summary_dict['Read Count'].append(map3_filtered.shape[0])

summary_dict['Category'].append(f'map4 shape unique Cat')
summary_dict['Read Count'].append(map4.shape[0])


In [None]:
map4.to_excel(os.path.join(Output_Directory,f'{Lib_Name}_map4_unique_Cat_min_{Threshold}_reads.xlsx'), index=False)
map4.to_csv(os.path.join(Output_Directory,f'{Lib_Name}_map4_unique_Cat_min_{Threshold}_reads.csv'), index=False)

In [None]:
#Unique ad bc coverage
abcovtavc = map4['Cat'].value_counts().to_frame().reset_index()

summary_dict['Category'].append(f'Unique Cat map4')
summary_dict['Read Count'].append(abcovtavc.shape[0])

print(f'Unique Cat: {abcovtavc.shape[0]}') # number unique ad bcs

In [None]:
#Unique ad bc coverage
abcovt = map4['BC1'].value_counts().to_frame().reset_index()

summary_dict['Category'].append(f'Unique BC1BC map4')
summary_dict['Read Count'].append(abcovt.shape[0])

print(f'Unique BC1: {abcovt.shape[0]}') # number unique ad bcs

In [None]:
#Unique ad bc coverage
abcovta = map4['BC2'].value_counts().to_frame().reset_index()

summary_dict['Category'].append(f'Unique BC2 map4')
summary_dict['Read Count'].append(abcovta.shape[0])

print(f'Unique BC2: {abcovta.shape[0]}') # number unique ad bcs

In [None]:
#Unique ad bc coverage
abcovtav = map4['BC3'].value_counts().to_frame().reset_index()

summary_dict['Category'].append(f'Unique BC3 map4')
summary_dict['Read Count'].append(abcovtav.shape[0])

print(f'Unique BC3: {abcovtav.shape[0]}') # number unique ad bcs

In [None]:
# Unique Tile coverage
tcovArd = map4['Designed'].value_counts().to_frame().reset_index()

summary_dict['Category'].append(f'Unique Tile coverage Map4')
summary_dict['Read Count'].append(tcovArd.shape[0])

print (f'Unique Tile combos Map4: {tcovArd.shape[0]}')

In [None]:
# Unique Tile coverage
tcovAr = map4['HA'].value_counts().to_frame().reset_index()

summary_dict['Category'].append(f'Unique BC1+BC2 combos coverage Map4')
summary_dict['Read Count'].append(tcovAr.shape[0])

print (f'Unique BC1+BC2 combos Map4: {tcovAr.shape[0]}')

In [None]:
# Unique Tile coverage
tcovArr = map4['HAR'].value_counts().to_frame().reset_index()

summary_dict['Category'].append(f'Unique BC1+BC2+ BC3 combos coverage Map4')
summary_dict['Read Count'].append(tcovArr.shape[0])

print (f'Unique BC1+BC2+ BC3 combos map4: {tcovArr.shape[0]}')

In [None]:
tbcov4 = map3_filtered['Cat'].value_counts().to_frame().reset_index()


In [None]:
# plot tb coverage on histogram
plt.figure()
plt.title(f'map3filtered Unique BC1 + BC2 + BC3 + Designed Tile Read Coverage Frequency log scale {Lib_Name}')
plt.hist(tbcov4['Cat'],  bins=75)
plt.xlabel('Coverage')
plt.ylabel('Counts')
plt.xscale('log')
plt.yscale('log')
plt.savefig(os.path.join(Output_Directory, f'{Lib_Name}_Cat_map3filtered_log_cov.{Fig_Format}'))
plt.show()

In [None]:
#create summary table
summary_dict_df = pd.DataFrame.from_dict(summary_dict)
summary_dict_df.to_csv(os.path.join(Output_Directory, f'{Lib_Name}_Step1_Summary.csv'), index=False)
summary_dict_df.to_excel(os.path.join(Output_Directory,f'{Lib_Name}_Step1_Summary.xlsx'), index=False)