In [None]:
import pandas as pd
import re
import numpy as np
import matplotlib.pyplot as plt

In [None]:
Lib_Name = 'Lib_1' #UPDATE with library name or base name you want for your files
Fig_Format = 'jpeg' #UPDATE default graph file format

Seq_Rep_1 = 'path/to/rep1_Map3' #UPDATE pathway to seq rep 1 Map3 
Seq_Rep_2 = 'path/to/rep2_Map3' #UPDATE pathway to seq rep 2 Map3

Name_Rep1 = 'Rep1' #UPDATE default is rep1 but I like to change to sequencing index number 
Name_Rep2 = 'Rep2'#UPDATE default is rep2 but I like to change to sequencing index number 

#create summary tabble
summary_dict = {'Category': [], 'Read Count': []}

In [None]:
#combine what appears in both step 1 seq reps 

df1 = pd.read_csv(Seq_Rep_1) 
df2 = pd.read_csv(Seq_Rep_2)  

summary_dict['Category'].append(f'{Name_Rep1} Map3 Shape')
summary_dict['Read Count'].append(df1.shape[0])

summary_dict['Category'].append(f'{Name_Rep2} Map3 Shape')
summary_dict['Read Count'].append(df2.shape[0])

print(f'{Name_Rep1} Map3 Shape: {df1.shape[0]}')
print(f'{Name_Rep2} Map3 Shape: {df2.shape[0]}')

# Add a column to indicate the source DataFrame
df1['Source'] = Name_Rep1
df2['Source'] = Name_Rep2

#initial unqiue Tile+Bc1 combos in each rep1
unique_sequences_r1 = df1['Cat'].nunique()
unique_sequences_r2 = df2['Cat'].nunique()

summary_dict['Category'].append(f'{Name_Rep1} Map3 unique tile + bc1 combinations')
summary_dict['Read Count'].append(unique_sequences_r1)

summary_dict['Category'].append(f'{Name_Rep2} Map3 unique tile + bc1 combinations')
summary_dict['Read Count'].append(unique_sequences_r2)

print(f"Number of unique tile + bc1 in {Name_Rep1} df: {unique_sequences_r1}")
print(f"Number of unique tile + bc1 in {Name_Rep2} df: {unique_sequences_r2}")

# Find intersection of values in column 'AD BCs'
intersection_values = set(df1['Cat']).intersection(set(df2['Cat'])) #UPDATE can intersect based on AD BCs too but is less stringent 

summary_dict['Category'].append(f'Intersected unique tile + bc1 combinations')
summary_dict['Read Count'].append(len(intersection_values))

print(f"Number of Intersected unique tile + bc1 combinations: {len(intersection_values)}")

# Filter the original DataFrames to keep only the intersected values
df1_filtered = df1[df1['Cat'].isin(intersection_values)]
df2_filtered = df2[df2['Cat'].isin(intersection_values)]

summary_dict['Category'].append(f'Filtered for Cat in both Reps, {Name_Rep1} Map3 Shape')
summary_dict['Read Count'].append(df1_filtered.shape[0])

summary_dict['Category'].append(f'Filtered for Cat in both Reps, {Name_Rep2} Map3 Shape')
summary_dict['Read Count'].append(df2_filtered.shape[0])

print(f'{Name_Rep1} Map3 Shape: {df1_filtered.shape[0]}')
print(f'{Name_Rep2} Map3 Shape: {df2_filtered.shape[0]}')

# Merge the filtered DataFrames to create a new DataFrame with all columns
new_df = pd.concat([df1_filtered, df2_filtered]).reset_index(drop=True)

unique_sequences_new = new_df['Cat'].nunique()

summary_dict['Category'].append(f'Intersected Map3 Shape')
summary_dict['Read Count'].append(new_df.shape[0])

summary_dict['Category'].append(f'Intersected Unique Tile + bc1 combinations')
summary_dict['Read Count'].append(unique_sequences_new)

print(f"Intersected df Map3 Shape: {new_df.shape[0]}")
print(f"Number of unique tile + bc1 in Intersected df: {unique_sequences_new}")



In [None]:
#stats on the above 
sum_reps_tile_bc_combos = len(df1) + len(df2)
percent_in_both = (len(new_df)/sum_reps_tile_bc_combos)*100
print(f'% AD_BC in both reps: {percent_in_both}')

In [None]:
#create intersected df csv
new_df.to_csv(f'{Lib_Name}_intersected_map3_unfiltered.csv', index=False)

In [None]:
#UPDATE
#For the below 2 kernals determine if you want to use the first filter >90% reads map to 1 tile or more stringent of all reads must map to 1 tile
#I typically use the >90% but you can also change the 90 to a higher % or use 100% in the stringent filter 

In [None]:
# Filter out rows where bc1 maps to multiple tiles but will keep if 90% of the times the bc1 appears it is with 1 tile
grouped = new_df.groupby('AD BCs')['Tiles'].value_counts().reset_index(name='count')
total_counts = grouped.groupby('AD BCs')['count'].sum().reset_index(name='total_count')
grouped = grouped.merge(total_counts, on='AD BCs')
grouped['percentage'] = grouped['count'] / grouped['total_count']
to_keep = grouped[grouped['percentage'] >= 0.9] #you can modify this to be more strigent but wouldn't make more permisive 
filtered_df = new_df.merge(to_keep[['AD BCs', 'Tiles']], on=['AD BCs', 'Tiles'], how='inner')

# Count unique Tiles after filtering
final_unique_tiles = filtered_df['Tiles'].nunique()
final_unique_tbc1 = filtered_df['Cat'].nunique()

summary_dict['Category'].append(f'Intersected Map3 unique Tiles after removing bc1 that map to multiple tiles <90% of the time')
summary_dict['Read Count'].append(final_unique_tiles)

summary_dict['Category'].append(f'Intersected Map3 unique Tile + bc1 combinations after removing bc1 that map to multiple tiles <90% of the time')
summary_dict['Read Count'].append(final_unique_tbc1)


print(f"Number of Tiles after >90% map to 1 bc1 filter: {final_unique_tiles}")
print(f"Number of unique Tile + bc1 combinations after >90% map to 1 bc1 filter: {final_unique_tbc1}")

#filtered_df.to_csv(f'{Lib_Name}_intersected_Map3_filtered.csv', index=False) #UPDATE uncomment if you want a filtered csv file made

In [None]:
#more stringent filter removing any barcodes that  map to multiple tiles 
# Group by 'AD BCs' and check if there's more than one unique 'Tiles' within each group
rows_to_remove = new_df.groupby('AD BCs').filter(lambda x: x['Tiles'].nunique() > 1)

# Remove those rows from the original dataframe
stringent_filtered = new_df.drop(rows_to_remove.index)

# Count unique Tiles after filtering
final_unique_tiles_2 = stringent_filtered['Tiles'].nunique()
final_unique_tbc1_2 = stringent_filtered['Cat'].nunique()

summary_dict['Category'].append(f'Intersected Map3 unique Tiles after stringent')
summary_dict['Read Count'].append(final_unique_tiles_2)

summary_dict['Category'].append(f'Intersected Map3 unique Tile + bc1 combinations after stringent')
summary_dict['Read Count'].append(final_unique_tbc1_2)


print(f"Number of Tiles after stringent filter: {final_unique_tiles_2}")
print(f"Number of unique Tile + bc1 combinations after stringent filter: {final_unique_tbc1_2}")

#stringent_filtered.to_csv(f'{Lib_Name}_intersected_Map3_stringent_filtered.csv', index=False) #UPDATE uncomment if you want a stringent filtered csv file made

In [None]:
# plot tile+bc1 coverage on histogram LOG
def tbc_graph(data):
    plt.figure()
    plt.title(f'{Lib_Name } Intersected: Unique Tile + BC Read Coverage Frequency')
    plt.hist(data['Cat'],  bins=100)
    plt.xlabel('Coverage')
    plt.ylabel('Counts')
    plt.xscale('log')
    plt.yscale('log')
    plt.savefig(f'{Lib_Name}_Intersected_Unique_Tile_with_bc1', format=Fig_Format)
    plt.show()

tbc_graph()#UPDATE input filtered_df or stringent_filtered depending on what filtered df you want to use 

In [None]:
# plot tile coverage 
def tile_graph(data):   
    plt.hist(data['Tiles'], bins=200)
    plt.title('Intersected: Unique Tile Read Coverage Frequency')
    plt.xlabel('Coverage')
    plt.ylabel('Counts')
    plt.savefig(f'{Lib_Name}_Intersected_Unique_Tiles', format=Fig_Format)
    plt.show()

tile_graph()#UPDATE input filtered_df or stringent_filtered depending on what filtered df you want to use 

In [None]:
#create summary table
summary_dict_df = pd.DataFrame.from_dict(summary_dict)
summary_dict_df.to_csv(f'{Lib_Name}_Intersected_Summary.csv', index=False)