# **USE FOR RNA DATA WITH UMIS**

In [None]:
#imports
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.cm as cm 
from matplotlib.lines import Line2D
import os
import numpy as np
from scipy.stats import pearsonr
from scipy.stats import sem, zscore

## **1. Decode Pre-Processed Step 1 Map**
- the step1 map is a csv file that associates barcodes to tiles from inital plasmid pool sequencing
- step1 map should be processed already (barcodes matching to muliple other barcodes and/or tiles beyond threshold and minimum read thresholds removed)
- inital step1 graphs will show you read count distributions to make sure you are happy with thresholds before proceeding 
- this first block will associate the gene, mutation, and protein sequence information from the origional tile design table with the step1 map containing barcodes based on matching tile DNA sequence
- you can skip to the second code block if you already have step1 decoded 

In [None]:
#Inputs for Step1 decoding 
Lib_Name = 'TL4_test' #UPDATE name you want associated with all file naming 
step1_initial = 'TL4S1_TL4S1_czb_Min_map4_unique_cat_map4_90_match_10_read_min_filter.csv' #UPDATE processed step1 seq bc lookup table 
gene_lut = 'Lib4_Tile_LUT.csv ' #UPDATE table used to create varients that associates a DNA sequence with a specific gene/mutations

out_s1 = f'{Lib_Name}_decoding_S1'
os.makedirs(out_s1, exist_ok=True)

In [None]:
#analysis of step1 map read coverage to ensure happy to move forward 
def analyze_cat_counts(csv_file, column, name, count_col="Cat_Count"):
    import pandas as pd
    import matplotlib.pyplot as plt
    import seaborn as sns

    df = pd.read_csv(csv_file)

    if column not in df.columns or count_col not in df.columns:
        raise ValueError(f"Required columns '{column}' and/or '{count_col}' not found in {csv_file}")

    # Histograms (normal + log scale)
    plt.figure(figsize=(10, 5))
    sns.histplot(df[count_col], bins=50, kde=False)
    plt.title(f"Histogram of {name} {count_col} (Normal Scale)")
    plt.xlabel(count_col)
    plt.ylabel("Frequency")
    plt.tight_layout()
    plt.show()

    plt.figure(figsize=(10, 5))
    sns.histplot(df[count_col], bins=50, kde=False, log_scale=(True, True))
    plt.title(f"Histogram of {name} {count_col} (Log Scale)")
    plt.xlabel(count_col)
    plt.ylabel("Frequency (log)")
    plt.tight_layout()
    plt.show()

    # Threshold table
    thresholds = [1, 5, 10, 25, 50, 100, 1000]
    threshold_counts = {t: (df[count_col] >= t).sum() for t in thresholds}
    threshold_df = pd.DataFrame.from_dict(threshold_counts, orient="index", columns=["Rows ≥ Threshold"])

    # Summary statistics
    stats_summary = {
        "min": int(df[count_col].min()),
        "max": int(df[count_col].max()),
        "median": int(df[count_col].median()),
        "mean": round(df[count_col].mean())
    }

    # Unique counts in key columns
    key_columns = ['Cat', 'HAR', 'HA', 'RTBC', 'HawkBCs', 'ADBC2', 'Designed'] #UPDATE if you have different columns 
    unique_counts = {}
    for col in key_columns:
        if col in df.columns:
            unique_counts[col] = df[col].nunique()
        else:
            unique_counts[col] = 'Column not found'

    print(f"\n{name} Threshold Counts:")
    display(threshold_df)

    print(f"\n{name} Summary Statistics for {count_col}:")
    for k, v in stats_summary.items():
        print(f"{k.capitalize()}: {v}")

    print(f"\n{name} Unique Counts in Key Columns (pre-threshold):")
    for col, count in unique_counts.items():
        print(f"{col}: {count}")

    return threshold_df, stats_summary, unique_counts


In [None]:
#execute step1 analysis function 
thresholds, stats, unique_count = analyze_cat_counts(step1_initial, column='Cat',name=f'{Lib_Name}_Step1_Initial')

**CHECKPOINT: IF UNHAPPY WITH STEP1 SUMMARY RESULTS GO BACK AND MAKE A DIFF READ THRESHOLD**

In [None]:
#add gene information to step1 map of barcodes associted to a specific dna sequence 
import pandas as pd
import os

# Summary table
Map4_Summary_Dict = {
    'Category': [],
    'Read Count': []
}

# Read reference LUT
excel_df = pd.read_csv(gene_lut)

###### adding new seq to gene lut end #UPDATE comment out from here until specified end if you don't want to add a new sequence to your gene lut 
# --- Add new sequence to reference LUT --- #UPDATE I frequently forget to add the GS linker control to my gene look up table so this ensure its added. Update if you use a diff one or want to add aditional seqs to your gene look up table before mapping to step 1
new_seq = "GGCAGCGGCAGCGGCAGCGGCAGCGGCAGCGGCAGCGGCAGCGGCAGCGGCAGCGGCAGCGGCAGCGGCAGCGGCAGCGGCAGCGGCAGCGGCAGCGGCAGCGGCAGCGGCAGCGGCAGCGGCAGCGGCAGCGGCAGCGGCAGCGGCAGCGGCAGCGGCAGC"

if not (excel_df['DNA Sequence'].str.strip() == new_seq).any(): #U{DATE other info associated with you new gene varient 
    new_row = {
        'DNA Sequence': new_seq,
        'Gene Name': 'GS_only_seq',
        'Mutant Sequence': 'GSGSGSGSGSGSGSGSGSGSGSGSGSGSGSGSGSGSGSGSGSGSGSGSGSGSGS',
        'Mutation': 'Gs_Only'
    }
    excel_df = pd.concat([excel_df, pd.DataFrame([new_row])], ignore_index=True)
    print(f"Added new sequence to {gene_lut}")
else:
    print(f"Sequence already present in {gene_lut}")

# (Optional) Save updated LUT permanently
excel_df.to_csv(gene_lut, index=False)

###### adding new seq to gene lut end

# Read the experimental CSV
new_csv_df = pd.read_csv(step1_initial)

# --- CUT-OFF SEQ CLEANING FUNCTION ---
def clean_tiles_column(tiles_column):
    cut_off_seq = 'GGATCCGAGCTCGCTAGC' #UPDATE to remove part of sequence from all the sequences in step1, it currently removes this sequence and eveyrthing after it  
    cleaned = []
    for seq in tiles_column:
        seq = str(seq).strip()
        cut_index = seq.find(cut_off_seq)
        if cut_index != -1:
            seq = seq[:cut_index]
        cleaned.append(seq)
    return cleaned

# Apply cut-off sequence cleaning to Tiles column
new_csv_df['Designed'] = clean_tiles_column(new_csv_df['Designed'])

# Make a copy of the cleaned dataframe
new_df = new_csv_df.copy()

# Initialize new columns #UPDATE with additional columns you want or change to the way you named them 
new_df['Gene Name'] = None
new_df['Mutant Sequence'] = None
new_df['Mutation'] = None


# Match cleaned Tiles against DNA Sequence in reference #UPDATE column names if nessicarry 
for i, tile in new_df.iterrows():
    match = excel_df[excel_df['DNA Sequence'].str.strip() == tile['Designed']]
    if not match.empty:
        new_df.at[i, 'Gene Name'] = match['Gene Name'].values[0]
        new_df.at[i, 'Mutant Sequence'] = match['Mutant Sequence'].values[0]
        new_df.at[i, 'Mutation'] = match['Mutation'].values[0]

# Print initial shape
print("Step1 Map Initial Shape:", new_df.shape)
Map4_Summary_Dict['Category'].append("Step1 Map Initial Shape:")
Map4_Summary_Dict['Read Count'].append(new_df.shape[0])

# Identify rows with no matches
no_match_df = new_df[
    new_df['Gene Name'].isna() &
    new_df['Mutant Sequence'].isna()
]

# Drop unmatched rows
new_df.dropna(subset=['Gene Name', 'Mutant Sequence'], inplace=True)

# Print post-match shape
print("Step1 Map Shape after dropping rows with no matches:", new_df.shape)
Map4_Summary_Dict['Category'].append("Step1 Map Shape after dropping rows with no matches:")
Map4_Summary_Dict['Read Count'].append(new_df.shape[0])

# Log unmatched rows
Map4_Summary_Dict['Category'].append("Number of Rows with No Tile Match:")
Map4_Summary_Dict['Read Count'].append(no_match_df.shape[0])

print('No matches:')
display(no_match_df)

In [None]:
#check out the new step1 map with gene info associated with barcode info 
new_df.head()

In [None]:
#create files and summary table of step1 decoding 
new_df.to_excel(os.path.join(out_s1, f'{Lib_Name}_Step1_Decoded.xlsx'), index= False)
new_df.to_csv(os.path.join(out_s1, f'{Lib_Name}_Step1_Decoded.csv'), index=False)

Summary_Dict_df = pd.DataFrame.from_dict(Map4_Summary_Dict)
Summary_Dict_df.to_csv(os.path.join(out_s1, f'{Lib_Name}_Step1_Decoded_Summary.csv'), index= False)
Summary_Dict_df.to_excel(os.path.join(out_s1, f'{Lib_Name}_Step1_Decoded_Summary.xlsx'), index= False)

In [None]:
#looking at the number of genes in the gene look up table 
total_tiles_per_gene = excel_df['Gene Name'].value_counts()
display(total_tiles_per_gene)

In [None]:
# Count the number of times each Gene occurs
gene_counts = new_df['Gene Name'].value_counts()

# Print the counts
print("Number of Unique Barcode Combinations for each Gene:")
display(gene_counts)


In [None]:
# Calculate the gene counts from the new dataframe normalized to the potenital number of tiles for that gene 
gene_counts = new_df['Gene Name'].value_counts()

# Calculate total tiles per gene from the lookup table
total_tiles_per_gene = excel_df['Gene Name'].value_counts()

# Create a new dataframe to ensure we match by gene name
count_df = pd.DataFrame({'Gene Name': gene_counts.index, 'Gene Count': gene_counts.values})
total_tiles_df = pd.DataFrame({'Gene Name': total_tiles_per_gene.index, 'Total Possible Tiles': total_tiles_per_gene.values})

# Merge the two dataframes on the Gene column
merged_df = pd.merge(count_df, total_tiles_df, on='Gene Name')

# Normalize the gene counts
merged_df['Normalized Count'] = (merged_df['Gene Count'] / merged_df['Total Possible Tiles']).round(1)

#Order by largest to smallest normalized count 
merged_df = merged_df.sort_values(by='Normalized Count', ascending=False)
display(merged_df)

# Plot the normalized counts as a histogram
plt.figure(figsize=(16, 6))
merged_df.set_index('Gene Name')['Normalized Count'].plot(kind='bar')
plt.xlabel('Gene')
plt.ylabel('Normalized Count')
plt.title('Histogram of Normalized Gene Counts')
plt.xticks(rotation=90)
plt.savefig(f'{Lib_Name}_decoding_S1/{Lib_Name}_Normalized_Gene_counts.jpeg')
plt.show()

## 2. Adds Decoded Step1 Info to RNA Data and Calculates Activity Ratios
- Adds the recently added gene, mutation, protein sequence information from decoded step1 to the RNA data based on matching unique ADBC + RTBC combinations
- Activity ratios are calculated by RTBC UMI counts / ADBC UMI counts. A simple and complex ratio are created based on the way the UMIS were counted. The ratios highly correlate based on initial data, so I typically just use complex ratio for further downstream analysis and graphing
- Controls only and Varibles only files are created if you want to look at the data with the controls pulled out or compare the datasets seperately 


In [None]:
df_rna = 'results_with_step1_unerror_corrected_with_zeros.csv' #UPDATE pathway to RNA data with initial UMI processing that maps them back to step1 genes via Sanjana pipeline 

umimin = 20 #UPDATE to change minimum UMI threshold

output_dir = f'{Lib_Name}_output_ratio_files' # Define the output directory for this section
os.makedirs(output_dir, exist_ok=True)

In [None]:
df = pd.read_csv(df_rna)
df.head()

In [None]:
#new_df = pd.read_csv('') #if you are skipping step1 beacuse you already have decoded step1 add it here 
new_df.head()

In [None]:
# ---- Initial length prints ----
print("Initial length of RNA Data (df):", len(df))
print("Initial length of Step1 Map Decoded (new_df):", len(new_df))

# Clean: ensure Cat column exists and is normalized
df['Cat'] = df['Cat'].astype(str).str.strip()
new_df['Cat'] = new_df['Cat'].astype(str).str.strip()

# Ensure new_df has only ONE row per Cat (prevents duplicate merge blow-ups)
new_df = new_df.groupby('Cat', as_index=False).first()

# Select only the columns we want to add
cols_to_add = ['Cat', 'Mutant Sequence', 'Gene Name', 'Mutation']
missing_cols = [c for c in cols_to_add if c not in new_df.columns]
if missing_cols:
    raise ValueError(f"new_df is missing columns: {missing_cols}")

# Perform safe left merge (keeps df length the same as the original df)
df = df.merge(
    new_df[cols_to_add],
    on='Cat',
    how='left',
    validate='many_to_one'   # df = many rows per Cat, new_df = one row per Cat
)

# Save results
df.to_excel(os.path.join(output_dir, f'{Lib_Name}_UMI_RNA_results_mapped_to_S1_with_NANs.xlsx'), index=False)
df.to_csv(os.path.join(output_dir, f'{Lib_Name}_UMI_RNA_results_mapped_to_S1_with_NANs.csv'), index=False)

# OPTIONAL: drop rows where ALL 3 new columns are untouched NaN
df_filtered = df.dropna(subset=['Mutant Sequence', 'Gene Name', 'Mutation'], how='all')

print("Length of df after merge:", len(df))
print("Length of df after removing unmatched rows:", len(df_filtered))

df.head()


In [None]:
# Create Ratio_Simple and Ratio_Complex columns
df['Ratio_Simple'] = df['RTBC_umi_count_simple'] / df['AD_umi_count_simple']
df['Ratio_Complex'] = df['RTBC_umi_count_complex'] / df['AD_umi_count_complex']

# Preview the updated DataFrame
df[['RTBC_umi_count_simple', 'AD_umi_count_simple', 'Ratio_Simple',
    'RTBC_umi_count_complex', 'AD_umi_count_complex', 'Ratio_Complex']].head()


In [None]:
# Save the updated DataFrame to a new CSV file
df.to_excel(os.path.join(output_dir, f'{Lib_Name}_UMI_RNA_results_with_ratios.xlsx'), index=False)
df.to_csv(os.path.join(output_dir, f'{Lib_Name}_UMI_RNA_results_with_ratios.csv'), index=False)


In [None]:
#make a controls only df (optional)
controls_df = df[df['Gene Name'] != 'NKX2_2_AD'] #UPDATE to include all gene names if using a library with multiple genes 
print("controls_df length:", len(controls_df))

# Number of unique entries in Designed column
unique_designed = controls_df['Designed'].nunique()
print("controls_df unique Tiles values:", unique_designed)


#save files 
controls_df.to_excel(os.path.join(output_dir, f'{Lib_Name}_UMI_RNA_results_with_ratios_Controls_Only.xlsx'), index=False)
controls_df.to_csv(os.path.join(output_dir, f'{Lib_Name}_UMI_RNA_results_with_ratios_Controls_Only.csv'), index=False)

In [None]:
# Make a varients only no controls df (optional)
No_Controls_df = df[df['Gene Name'] == 'NKX2_2_AD'] #UPDATE to include all gene names if using a library with multiple genes
print("No_Controls_df length:", len(No_Controls_df))

# Number of unique entries in Designed column
unique_designed_nc = No_Controls_df['Designed'].nunique()
print("No_Controls_df unique Tiles values:", unique_designed_nc)

#save files 
No_Controls_df.to_excel(os.path.join(output_dir, f'{Lib_Name}_UMI_RNA_results_with_ratios_Variables_Only.xlsx'), index=False)
No_Controls_df.to_csv(os.path.join(output_dir, f'{Lib_Name}_UMI_RNA_results_with_ratios_Varibles_Only.csv'), index=False)

In [None]:
# Create separate DataFrames based on unique values in the 'number' column
# Sort the unique numbers first so they are in the right order in the dictionary 
unique_numbers = sorted(df['number'].unique())

# Build dictionary 
dataframes = {f"df_{num}": df[df['number'] == num] for num in unique_numbers}

# Print length and unique count in 'Designed' column for each subset
for key, sub_df in dataframes.items():
    length = len(sub_df)
    unique_designed = sub_df['Designed'].nunique()
    print(f"{key}: length={length}, unique Designed={unique_designed}")



In [None]:
print(dataframes.keys())

In [None]:
#Filter to dfs to have a minimum number of UMIs for the ADBC and RTBC
filtered_dataframes = {
    name: df[(df['AD_umi_count_complex'] >= f'{umimin}') & (df['RTBC_umi_count_complex'] >= f'{umimin}')]
    for name, df in dataframes.items()
}

In [None]:
print(filtered_dataframes.keys())

## 3. Make Lots of Graphs 
- this section relies on the number column created during Sanjana's initial RNA processing that is done before this file to seperate the large df into smaller dfs based on differing samples
- it is critical to ensure you have the right numbers associated with the right sample type
- all analysis will be done using the complex ratio and the df that contains all of the variables and controls together filtered >=20UMI, change if needed but if you have UMI counts of zeros kept in you will need to at least remove those before graphing or it will cause errors
- histograms, bar graphs, and correlations will be made for the correlations and histograms that involve paired dfs then you need to specify in the tuple which dfs you want to analyse together and it will loop through all combinations 

In [None]:
#Inputs for this section
os.makedirs(f'{Lib_Name}_output_graphs', exist_ok=True)
names = ['R1_Notind_24hr_S1', #UPDATE files are called by number if you want to associate a more meaninful name with the graphs/file names list them in number order here 
         'R1_Ind_24hr_S2',
         'R2_Notind_24hr_S3', 
         'R2_Ind_24hr_S4',
         'R1_Notind_48hr_S5', 
         'R1_Ind_48hr_S6',
         'R2_Notind_48hr_S7', 
         'R2_Ind_48hr_S8']  

**HISTOGRAMS**

In [None]:
#Complex Ratio Individual Samples Histograms (normal, log10, xscale log)
# log10 seems to give the best visibility but it creates versions of all of them 

os.makedirs(f'{Lib_Name}_output_graphs/{Lib_Name}_output_graphs_complex_ratio', exist_ok=True)

# Loop through each unique number and plot histograms using filtered_dataframes
for i, num in enumerate(sorted(df['number'].unique())):
    key = f"df_{num}"                     # dictionary key
    df_current = filtered_dataframes[key] # grab the subset DataFrame
    title = f"{num} - {names[i]}"
    base_filename = f"{num}_{names[i].replace(' ', '_')}"

    # --- 1. Regular histogram ---
    plt.figure(figsize=(8, 6))
    plt.hist(df_current['Ratio_Complex'].dropna(), bins=30,
             color='skyblue', edgecolor='black')
    plt.title(f"{title} - Ratio_Complex")
    plt.xlabel('Ratio_Complex')
    plt.ylabel('Frequency')
    plt.tight_layout()
    plt.savefig(f"{Lib_Name}_output_graphs/{Lib_Name}_output_graphs_complex_ratio/{base_filename}_Ratio_Complex_histogram.png")
    plt.close()

    # --- 2. Histogram with log-scaled x-axis ---
    plt.figure(figsize=(8, 6))
    plt.hist(df_current['Ratio_Complex'].dropna(), bins=30,
             color='orange', edgecolor='black')
    plt.xscale('log')  # log scale on x-axis
    plt.title(f"{title} - Ratio_Complex (X-axis Log Scale)")
    plt.xlabel('Ratio_Complex (log scale)')
    plt.ylabel('Frequency')
    plt.tight_layout()
    plt.savefig(f"{Lib_Name}_output_graphs/{Lib_Name}_output_graphs_complex_ratio/{base_filename}_Ratio_Complex_xlog_histogram.png")
    plt.close()

    # --- 3. Histogram of log10-transformed values ---
    ratio_log10 = np.log10(df_current['Ratio_Complex'].dropna())
    ratio_log10 = ratio_log10.replace([np.inf, -np.inf], np.nan).dropna()

    plt.figure(figsize=(8, 6))
    plt.hist(ratio_log10, bins=30, color='salmon', edgecolor='black')
    plt.title(f"{title} - Log10(Ratio_Complex)")
    plt.xlabel('Log10(Ratio_Complex)')
    plt.ylabel('Frequency')
    plt.tight_layout()
    plt.savefig(f"{Lib_Name}_output_graphs/{Lib_Name}_output_graphs_complex_ratio/{base_filename}_Ratio_Complex_log10_histogram.png")
    plt.close()


In [None]:
#induced complex ratio / uninduced complex ratio (yes ratio of a ratio)
# Create output directory
output_dir4 = f"{Lib_Name}_output_graphs/{Lib_Name}_output_graphs_complex_ratio_induced_divided_by_uninduced"
os.makedirs(output_dir4, exist_ok=True)

# Define merge key 
merge_key = "Cat"   #UPDATE if you want to merge on a different column  

# Define tuple combinations (first ÷ second)
tuple_combinations = [('df_4', 'df_3'), ('df_8', 'df_7')]  #UPDATE (first df is induced, second is UNinduced) (first_df/second_df)

# Loop through each tuple combination
for induced_key, uninduced_key in tuple_combinations:
    df_induced = filtered_dataframes[induced_key][[merge_key, 'Ratio_Complex']].dropna()
    df_uninduced = filtered_dataframes[uninduced_key][[merge_key, 'Ratio_Complex']].dropna()

    # Merge on chosen key
    merged = df_induced.merge(df_uninduced, on=merge_key, suffixes=('_induced', '_uninduced'))

    # Compute ratio (induced ÷ uninduced)
    merged['Ratio_Complex_divided'] = merged['Ratio_Complex_induced'] / merged['Ratio_Complex_uninduced']
    ratio_data = merged['Ratio_Complex_divided'].dropna()

    # --- 1. Normal histogram ---
    plt.figure(figsize=(8, 4))
    plt.hist(ratio_data, bins=50, color='mediumseagreen', edgecolor='black')
    plt.title(f'{induced_key} ÷ {uninduced_key} Ratio_Complex by {merge_key} (Normal)')
    plt.xlabel(f'{induced_key} ÷ {uninduced_key} Ratio_Complex')
    plt.ylabel('Frequency')
    plt.tight_layout()
    plt.savefig(os.path.join(output_dir4, f'{induced_key}_div_{uninduced_key}_ratio_hist_normal.png'), dpi=300)
    plt.close()

    # --- 2. Log10-transformed histogram ---
    ratio_log10 = np.log10(ratio_data.replace(0, np.nan)).replace([np.inf, -np.inf], np.nan).dropna()
    plt.figure(figsize=(8, 4))
    plt.hist(ratio_log10, bins=50, color='mediumseagreen', edgecolor='black')
    plt.title(f'{induced_key} ÷ {uninduced_key} Ratio_Complex by {merge_key} (Log10 transformed)')
    plt.xlabel(f'Log10({induced_key} ÷ {uninduced_key} Ratio_Complex)')
    plt.ylabel('Frequency')
    plt.tight_layout()
    plt.savefig(os.path.join(output_dir4, f'{induced_key}_div_{uninduced_key}_ratio_hist_log10.png'), dpi=300)
    plt.close()

    # --- 3. Histogram with log-scaled x-axis ---
    plt.figure(figsize=(8, 4))
    plt.hist(ratio_data, bins=50, color='mediumseagreen', edgecolor='black')
    plt.xscale('log')
    plt.title(f'{induced_key} ÷ {uninduced_key} Ratio_Complex by {merge_key} (X-axis Log Scale)')
    plt.xlabel(f'{induced_key} ÷ {uninduced_key} Ratio_Complex (log scale)')
    plt.ylabel('Frequency')
    plt.tight_layout()
    plt.savefig(os.path.join(output_dir4, f'{induced_key}_div_{uninduced_key}_ratio_hist_xlog.png'), dpi=300)
    plt.close()


In [None]:
#Graph UMI per unique Integration Histograms (normal, log10, xscale log)
#the graph should already be in a place where each row is unique bc combination / integration so the code works based on that assumption 
#this one log scale x is usually best visibility but overall usually hard to get good resolution with the spread

output_dir2 = f"{Lib_Name}_output_graphs/{Lib_Name}_umi_per_integration"
os.makedirs(output_dir2, exist_ok=True)

# Loop through each dataframe in the filtered_dataframes dictionary
for name, df in filtered_dataframes.items():

    # --- AD_umi_count_complex ---
    if 'AD_umi_count_complex' in df.columns:
        values = df['AD_umi_count_complex'].dropna()

        # Stats
        total_umi = values.sum()
        unique_cat = df['Cat'].nunique() if 'Cat' in df.columns else 0
        stats_text = f"Total UMI: {total_umi}\nUnique Cat: {unique_cat}"

        # 1. Normal histogram
        plt.figure(figsize=(6, 4))
        plt.hist(values, bins=500, color='steelblue', alpha=0.6)
        plt.title(f'{name} - ADBC (Normal)')
        plt.xlabel('AD_umi_count_complex')
        plt.ylabel('Frequency')
        plt.text(0.95, 0.95, stats_text, transform=plt.gca().transAxes,
                 fontsize=9, va='top', ha='right',
                 bbox=dict(facecolor='white', alpha=0.7, edgecolor='none'))
        plt.tight_layout()
        plt.savefig(os.path.join(output_dir2, f'{name}_ADBC_hist.png'))
        plt.close()

        # 2. Log10-transformed histogram
        values_log10 = np.log10(values.replace(0, np.nan)).replace([np.inf, -np.inf], np.nan).dropna()
        plt.figure(figsize=(6, 4))
        plt.hist(values_log10, bins=500, color='steelblue', alpha=0.6)
        plt.title(f'{name} - ADBC (Log10 transformed)')
        plt.xlabel('Log10(AD_umi_count_complex)')
        plt.ylabel('Frequency')
        plt.text(0.95, 0.95, stats_text, transform=plt.gca().transAxes,
                 fontsize=9, va='top', ha='right',
                 bbox=dict(facecolor='white', alpha=0.7, edgecolor='none'))
        plt.tight_layout()
        plt.savefig(os.path.join(output_dir2, f'{name}_ADBC_log10_hist.png'))
        plt.close()

        # 3. Histogram with log-scaled x-axis
        plt.figure(figsize=(6, 4))
        plt.hist(values, bins=500, color='steelblue', alpha=0.6)
        plt.xscale('log')
        plt.title(f'{name} - ADBC (X-axis Log Scale)')
        plt.xlabel('AD_umi_count_complex (log scale)')
        plt.ylabel('Frequency')
        plt.text(0.95, 0.95, stats_text, transform=plt.gca().transAxes,
                 fontsize=9, va='top', ha='right',
                 bbox=dict(facecolor='white', alpha=0.7, edgecolor='none'))
        plt.tight_layout()
        plt.savefig(os.path.join(output_dir2, f'{name}_ADBC_xlog_hist.png'))
        plt.close()

    # --- RTBC_umi_count_complex ---
    if 'RTBC_umi_count_complex' in df.columns:
        values = df['RTBC_umi_count_complex'].dropna()

        # Stats
        total_umi = values.sum()
        unique_cat = df['Cat'].nunique() if 'Cat' in df.columns else 0
        stats_text = f"Total UMI: {total_umi}\nUnique Cat: {unique_cat}"

        # 1. Normal histogram
        plt.figure(figsize=(6, 4))
        plt.hist(values, bins=500, color='darkorange', alpha=0.6)
        plt.title(f'{name} - RTBC (Normal)')
        plt.xlabel('RTBC_umi_count_complex')
        plt.ylabel('Frequency')
        plt.text(0.95, 0.95, stats_text, transform=plt.gca().transAxes,
                 fontsize=9, va='top', ha='right',
                 bbox=dict(facecolor='white', alpha=0.7, edgecolor='none'))
        plt.tight_layout()
        plt.savefig(os.path.join(output_dir2, f'{name}_RTBC_hist.png'))
        plt.close()

        # 2. Log10-transformed histogram
        values_log10 = np.log10(values.replace(0, np.nan)).replace([np.inf, -np.inf], np.nan).dropna()
        plt.figure(figsize=(6, 4))
        plt.hist(values_log10, bins=500, color='darkorange', alpha=0.6)
        plt.title(f'{name} - RTBC (Log10 transformed)')
        plt.xlabel('Log10(RTBC_umi_count_complex)')
        plt.ylabel('Frequency')
        plt.text(0.95, 0.95, stats_text, transform=plt.gca().transAxes,
                 fontsize=9, va='top', ha='right',
                 bbox=dict(facecolor='white', alpha=0.7, edgecolor='none'))
        plt.tight_layout()
        plt.savefig(os.path.join(output_dir2, f'{name}_RTBC_log10_hist.png'))
        plt.close()

        # 3. Histogram with log-scaled x-axis
        plt.figure(figsize=(6, 4))
        plt.hist(values, bins=500, color='darkorange', alpha=0.6)
        plt.xscale('log')
        plt.title(f'{name} - RTBC (X-axis Log Scale)')
        plt.xlabel('RTBC_umi_count_complex (log scale)')
        plt.ylabel('Frequency')
        plt.text(0.95, 0.95, stats_text, transform=plt.gca().transAxes,
                 fontsize=9, va='top', ha='right',
                 bbox=dict(facecolor='white', alpha=0.7, edgecolor='none'))
        plt.tight_layout()
        plt.savefig(os.path.join(output_dir2, f'{name}_RTBC_xlog_hist.png'))
        plt.close()


**CORRELATIONS**

In [None]:
#Complex Ratio Correlations 
#list the tuple combinations you want to make correlations with  

# Step 1: Create output directory
output_dir3 = f"{Lib_Name}_output_graphs/{Lib_Name}_output_graphs_complex_ratio_correlations"
os.makedirs(output_dir3, exist_ok=True)

# Step 2: Define merge key 
merge_key = "Cat"   #UPDATE with a diff column if you want 

# Step 3: Define tuple combinations of DataFrame keys to compare
tuple_combinations = [('df_1', 'df_2'),('df_3', 'df_4'),('df_5', 'df_6'), ('df_7', 'df_8')] #induced vs uninduced 
#[('df_1', 'df_3'),('df_2', 'df_4'),('df_5', 'df_7'), ('df_6', 'df_8')] #R1 vs R2
#[('df_1', 'df_5'),('df_2', 'df_6'),('df_3', 'df_7'), ('df_4', 'df_8')] #24 vs 48

# Step 4: Loop through each tuple combination
for key_a, key_b in tuple_combinations:
    df_a = filtered_dataframes[key_a][[merge_key, 'Ratio_Complex']].dropna()
    df_b = filtered_dataframes[key_b][[merge_key, 'Ratio_Complex']].dropna()

    # Merge on chosen key
    merged = df_a.merge(df_b, on=merge_key, suffixes=(f'_{key_a}', f'_{key_b}'))

    # Compute correlation
    x = merged[f'Ratio_Complex_{key_a}']
    y = merged[f'Ratio_Complex_{key_b}']
    r, _ = pearsonr(x, y)
    r_squared = r ** 2

    # Count matches and unmatched
    matched_count = len(merged)
    unmatched_a = len(df_a) - matched_count
    unmatched_b = len(df_b) - matched_count

    # Plot correlation
    plt.figure(figsize=(6, 6))
    plt.scatter(x, y, alpha=0.6, edgecolor='black', linewidth=0.5)

    plt.xlabel(f'{key_a} Ratio_Complex ({merge_key})')
    plt.ylabel(f'{key_b} Ratio_Complex ({merge_key})')
    plt.title(f'Correlation of Ratio_Complex by {merge_key}: {key_a} vs {key_b}')

    # Annotate with stats in top right
    textstr = (
        f'Matched {merge_key}s: {matched_count}\n'
        f'Unmatched in {key_a}: {unmatched_a}\n'
        f'Unmatched in {key_b}: {unmatched_b}\n'
        f'r = {r:.3f}\n'
        f'r² = {r_squared:.3f}'
    )
    plt.text(0.95, 0.95, textstr, transform=plt.gca().transAxes,
             fontsize=10, verticalalignment='top', horizontalalignment='right',
             bbox=dict(boxstyle='round', facecolor='white', edgecolor='gray'))

    plt.tight_layout()

    # Save plot with merge_key in filename
    output_path = os.path.join(
        output_dir3,
        f'{key_a}_vs_{key_b}_correlation_by_{merge_key}.png'
    )
    plt.savefig(output_path, dpi=300)
    plt.close()


**OUTLIER REMOVAL**

In [None]:
#Remove Integration outliers within each tile based on z-score

output_dir5 = f"{Lib_Name}_output_graphs/{Lib_Name}_averge_ratio_complex_per_tile_outliers_removed"
os.makedirs(output_dir5, exist_ok=True)

average_ratio_complex_dfs = {}

for name, df in filtered_dataframes.items():
    df = df.copy()
    df['Mutation'] = df['Mutation'].replace('1-2-3-4', 'WT') #THIS is just beacuse the shuffle in 1-2-3-4 order is just WT and I forgot to update earlier

    grouped = df.groupby('Mutation')
    records = []

    for mutation, group in grouped:
        raw_ratios = group['Ratio_Complex'].dropna()
        initial_count = len(raw_ratios)

        # --- Z-score filtering ---
        if len(raw_ratios) > 1:
            z_scores = zscore(raw_ratios)
            cleaned_ratios = raw_ratios[(np.abs(z_scores) <= 3)]
        else:
            # keep single-value groups
            cleaned_ratios = raw_ratios.copy()

        avg = cleaned_ratios.mean()
        stderr = sem(cleaned_ratios) if len(cleaned_ratios) > 1 else np.nan
        count_cleaned = len(cleaned_ratios)

        # Print counts before and after
        #print(f"[{name}] Mutation {mutation}: before={initial_count}, after={count_cleaned}")

        # Safe extraction of metadata
        mutant_seq = group['Mutant Sequence'].iloc[0] if 'Mutant Sequence' in group.columns else ''
        gene_name = group['Gene Name'].iloc[0] if 'Gene Name' in group.columns else ''

        records.append({
            'Mutant Sequence': mutant_seq,
            'Gene Name': gene_name,
            'Mutation': mutation,
            'average_Ratio_Complex': avg,
            'stderror_Ratio_Complex': stderr,
            'number_integrations': count_cleaned,
            'initial_ints': initial_count
        })

    summary_df = pd.DataFrame(records)
    summary_df = summary_df.sort_values(by='average_Ratio_Complex', ascending=False)

    # # Optional: global z-score of averages
    # if summary_df['average_Ratio_Complex'].std() == 0:
    #     summary_df['zscore_average_Ratio_Complex'] = np.nan
    # else:
    #     summary_df['zscore_average_Ratio_Complex'] = (
    #         (summary_df['average_Ratio_Complex'] - summary_df['average_Ratio_Complex'].mean()) /
    #         summary_df['average_Ratio_Complex'].std()
    #     )

    key = f'average_ratio_complex_outliers_removed_{name}'
    average_ratio_complex_dfs[key] = summary_df

    summary_df.to_csv(os.path.join(output_dir5, f'{key}.csv'), index=False)
    summary_df.to_excel(os.path.join(output_dir5, f'{key}.xlsx'), index=False)


**CORRELATIONS WITH OUTLIERS REMOVED**

In [None]:
#Average Complex Ratio (with outliers removed) per Tile Correlations 

output_dir6 = f"{Lib_Name}_output_graphs/{Lib_Name}_output_graphs_complex_ratio_correlation_nooutliers"
os.makedirs(output_dir6, exist_ok=True)

# Step 1: Folder path containing CSVs
folder_path = output_dir5  # ← Replace with your actual folder path
nooutliers = {}

# Load all CSVs into dictionary
for filename in os.listdir(folder_path):
    if filename.endswith('.csv'):
        key = os.path.splitext(filename)[0]
        filepath = os.path.join(folder_path, filename)
        df = pd.read_csv(filepath)
        nooutliers[key] = df

# Step 2: Define tuple combinations of DataFrame keys
tuple_combinations = [
    #ind vs unind combinations 
    ('average_ratio_complex_outliers_removed_df_1', 'average_ratio_complex_outliers_removed_df_2'),
    ('average_ratio_complex_outliers_removed_df_3', 'average_ratio_complex_outliers_removed_df_4'),
    ('average_ratio_complex_outliers_removed_df_5', 'average_ratio_complex_outliers_removed_df_6'),
    ('average_ratio_complex_outliers_removed_df_7', 'average_ratio_complex_outliers_removed_df_8')
   
    ##R1 vs R2 combinations
    #, ('average_ratio_complex_outliers_removed_df_1', 'average_ratio_complex_outliers_removed_df_3'),
    #('average_ratio_complex_outliers_removed_df_2', 'average_ratio_complex_outliers_removed_df_4'),
    #('average_ratio_complex_outliers_removed_df_5', 'average_ratio_complex_outliers_removed_df_7'),
   # ('average_ratio_complex_outliers_removed_df_6', 'average_ratio_complex_outliers_removed_df_8')
    
    ##24 vs 48 combinations
    #, ('average_ratio_complex_outliers_removed_df_1', 'average_ratio_complex_outliers_removed_df_5'),
    #('average_ratio_complex_outliers_removed_df_2', 'average_ratio_complex_outliers_removed_df_6'),
    #('average_ratio_complex_outliers_removed_df_3', 'average_ratio_complex_outliers_removed_df_7'),
   # ('average_ratio_complex_outliers_removed_df_4', 'average_ratio_complex_outliers_removed_df_8')
    
    
]

# Helper to shorten labels (strip prefix, keep df_X)
def short_label(key):
    return key.split('_')[-1] if key.startswith('average_ratio_complex_outliers_removed') else key

# Step 3: Loop through each tuple combination
for induced_key, uninduced_key in tuple_combinations:
    df_induced = nooutliers[induced_key][['Mutation', 'average_Ratio_Complex']].dropna()
    df_uninduced = nooutliers[uninduced_key][['Mutation', 'average_Ratio_Complex']].dropna()

    # Merge on Mutation
    merged = df_induced.merge(df_uninduced, on='Mutation', suffixes=('_induced', '_uninduced'))

    # Compute correlation
    x = merged['average_Ratio_Complex_uninduced']
    y = merged['average_Ratio_Complex_induced']
    r, _ = pearsonr(x, y)
    r_squared = r ** 2

    # Count matches and unmatched
    matched_count = len(merged)
    unmatched_uninduced = len(df_uninduced) - matched_count
    unmatched_induced = len(df_induced) - matched_count

    # Short labels for plotting
    induced_short = short_label(induced_key)
    uninduced_short = short_label(uninduced_key)

    # Plot correlation
    plt.figure(figsize=(6, 6))
    plt.scatter(x, y, alpha=0.6, edgecolor='black', linewidth=0.5)
    plt.xlabel(f'{uninduced_short} average_Ratio_Complex No Outliers')
    plt.ylabel(f'{induced_short} average_Ratio_Complex No Outliers')
    plt.title(f'Correlation of average_Ratio_Complex No Outliers: {induced_short} vs {uninduced_short}')

    # Annotate with stats in top left
    textstr = (
        f'Matched: {matched_count}\n'
        f'Unmatched in {uninduced_short}: {unmatched_uninduced}\n'
        f'Unmatched in {induced_short}: {unmatched_induced}\n'
        f'r = {r:.3f}\n'
        f'r² = {r_squared:.3f}'
    )
    plt.text(0.05, 0.95, textstr, transform=plt.gca().transAxes,
             fontsize=10, verticalalignment='top', horizontalalignment='left',
             bbox=dict(boxstyle='round', facecolor='white', edgecolor='gray'))

    plt.tight_layout()

    # Save plot with short names
    output_path = os.path.join(output_dir6, f'{induced_short}_vs_{uninduced_short}_correlation_plot_NO_outliers.png')
    plt.savefig(output_path, dpi=300)
    plt.close()


**GFP Activity for UMI >=20 UMI for RTBC and ADBC pair with outliers removed AND >= 5 Integrations per Tile**

In [None]:
#GFP Activity per tile number of integrations on top of bar color coded, same y-axis, graph breaks for better resolutoin on most common Ration range 

# GFP Activity Bar Graphs with broken y-axis, counts above bars, and summary legend
input_folder = output_dir5
int_min_per_tile = 5  #UPDATE minimum integrations per tile

output_dir7 = f"{Lib_Name}_output_graphs/{Lib_Name}_output_graphs_complex_ratio_nooutliers_{int_min_per_tile}_ints_activity"
os.makedirs(output_dir7, exist_ok=True)

def short_df_name(filename):
    base = os.path.splitext(filename)[0]
    return "_".join(base.split("_")[-2:]) if "df_" in base else base

# Collect all mutations for global color assignment
all_mutations = []
for filename in os.listdir(input_folder):
    if filename.endswith('.csv'):
        df = pd.read_csv(os.path.join(input_folder, filename))
        all_mutations.extend(df['Mutation'].dropna().unique().tolist())
mutation_order = sorted(set(all_mutations))

cmap = cm.get_cmap('tab20', len(mutation_order))
mutation_colors = {mut: cmap(i) for i, mut in enumerate(mutation_order)}

for filename in os.listdir(input_folder):
    if filename.endswith('.csv'):
        df = pd.read_csv(os.path.join(input_folder, filename))
        plot_df = df[df['number_integrations'] >= int_min_per_tile]
        total_valid_rows = df[df['number_integrations'] >= 1].shape[0]
        total_bars = plot_df.shape[0]

        if plot_df.empty:
            continue

        df_short = short_df_name(filename)
        present_mutations = [m for m in mutation_order if m in plot_df['Mutation'].values]
        plot_df = plot_df.set_index('Mutation').loc[present_mutations]

        # Broken y-axis with two panels
        fig, (ax_main, ax_top) = plt.subplots(2, 1, sharex=True, figsize=(14, 8),
                                              gridspec_kw={'height_ratios':[3,1]})

        colors = [mutation_colors.get(mut, 'gray') for mut in plot_df.index]

        # Plot bars on both axes
        bars_main = ax_main.bar(plot_df.index, plot_df['average_Ratio_Complex'],
                                yerr=plot_df['stderror_Ratio_Complex'], capsize=5,
                                alpha=0.7, color=colors)
        bars_top = ax_top.bar(plot_df.index, plot_df['average_Ratio_Complex'],
                              yerr=plot_df['stderror_Ratio_Complex'], capsize=5,
                              alpha=0.7, color=colors)

        # Main axis shows 0–5
        ax_main.set_ylim(0, 5)
        # Top axis shows 5–16
        ax_top.set_ylim(5, 16)

        # Hide spines between subplots
        ax_main.spines['top'].set_visible(False)
        ax_top.spines['bottom'].set_visible(False)
        ax_main.xaxis.tick_top()
        ax_main.tick_params(labeltop=False)
        ax_top.xaxis.tick_bottom()

        # Add diagonal break marks only at the top of lower axis
        d = .015
        kwargs = dict(transform=ax_main.transAxes, color='k', clip_on=False)
        ax_main.plot((-d,+d), (1-d,1+d), **kwargs)
        ax_main.plot((1-d,1+d), (1-d,1+d), **kwargs)

        # Labels and title
        ax_main.set_ylabel('GFP Activity (Average Ratio Complex)')
        fig.suptitle(f'{df_short} - Mutations with ≥{int_min_per_tile} Integrations')

        plt.xticks(rotation=90, ha='center', fontsize=8)

        # Add horizontal gridlines
        ax_main.grid(axis='y', linestyle='--', alpha=0.7)
        ax_top.grid(axis='y', linestyle='--', alpha=0.7)

        # Add vertical dotted guide lines across the whole figure #UPDATE comment this out to remove vertical lines 
        ticks = ax_top.get_xticks()
        for tick in ticks:
            x_fig = ax_top.transData.transform((tick, 0))[0]
            x_fig = fig.transFigure.inverted().transform((x_fig, 0))[0]
            fig.add_artist(Line2D([x_fig, x_fig], [0, 1],
                                  transform=fig.transFigure,
                                  color='gray', linestyle=':', linewidth=0.5, alpha=0.5))

        # Annotate counts above bars
        for mut, avg, err, count in zip(plot_df.index,
                                        plot_df['average_Ratio_Complex'],
                                        plot_df['stderror_Ratio_Complex'],
                                        plot_df['number_integrations']):
            if pd.notnull(avg):
                top_val = avg + err if pd.notnull(err) else avg
                if top_val <= 5:
                    ax = ax_main
                else:
                    ax = ax_top
                ax.text(mut, top_val + 0.1, f'{int(count):,}',
                        ha='center', va='bottom', fontsize=7)

        # Add legend with summary counts
        legend_text = f'Total bars plotted ≥{int_min_per_tile} ints: {total_bars:,}\nTotal rows with ≥1 ints: {total_valid_rows:,}'
        ax_top.legend([legend_text], loc='upper right', frameon=True)

        plt.tight_layout()

        plot_filename = f'{df_short}_plot_min{int_min_per_tile}int_brokenY.png'
        plt.savefig(os.path.join(output_dir7, plot_filename))
        plt.show()


In [None]:
# #GFP Activity Bar Graphs (leaves empty spaces in graph if it is missing that mutation)
# input_folder = output_dir5
# int_min_per_tile = 3  # UPDATE with the number of min integrations you want per tile

# output_dir7 = f"{Lib_Name}_output_graphs/{Lib_Name}_output_graphs_complex_ratio_nooutliers_{int_min_per_tile}_ints_activity"
# os.makedirs(output_dir7, exist_ok=True)

# # Helper to shorten the filename to just df_X
# def short_df_name(filename):
#     base = os.path.splitext(filename)[0]
#     return "_".join(base.split("_")[-2:]) if "df_" in base else base

# # --- Step 1: Determine global mutation order across all CSVs ---
# all_mutations = []
# for filename in os.listdir(input_folder):
#     if filename.endswith('.csv'):
#         filepath = os.path.join(input_folder, filename)
#         df = pd.read_csv(filepath)
#         all_mutations.extend(df['Mutation'].dropna().unique().tolist())

# # Unique mutations in consistent order
# mutation_order = sorted(set(all_mutations))

# # Assign colors to mutations using a colormap
# cmap = cm.get_cmap('tab20', len(mutation_order))  # tab20 has 20 distinct colors
# mutation_colors = {mut: cmap(i) for i, mut in enumerate(mutation_order)}

# # --- Step 2: Loop through all CSV files and plot ---
# for filename in os.listdir(input_folder):
#     if filename.endswith('.csv'):
#         filepath = os.path.join(input_folder, filename)
#         df = pd.read_csv(filepath)

#         # Filter for plotting and total count
#         plot_df = df[df['number_integrations'] >= int_min_per_tile]
#         total_valid_rows = df[df['number_integrations'] >= 1].shape[0]
#         total_bars = plot_df.shape[0]

#         if plot_df.empty:
#             continue  # skip if no valid rows

#         # Short name for plot title and saved file
#         df_short = short_df_name(filename)

#         # Reindex plot_df to global mutation order (missing mutations will be NaN)
#         plot_df = plot_df.set_index('Mutation').reindex(mutation_order)

#         # Plot
#         plt.figure(figsize=(14, 6))
#         bars = plt.bar(
#             plot_df.index,
#             plot_df['average_Ratio_Complex'],
#             yerr=plot_df['stderror_Ratio_Complex'],
#             capsize=5,
#             alpha=0.7,
#             color=[mutation_colors.get(mut, 'gray') for mut in plot_df.index]
#         )
#         plt.xticks(rotation=90, ha='center', fontsize=8)
#         plt.ylabel('GFP Activity (Average Ratio Complex)')
#         plt.title(f'{df_short} - Mutations with ≥{int_min_per_tile} Integrations')
#         plt.tight_layout()

#         # Annotate each bar above the error bar
#         for bar, avg, err, count in zip(
#             bars,
#             plot_df['average_Ratio_Complex'],
#             plot_df['stderror_Ratio_Complex'],
#             plot_df['number_integrations']
#         ):
#             if pd.notnull(avg):
#                 top = avg + err if pd.notnull(err) else avg
#                 plt.text(bar.get_x() + bar.get_width() / 2, top + 0.01, f'{int(count):,}',
#                          ha='center', va='bottom', fontsize=7)

#         # Add legend with summary counts
#         legend_text = f'Total bars plotted ≥{int_min_per_tile} ints: {total_bars:,}\nTotal rows with ≥1 ints: {total_valid_rows:,}'
#         plt.legend([legend_text], loc='upper right', frameon=True)

#         # Save figure to same folder with short name
#         plot_filename = f'{df_short}_plot_min{int_min_per_tile}int.png'
#         plot_path = os.path.join(output_dir7, plot_filename)
#         plt.savefig(plot_path)
#         plt.show()  # or plt.close()


In [None]:
#GFP Activity Induced complex ratio / Uninduced complex ratio for each mutation 

# Input folder
input_folder = output_dir5
int_min = 3

#out_dir8 = f"{Lib_Name}_output_graphs/{Lib_Name}_output_graphs_complex_ratio_nooutliers_{int_min}_ints_activity_ind_divide_by_unind"
base_graph_dir = f"{Lib_Name}_output_graphs"
out_dir8 = os.path.join(base_graph_dir,
                        f"{Lib_Name}_output_graphs_complex_ratio_nooutliers_{int_min}_ints_activity_IND_vs_NI")

os.makedirs(out_dir8, exist_ok=True)
print("Saving plots to:", os.path.abspath(out_dir8))


# List of file name pairs to compare (no .csv needed) 
#List the induced first and the unindueced second ****
file_pairs = [
    ('average_ratio_complex_outliers_removed_df_1', 'average_ratio_complex_outliers_removed_df_2'),
    ('average_ratio_complex_outliers_removed_df_3', 'average_ratio_complex_outliers_removed_df_4'),
    ('average_ratio_complex_outliers_removed_df_5', 'average_ratio_complex_outliers_removed_df_6'),
    ('average_ratio_complex_outliers_removed_df_7', 'average_ratio_complex_outliers_removed_df_8')
]

# Get sorted CSV files
csv_files = sorted([f for f in os.listdir(input_folder) if f.endswith('.csv')])

# --- Step 1: Determine global mutation order across all CSVs ---
all_mutations = []
for filename in csv_files:
    df = pd.read_csv(os.path.join(input_folder, filename))
    all_mutations.extend(df['Mutation'].dropna().unique().tolist())
mutation_order = sorted(set(all_mutations))

# Assign colors to mutations using a colormap
cmap = cm.get_cmap('tab20', len(mutation_order))
mutation_colors = {mut: cmap(i) for i, mut in enumerate(mutation_order)}

# --- Step 2: Loop through file pairs ---
for file1_base, file2_base in file_pairs:
    file1_name = file1_base + '.csv'
    file2_name = file2_base + '.csv'

    if file1_name not in csv_files or file2_name not in csv_files:
        raise ValueError(f"One of the specified files ({file1_name}, {file2_name}) is not in the folder.")

    file1_path = os.path.join(input_folder, file1_name)
    file2_path = os.path.join(input_folder, file2_name)

    # Load data
    df1 = pd.read_csv(file1_path)
    df2 = pd.read_csv(file2_path)

    # Counts
    count_1_ge1 = df1[df1['number_integrations'] >= 1].shape[0]
    count_2_ge1 = df2[df2['number_integrations'] >= 1].shape[0]
    df1_filt = df1[df1['number_integrations'] >= int_min]
    df2_filt = df2[df2['number_integrations'] >= int_min]
    count_1_ge5 = df1_filt.shape[0]
    count_2_ge5 = df2_filt.shape[0]

    # Merge on Mutation
    merged = pd.merge(df1_filt[['Mutation', 'average_Ratio_Complex']],
                      df2_filt[['Mutation', 'average_Ratio_Complex']],
                      on='Mutation',
                      suffixes=('_file1', '_file2'))

    # Calculate ratio
    merged['ratio'] = merged['average_Ratio_Complex_file2'] / merged['average_Ratio_Complex_file1']
    merged = merged.set_index('Mutation').reindex([m for m in mutation_order if m in merged['Mutation'].values])


    # --- Single continuous y-axis ---
    fig, ax = plt.subplots(figsize=(14, 6))

    colors = [mutation_colors.get(mut, 'gray') for mut in merged.index]

    bars = ax.bar(merged.index, merged['ratio'], alpha=0.7, color=colors)

    # Uniform y-axis across all plots
    ax.set_ylim(0, 6.5)  # adjust upper bound to cover your data range

    # Labels and title
    ax.set_ylabel('File2 ÷ File1 Ratio')
    ax.set_title(f'{file2_base} ÷ {file1_base} (Mutations with ≥{int_min} Integrations)')

    plt.xticks(rotation=90, ha='center', fontsize=8)

    # Add horizontal gridlines
    ax.grid(axis='y', linestyle='--', alpha=0.7)

    # Annotate each bar with ratio
   # for bar, ratio in zip(bars, merged['ratio']):
       # ax.text(bar.get_x() + bar.get_width()/2, ratio + 0.1, f'{ratio:.2f}',
                #ha='center', va='bottom', fontsize=7)

    # Add legend with counts
    legend_text = (
        f'{file1_base}: ≥1 ints = {count_1_ge1:,}, ≥{int_min} ints = {count_1_ge5:,}\n'
        f'{file2_base}: ≥1 ints = {count_2_ge1:,}, ≥{int_min} ints = {count_2_ge5:,}\n'
        f'Total mutations plotted: {merged.shape[0]:,}'
    )
    ax.legend([legend_text], loc='upper left', frameon=True)

    plt.tight_layout()

    # Extract df numbers for saving
    df1_num = file1_base.split('_')[-1]
    df2_num = file2_base.split('_')[-1]

    # Save plot using only df numbers
    plot_filename = f'ratio_plot_df_{df1_num}_vs_df_{df2_num}_{int_min}ints.png'
    plot_path = os.path.join(out_dir8, plot_filename)
    plt.savefig(plot_path)
    plt.show()


## Step 4 Closer Look at specific tiles / genes
- hopefully you have lots of good tiles and looking at them all together is too crowded to use below code to identify and pull out specific tiles / genes/ whatever 

In [None]:
# Function to find the top tiles within a df based on highest complex ratio, lowest spread, or lowest complex ratio
#Good if you need to select a few mutants to put on a plot
# Flexible function to compute top 10 mutations by chosen metric, requiring at least 5 replicates
def get_top_mutations(df, label, metric="spread", top_n=10, min_reps=5):
    """
    metric: "spread" (lowest std), "high" (highest mean), "low" (lowest mean)
    top_n: number of mutations to return
    min_reps: minimum replicates required
    """
    df = df.dropna(subset=['Ratio_Complex'])
    
    # Count replicates per mutation
    counts = df['Mutation'].value_counts()
    valid_mutations = counts[counts >= min_reps].index
    
    # Filter to mutations with at least min_reps
    df_filtered = df[df['Mutation'].isin(valid_mutations)]
    
    # Compute mean and std
    stats = df_filtered.groupby('Mutation')['Ratio_Complex'].agg(['mean', 'std'])
    
    # Sort based on metric
    if metric == "spread":
        ranked = stats.sort_values(by='std', ascending=True).head(top_n)
        print(f"\nTop {top_n} mutations with lowest spread in {label} (≥{min_reps} reps):")
    elif metric == "high":
        ranked = stats.sort_values(by='mean', ascending=False).head(top_n)
        print(f"\nTop {top_n} mutations with highest mean in {label} (≥{min_reps} reps):")
    elif metric == "low":
        ranked = stats.sort_values(by='mean', ascending=True).head(top_n)
        print(f"\nTop {top_n} mutations with lowest mean in {label} (≥{min_reps} reps):")
    else:
        raise ValueError("metric must be 'spread', 'high', or 'low'")
    
    display(ranked)
    return ranked

# Example loop over all dataframes
for key, df in dataframes.items():
    get_top_mutations(df, key, metric="spread")   # lowest spread
    #get_top_mutations(df, key, metric="high")   # highest mean
    #get_top_mutations(df, key, metric="low")    # lowest mean


In [None]:
# Function to find the top tiles within a df based on highest complex ratio, lowest spread, or lowest complex ratio
# Collect top mutations from a single DataFrame
def get_top_mutations(df, metric="spread", n=20, min_reps=5):
    """
    metric: "spread" (lowest std), "highest" (highest mean), "lowest" (lowest mean)
    n: number of top mutations to return
    min_reps: minimum replicates required
    """
    df = df.dropna(subset=['Ratio_Complex'])

    # Count replicates per mutation
    counts = df['Mutation'].value_counts()
    valid_mutations = counts[counts >= min_reps].index

    # Filter to mutations with at least min_reps
    df_filtered = df[df['Mutation'].isin(valid_mutations)]

    # Compute mean and std
    stats = df_filtered.groupby('Mutation')['Ratio_Complex'].agg(['mean', 'std'])

    # Decide sorting metric
    if metric == "spread":
        stats = stats.sort_values(by='std', ascending=True)  # lowest std
    elif metric == "highest":
        stats = stats.sort_values(by='mean', ascending=False)  # highest mean
    elif metric == "lowest":
        stats = stats.sort_values(by='mean', ascending=True)  # lowest mean
    else:
        raise ValueError("metric must be 'spread', 'highest', or 'lowest'")

    return stats.head(n)

# General comparison function
def compare_mutations(df_names, data_dict, metric="spread", n=20, min_reps=5, top=10):
    top_list = []
    for name in df_names:
        df = data_dict[name]
        top_stats = get_top_mutations(df, metric=metric, n=n, min_reps=min_reps)
        top_stats = top_stats[['mean','std']].assign(source=name)
        top_list.append(top_stats)

    combined = pd.concat(top_list)

    # Count appearances and average values
    appearance_counts = combined.groupby('Mutation').agg(
        appearances=('source', 'count'),
        avg_mean=('mean', 'mean'),
        avg_spread=('std', 'mean')
    )

    # Sort depending on metric
    if metric == "spread":
        ranked = appearance_counts.sort_values(
            by=['appearances', 'avg_spread'], ascending=[False, True]
        ).head(top)
    elif metric == "highest":
        ranked = appearance_counts.sort_values(
            by=['appearances', 'avg_mean'], ascending=[False, False]
        ).head(top)
    elif metric == "lowest":
        ranked = appearance_counts.sort_values(
            by=['appearances', 'avg_mean'], ascending=[False, True]
        ).head(top)

    print(f"Top {top} mutations ({metric}) across {', '.join(df_names)}:")
    display(ranked)
    return ranked

# Example usage:
df_names = ['df_1', 'df_2', 'df_5', 'df_6']  # keys from filtered_dataframes #this is all Rep1 example
ranked_spread = compare_mutations(df_names, filtered_dataframes, metric="spread")
ranked_highest = compare_mutations(df_names, filtered_dataframes, metric="highest")
ranked_lowest = compare_mutations(df_names, filtered_dataframes, metric="lowest")


In [None]:
#plot specific mutations on the same graph over various time points 

out_dir9 =  f"{Lib_Name}_output_graphs/{Lib_Name}_output_graphs_mutations"
os.makedirs(out_dir9, exist_ok=True)

# Define mutations to plot
mut_identifier_name = 'muts_I_like' #name to add in the title and fig save naming to differentiate if you have mutliple diff graphs 
mutation_list = ['P > A',
                  'P > L',
                  'P > D',
                  'VP16H1_killmotif1'	
                 ]  # Add any mutations you want

# Define time mapping
time_map = {
    'df_3': 0,
    'df_4': 24,
    'df_8': 48
}

# Prepare plot
plt.figure(figsize=(8, 6))

# Loop through each mutation
for mutation_type in mutation_list:
    mean_ratios = []
    stderr_ratios = []
    time_points = []

    for df_name, time in time_map.items():
        df = filtered_dataframes[df_name]
        if 'Mutation' in df.columns and 'Ratio_Complex' in df.columns:
            selected = df[df['Mutation'] == mutation_type]['Ratio_Complex'].dropna()
            if len(selected) > 0:
                mean_ratios.append(np.mean(selected))
                stderr_ratios.append(np.std(selected, ddof=1) / np.sqrt(len(selected)))
                time_points.append(time)

    # Plot mean with error bars
    plt.errorbar(time_points, mean_ratios, yerr=stderr_ratios, fmt='o-', capsize=5, label=mutation_type)

# Final formatting
plt.xlabel('Time (hours)')
plt.ylabel('Ratio Complex')
plt.title(f'Mean Ratio Complex Over Time by Mutation for {mut_identifier_name}')
plt.legend()
plt.tight_layout()
plot_filename2 = f'{Lib_Name}_{mut_identifier_name}_muts_over_time.jpg'
plot_pathnew = os.path.join(out_dir9, plot_filename2)
plt.savefig(plot_pathnew, dpi=300)
plt.show()

In [None]:
#Plot multiple mutations on the same plot at the same time 
# Define mutations to plot
timepoint = '24_Hours'
mutations_name = 'good_looking_muts' #name to add in the title and fig save naming to differentiate if you have mutliple diff graphs 
mutation_list = ['P > A',
                 'P > L',
                 'P > D', 
                 'T > A',
                 'Q > T',
                 'VP16H1_killmotif1',
                 'VP16C_Y2A']

# Select the 24-hour DataFrame
df_24 = filtered_dataframes['df_4']

# Prepare plot
plt.figure(figsize=(8, 6))

# Track max y-value to position sample counts
global_max = 0
sample_counts = []

# Loop through each mutation
for i, mutation_type in enumerate(mutation_list):
    if 'Mutation' in df_24.columns and 'Ratio_Complex' in df_24.columns:
        selected = df_24[df_24['Mutation'] == mutation_type]['Ratio_Complex'].dropna()

        # Plot individual dots
        x_vals = np.full(len(selected), i)
        plt.scatter(x_vals, selected, color='gray', alpha=0.6, s=40)

        # Plot mean ± SEM
        if len(selected) > 0:
            mean = np.mean(selected)
            sem = np.std(selected, ddof=1) / np.sqrt(len(selected))
            plt.errorbar(i, mean, yerr=sem, fmt='o', color='black', capsize=5, markersize=8)

            global_max = max(global_max, max(selected) + sem)
            sample_counts.append((i, len(selected)))

# Add sample counts at top of plot
y_top = global_max + 0.1 * global_max
for i, count in sample_counts:
    plt.text(i, y_top, f'n={count}', ha='center', va='bottom', fontsize=9)

# Final formatting
plt.xticks(range(len(mutation_list)), mutation_list, rotation=45)
plt.ylabel('TREBL-Seq Activity')
plt.xlabel('Mutation')
plt.title(f'TREBL-Seq Activity at {timepoint} for {mutations_name}')
plt.ylim(0, y_top + 0.1 * y_top)
plt.tight_layout()
plot_filename2 = f'{Lib_Name}_{mutations_name}_muts_{timepoint}.jpg'
plot_pathnew = os.path.join(out_dir9, plot_filename2)
plt.savefig(plot_pathnew, dpi=300)
plt.show()