In [1]:
from open_image import Img
import train
import utils_train

%reload_ext autoreload
%autoreload 2

import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os
import warnings
warnings.filterwarnings('ignore')




## Set paths

In [2]:
fov = 1
hyb_list=[2,3,4]

selection_path = r'X:\sandbox\for_yed\new_exp\sample_selections.csv'
whole_cell_data_path = r'X:\danielda\analysis\zp_auto_080524_late_exp_100x\whole_cell_data'
demult_path = rf'X:\danielda\analysis\zp_auto_080524_late_exp_100x\demultiplexing\samples_fov_{fov}_demult.txt'
automation_summary_path = r'C:\Users\yedidyab\Downloads\automation_summary.xlsx'

In [3]:
probs_dict = {'OD_0.15':['R223', 'R224'],
              'OD_0.85':['R223', 'R225'],
              'OD_1.1':['R226', 'R228'],
              'OD_1.1_B':['R226', 'R230'],
              'OD_2.8':['R229', 'R224'],
              'OD_3.8':['R229', 'R225']}


# Train Genes
### load image

In [4]:
hybs = {1: 'ref', 2: 'bc1', 3: 'bc1', 4: 'bc1'}
temp = Img(path = r'X:\danielda\analysis\zp_auto_080524_late_exp_100x', fov = fov, automation_summary_path = automation_summary_path, hybs = hybs)

to_show = [temp.ref_A647_1,
           temp.ref_A488_1,
           temp.ref_A550_1,
           temp.bc1_A647_2,
           temp.bc1_A488_2,
           temp.bc1_A550_2,
           temp.bc1_A647_3,
           temp.bc1_A488_3,
           temp.bc1_A550_3,
           temp.bc1_A647_4,
           temp.bc1_A488_4,
           temp.bc1_A550_4,
           ]


titels  = ['refA647', 'refA488', 'refA550', 'bc1_hyb2_A647_R223',
           'bc1_hyb2_A488_R224', 'bc1_hyb2_A550_R225', 'bc1_hyb3_A647_R226',
           'bc1_hyb3_A488_R227', 'bc1_hyb3_A550_R228', 'bc1_hyb4_A647_R229',
           'bc1_hyb4_A488_R230', 'bc1_hyb4_A550_R231']


# Label cells

In [5]:
train.run(temp, titels, automation_summary_path, to_show, save_path=selection_path, session_length = 250, space = 5)

  0%|          | 0/250 [00:00<?, ?it/s]


TypeError: Image data of dtype <U616656 cannot be converted to float

# Build training df

In [None]:
whole_cell_data = utils_train.concat_files(fov_list=[fov], hyb_list=hyb_list, directory_path=whole_cell_data_path)
whole_cell_data = whole_cell_data.reindex(sorted(whole_cell_data.columns), axis=1)
selections = pd.read_csv(selection_path).drop(['refA647', 'refA488', 'refA550'], axis=1)
selections = selections.reindex(sorted(selections.columns), axis=1)
df1 = selections.merge(whole_cell_data, left_on='Index', right_on='cell_id')
demult = pd.read_csv(demult_path, sep='\t')
df = df1.merge(demult, left_on='Index', right_on='cell_id').drop(['cell_id', 'sample_name'], axis=1)
df

# Train new model

In [None]:
A647_random_forest_model, A647_pred_df = utils_train.create_model(df, automation_summary_path = automation_summary_path, channel = 'A647', hyb_list = hyb_list, test_size=0.5)
A488_random_forest_model, A488_pred_df = utils_train.create_model(df, automation_summary_path = automation_summary_path, channel = 'A488', hyb_list = hyb_list, test_size=0.5)
A550_random_forest_model, A550_pred_df = utils_train.create_model(df, automation_summary_path = automation_summary_path, channel = 'A550', hyb_list = hyb_list, test_size=0.5)
model_zoo = [A647_random_forest_model, A488_random_forest_model, A550_random_forest_model]
eval_dfs = [A647_pred_df, A488_pred_df, A550_pred_df]

# build evaluation df

In [None]:
# evaluate
eval_df = pd.concat(eval_dfs, axis=1)
eval_df = eval_df.loc[:,~eval_df.columns.duplicated()].copy()
eval_df = eval_df.reindex(sorted(eval_df.columns), axis=1)
pred_cols = [col for col in eval_df.columns if 'prob' in col]
true_cols = [col for col in eval_df.columns if  'bc' in col and 'prob' not in col]
relevant_cols = ['cell_id'] + true_cols + pred_cols

eval_df = eval_df[relevant_cols]

# Calculate the difference and add new columns
delta_cols = []
for col1, col2 in zip(pred_cols, true_cols):
    new_col_name = f"{col1.split('_')[-1]}_diff"  # Name for the new column
    delta_cols.append(new_col_name)
    eval_df[new_col_name] = eval_df[col1] - eval_df[col2]

eval_df

# Check problematic cells

In [None]:
data_for_heatmap, problematic_idx, problematic_cell_id = utils_train.plot_eval(eval_df, true_cols, pred_cols, delta_cols)

In [None]:
problematic = data_for_heatmap.iloc[problematic_idx]
sns.heatmap(problematic, annot=True, cmap='viridis', cbar=False, linewidths=2, linecolor='black')
plt.savefig('sampels_problematic.png')
plt.show()

In [None]:
# Show problematic cells
for i in problematic_cell_id:
    utils_train.plot_by_idx(i, temp, to_show, titels)

# Predict for all the cells

In [None]:
predict_dfs = []
channel_list = ['A647', 'A488', 'A550']
for model, channel in zip(model_zoo, channel_list):
    pred = utils_train.predict(model = model,
                               automation_summary_path = automation_summary_path,
                               cutoff=0.6,
                               channel = channel,
                               fov_list=[fov],
                               hyb_list=hyb_list,
                               directory_path=whole_cell_data_path,
                               demult_path=demult_path)

    predict_dfs.append(pred)

predict_df = pd.concat(predict_dfs, axis=1)
predict_df = predict_df.loc[:,~predict_df.columns.duplicated()].copy()
predict_df = predict_df.reindex(sorted(predict_df.columns), axis=1)
relevant_cols = ['cell_id'] + pred_cols

predict_df = predict_df[predict_df['cell_id'].isin(np.unique(temp.masks))]

predict_df[relevant_cols]


In [22]:
cutoff=0.5

relevant_cols = ['cell_id'] + pred_cols
df = predict_df[relevant_cols]
rounded_df = df.copy()

rounded_df[pred_cols] = (predict_df[pred_cols] > cutoff).astype(int)

df.columns = ['cell_id'] + [col.split('_')[-1] for col in pred_cols]
df['total'] = rounded_df.loc[:, df.columns != 'cell_id'].sum(axis=1)
df['predicted_sample'] = None

for sample, probs in probs_dict.items():
    #Round the specific columns
    rounded_df = df[probs].round()

    #Check if both columns are equal to 1
    condition = (rounded_df[probs[0]] == 1) & (rounded_df[probs[1]] == 1)
    df.loc[condition, 'predicted_sample'] = sample

df.loc[df.total == 0, 'predicted_sample'] = 'no_signal'
df.loc[df.total == 1, 'predicted_sample'] = 'partial_signal'
df.loc[df.total > 2 , 'predicted_sample'] = 'too_many_signals'
df.predicted_sample.fillna(value='false_positive', inplace=True)

df

Unnamed: 0,cell_id,R224,R225,R223,R227,R228,R226,R230,R231,R229,R233,R234,R232,total,predicted_sample
0,1,0.0,0.0,0.40,0.0,0.0,0.40,0.31,0.0,0.0,0.0,0.49,0.0,0,no_signal
1,2,0.0,0.0,0.57,0.0,0.0,0.23,0.31,0.0,0.0,0.0,0.39,0.0,1,partial_signal
2,3,0.0,0.0,0.58,0.0,0.0,0.22,0.26,0.0,0.0,0.0,0.29,0.0,1,partial_signal
3,4,0.0,0.0,0.40,0.0,0.0,0.40,0.54,0.0,0.0,0.0,0.57,0.0,2,false_positive
4,5,0.0,0.0,0.14,0.0,0.0,0.66,0.58,0.0,0.0,0.0,0.18,0.0,2,OD_1.1_B
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7958,7959,0.0,0.0,0.15,0.0,0.0,0.65,0.57,0.0,0.0,0.0,0.18,0.0,2,OD_1.1_B
7959,7960,0.0,0.0,0.55,0.0,0.0,0.25,0.38,0.0,0.0,0.0,0.53,0.0,2,false_positive
7960,7961,0.0,0.0,0.32,0.0,0.0,0.48,0.43,0.0,0.0,0.0,0.22,0.0,0,no_signal
7961,7962,0.0,0.0,0.23,0.0,0.0,0.57,0.26,0.0,0.0,0.0,0.62,0.0,2,false_positive


In [None]:

# Directory where you want to check/create folders
base_dir = "samples_QC"

# Initialize a dictionary to count occurrences of each folder
folder_count = {}

for index, row in df.iterrows():

    folder_name = f"{row['predicted_sample']}"
    folder_path = os.path.join(base_dir, folder_name)

    # Increase the counter for this folder_name or initialize it
    if folder_name in folder_count:
        folder_count[folder_name] += 1
    else:
        folder_count[folder_name] = 1

    # Skip processing if this folder_name has appeared more than 10 times
    if folder_count[folder_name] > 5:
        # print(f"Skipping {folder_name}, as it has been processed more than 10 times.")
        continue

    # Check if the folder exists
    if not os.path.exists(folder_path):
        # Create the folder if it does not exist
        os.makedirs(folder_path)


    cell_id = row['cell_id']
    print(folder_path, cell_id)
    utils_train.plot_by_idx(cell_id, temp, to_show, titels, save_path=rf'{folder_path}\{cell_id}.png')
