In [26]:
#Imports
import pathlib
import numpy as np
import pandas as pd

# pycytominer imports
from pycytominer.cyto_utils.cells import SingleCells
from pycytominer import aggregate, annotate, normalize, feature_select

# ignore mix type warnings from pandas
import warnings
#plotting
import matplotlib.colors as mcolors
from matplotlib.patches import Rectangle
import math
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import seaborn as sns
import joypy
import scipy
from scipy import stats
from scipy.stats import f_oneway
from statsmodels.stats.multicomp import pairwise_tukeyhsd
from sklearn.ensemble import IsolationForest

from dask import dataframe as dd

In [27]:
# Setting file paths
data_dir = pathlib.Path("/mnt/bigdisk1/Allie_S/Replicative_Age_Project/CP_Output/Mar26").resolve(strict=True)
plate = "Mar26_mitolyso_v3"
metadata_dir = (data_dir / "metadata").resolve(strict=True)

plate_map = (metadata_dir / 'map.csv').resolve(strict=True)

In [None]:
# input file paths for csv
image_path = pathlib.Path(data_dir / plate / "Image.csv").resolve(strict=True)
cell_path = pathlib.Path(data_dir / plate / "CellOutline.csv").resolve(strict=True)
nuc_path = pathlib.Path(data_dir / plate / "Nuclei.csv").resolve(strict=True)
lyso_path = pathlib.Path(data_dir / plate / "Lysosomes.csv").resolve(strict=True)
mito_path = pathlib.Path(data_dir / plate / "Mitochondria.csv").resolve(strict=True)

In [45]:
#Input paths for db  
db_path = pathlib.Path(data_dir / plate / "Mar26_MitoLyso.db").resolve(strict=True)

#setting output paths
out_dir = pathlib.Path("/mnt/bigdisk1/Allie_S/Replicative_Age_Project/Data_Mining/test_output/results")
out_dir.mkdir(exist_ok=True)

sc_profiles_path = out_dir / "single_cell_profile.csv.gz"
anno_profiles_path = out_dir / "annotated_profile.csv.gz"
norm_profiles_path = out_dir / "normalized_profile.csv.gz"
feat_profiles_path = out_dir / "features_profile.csv.gz"

#img_db = pd.read_sql("SELECT ImageNumber, Metadata_Well, AreaShape_Area FROM Cell", conn)

# update compartment names and strata
strata = ["Image_Metadata_Well", "Image_Metadata_Plate"]
compartments = ["Cells", "Cytoplasm", "Nuclei"]

# Updating linking columns for merging all compartments
linking_cols = {
    "Cytoplasm": {
        "Cells": "Cytoplasm_Parent_Cells",
        "Nuclei": "Cytoplasm_Parent_Nuclei",
    },
    "Cells": {"Cytoplasm": "Cells_Number_Object_Number"},
    "Nuclei": {"Cytoplasm": "Nuclei_Number_Object_Number"},
}

In [30]:
#Import the db and make single-cell profiles
platemap_df = pd.read_csv(plate_map)
print(platemap_df.columns.tolist())



['Metadata_Well', 'Metadata_WellRow', 'Metadata_WellColumn', 'Metadata_Field', 'TimepointName', 'Replicate', 'Time', 'PassageNumber', 'Staining', 'Drug']


In [47]:

# setting up sqlite address
sqlite_address = f"sqlite:///{str(db_path)}"

# loading single cell morphology data into pycyotminer's SingleCells Object
single_cell_profile = SingleCells(
    sql_file=sqlite_address,
    compartments=compartments,
    compartment_linking_cols=linking_cols,
    image_table_name="Mar26_Per_Image",
    strata=strata,
    merge_cols=["ImageNumber"],
    image_cols="ImageNumber",
    load_image_data=True,
)

# merging all sqlite table into a single tabular dataset (csv) and save as
# compressed csv file
single_cell_profile.merge_single_cells(
    sc_output_file=sc_profiles_path, compression_options="gzip"
)


AttributeError: 'Connection' object has no attribute 'cursor'

In [3]:
# loading plate map and display it


# displaying platemap
print(platemap_df.columns.tolist())

#Import the cellprofiler csv files using dask to save memory
image_df = dd.read_csv(image_path)
pre_nuclei_df = dd.read_csv(nuc_path, usecols=['ObjectNumber','ImageNumber','Metadata_Well', 'Metadata_Field', 
                                          'Metadata_WellColumn', 'Metadata_WellRow', 'AreaShape_Area',
                                         'AreaShape_Eccentricity','Parent_CellOutline'],index_col = False)
pre_cell_df = dd.read_csv(cell_path, usecols=['ObjectNumber','ImageNumber','Metadata_Well', 'Metadata_Field', 
                                          'Metadata_WellColumn', 'Metadata_WellRow', 'AreaShape_Area',
                                         'AreaShape_Eccentricity','Children_Lysosomes_Count', 'Children_Mitochondria_Count',
                                          'Math_Lysosome_CellOutline_Ratio', 'Math_Mitochondria_CellOutline_Ratio'], index_col = False)
pre_lyso_df = dd.read_csv(lyso_path, usecols=['ObjectNumber','ImageNumber','Metadata_Well', 'Metadata_Field',
                                          'Metadata_WellColumn', 'Metadata_WellRow', 'AreaShape_Area',
                                         'AreaShape_Eccentricity','Parent_CellOutline','Intensity_MedianIntensity_LAMP1',
                                         'Intensity_MeanIntensity_LAMP1','Texture_Contrast_LAMP1_3_01_256'],index_col = False)
pre_mito_df = dd.read_csv(mito_path, usecols=['ObjectNumber','ImageNumber','Metadata_Well', 'Metadata_Field',
                                          'Metadata_WellColumn', 'Metadata_WellRow', 'AreaShape_Area',
                                         'AreaShape_Eccentricity','Parent_CellOutline','Intensity_MedianIntensity_MitoTracker',
                                         'Intensity_MeanIntensity_MitoTracker','Texture_Contrast_MitoTracker_3_01_256'],index_col = False)


['Metadata_Well', 'TimepointName', 'Staining', 'Metadata_WellRow', 'Metadata_WellColumn', 'Metadata_Field', 'Replicate', 'Time', 'PassageNumber', 'Drug', 'Polylysine']


In [4]:
#Merge the dfs with the metadata
cell_df = pre_cell_df.merge(platemap_df, on=['Metadata_Well','Metadata_WellRow','Metadata_WellColumn','Metadata_Field'], how ='left')
lyso_df = pre_lyso_df.merge(platemap_df, on=['Metadata_Well','Metadata_WellRow','Metadata_WellColumn','Metadata_Field'], how ='left')
mito_df = pre_mito_df.merge(platemap_df, on=['Metadata_Well','Metadata_WellRow','Metadata_WellColumn','Metadata_Field'], how ='left')
nuclei_df Show Co= pre_nuclei_df.merge(platemap_df, on=['Metadata_Well','Metadata_WellRow','Metadata_WellColumn','Metadata_Field'], how ='left')

###cell_df.set_index('ObjectNumber')
#dfs are still of the dask df type 
#print(type(cell_df))


# Welcome to join hell
Joining two 6GB files will definitley crash...without a different approach (dask, SQL, or concat and remove dupes)

In [None]:
#Trying with right join

#Rename the Cell object number column so pandas doesn't get confused
renamed_cell_df = cell_df.rename(columns={"ObjectNumber":"CellObjectNumber"})

intersect_cols = renamed_cell_df.columns.intersection(lyso_df.columns,sort=True).to_numpy
print(intersect_cols)

merged_lyso = renamed_cell_df.merge(lyso_df, how ='right', on=intersect_cols,
                            left_on='CellObjectNumber', right_on='Parent_CellOutline',
                            suffixes = ('_Cell','_Compartment'), indicator = True, broadcast = True)

display(merged_lyso.head(100).style)
#print(cell_df.head(100))

In [None]:
#Trying with leftsemijoin

#Rename the Cell object number column so pandas doesn't get confused
renamed_cell_df = cell_df.rename(columns={"ObjectNumber":"CellObjectNumber"})


merged_lyso = lyso_df.merge(renamed_cell_df, how ='leftsemi', #on=['Metadata*],
                            right_on='CellObjectNumber', left_on='Parent_CellOutline',
                            suffixes = ('_Cell','_Compartment'), indicator = True)

display(merged_lyso.head(20))
print(merged_lyso.columns)

## Getting closer; but dask will drop the right columns... not ideal

In [None]:
#Trying with a left whatever #crashes fml

#Rename the Cell object number column so pandas doesn't get confused
renamed_cell_df = cell_df.rename(columns={"ObjectNumber":"CellObjectNumber"})
#computed_lyso = lyso_df.compute()

merged_lyso = lyso_df.merge(renamed_cell_df, how = 'left',
                                  right_on='CellObjectNumber', left_on='Parent_CellOutline',
                                 suffixes = ('_Lyso','_Cell'), indicator = True)

merged_lyso.to_csv('path_to_merged.csv', index=False)

#print(merged_lyso.columns)


## Don't do this!
```python
##Try it in chunks

#Define the chunk size
chunk_size = 10000

# Initialize an empty list to store the merged chunks
merged_chunks = []

# Read the cell data once
pre_cell_df = pd.read_csv(cell_path, usecols=['ObjectNumber','ImageNumber','Metadata_Well', 'Metadata_Field', 
                                          'Metadata_WellColumn', 'Metadata_WellRow', 'AreaShape_Area',
                                         'AreaShape_Eccentricity','Children_Lysosomes_Count', 'Children_Mitochondria_Count',
                                          'Math_Lysosome_CellOutline_Ratio', 'Math_Mitochondria_CellOutline_Ratio'], index_col = False)

with open('merged.csv', 'w') as f:
    # Write the header
    header_written = False
    
    pre_lyso_df = pd.read_csv(lyso_path, usecols=['ObjectNumber','ImageNumber','Metadata_Well', 'Metadata_Field',
                                          'Metadata_WellColumn', 'Metadata_WellRow', 'AreaShape_Area',
                                         'AreaShape_Eccentricity','Parent_CellOutline','Intensity_MedianIntensity_LAMP1',
                                         'Intensity_MeanIntensity_LAMP1','Texture_Contrast_LAMP1_3_01_256'],index_col = False,chunksize = chunk_size)
    # Process the lysosome data in chunks
    for lysosome_chunk in pre_lyso_df:
        merged_chunk = pd.merge(lysosome_chunk, pre_cell_df, left_on='Parent_CellOutline', right_on='ObjectNumber', how='left')
        if not header_written:
            merged_chunk.to_csv(f, index=False)
            header_written = True
            merged_chunks.append(merged_chunk)
        else:
            merged_chunk.to_csv(f, index=False, header=False)

```
What's happening here is the **cartesian effect** - the join is seeing my rows as duplicates, and thus doing vector multiplication and making my life a living hell.
- Avoid left/right/outer joins when you aren't specifying "WHERE" to join on. Otherwise, the computer won't know what to do.


## This crashes
'''

danger_merged_lyso = merged_lyso.compute()
display(danger_merged_lyso.head(20))

'''

In [None]:
display(merged_lyso.compute().head(20))

In [7]:
#valid_merged_lyso = merged_lyso[merged_lyso['CellObjectNumber'].notnull() & merged_lyso['Parent_CellOutline'].notnull()]
                        
print(f"Number of rows in merged DataFrame: {merged_lyso.shape[0].compute()}")
print(f"Number of unique lysosomes: {lyso_df.shape[0].compute()}")
print(f"Number of unique cells: {cell_df.shape[0].compute()}")
#group = merge_lyso.where()

#display(merged_lyso.compute().head(10))

Number of rows in merged DataFrame: 587282296
Number of unique lysosomes: 1270303
Number of unique cells: 7376


In [None]:
query = lyso_df['Parent_CellOutline'].astype(int) < 1
display(query)


In [7]:
#Trying otut summary stats
sample_df = lyso_df.sample(frac=0.1).compute()  # Sample 1% of the data for visualization

display(sample_df.describe())

Unnamed: 0,ImageNumber,ObjectNumber,Metadata_Field,Metadata_WellColumn,Metadata_WellRow,AreaShape_Area,AreaShape_Eccentricity,Intensity_MeanIntensity_LAMP1,Intensity_MedianIntensity_LAMP1,Parent_CellOutline,Texture_Contrast_LAMP1_3_01_256,Replicate,Time,PassageNumber
count,127028.0,127028.0,127028.0,127028.0,127028.0,127028.0,127028.0,127028.0,127028.0,127028.0,127028.0,127028.0,127028.0,127028.0
mean,582.619289,955.06412,17.266453,4.118911,4.402982,140.886529,0.641414,0.109842,0.106502,4.510856,186.035958,3.118911,1.881089,11.030804
std,356.684464,839.815325,11.318295,1.347064,1.751927,125.296455,0.19396,0.057771,0.056714,5.026357,265.017958,1.347064,1.347064,3.631007
min,2.0,1.0,1.0,2.0,2.0,1.0,0.0,0.002317,0.002319,0.0,0.0,1.0,0.0,6.0
25%,266.0,341.0,8.0,3.0,3.0,48.0,0.532778,0.070899,0.068559,1.0,21.603488,2.0,1.0,9.0
50%,562.0,745.0,15.0,4.0,4.0,107.0,0.673879,0.100315,0.09691,3.0,103.834848,3.0,2.0,11.0
75%,900.0,1334.0,26.0,5.0,6.0,198.0,0.787133,0.137081,0.132311,7.0,246.845333,4.0,3.0,14.0
max,1200.0,7604.0,40.0,6.0,7.0,1287.0,1.0,0.932394,1.0,36.0,9516.976471,5.0,4.0,17.0


In [None]:
#Sample visualizattion
sample_df = merged_lyso.sample(frac=0.1).compute()  # Sample 1% of the data for visualization
plt.scatter(sample_df['Metadata_WellColumn'], sample_df['AreaShape_Area'])
plt.xlabel('Col')
plt.ylabel('Area')
plt.show()

In [None]:
#Welcome to join hell - Aligned?


merged_lyso = cell_df.align(lyso_df, how ='right', on=['Metadata_Well','Metadata_WellRow','Metadata_WellColumn','Metadata_Field'],
                            left_on='ObjectNumber', right_on='Parent_CellOutline',
                            suffixes = ('_Cell','_Compartment'), indicator = True)

display(merged_lyso.head(100).style)
#print(cell_df.head(100))

In [None]:
#Join hell but with vectors - if cross join works we might be so back
lyso_df_pd = lyso_df.compute()
cell_df_pd = cell_df.compute()

#merged_lyso = cell_df_pd.merge(lyso_df_pd, how ='cross') #its so over

In [None]:
#Features to be used in plots and stats
cell_features = ['AreaShape_Area','AreaShape_Eccentricity','Children_Lysosomes_Count', 'Children_Mitochondria_Count',
                 'Math_Lysosome_CellOutline_Ratio', 'Math_Mitochondria_CellOutline_Ratio']
lyso_features = ['AreaShape_Area','AreaShape_Eccentricity','Intensity_MedianIntensity_LAMP1',
                 'Intensity_MeanIntensity_LAMP1','Texture_Contrast_LAMP1_3_01_256']
mito_features = ['AreaShape_Area','AreaShape_Eccentricity','Texture_Contrast_MitoTracker_3_01_256']
nuclei_features = ['AreaShape_Area','AreaShape_Eccentricity']

In [None]:
#Make time int
for df in [mito_df, nuclei_df, cell_df, lyso_df]:

    df['Time'] = df['Time'].astype(int)

In [None]:
#Function to find the ratio between two columns in the two dataframes
def ratioCalc(df1, df2):
  int1 = df1['Intensity_MedianIntensity_CompensatedTfn']
  int2 = df2['Intensity_MedianIntensity_CompensatedTfn']

  temp_copy1 = outlier_removal(df1, int1)
  temp_copy2 = outlier_removal(df2, int2)

  intensity_ratio = temp_copy1[int1] / temp_copy1[int2]
  return df[intensity_ratio]

def normalize_timepoint(df, column):
    #A function to normalize values in a column to those of the time 0 
    time_zero_df = df.where(df['Time'] == 0)

 #normalizing intensity from 0 to 1 - DEPRECATE THIS
def normalization(df, column):
  min_intensity = df[column].min()
  max_intensity = df[column].max()
  df[column] = (df[column] - min_intensity) / (max_intensity - min_intensity)
  return df[column]

def mad_normalization(df, column): #Also deprecate this 

  mad = scipy.stats.median_abs_deviation(df[column])
  outlier_mask = np.abs(df[column] - df[column].median()) > mad*1.5
  cleaned_df = df[~outlier_mask]
  #recalaculate mad and make a mask of mads that go over the threshold (then negate them)
  return cleaned_df[column]

def outlier_removal(df, column):
    # Create a copy of the column and the 'Time' column

    if 'Ikarugamycin' in df: #remove this for now
      filtered_df = df[df['Drug'] != 'Ikarugamycin']

      column_copy = filtered_df[column].copy()
      time_column = filtered_df['Time'].copy()
      parent_column = filtered_df['Parent_Nuclei'].copy()

      mini_df = pd.DataFrame({column: column_copy,'Time': time_column,'Parent_Nuclei' : parent_column})
      mini_df = mini_df.dropna()
    else:
      column_copy = df[column].copy()
      time_column = df['Time'].copy()
       # Create a mini DataFrame with the column and 'Time' values
      mini_df = pd.DataFrame({column: column_copy, 'Time': time_column})
      mini_df = mini_df.dropna()

    #remove values for each type of data
    if df.equals(nuclei_df):
      # remove stuff within the range of t4
      four_mean = np.mean(column_copy[df['Time'] == 4])
      std_dev = np.std(column_copy[df['Time'] == 4])
      threshold = four_mean + (3 * std_dev)

      mini_df = mini_df[mini_df[column] <= threshold]
      mini_df = mini_df.reset_index(drop=True)


    if 'AreaShape' in column:
      mini_df[column] = mini_df[column].astype(float)
      mini_df = mini_df.loc[(mini_df[column] <= 300000) & (mini_df[column] > 0)]
      mini_df = mini_df.reset_index(drop=True)

    if 'Texture' in column:
      mini_df[column] = mini_df[column].astype(float)
      mini_df = mini_df.loc[(mini_df[column] <= 200) & (mini_df[column] > 0)]
      mini_df = mini_df.reset_index(drop=True)

      # Calculate top and bottom percentiles for Time == 0
    p1 = np.percentile(mini_df[column], 5)
    p3 = np.percentile(mini_df[column], 95)

    # Filter out values greater than 3IQR from Q1 or Q3
    filtered_mini_df = mini_df.loc[(mini_df[column] >= p1) & (mini_df[column] <= p3)]
    #Train outlier detection algorithm on non-images csvs
    if df.dropna().equals(image_df.dropna()):
    #pd.testing.assert_frame_equal(df,image_df):
      return filtered_mini_df
    else:
      cleaned_df = []
      groups = filtered_mini_df['Time'].unique()
      for group in groups:

        group = int(group)
        group_mini_df = filtered_mini_df[filtered_mini_df['Time'] == group]

        X_1D = group_mini_df[column].values
        X = X_1D.reshape(-1, 1)
        clf = IsolationForest(n_estimators=20, random_state=42, contamination='auto')
        clf.fit(X)  # fit 20 trees

        #Predict outliers and remove from X
        preds = clf.predict(X)
        outlier_indices = np.where(preds == -1)[0]
        # Create a masked df for each group to block out the values of outliers from the original dataset (use ~ to negate outliers so they can be removed)
        cleaned_group_df = group_mini_df.loc[~group_mini_df.index.isin(outlier_indices)]
        #add the masked df to the list
        cleaned_df.append(cleaned_group_df)

      cleaned_df = pd.concat(cleaned_df, ignore_index=True) #collapse list into df
    return pd.DataFrame(cleaned_df)



# if df.equals(mito_df) or df.equals(lysosomes_df):
# if df[Parents] == nuclei_df[ObjectNumber]. - only have the above if
#if 'Area' in column:
# Calculate mean and standard deviation for Time == 4

In [None]:
def normalization(df, column): #normalizing intensity from 0 to 1
  min_intensity = df[column].min()
  max_intensity = df[column].max()
  df[column] = (df[column] - min_intensity) / (max_intensity - min_intensity)
  return df[column]

def z_normalization(df, column):

  df[column] = df[column] - df[column].mean() / df[column].std()
  return df[column]

#median absolute deviation; normalize using median version of standard deviation
def mad_normalization(df, column):

  mad = scipy.stats.median_abs_deviation(df[column])
  outlier_mask = np.abs(df[column] - df[column].median()) > mad*1.5
  cleaned_df = df[~outlier_mask]
  #recalaculate mad and make a mask of mads that go over the threshold (then negate them)
  return cleaned_df[column]


In [None]:

def stats(df, cols, excel_name):
    
    stats_cols = cols
    with pd.ExcelWriter(excel_name) as writer:

      for column in cols:
        temp_copy = df.copy()  # Create a copy of the DataFrame for processing

        if 'Intensity' in column:
          temp_copy[column] = normamlization(temp_copy, column)
          temp_copy = outlier_removal(temp_copy, column)
        else:
          temp_copy[column] = mad_normalization(temp_copy, column)

          temp_copy = outlier_removal(temp_copy, column)

        # Perform pairwise Tukey's HSD test
        tukey = pairwise_tukeyhsd(endog=temp_copy[column], groups=temp_copy['Time'])

        # Extract relevant results
        results = np.array(tukey.summary().data)[:, [0, 1, 3, 6]]
        df_results = pd.DataFrame(results, columns=['Group 1', 'Group 2', 'p-value', 'Reject']).drop([0])
        df_results.reset_index(drop=True, inplace=True)
        df_results[['Group 1', 'Group 2']] = df_results[['Group 1', 'Group 2']].astype(int)
        df_results['p-value'] = df_results['p-value'].astype(float)
        # Truncate the column name if it exceeds 31 characters
        name = column[:31]

        # Save DataFrame to Excel sheet without the index column
        df_results.to_excel(writer, sheet_name=name, index=False)


# Call your stats below (anova)

In [None]:
stats(mito_df, # dataframe name: mito_df, nuclei_df, image_df, outline_df, lysosomes_df
      mito_features, # list of columns specified in the cell right above, modify to make it work for a specific dataset
      'mar13_mito_stats.xlsx') # name of the excel file you are gonna save stats to

# Make the plots and validate dist

In [None]:
def make_layout(xtitle,ytitle):
  design = go.Layout(
        plot_bgcolor="#FFF",
        xaxis=dict(
            title=xtitle,
            linecolor="black",
            showgrid=False,
            titlefont=dict(size=20),
            tickfont=dict(size=16, color="black")
        ),
        yaxis=dict(
            title=ytitle,
            linecolor="black",
            showgrid=False,
            titlefont=dict(size=20),
            tickfont=dict(size=16, color="black")
        ),
        font=dict(size=14),
        legend=dict(
            title="",
            itemsizing='constant',
            font=dict(size=16, color="black"),
            tracegroupgap=10,
            traceorder='normal',
            orientation="h",
            yanchor="bottom",
            y=1.02,
            xanchor="right",
            x=1
        ),
        boxgap=0.4,
        boxgroupgap=0.05,
        width=900,  # Specify width of the plot
        height=600
    )
  return design

def box_param(fig, color, design):
  fig.update_traces(boxmean=True)
  fig.update_traces(jitter=1.0)
  fig.update_traces(boxpoints=False)
  fig.update_layout(design)
  fig.update_traces({'opacity': 0.9})
  fig.update_traces(marker_color=color)
  return fig

In [None]:
def boxplot(df, variable, ytitle, xtitle, box_color, save=False, save_name=None):
    # #some styling stuff
  layout = make_layout(xtitle,ytitle)
  temp_copy = df.copy()
  if 'Intensity' in variable:
    temp_copy = outlier_removal(temp_copy, variable)
    temp_copy[variable] = normalization(temp_copy, variable)

  else:
    temp_copy = outlier_removal(temp_copy, variable)
    temp_copy[variable] = mad_normalization(temp_copy, variable)

  fig = px.box(temp_copy, x="Time", y=variable)
  fig = box_param(fig, box_color, layout)
  fig.update_traces(quartilemethod="linear")
  fig.show()
  
  if save==True:
      fig.write_image(save_name)

##  Call plotly
```python
boxplot(df, # dataframe name: mito_df, nuclei_df, image_df, outline_df, lysosomes_df
        'Variable', # the variable you wanna plot, column name
        'Y Title',# name of your y-axis (custom)
        'Time Point',#name of your x-axis (custom)
        'Red', # color of the boxes
        save=True, # if wanna save change to False to True, False is default
        save_name='_boxplot.png') #specify the name of the plot that you save

```

In [None]:
#save all the files for that one feature

colors_i = 0
compartment = 'Lyso'

for feature in lyso_features:
    title = compartment + '_' + feature
    boxplot(lyso_df,
            feature,
            feature,
            "Time Point",
            px.colors.qualitative.Pastel[colors_i],
            save = True,
            save_name = plate + "_" + compartment + "_" + feature + "_boxplot.png")
    colors_i = colors_i + 1
    
            
    

In [None]:
#fig = boxplot(cell_df,"AreaShape_Area","Area","Time","dodgerblue")

fig = px.bar(lyso_df,y="AreaShape_Area",x="Time")
fig.show()

In [None]:
fig = px.histogram(lyso_df, x='Intensity_MedianIntensity_LAMP1', color = 'Time')
fig.update_xaxes(range=[0,1])
fig.show()

In [None]:
fig = px.histogram(outlier_removal(lyso_df,'Intensity_MedianIntensity_LAMP1'), x='Intensity_MedianIntensity_LAMP1', color = 'Time')
fig.update_xaxes(range=[0,1])
fig.show()

# Pycytominer Testing: Use on single-cell profiles

In [5]:
# annotating merged single-cell profile with metadata
annotate(
    profiles=sc_profiles_path,
    platemap=platemap_df,
    join_on=["Metadata_Well", "Metadata_Field"],
    output_file=anno_profiles_path,
    compression_options="gzip",
)
single_cell_profile.merge_single_cells(
    sc_output_file=sc_profiles_path, compression_options="gzip"
)

# save message display
print(f"Annotated profile saved in: {anno_profiles_path}")


load_profiles() didn't find the path.
[Errno 2] No such file or directory: '/mnt/bigdisk1/Allie_S/Replicative_Age_Project/Data_Mining/test_output/results/single_cell_profile.csv.gz'


FileNotFoundError: [Errno 2] No such file or directory: '/mnt/bigdisk1/Allie_S/Replicative_Age_Project/Data_Mining/test_output/results/single_cell_profile.csv.gz'

In [None]:
# normalize dataset
normalize(
    profiles=anno_profiles_path,
    features="infer",
    image_features=False,
    meta_features="infer",
    samples="all",
    method="standardize",
    output_file=norm_profiles_path,
    compression_options="gzip",
)

# save message display
print(f"Normalized profile saved in: {norm_profiles_path}")


In [None]:
# creating selected features profile
feature_select(
    profiles=norm_profiles_path,
    features="infer",
    image_features=False,
    samples="all",
    operation=["variance_threshold", "correlation_threshold", "blocklist"],
    output_file=feat_profiles_path,
    compression_options="gzip",
)

# save message display
print(f"Selected features profile saved in: {feat_profiles_path}")
