In [83]:
import pandas as pd
from pandas.api.types import CategoricalDtype

dataframe_paths = [
    "/Users/shayecarver/Downloads/Colorectal_lizard_cells_SUB.csv",
    "/Users/shayecarver/Downloads/NSCLC_lizard_cells_SUB.csv"
]
cancers = ["Colorectal", "NSCLC"]

output_path = "/Users/shayecarver/CellVit/lizard_slide_level_features_ALL.csv"
output_path_all = output_path.replace("ALL", "ONLY_all_cells")
output_path_immune = output_path.replace("ALL", "ONLY_immune_cells")

morph_features = ['area', 'perimeter', 'eccentricity', 'solidity', 'orientation', 
                  'major_axis_length', 'minor_axis_length', 'aspect_ratio', 
                  'circularity', 'centroid_x', 'centroid_y']
immune_types = ['Eosinophil', 'Lymphocyte', 'Neutrophil', 'Plasma']

for i, (df_path, cancer_type) in enumerate(zip(dataframe_paths, cancers)):
    df = pd.read_csv(df_path)

    # calculate all cell counts and morph features across the entire slide
    all_cells_df = df.groupby('slide_id')[morph_features].agg('mean').reset_index()
    all_cells_df['type_label'] = 'all_cells'
    all_cells_df['cell_count'] = df.groupby('slide_id').size().values

    # calculate immune cell counts and morph features across the entire slide
    immune_subset = df[df['type_label'].isin(immune_types)]
    immune_cells_df = immune_subset.groupby('slide_id')[morph_features].agg('mean').reset_index()
    immune_cells_df['type_label'] = 'immune_cells' 
    immune_cells_df['cell_count'] = immune_subset.groupby('slide_id').size().values

    # calculate morph features for each cell type across the entire slide
    grouped_df = df.groupby(['slide_id', 'type_label']).agg({
        'area': 'mean',
        'perimeter': 'mean', 
        'eccentricity': 'mean',
        'solidity': 'mean',
        'orientation': 'mean',
        'major_axis_length': 'mean',
        'minor_axis_length': 'mean',
        'aspect_ratio': 'mean',
        'circularity': 'mean',
        'centroid_x': 'mean',
        'centroid_y': 'mean',
        'cell_id': 'size'  # Use 'cell_id' for counting instead
    }).reset_index()
    grouped_df.rename(columns={'cell_id': 'cell_count'}, inplace=True)

    # concatenate all cells, immune cells, and grouped dataframes
    all_data = pd.concat([all_cells_df, immune_cells_df, grouped_df], ignore_index=True)

    # reorder rows so that for each slide_id, 'all_cells' and 'immune_cells' appear first, followed by other cell types
    specific_cell_types = sorted(set(df['type_label']) - set(['all_cells', 'immune_cells']))
    type_order = ['all_cells', 'immune_cells'] + specific_cell_types
    all_data['type_label'] = all_data['type_label'].astype(CategoricalDtype(categories=type_order, ordered=True))
    all_data = all_data.sort_values(by=['slide_id', 'type_label']).reset_index(drop=True)

    # add cancer type label column
    all_data['cancer_type'] = cancer_type

    # Create a Series mapping from slide_id to the cell_count of 'all_cells'
    all_cells_map = all_data[all_data['type_label'] == 'all_cells'].set_index('slide_id')['cell_count']

    # Map the total 'all_cells' count to each row by slide_id, then compute the proportion
    all_data['prop_type_label'] = all_data['cell_count'] / all_data['slide_id'].map(all_cells_map)

    # reorder columns to have 'slide_id' first, then 'type_label' and 'cell_count', followed by morph features
    cols = ['slide_id', 'type_label', 'prop_type_label', 'cell_count', 'cancer_type'] + morph_features
    all_data = all_data[cols]

    # write to disk in append mode — header only for first file
    all_data.to_csv(output_path, sep='\t', mode='w' if i == 0 else 'a', header=(i == 0), index=False)


    # NOW subsetting to all_cells and immune_cels but retaining cell type proportions

    # create immune_cell dataframe and all_cell dataframe retaining cell type proportions

    # 1. Subset to the two main categories
    all_cells_df = all_data[all_data['type_label'] == 'all_cells'].copy()
    immune_cells_df = all_data[all_data['type_label'] == 'immune_cells'].copy()

    # Start with all specific cell types (excluding 'all_cells' and 'immune_cells')
    specific_types_df = all_data[
        ~all_data['type_label'].isin(['all_cells', 'immune_cells'])
    ][['slide_id', 'type_label', 'cell_count', 'prop_type_label']].copy()

    # Pivot: one for counts, one for proportions
    count_matrix = specific_types_df.pivot(index='slide_id', columns='type_label', values='cell_count')
    prop_matrix = specific_types_df.pivot(index='slide_id', columns='type_label', values='prop_type_label')

    # Rename columns to add 'prop_' prefix
    prop_matrix.columns = [f"prop_{col}" for col in prop_matrix.columns]

    # combine dfs
    prop_counts = pd.concat([count_matrix, prop_matrix], axis=1)

    # now merge with all and immune cells
    all_cells_df = all_cells_df.merge(
        prop_counts, on='slide_id', how='left'
    )
    immune_cells_df = immune_cells_df.merge(
        prop_counts, on='slide_id', how='left'
    )

    # now save dfs
    all_cells_df.to_csv(output_path_all, sep='\t', mode='w' if i == 0 else 'a', header=(i == 0), index=False)
    immune_cells_df.to_csv(output_path_immune, sep='\t', mode='w' if i == 0 else 'a', header=(i == 0), index=False)

    


In [None]:
/agusevlab/scarver/slide_ids_ALL.txt