# Aggregate, Annotate, Normalize, and Feature Select

This notebook will run all these above operations on a single-cell file or well-aggregated file.

We will have the option to run or not 

In [1]:
import pandas as pd
import numpy as np
import pycytominer
import easygui as eg
import utilitary as util
import os
from generate_profiles import *
%load_ext autoreload
%autoreload 2

# 0) Inputs

In [2]:
profile = eg.fileopenbox(msg="Choose a file with samples and their features", default=r"D:")
print('Filename', profile)

project_name = input('Provide the name of this project:')
print('Project name:', project_name)

metadata_question = input(r"If you need to annotate your dataset with an external file, write yes and press enter. If already annotated, answer no and press enter.")
metadata_answer = stringToBool(metadata_question)

if metadata_answer:
  platemap = eg.fileopenbox(msg="Choose a map (csv file) with plates names and metadata filenames", default=r"G:")
  platemap_path = os.path.split(platemap)[0]
  print('Platemap file selected', platemap)
  barcode_df = pd.read_csv(platemap)

cells_that_run = []
cells_that_run.append(project_name)

Filename G:\My Drive\2022_09_09_LiveCellPainting_fossa_Cimini\analysis\2022_06_07_DILI\agg_normalization_featselect\notebooks\DILI_singlecell.csv
Project name: 2022_06_07_DILI
Platemap file selected G:\My Drive\2022_09_09_LiveCellPainting_fossa_Cimini\metadata\platemaps\2022_06_07_DILI\barcode_platemap.csv


## 1) Import extracted features file (single cell or well-aggregated)

In [3]:
df = pd.read_csv(profile)
df.head()

Unnamed: 0.1,Unnamed: 0,Metadata_ImageNumber,Metadata_Plate,Metadata_Site,Metadata_Well,Metadata_TableNumber,Metadata_ObjectNumber_cytoplasm,Cytoplasm_AreaShape_Area,Cytoplasm_AreaShape_BoundingBoxArea,Cytoplasm_AreaShape_BoundingBoxMaximum_X,...,Nuclei_Texture_Variance_CorrPI_10_02_256,Nuclei_Texture_Variance_CorrPI_10_03_256,Nuclei_Texture_Variance_CorrPI_20_00_256,Nuclei_Texture_Variance_CorrPI_20_01_256,Nuclei_Texture_Variance_CorrPI_20_02_256,Nuclei_Texture_Variance_CorrPI_20_03_256,Nuclei_Texture_Variance_CorrPI_5_00_256,Nuclei_Texture_Variance_CorrPI_5_01_256,Nuclei_Texture_Variance_CorrPI_5_02_256,Nuclei_Texture_Variance_CorrPI_5_03_256
0,0,1,220607_092050_Plate_1,1,B10,1571898871,1,17563,28700,1057,...,23.38748,23.456552,24.89357,21.909327,25.605559,26.443746,23.180844,23.307391,22.693203,22.563807
1,1,1,220607_092050_Plate_1,1,B10,1571898871,2,26625,47064,544,...,27.535579,28.164301,29.948233,32.728835,29.76062,28.631319,26.867608,27.496918,26.937428,27.22747
2,2,1,220607_092050_Plate_1,1,B10,1571898871,3,10090,20703,722,...,190.739156,184.776425,193.994949,288.495965,229.479026,227.77847,226.038468,191.521848,187.765931,190.22764
3,3,1,220607_092050_Plate_1,1,B10,1571898871,4,20770,39366,1068,...,30.832619,32.460216,30.510909,30.444835,31.034468,29.68188,31.185042,31.649853,30.655306,30.647967
4,4,1,220607_092050_Plate_1,1,B10,1571898871,5,27108,59052,409,...,111.594226,111.98076,116.428891,115.604023,116.062754,119.931474,115.106248,109.352773,108.304546,108.767129


## 2) Generate profile

### 2A) Aggregate

- Run the next cell to list the metadata columns, and copy them to choose from which metadata information you'd like to join the rows on (strata)

- If using **already aggregated data by plates and wells**, skip to 2B.

In [4]:
pycytominer.cyto_utils.infer_cp_features(df, metadata=True)

['Metadata_ImageNumber',
 'Metadata_Plate',
 'Metadata_Site',
 'Metadata_Well',
 'Metadata_TableNumber',
 'Metadata_ObjectNumber_cytoplasm',
 'Metadata_Cytoplasm_Parent_Cells',
 'Metadata_Cytoplasm_Parent_Nuclei',
 'Metadata_ObjectNumber_cells',
 'Metadata_ObjectNumber']

- As a default we are using **Metadata_Plate and Metadata_Well** to join the rows, using the **'median'** operation. 

In [5]:
df = pycytominer.aggregate(df, 
                              strata=['Metadata_Plate', 'Metadata_Well'], 
                              operation='median')
df.head()

Unnamed: 0,Metadata_Plate,Metadata_Well,Cytoplasm_AreaShape_Area,Cytoplasm_AreaShape_BoundingBoxArea,Cytoplasm_AreaShape_BoundingBoxMaximum_X,Cytoplasm_AreaShape_BoundingBoxMaximum_Y,Cytoplasm_AreaShape_BoundingBoxMinimum_X,Cytoplasm_AreaShape_BoundingBoxMinimum_Y,Cytoplasm_AreaShape_Center_X,Cytoplasm_AreaShape_Center_Y,...,Nuclei_Texture_Variance_CorrPI_10_02_256,Nuclei_Texture_Variance_CorrPI_10_03_256,Nuclei_Texture_Variance_CorrPI_20_00_256,Nuclei_Texture_Variance_CorrPI_20_01_256,Nuclei_Texture_Variance_CorrPI_20_02_256,Nuclei_Texture_Variance_CorrPI_20_03_256,Nuclei_Texture_Variance_CorrPI_5_00_256,Nuclei_Texture_Variance_CorrPI_5_01_256,Nuclei_Texture_Variance_CorrPI_5_02_256,Nuclei_Texture_Variance_CorrPI_5_03_256
0,220607_092050_Plate_1,B10,18752.0,37376.0,693.0,555.0,498.0,360.0,599.561854,462.747582,...,58.348243,58.993129,60.28589,59.150598,60.150878,61.308779,57.335272,57.522162,58.258208,57.18377
1,220607_092050_Plate_1,B11,11882.0,24180.0,693.0,529.0,537.0,384.0,611.65213,457.321057,...,54.123691,55.449175,56.883491,57.822113,55.794023,54.900001,52.717471,53.066292,52.151373,52.641898
2,220607_092050_Plate_1,B3,12076.5,23814.0,679.5,544.5,523.5,393.0,602.766028,470.136183,...,65.144702,67.16062,66.89097,65.61763,67.153566,66.880734,64.223359,64.614367,64.830408,64.647169
3,220607_092050_Plate_1,B4,11418.5,22496.5,720.0,522.0,560.0,358.5,635.978163,439.942632,...,77.924752,79.349215,80.568016,81.234404,81.987271,81.43308,75.882404,76.401983,76.015379,75.955258
4,220607_092050_Plate_1,B5,13157.0,26010.0,651.0,515.0,491.0,351.0,568.869559,431.739302,...,71.288318,72.710651,74.004674,72.930256,74.86425,72.184426,69.797468,70.182677,69.166294,69.960563


### 2B) Annotate 

- Run the following to generate a plate list based on Metadata_Plate column.

In [6]:
plate_list = df['Metadata_Plate'].unique().tolist()

- All metadata must be organized such:
    ```
    |- metadata 
    |   |   |- <barcode_platemap.csv> 
    |   |- platemap
    |   |   |- <platemap_1.csv>
    |   |   |- <platemap_2.csv>
    ```

- Run the next cell to annotate the profiles. 

In [7]:
df_temp_list = []
for pl in plate_list:
    df_plate = df.loc[df['Metadata_Plate'] == pl]
    #deal with metadata information
    barcode_map = barcode_df[barcode_df['Assay_Plate_Barcode'] == pl]
    index_map = barcode_map.index[0]
    metadata_filename = barcode_df['Plate_Map_Name'][index_map]
    metadata = pd.read_csv(platemap_path + r'/platemap/' + metadata_filename + '.csv')
    #annotate
    df_temp = pycytominer.annotate(profiles = df_plate, platemap = metadata, join_on =  ["Metadata_well_position", "Metadata_Well"])
    df_temp_list.append(df_temp)
    print('Shape of each plate ', df_temp.shape)
df = pd.concat(df_temp_list, axis=0)

Shape of each plate  (59, 1817)
Shape of each plate  (58, 1817)
Shape of each plate  (60, 1817)


### 2C) Normalize

- Normalize the dataset, per-plate basis, to **samples = all** or **samples = negcon**.
- CHOOSE one or the other below.

#### Normalize TO NEGCON => run next cell

In [8]:
df_temp_list = []
for pl in plate_list:
    df_temp = df.loc[df['Metadata_Plate'] == pl]
    print(df_temp.shape)
    df_norm_temp = pycytominer.normalize(df_temp, method = 'mad_robustize', mad_robustize_epsilon = 0, samples = "Metadata_control_type == 'negcon'") 
    df_temp_list.append(df_norm_temp)
df_norm2 = pd.concat(df_temp_list, axis=0)
cells_that_run.append('normalized_negcon')

(59, 1817)
(58, 1817)


  mad = np.median(np.abs(x - med))
  mad = np.median(np.abs(x - med))


(60, 1817)


  mad = np.median(np.abs(x - med))


#### Normalize TO ALL => run next cell

In [None]:
df_temp_list = []
for pl in plate_list:
    df_temp = df.loc[df['Metadata_Plate'] == pl]
    print(df_temp.shape)
    df_norm_temp = pycytominer.normalize(df_temp, method = 'mad_robustize', mad_robustize_epsilon = 0) 
    df_temp_list.append(df_norm_temp)
df_norm2 = pd.concat(df_temp_list, axis=0)
cells_that_run.append('normalized')

### 2D) Feature selection

In [9]:
df_selected = pycytominer.feature_select(df_norm2, operation = ['correlation_threshold', 'variance_threshold', 'drop_na_columns','drop_outliers'], outlier_cutoff = 500) 
print('Number of columns removed:', df_norm2.shape[1] - df_selected.shape[1])
print('Percentage of columns removed:',100 - ((df_selected.shape[1]*100)/df_norm2.shape[1]))
cells_that_run.append('feature_select')

Number of columns removed: 1417
Percentage of columns removed: 77.98569069895433


# Export

In [10]:
output_path = eg.diropenbox(msg="Choose an output folder", default=r"D:")
print('Path to save the profile', output_path)

Path to save the profile D:\2022_09_09_LiveCellPainting_fossa_Cimini\workspace\profiles\2022_06_07_DILI


In [11]:
output_name = '_'.join(cells_that_run)
df_selected.to_csv(output_path + r'/' + output_name + '.csv')
print('Successfully exported to:', output_path + r'/' + output_name + '.csv')

Successfully exported to: D:\2022_09_09_LiveCellPainting_fossa_Cimini\workspace\profiles\2022_06_07_DILI/2022_06_07_DILI_normalized_negcon_feature_select.csv
