# Visualize batches, apply pyCombat and visualize again

- This notebook is to visualize batch effects of High-Content screening assays, try to correct those effects using pyCombat [reference here], and visualize it again with UMAP to check if the effect was corrected.


github@fefossa

In [None]:
%cd G:\My Drive\GitHub\scripts_notebooks_fossa
%pip install .

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import pycytominer
import easygui as eg
import os 

from umap import UMAP
from pycombat_umap import combat_util
from combat.pycombat import pycombat
import plotly.io as pio
# %load_ext autoreload
# %autoreload 2
pio.renderers.default='notebook'

# 1. Inputs

In [None]:
myfile = eg.fileopenbox(msg="Choose a file", default=r"D:")
print('Filename', myfile)


In [None]:
df = pd.read_csv(myfile)
df.head()

## 1a) Apply filters or change labels (optional)

If there's any rows you'd like to filter out of your plate that you think it's affecting your analysis, do it here.

### Filters

In [None]:
df = df[df['Metadata_Plate'] != '220604_111529_Plate_1'].copy().reset_index(drop=True)
df['Metadata_CompoundSizeTime'] = df['Metadata_Compound'] + ' ' + df['Metadata_NPSize_nm'].astype(str) + ' ' + df['Metadata_Time'].astype(str)
df = df[df['Metadata_CompoundSizeTime'] != 'AgNP 40 0'].copy().reset_index(drop=True)

### Change the Time column to be in hours instead of days

- Doing this so we don't have 0 values for the time. This allow us to use Metadata_Time_hr as a size marker for the plots, for example.

In [None]:
dict_change = {0:24, 2:48, 4:96, 6:144}
df['Metadata_Time_hr'] = df['Metadata_Time']
df.replace({'Metadata_Time_hr':dict_change}, inplace=True)

## 1b) Define UMAP parameters and hover cols

### UMAP

In [None]:
n_neighbors_input = 15
min_dist_input = 0.5

### Define hover columns to always have it on the plots

In [None]:
hover_list = ['Metadata_Plate', 'Metadata_Time_hr', 'Metadata_Well', 'Metadata_Concentration']

# 3. Plot to check for batch effects

- We plot columns that usually has batch effects like Plate (if they were read on different days), Well position

In [None]:
cols_to_join = ["Metadata_Compound", "Metadata_Concentration"]

In [None]:
df, new_col = combat_util.col_generator(df, cols_to_join = cols_to_join)

## Plate

In [None]:
df_plot = combat_util.generate_x_y_umap(df, n_neighbors=n_neighbors_input, min_dist=min_dist_input, metric='euclidean')
combat_util.plot_umap(df_plot, 
                      color_col='Metadata_Plate',
                      hover_cols=hover_list)

## Well

In [None]:
combat_util.plot_umap(df_plot, 
                      color_col='Metadata_Well', 
                      hover_cols=hover_list,
                      discrete=True,
                      compound_color=True)

## Compounds

In [None]:
combat_util.plot_umap(df_plot, color_col=new_col, split_df = False, split_column = None, np = None,
                      hover_cols=hover_list)

In [None]:
np_list = [0.5, 4.0]
for np in np_list:
    combat_util.plot_umap(df_plot, 
                        color_col=new_col, 
                        hover_cols=hover_list, 
                        split_df = True, split_column = 'Metadata_Time_hr', np = np, 
                        # size=True, size_col = 'Metadata_Time_hr',
                        discrete=True,
                        time_color=True)

# 4. Apply pyCombat

- Based on the batch effects you could visually identify above, choose the batch_col you'd like to correct 

In [None]:
batch_col = "Metadata_Plate"

In [None]:
df_plates = combat_util.pycombat_generator(df, batch_column = batch_col, well_column = 'Metadata_Well')

### Generate UMAP visualization and plot

In [None]:
df_plot = combat_util.generate_x_y_umap(df_plates, n_neighbors=n_neighbors_input, min_dist=min_dist_input, metric='cosine')
combat_util.plot_umap(df_plot, color_col=new_col, split_df = False, split_column = None, np = None,
                      hover_cols=hover_list,
                      size=True, size_col = "Metadata_Time_hr")

In [None]:
np_list = [0.5, 4.0]
for np in np_list:
    combat_util.plot_umap(df_plot, 
                        color_col=new_col, 
                        hover_cols=hover_list, 
                        split_df = True, split_column = 'Metadata_Time_hr', np = np, 
                        # size=True, size_col = 'Metadata_Time_hr',
                        discrete=True,
                        time_color=True)

# "Messing" with the data
## See if the effect is still there! 

- Consider rows or columns as batches, and ask pyCombat to correct for it. Somehow, if the effects are strong enough, it won't disappear after that correction.

- Rows or columns:
    - Choose rows = True, the default is {'B':0, 'C':1, 'D':2, 'E':3, 'F':4, 'G':5}
    - Choose columns = True, the default is {'2':0, '3':1, '4':2, '5':3, '6':4, '7':5, '8':6, '9':7, '10':8, '11':9}
    

Other options

dict_change={'2': 0, '7': 0, '3':1, '8':1, '4':2, '9':2, '5':3, '10':3, '6':4, '11':4}

dict_change={'B':0, 'C':1, 'D':2, 'E':0, 'F':1, 'G':2}

### Columns

In [None]:
df_combat = combat_util.generate_batch_col(df_plates, columns=True,
                                           rows=False,
                                           change_default_dict=False,
                                           join_plate=False)

In [None]:
df_combat_corr = combat_util.pycombat_generator(df_combat, batch_column = 'Metadata_batch_pycombat', well_column='Metadata_Well')

In [None]:
df_plot = combat_util.generate_x_y_umap(df_combat_corr, n_neighbors=n_neighbors_input, min_dist=min_dist_input)
combat_util.plot_umap(df_plot, 
                        color_col='Metadata_Time', 
                        hover_cols=hover_list, 
                        # split_df = True, split_column = 'Metadata_CompoundSize', np = np, 
                        size=True, size_col = 'Metadata_NPSize_nm',
                        discrete=True,
                        time_color=True)

### Rows

In [None]:
df_combat2 = combat_util.generate_batch_col(df_plates, columns=False,
                                           rows=True,
                                           change_default_dict=False,
                                           join_plate=False)

In [None]:
df_plot = combat_util.generate_x_y_umap(df_combat2, n_neighbors=n_neighbors_input, min_dist=min_dist_input)
combat_util.plot_umap(df_plot, 
                        color_col='Metadata_Time', 
                        hover_cols=hover_list, 
                        # split_df = True, split_column = 'Metadata_CompoundSize', np = np, 
                        size=True, size_col = 'Metadata_NPSize_nm',
                        discrete=True,
                        time_color=True)

# Export to csv

In [None]:
filename = myfile.rsplit('\\', 1)[-1][:-4]
path = myfile.rsplit('\\', 1)[-2]
output = os.path.join(path, filename)
df_plates.to_csv(output+"_pycombat.csv")

# OPTIONAL: UMAP parameters search

In [None]:
n_neighbors_list = [5, 15] 
min_dist_list = [0.5, 1]
np_list = ['AgNP 40', 'AgNP 100']

for np in np_list:
    for n in n_neighbors_list:
            for m in min_dist_list:
                df_umap = combat_util.generate_x_y_umap(df_plates, n_neighbors=n, min_dist=m, metric='cosine')
                combat_util.plot_umap(df_umap, 
                        color_col='Metadata_Time', 
                        hover_cols=['Metadata_CompoundSize'], 
                        split_df = True, 
                        split_column = 'Metadata_CompoundSize', 
                        np = np,
                        umap_param=True, neighbor=n, mindist=m,
                        discrete=True,
                        time_color=True)