 <font size="10">**Adapter notebook**</font>
***

<div class="alert alert-info">
    
<b> <h1> ℹ️ <strong> <font size="6" color="black"> Important notebook information </font> </strong> </h1> </b>
    <hr>
    <font size="4" color="black">
        The purpose of this notebook is to combine the binary masks generated using either Fiji or plotly with the scRNAseq data into a single anndata object<br> <br>
    This scipt will transform the data using a function which utalises scanpy preprocessing functions</font>

# Import packages

In [None]:
import numpy as np
import pandas as pd
import scanpy as sc
import anndata as ad
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import cv2
from PIL import Image

# User input

In [None]:
# file path to data object with transformed counts
data_path = '/dir/dir1/dir2/anndata.h5ad'

# save path
save_path = './formatted_anndata.h5ad'

# assign the names for the mask sets you would like
masks_selector_keys = ['set1','set2','set3']

# column in obs holding sectional information for each collection of masks
region_cols = {
'set1':'spatial_location',
'set2':'head_body',
'set3':'head_body',
}


# Dictionary for all masks required
# include all sectional masks in desired order followed by base img and outline masks
mask_set_1 = {
'Section_01' : 'Mask_section_01.jpg',
'Section_02' : 'Mask_section_02.jpg',
'Section_03' : 'Mask_section_03.jpg',
'Section_04' : 'Mask_section_04.jpg',
'Section_05' : 'Mask_section_05.jpg',
'Section_06' : 'Mask_section_06.jpg',
'Section_07' : 'Mask_section_07.jpg',
'Section_08' : 'Mask_section_08.jpg',
'Section_09' : 'Mask_section_09.jpg',
'Section_10' : 'Mask_section_10.jpg',
'Section_11' : 'Mask_section_11.jpg',
'Section_12' : 'Mask_section_12.jpg',
'base_mask_key' : 'mask_sections_outlined_reverse.jpg',
# Web portal additional:
'ref_img' : 'reference_image.jpg',
}

mask_set_2 = {
'head' : 'head_3.png', # head_3
'body' : 'body_3.png',
'base_mask_key' : 'orig_plotly.jpeg',
# Web portal additional:
'ref_img' : 'orig_plotly.jpeg',
}

mask_set_3 = {
'head' : 'head_multi_test_1.png', # head_3
'body' : 'body_multi.png',
'base_mask_key' : 'orig_plotly.jpeg',
# Web portal additional:
'ref_img' : 'orig_plotly.jpeg',
}

dict_masks = {
'12_sections' : mask_set_1,
'head_body' : mask_set_2,
'Picasso' : mask_set_3,
}


# Manual datetime dictionary if wanted - set as the same name as the date column of interest
# If not set it will use the original column to generate timeline

option_date_dictionaries = {
    # 'col_of_interest' : {'dates':[], 'labels': []}
    'col_of_interest' : {'dates':[[2020, 12, 11],[2021, 2, 26],[2021, 9, 9],[2021, 12, 15]],
                         'labels':['\nTime point 1',
                                    '\nTime point 2, \n\info', 
                                    '\nTime point 3, \ninfo', 
                                    '\nTime point 4, \ninfo']},
}


# Read in data

In [None]:
# Load in sc data 
adata = sc.read(data_path)

In [None]:
# Load masks to turn into polygons
mask_dict = {}

for key in dict_masks:
    masks = {}
    for mask in dict_masks[key]:
        masks[mask] = cv2.imread(dict_masks[key][mask])
    mask_dict[key] = masks

# Format and clean data if required

In [None]:
adata.obs['head_body'] = ''
adata.obs.loc[adata.obs['spatial_location'].isin(['Section_01', 'Section_02', 'Section_03', 'Section_04', 'Section_05', 'Section_06']),'head_body'] = 'body'
adata.obs.loc[adata.obs['spatial_location'].isin(['Section_07','Section_08', 'Section_09', 'Section_10', 'Section_11', 'Section_12']),'head_body'] = 'head'

dict1 = {'F137':'26-02-2021','F147':'09-09-2021','F158':'15-11-2021',}
adata.obs['collection_date'] = adata.obs['haniffa_ID'].astype(str).map(dict1)

In [None]:
adata.obs = adata.obs[['dataset', 'sequencing_lane_ID', 'haniffa_ID', 'HDBR_ID',
'sequencing_type', '10x_kit', 'spatial_location',
'spatial_location_replicates', 'sort_ID', 'procedure', 'age_in_cs',
'sex', 'alignment_software', 'alignment_reference',
'cutoff_1.4826_is_doublet', 'cutoff_2_is_doublet', 'cutoff_3_is_doublet',
'nUMI', 'nGene', 'nCounts', 'percent.mito','percent.ribo', 
'cell_labels_lvl1', 'cell_labels_lvl2','maternal_predicted', 'collection_date','head_body']]
umap = adata.obsm['X_umap_neighbours_30'].copy()

adata.obsm['X_umap'] = umap

adata.obs['cell_labels_lvl1'] = adata.obs['cell_labels_lvl1'].apply(str).astype('category')
adata.obs['cell_labels_lvl2'] = adata.obs['cell_labels_lvl2'].apply(str).astype('category')
adata.obs['spatial_location'] = adata.obs['spatial_location'].apply(str).astype('category')
adata.obs['head_body'] = adata.obs['head_body'].apply(str).astype('category')

adata.obs['age_in_cs'] = adata.obs['age_in_cs'].apply(str).astype('category')
adata.obs['HDBR_ID'] = adata.obs['HDBR_ID'].apply(str).astype('category')
adata.obs['nUMI'] = adata.obs['nUMI'].astype(float)
adata.obs['nGene'] = adata.obs['nGene'].astype(float)
adata.obs['nCounts'] = adata.obs['nCounts'].astype(float)
adata.obs['percent.mito'] = adata.obs['percent.mito'].astype(float)
adata.obs['percent.ribo'] = adata.obs['percent.ribo'].astype(float)
adata.obs['cutoff_1.4826_is_doublet'] = adata.obs['cutoff_1.4826_is_doublet'].map({'False':False, 'True':True})
adata.obs['cutoff_2_is_doublet'] = adata.obs['cutoff_2_is_doublet'].map({'False':False, 'True':True})
adata.obs['cutoff_3_is_doublet'] = adata.obs['cutoff_3_is_doublet'].map({'False':False, 'True':True})

adata.obs['cutoff_1.4826_is_doublet'] = adata.obs['cutoff_1.4826_is_doublet'].astype(bool)
adata.obs['cutoff_2_is_doublet'] = adata.obs['cutoff_2_is_doublet'].astype(bool)
adata.obs['cutoff_3_is_doublet'] = adata.obs['cutoff_3_is_doublet'].astype(bool)


adata.obs['maternal_predicted'] = adata.obs['maternal_predicted'].map({'Embryonic':False, 'Maternal_predicted':True})
adata.obs['maternal_predicted'] = adata.obs['maternal_predicted'].astype(bool)

adata.obs['collection_date'] = adata.obs['collection_date'].apply(str).astype('category')

In [None]:
del adata.obsm['X_umap_neighbours_15']
del adata.obsm['X_umap_neighbours_20']
del adata.obsm['X_umap_neighbours_25']
del adata.obsm['X_umap_neighbours_30']

del adata.obsp
del adata.uns

# Order the obs into categories for web portal dropdown - optional

In [None]:
# simple ordering

celltypes_columns = ['cell_labels_lvl1', 'cell_labels_lvl2']
spatial_regions = ['spatial_location','spatial_location_replicates','head_body']
donor_information = ['haniffa_ID', 'HDBR_ID','age_in_cs','sex','procedure','sort_ID']
technical_information = ['10x_kit','cutoff_1.4826_is_doublet', 'cutoff_2_is_doublet','cutoff_3_is_doublet', 'nUMI', 'nGene', 'nCounts', 'percent.mito','percent.ribo','maternal_predicted']
general_dataset_information = ['dataset', 'sequencing_lane_ID','sequencing_type', 'alignment_software', 'alignment_reference','collection_date']

column_ordering = {
'celltypes_columns' : celltypes_columns,
'spatial_regions' : spatial_regions,
'donor_information' : donor_information,
'technical_information' : technical_information,
'general_dataset_information' : general_dataset_information,
}

adata.uns['column_ordering'] = column_ordering

# Generate polygons

In [None]:
polygons = {}
for key in mask_dict:
    polygon_dict = {}
    for mask in mask_dict[key]:
        if (mask == 'base_mask_key') or (mask == 'ref_img'):
            pass
        else:
            mask_grey = cv2.cvtColor(mask_dict[key][mask], cv2.COLOR_BGR2GRAY)
            _, mask_thresh = cv2.threshold(mask_grey, 50, 255, cv2.THRESH_BINARY)
            mask_contours, hierarchy = cv2.findContours(mask_thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
            if len(mask_contours) > 1:
                count = 0
                for m_c in mask_contours:
                    polygon_dict[mask + '_' + str(count)] = np.array([m_c])
                    count+=1
                #polygon_dict[mask] = np.array(mask_contours[-1], dtype='int32')
            else:
                polygon_dict[mask] = np.array(mask_contours, dtype='int32')        
    polygons[key] = polygon_dict


# Generate sectional gene expression dataframe

In [None]:
for key in list(set([v for k,v in region_cols.items()])):
    
    table_name = str(key) + '_gene_expression'
    
    if table_name not in adata.varm.keys():
        gene_expression_table = pd.DataFrame(columns=adata.var_names, index=adata.obs[key].cat.categories)
        for clust in adata.obs[key].cat.categories:
            gene_expression_table.loc[clust] = adata[adata.obs[key].isin([clust]),:].X.mean(0)
        adata.varm[table_name] = gene_expression_table.T
        for col in adata.varm[table_name].columns:
            adata.varm[table_name][col] = adata.varm[table_name][col].astype(float)
            adata.varm[table_name].columns = adata.varm[table_name].columns.astype(str)
   

# Make single anndata object holding all information

In [None]:
adata.uns['masks'] = {}

for mask_group in region_cols:
    
    adata.uns['masks'][mask_group] = {}
    
    adata.uns['masks'][mask_group]['obs'] = region_cols[mask_group]
    adata.uns['masks'][mask_group]['polygons'] = polygons[mask_group]
    adata.uns['masks'][mask_group]['ref_img'] = mask_dict[mask_group]['ref_img']
    adata.uns['masks'][mask_group]['shape'] = np.array(mask_dict[mask_group]['base_mask_key'].shape)
    adata.uns['masks'][mask_group]['varm'] = region_cols[mask_group] + '_gene_expression'
    

adata.uns['Mask_selector'] = region_cols
    
adata.uns['optional_date_dictionaries'] = optional_date_dictionaries
    

# Save data object

In [None]:
adata.write(save_path)
print(f'Data saved at: {save_path}')

***
### Dependencies
Here are some information about resources and import package versions used within this script

In [1]:
%load_ext watermark
%watermark -v -m -p numpy,pandas,scanpy,anndata,matplotlib,seaborn,cv2,PIL

Python implementation: CPython
Python version       : 3.9.4
IPython version      : 7.24.1

numpy     : 1.20.3
pandas    : 1.2.4
scanpy    : 1.7.2
anndata   : 0.7.6
matplotlib: 3.5.0
seaborn   : 0.11.1
cv2       : 4.6.0
PIL       : 8.2.0

Compiler    : GCC 9.3.0
OS          : Linux
Release     : 4.15.0-112-generic
Machine     : x86_64
Processor   : x86_64
CPU cores   : 26
Architecture: 64bit

