In [1]:
import numpy as np
import pandas as pd
import re
import toytree as tt
from tasccoda import tree_utils as util

# Introduction

(Analysis by Maren Büttner)

In this notebook, we examine publicly available data of Smillie et al (2019), Cell. The purpose is to determine compositional changes using our scCODA model in the three conditions (Healthy, Non-inflamed, Inflamed). 

We perform the following steps:
1. Load the data
2. Preprocess data

# Read the data

In [2]:
data_path = '../../../tascCODA_data/applications/smillie_UC/SCP259/metadata/'

Read meta data. 
The data were downloaded from Single Cell Portal (SCP259).
The project contains also mtx-files, but we are only interested in the metadata.

In [3]:
meta = pd.read_table(data_path + 'all.meta2.txt', sep='\t')

  exec(code_obj, self.user_global_ns, self.user_ns)


In [4]:
meta = meta.drop([0])

Set index to "NAME" column.

In [5]:
meta.index = meta['NAME']
meta = meta.drop(columns = ['NAME'])

meta.shape

(365492, 7)

In [6]:
meta.head(10)

Unnamed: 0_level_0,Cluster,nGene,nUMI,Subject,Health,Location,Sample
NAME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
N7.EpiA.AAACATACACACTG,TA 1,328,891,N7,Non-inflamed,Epi,N7.EpiA
N7.EpiA.AAACCGTGCATCAG,TA 1,257,663,N7,Non-inflamed,Epi,N7.EpiA
N7.EpiA.AAACGCACAATCGC,TA 2,300,639,N7,Non-inflamed,Epi,N7.EpiA
N7.EpiA.AAAGATCTAACCGT,Enterocyte Progenitors,250,649,N7,Non-inflamed,Epi,N7.EpiA
N7.EpiA.AAAGATCTAGGCGA,Enterocyte Progenitors,284,769,N7,Non-inflamed,Epi,N7.EpiA
N7.EpiA.AAAGCCTGCTCGAA,Enterocyte Progenitors,339,951,N7,Non-inflamed,Epi,N7.EpiA
N7.EpiA.AAATCAACATCACG,TA 1,262,600,N7,Non-inflamed,Epi,N7.EpiA
N7.EpiA.AAATCAACCTTGGA,Immature Goblet,308,976,N7,Non-inflamed,Epi,N7.EpiA
N7.EpiA.AAATCATGGAAAGT,Enterocyte Progenitors,316,934,N7,Non-inflamed,Epi,N7.EpiA
N7.EpiA.AAATCCCTCACTTT,Enterocyte Progenitors,267,655,N7,Non-inflamed,Epi,N7.EpiA


# Adjust metadata according to manuscript

In [7]:
meta['Sample'].value_counts()

N58.LPB1      16723
N111.LPB1     13175
N661.LPA2     12345
N661.LPA1     12253
N661.LPB1      9450
              ...  
N106.EpiA       135
N52.EpiA2b      109
N52.EpiA2a      108
N58.EpiB2        33
N49.EpiA         21
Name: Sample, Length: 133, dtype: int64

In [8]:
meta['Location'].value_counts()

LP     266286
Epi     99206
Name: Location, dtype: int64

In [9]:
meta['Health'].value_counts()

Non-inflamed    130263
Inflamed        125119
Healthy         110110
Name: Health, dtype: int64

In [10]:
len(np.unique(meta['Subject']))

30

In [11]:
pd.crosstab(meta['Subject'], meta['Health'])

Health,Healthy,Inflamed,Non-inflamed
Subject,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
N10,16643,0,0
N106,0,4848,2694
N11,6799,0,0
N110,0,3834,6570
N111,0,19648,5738
N12,0,1355,1009
N13,4695,0,0
N14,0,2276,2676
N15,10649,0,0
N16,5417,0,0


In [12]:
health = meta[['Health', 'Subject', 'Sample']]

replicates = pd.DataFrame([sample.split('.') for sample in np.unique(meta['Sample'])], 
                          columns=['Subject', 'Sample'], index= np.unique(meta['Sample']) )

replicates['Location']= [re.split('A|B',region)[0] for region in replicates['Sample']]
replicates['Replicate'] = [re.split('Epi|LP',region)[1] for region in replicates['Sample']]

In [13]:
replicates

Unnamed: 0,Subject,Sample,Location,Replicate
N10.EpiA,N10,EpiA,Epi,A
N10.EpiB,N10,EpiB,Epi,B
N10.LPA,N10,LPA,LP,A
N10.LPB,N10,LPB,LP,B
N106.EpiA,N106,EpiA,Epi,A
...,...,...,...,...
N8.LPB,N8,LPB,LP,B
N9.EpiA,N9,EpiA,Epi,A
N9.EpiB,N9,EpiB,Epi,B
N9.LPA,N9,LPA,LP,A


Merge health status info and replicate info.

In [14]:
new_meta = replicates.merge(health, how='outer', 
                            left_index=True, right_on='Sample', suffixes=('', '_y'))

new_meta = new_meta.drop(columns = ['Subject_y', 'Sample_y'])

Examine the different numbers of repicates per sample.

In [15]:
replicates['Subject'].value_counts()

N52     12
N58      8
N111     7
N10      4
N106     4
N8       4
N7       4
N661     4
N539     4
N51      4
N50      4
N46      4
N44      4
N26      4
N24      4
N23      4
N21      4
N20      4
N19      4
N18      4
N17      4
N16      4
N15      4
N14      4
N13      4
N12      4
N11      4
N9       4
N49      3
N110     3
Name: Subject, dtype: int64

In [16]:
new_meta.loc[new_meta['Subject']=='N52'].drop_duplicates()
new_meta.loc[new_meta['Subject']=='N58'].drop_duplicates()
new_meta.loc[new_meta['Subject']=='N111'].drop_duplicates()
new_meta.loc[new_meta['Subject']=='N110'].drop_duplicates()
new_meta.loc[new_meta['Subject']=='N49'].drop_duplicates()
new_meta.loc[new_meta['Subject']=='N19'].drop_duplicates()
new_meta.loc[new_meta['Subject']=='N8'].drop_duplicates()

Unnamed: 0_level_0,Subject,Sample,Location,Replicate,Health
NAME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
N8.EpiA.AAACATTGCGTTGA,N8,EpiA,Epi,A,Healthy
N8.EpiB.AAAGTTTGACCCTC,N8,EpiB,Epi,B,Healthy
N8.LPA.AACAATACAGTGTC,N8,LPA,LP,A,Healthy
N8.LPB.AACTCGGATGTCCC,N8,LPB,LP,B,Healthy


Check the number of cell types.

In [17]:
len(np.unique(meta['Cluster']))

51

Merge health status info and replicate info with the remaining metadata.

In [18]:
meta = new_meta.merge(meta, how='outer', left_index=True, right_index=True, suffixes=('', '_y'))
# Drop duplicate columns.
meta = meta.drop(columns = [full_name for full_name in meta.columns if full_name.endswith('_y')])
meta

Unnamed: 0_level_0,Subject,Sample,Location,Replicate,Health,Cluster,nGene,nUMI
NAME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
N10.EpiA.AAACATACAACCAC,N10,EpiA,Epi,A,Healthy,Enterocyte Progenitors,425,968
N10.EpiA.AAACATACAGGCGA,N10,EpiA,Epi,A,Healthy,Cycling TA,1695,7273
N10.EpiA.AAACATACCACTAG,N10,EpiA,Epi,A,Healthy,Immature Goblet,391,1190
N10.EpiA.AAACATACCCTTTA,N10,EpiA,Epi,A,Healthy,Secretory TA,1327,5620
N10.EpiA.AAACATACTGCAAC,N10,EpiA,Epi,A,Healthy,Immature Enterocytes 2,1383,4676
...,...,...,...,...,...,...,...,...
N9.LPB.TTTATCCTAACGAA,N9,LPB,LP,B,Inflamed,Enterocytes,2768,18811
N9.LPB.TTTATCCTGTAAAG,N9,LPB,LP,B,Inflamed,Plasma,1392,27685
N9.LPB.TTTATCCTGTCGTA,N9,LPB,LP,B,Inflamed,Plasma,574,5478
N9.LPB.TTTCAGTGGCGTTA,N9,LPB,LP,B,Inflamed,Macrophages,1437,5698


## Build lineage tree from Figure 1D

(Analysis from here by Johannes Ostner)

The following cell lineage assignments were extracted from Figure 1D and the methods section ("Cell Lineage dendrogram") of Smillie et al. (2019).
Unfortunately, the tree and the description do not match. We use the assignment provided in the text.

- Epithelial cells
    - Absorptive
        - TA cells
            - TA 1
            - TA 2
        - Immature cells
            - Immature Enterocytes 1
            - Immature Enterocytes 2
            - Enterocyte Progenitors
        - Mature cells
            - Enterocytes
            - BEST4+ Enterocytes
    - Secretory
        - Progenitor cells
            - Secretory TA
            - Immature Goblet
        - Mature cells
            - Goblet
            - Tuft
            - Enteroendocrine
    - Stem
    - Cycling TA
    - M cells
- Stromal cells
    - Glia
    - Fibroblast
        - WNT2B+
            - WNT2B+ Fos-hi
            - WNT2B+ Fos-lo 1
            - WNT2B+ Fos-lo 2
            - RSPO3+
        - WNT5B+
            - WNT5B+ 1
            - WNT5B+ 2
        - Inflammatory Fibroblast
        - Myofibroblast
    - Endothelial
        - Endothelial
        - Microvascular
        - Post-capillary venules
        - Pericytes
- Immune cells
    - Myeloid cells
        - Mast
            - CD69+ Mast
            - CD69- Mast
        - Monocytes
            - Macrophages
            - Cycling Monocytes
            - Inflammatory Monocytes
            - DCs
                - DC1
                - DC2
    - Lymphoid cells
        - NK cells
        - ILCs
        - B cells
            - Plasma
            - Follicular
            - GC
            - Cycling B
        - T cells
            - CD4+ T cells
                - CD4+ Activated Fos-lo
                - CD4+ Activated Fos-hi
                - CD4+ Memory, Tregs
                - PD1+
                - MT-hi
                - Tregs
            - CD8+ T cells
                - CD8+ IELs
                - CD8+ LP
                - CD8+IL-17+
                - Cycling T


The rough annotation in Figure 1C can be inferred as follows:
- Fibroblasts: WNT2B+ Fos-hi, WNT2B+ Fos-lo 1,WNT2B+ Fos-lo 2, WNT5B+ 1, WNT5B+ 2, RSPO3+, Inflammatory Fibroblasts, Myofibroblasts, 
- Endothelial: Endothelial, Microvascular, Post-capillary Venules, Pericytes 
- Glia
- NKs/ILCs: NKs, ILCs
- Myeloid: Macrophages, DC1, DC2, Inflammatory Monocytes, CD69+ Mast, CD69- Mast, Cycling Monocytes
- T cells: CD4+ Activated Fos-hi, CD4+ Activated Fos-lo, CD4+ Memory, Tregs, CD4+ PD1+, CD8+ IELs, CD8+ IL17+, CD8+ LP, MT-hi, Cycling T
- B cells: Plasma, Follicular, GC, Cycling B
- Epithelial: Stem, TA 1, TA 2, Cycling TA, Immature Enterocytes 1, Immature Enterocytes 2, Enterocytes, M-like cells, Best4+ Enterocytes, Secretory TA, Immature Goblet, Goblet, Tuft, Enteroendocrine, Enterocyte Progenitors

In [19]:
meta['Cluster'] = meta['Cluster'].astype('category')

meta

Unnamed: 0_level_0,Subject,Sample,Location,Replicate,Health,Cluster,nGene,nUMI
NAME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
N10.EpiA.AAACATACAACCAC,N10,EpiA,Epi,A,Healthy,Enterocyte Progenitors,425,968
N10.EpiA.AAACATACAGGCGA,N10,EpiA,Epi,A,Healthy,Cycling TA,1695,7273
N10.EpiA.AAACATACCACTAG,N10,EpiA,Epi,A,Healthy,Immature Goblet,391,1190
N10.EpiA.AAACATACCCTTTA,N10,EpiA,Epi,A,Healthy,Secretory TA,1327,5620
N10.EpiA.AAACATACTGCAAC,N10,EpiA,Epi,A,Healthy,Immature Enterocytes 2,1383,4676
...,...,...,...,...,...,...,...,...
N9.LPB.TTTATCCTAACGAA,N9,LPB,LP,B,Inflamed,Enterocytes,2768,18811
N9.LPB.TTTATCCTGTAAAG,N9,LPB,LP,B,Inflamed,Plasma,1392,27685
N9.LPB.TTTATCCTGTCGTA,N9,LPB,LP,B,Inflamed,Plasma,574,5478
N9.LPB.TTTCAGTGGCGTTA,N9,LPB,LP,B,Inflamed,Macrophages,1437,5698


In [20]:
meta['Major_l1'] = meta['Cluster'].cat.add_categories(['Stromal', 'Epithelial', 'Immune'])
meta['Major_l1'][np.in1d(meta['Major_l1'], ['WNT2B+ Fos-hi', 'WNT2B+ Fos-lo 1', 'WNT2B+ Fos-lo 2', 'WNT5B+ 1', 'WNT5B+ 2', 'RSPO3+', 
                                      'Inflammatory Fibroblasts', 'Myofibroblasts', 'Endothelial', 'Microvascular', 
                                      'Post-capillary Venules', 'Pericytes', 'Glia'])] = 'Stromal'
meta['Major_l1'][np.in1d(meta['Major_l1'], ['Stem', 'TA 1', 'TA 2', 'Cycling TA', 'Immature Enterocytes 1', 'Immature Enterocytes 2', 
                         'Enterocytes', 'M cells', 'Best4+ Enterocytes', 'Secretory TA', 'Immature Goblet', 'Goblet', 
                         'Tuft', 'Enteroendocrine', 'Enterocyte Progenitors'])] = 'Epithelial'
meta['Major_l1'][np.in1d(meta['Major_l1'], ['Macrophages', 'DC1', 'DC2', 'Inflammatory Monocytes', 'CD69+ Mast', 'CD69- Mast', 
                                      'Cycling Monocytes', 'CD4+ Activated Fos-hi', 'CD4+ Activated Fos-lo', 'CD4+ Memory', 
                                      'NKs', 'ILCs', 'Tregs', 'CD4+ PD1+', 'CD8+ IELs', 'CD8+ IL17+', 'CD8+ LP', 'MT-hi', 
                                      'Cycling T', 'Plasma', 'Follicular', 'GC', 'Cycling B'])] = 'Immune'
meta['Major_l1'] = meta['Major_l1'].cat.remove_unused_categories()

In [21]:
meta['Major_l2'] = meta['Cluster'].cat.add_categories(['Stromal', 'Epithelial', 'Myeloid', 'Lymphoid'])
meta['Major_l2'][np.in1d(meta['Major_l2'], ['WNT2B+ Fos-hi', 'WNT2B+ Fos-lo 1', 'WNT2B+ Fos-lo 2', 'WNT5B+ 1', 'WNT5B+ 2', 'RSPO3+',
                                      'Inflammatory Fibroblasts', 'Myofibroblasts', 'Endothelial', 'Microvascular',
                                      'Post-capillary Venules', 'Pericytes', 'Glia'])] = 'Stromal'
meta['Major_l2'][np.in1d(meta['Major_l2'], ['Stem', 'TA 1', 'TA 2', 'Cycling TA', 'Immature Enterocytes 1', 'Immature Enterocytes 2',
                         'Enterocytes', 'M cells', 'Best4+ Enterocytes', 'Secretory TA', 'Immature Goblet', 'Goblet',
                         'Tuft', 'Enteroendocrine', 'Enterocyte Progenitors'])] = 'Epithelial'
meta['Major_l2'][np.in1d(meta['Major_l2'], ['Macrophages', 'DC1', 'DC2', 'Inflammatory Monocytes', 'CD69+ Mast', 'CD69- Mast',
                                      'Cycling Monocytes'])] = 'Myeloid'
meta['Major_l2'][np.in1d(meta['Major_l2'], ['CD4+ Activated Fos-hi', 'CD4+ Activated Fos-lo', 'CD4+ Memory',
                                      'NKs', 'ILCs', 'Tregs', 'CD4+ PD1+', 'CD8+ IELs', 'CD8+ IL17+', 'CD8+ LP', 'MT-hi',
                                      'Cycling T', 'Plasma', 'Follicular', 'GC', 'Cycling B'])] = 'Lymphoid'
meta['Major_l2'] = meta['Major_l2'].cat.remove_unused_categories()

In [22]:
meta['Major_l3'] = meta['Cluster'].cat.add_categories(['Absorptive', 'Secretory', 'Stem3', 'Cycling TA3', 'M cells3', 'Glia3', 'Fibroblasts', 'Endothelial3', 'Mast', 'Monocytes', 'NKs3', 'ILCs3', 'B cells', 'T cells'])

meta['Major_l3'][np.in1d(meta['Major_l3'], ['WNT2B+ Fos-hi', 'WNT2B+ Fos-lo 1', 'WNT2B+ Fos-lo 2', 'WNT5B+ 1', 'WNT5B+ 2', 'RSPO3+',
                                      'Inflammatory Fibroblasts', 'Myofibroblasts'])] = 'Fibroblasts'
meta['Major_l3'][np.in1d(meta['Major_l3'], ['Endothelial', 'Microvascular',
                                      'Post-capillary Venules', 'Pericytes'])] = 'Endothelial3'
meta['Major_l3'][np.in1d(meta['Major_l3'], ['Glia'])] = 'Glia3'

meta['Major_l3'][np.in1d(meta['Major_l3'], ['TA 1', 'TA 2', 'Immature Enterocytes 1', 'Immature Enterocytes 2',
                         'Enterocytes', 'Best4+ Enterocytes', 'Enterocyte Progenitors'])] = 'Absorptive'
meta['Major_l3'][np.in1d(meta['Major_l3'], ['Secretory TA', 'Immature Goblet', 'Goblet', 'Tuft', 'Enteroendocrine'])] = 'Secretory'
meta['Major_l3'][np.in1d(meta['Major_l3'], ['Stem'])] = 'Stem3'
meta['Major_l3'][np.in1d(meta['Major_l3'], ['Cycling TA'])] = 'Cycling TA3'
meta['Major_l3'][np.in1d(meta['Major_l3'], ['M cells'])] = 'M cells3'

meta['Major_l3'][np.in1d(meta['Major_l3'], ['Plasma', 'Follicular', 'GC', 'Cycling B'])] = 'B cells'
meta['Major_l3'][np.in1d(meta['Major_l3'], [ 'CD4+ Activated Fos-hi', 'CD4+ Activated Fos-lo', 'CD4+ Memory',
                                      'Tregs', 'CD4+ PD1+', 'CD8+ IELs', 'CD8+ IL17+', 'CD8+ LP', 'MT-hi',
                                      'Cycling T'])] = 'T cells'
meta['Major_l3'][np.in1d(meta['Major_l3'], ['CD69+ Mast', 'CD69- Mast'])] = 'Mast'
meta['Major_l3'][np.in1d(meta['Major_l3'], ['Macrophages', 'DC1', 'DC2', 'Inflammatory Monocytes', 'Cycling Monocytes'])] = 'Monocytes'
meta['Major_l3'][np.in1d(meta['Major_l3'], ['NKs'])] = 'NKs3'
meta['Major_l3'][np.in1d(meta['Major_l3'], ['ILCs'])] = 'ILCs3'

meta['Major_l3'] = meta['Major_l3'].cat.remove_unused_categories()

In [23]:
meta['Major_l4'] = meta['Cluster'].cat.add_categories(['TA cells', 'Immature cells', 'Absorptive Mature cells', 'Progenitor cells', 'Secretory Mature cells', 'Stem4', 'Cycling TA4', 'M cells4',
                                                       'Glia4', 'WNT2B+', 'WNT5B+', 'Inflammatory Fibroblast4', 'Myofibroblast4', 'EndothelialCells', 'Microvascular4', 'Post-capillary venules4', 'Pericytes4',
                                                       'CD69+ Mast4', 'CD69- Mast4', 'Macrophages4', 'Cycling Monocytes4', 'Inflammatory Monocytes4', 'DCs',
                                                       'NKs4', 'ILCs4', 'Plasma4', 'Follicular4', 'GC4', 'Cycling B4', 'CD4+ T', 'CD8+ T'])

meta['Major_l4'][np.in1d(meta['Major_l4'], ['WNT2B+ Fos-hi', 'WNT2B+ Fos-lo 1', 'WNT2B+ Fos-lo 2', 'RSPO3+'])] = 'WNT2B+'
meta['Major_l4'][np.in1d(meta['Major_l4'], ['WNT5B+ 1', 'WNT5B+ 2'])] = 'WNT5B+'
meta['Major_l4'][np.in1d(meta['Major_l4'], ['Inflammatory Fibroblasts'])] = 'Inflammatory Fibroblast4'
meta['Major_l4'][np.in1d(meta['Major_l4'], ['Myofibroblasts'])] = 'Myofibroblast4'

meta['Major_l4'][np.in1d(meta['Major_l4'], ['Endothelial', 'Microvascular', 'Post-capillary Venules', 'Pericytes'])] = 'EndothelialCells'
meta['Major_l4'][np.in1d(meta['Major_l4'], ['Glia'])] = 'Glia4'

meta['Major_l4'][np.in1d(meta['Major_l4'], ['TA 1', 'TA 2'])] = 'TA cells'
meta['Major_l4'][np.in1d(meta['Major_l4'], ['Immature Enterocytes 1', 'Immature Enterocytes 2', 'Enterocyte Progenitors'])] = 'Immature cells'
meta['Major_l4'][np.in1d(meta['Major_l4'], ['Enterocytes', 'Best4+ Enterocytes'])] = 'Absorptive Mature cells'

meta['Major_l4'][np.in1d(meta['Major_l4'], ['Secretory TA', 'Immature Goblet'])] = 'Progenitor cells'
meta['Major_l4'][np.in1d(meta['Major_l4'], ['Goblet', 'Tuft', 'Enteroendocrine'])] = 'Secretory Mature cells'

meta['Major_l4'][np.in1d(meta['Major_l4'], ['Stem'])] = 'Stem4'
meta['Major_l4'][np.in1d(meta['Major_l4'], ['Cycling TA'])] = 'Cycling TA4'
meta['Major_l4'][np.in1d(meta['Major_l4'], ['M cells'])] = 'M cells4'

meta['Major_l4'][np.in1d(meta['Major_l4'], ['Plasma'])] = 'Plasma4'
meta['Major_l4'][np.in1d(meta['Major_l4'], ['Follicular'])] = 'Follicular4'
meta['Major_l4'][np.in1d(meta['Major_l4'], ['GC'])] = 'GC4'
meta['Major_l4'][np.in1d(meta['Major_l4'], ['Cycling B'])] = 'Cycling B4'

meta['Major_l4'][np.in1d(meta['Major_l4'], [ 'CD4+ Activated Fos-hi', 'CD4+ Activated Fos-lo', 'CD4+ Memory',
                                      'Tregs', 'CD4+ PD1+', 'MT-hi'])] = 'CD4+ T'
meta['Major_l4'][np.in1d(meta['Major_l4'], ['CD8+ IELs', 'CD8+ IL17+', 'CD8+ LP', 'Cycling T'])] = 'CD8+ T'

meta['Major_l4'][np.in1d(meta['Major_l4'], ['CD69+ Mast'])] = 'CD69+ Mast4'
meta['Major_l4'][np.in1d(meta['Major_l4'], ['CD69- Mast'])] = 'CD69- Mast4'

meta['Major_l4'][np.in1d(meta['Major_l4'], ['Macrophages'])] = 'Macrophages4'
meta['Major_l4'][np.in1d(meta['Major_l4'], ['DC1', 'DC2'])] = 'DCs'
meta['Major_l4'][np.in1d(meta['Major_l4'], ['Inflammatory Monocytes'])] = 'Inflammatory Monocytes4'
meta['Major_l4'][np.in1d(meta['Major_l4'], ['Cycling Monocytes'])] = 'Cycling Monocytes4'

meta['Major_l4'][np.in1d(meta['Major_l4'], ['NKs'])] = 'NKs4'
meta['Major_l4'][np.in1d(meta['Major_l4'], ['ILCs'])] = 'ILCs4'

meta['Major_l4'] = meta['Major_l4'].cat.remove_unused_categories()

In [24]:
pd.value_counts(meta['Major_l1'])

Immune        210614
Epithelial    123006
Stromal        31872
Name: Major_l1, dtype: int64

In [25]:
pd.value_counts(meta['Major_l2'])

Lymphoid      183304
Epithelial    123006
Stromal        31872
Myeloid        27310
Name: Major_l2, dtype: int64

In [26]:
pd.value_counts(meta['Major_l3'])

B cells         107246
Absorptive       83833
T cells          73526
Fibroblasts      24290
Monocytes        21513
Cycling TA3      18204
Secretory        18125
Endothelial3      6320
Mast              5797
Stem3             2403
NKs3              2023
Glia3             1262
ILCs3              509
M cells3           441
Name: Major_l3, dtype: int64

In [27]:
pd.value_counts(meta['Major_l4'])

Plasma4                     82651
CD4+ T                      53968
TA cells                    49445
Immature cells              26585
Follicular4                 21468
CD8+ T                      19558
Cycling TA4                 18204
Macrophages4                16692
Progenitor cells            14436
WNT2B+                      14179
Absorptive Mature cells      7803
EndothelialCells             6320
WNT5B+                       5855
CD69+ Mast4                  5654
Secretory Mature cells       3689
DCs                          2819
Stem4                        2403
Inflammatory Fibroblast4     2268
Cycling B4                   2211
NKs4                         2023
Myofibroblast4               1988
Inflammatory Monocytes4      1652
Glia4                        1262
GC4                           916
ILCs4                         509
M cells4                      441
Cycling Monocytes4            350
CD69- Mast4                   143
Name: Major_l4, dtype: int64

In [28]:
meta['Major_l1'] = meta['Major_l1'].astype(str)
meta['Major_l2'] = meta['Major_l2'].astype(str)
meta['Major_l3'] = meta['Major_l3'].astype(str)
meta['Major_l4'] = meta['Major_l4'].astype(str)

Try to make a tree out of this

In [29]:
tree_levels = ["Major_l1", "Major_l2", "Major_l3", "Major_l4", "Cluster"]
newick = util.df2newick(meta.loc[:, tree_levels].reset_index(drop=True), tree_levels)
print(newick)

(((((Enterocyte Progenitors,Immature Enterocytes 2,Immature Enterocytes 1)Immature cells,(TA 1,TA 2)TA cells,(Best4+ Enterocytes,Enterocytes)Absorptive Mature cells)Absorptive,((Cycling TA)Cycling TA4)Cycling TA3,((Immature Goblet,Secretory TA)Progenitor cells,(Enteroendocrine,Goblet,Tuft)Secretory Mature cells)Secretory,((Stem)Stem4)Stem3,((M cells)M cells4)M cells3)Epithelial)Epithelial,((((CD8+ IELs,CD8+ LP,Cycling T,CD8+ IL17+)CD8+ T,(CD4+ Memory,CD4+ Activated Fos-lo,CD4+ PD1+,CD4+ Activated Fos-hi,MT-hi,Tregs)CD4+ T)T cells,((Cycling B)Cycling B4,(GC)GC4,(Follicular)Follicular4,(Plasma)Plasma4)B cells,((NKs)NKs4)NKs3,((ILCs)ILCs4)ILCs3)Lymphoid,(((CD69+ Mast)CD69+ Mast4,(CD69- Mast)CD69- Mast4)Mast,((Macrophages)Macrophages4,(DC2,DC1)DCs,(Inflammatory Monocytes)Inflammatory Monocytes4,(Cycling Monocytes)Cycling Monocytes4)Monocytes)Myeloid)Immune,((((WNT5B+ 2,WNT5B+ 1)WNT5B+,(WNT2B+ Fos-hi,WNT2B+ Fos-lo 1,RSPO3+,WNT2B+ Fos-lo 2)WNT2B+,(Inflammatory Fibroblasts)Inflammatory Fibrob

In [30]:
# Build tree
tree = tt.tree(newick=newick, tree_format=8)
tree.draw(tip_labels_align=True, node_sizes=10, node_labels='name')

(<toyplot.canvas.Canvas at 0x7fc355834c70>,
 <toyplot.coordinates.Cartesian at 0x7fc3558347c0>,
 <toytree.Render.ToytreeMark at 0x7fc354e32eb0>)

In [31]:
# Collapse all the singularities
tree2 = util.collapse_singularities(tree)
tree2.draw(tip_labels_align=True, node_sizes=10, node_labels='name')

(<toyplot.canvas.Canvas at 0x7fc3554a7f70>,
 <toyplot.coordinates.Cartesian at 0x7fc3554a7a60>,
 <toytree.Render.ToytreeMark at 0x7fc354e32b50>)

Save to file.

In [32]:
meta.to_csv(data_path + 'meta_processed.csv')

In [33]:
meta

Unnamed: 0_level_0,Subject,Sample,Location,Replicate,Health,Cluster,nGene,nUMI,Major_l1,Major_l2,Major_l3,Major_l4
NAME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
N10.EpiA.AAACATACAACCAC,N10,EpiA,Epi,A,Healthy,Enterocyte Progenitors,425,968,Epithelial,Epithelial,Absorptive,Immature cells
N10.EpiA.AAACATACAGGCGA,N10,EpiA,Epi,A,Healthy,Cycling TA,1695,7273,Epithelial,Epithelial,Cycling TA3,Cycling TA4
N10.EpiA.AAACATACCACTAG,N10,EpiA,Epi,A,Healthy,Immature Goblet,391,1190,Epithelial,Epithelial,Secretory,Progenitor cells
N10.EpiA.AAACATACCCTTTA,N10,EpiA,Epi,A,Healthy,Secretory TA,1327,5620,Epithelial,Epithelial,Secretory,Progenitor cells
N10.EpiA.AAACATACTGCAAC,N10,EpiA,Epi,A,Healthy,Immature Enterocytes 2,1383,4676,Epithelial,Epithelial,Absorptive,Immature cells
...,...,...,...,...,...,...,...,...,...,...,...,...
N9.LPB.TTTATCCTAACGAA,N9,LPB,LP,B,Inflamed,Enterocytes,2768,18811,Epithelial,Epithelial,Absorptive,Absorptive Mature cells
N9.LPB.TTTATCCTGTAAAG,N9,LPB,LP,B,Inflamed,Plasma,1392,27685,Immune,Lymphoid,B cells,Plasma4
N9.LPB.TTTATCCTGTCGTA,N9,LPB,LP,B,Inflamed,Plasma,574,5478,Immune,Lymphoid,B cells,Plasma4
N9.LPB.TTTCAGTGGCGTTA,N9,LPB,LP,B,Inflamed,Macrophages,1437,5698,Immune,Myeloid,Monocytes,Macrophages4
