In [172]:
import pandas as pd

DATASETS_TAGS = ['tabula-muris-marrow_P7_3', 'tabula-muris-heart', 'peripheal-blood', 'zheng-4', 'zheng-8']
# Order dataset tags by length
DATASETS_TAGS = sorted(DATASETS_TAGS, key=lambda x: len(x))
# len of longest dataset tag
MAX_LEN = len(DATASETS_TAGS[-1]) + 15
dfs = {}
for dataset in DATASETS_TAGS:
    dfs[dataset] = pd.read_csv("./results/aggregate/{}/markers.csv".format(dataset))

In [173]:
dfClusterGene = {}
for dataset in DATASETS_TAGS:
    dfClusterGene[dataset] = dfs[dataset].groupby(['cluster', 'gene']).size().reset_index(name='count')    

In [208]:
dfClusterIntersection = {}
for dataset in DATASETS_TAGS:
    # number of cluster
    n_cluster = dfClusterGene[dataset]['cluster'].nunique()
    dfClusterIntersection[dataset] = dfClusterGene[dataset][dfClusterGene[dataset]['count'] == n_cluster].groupby('cluster').size().reset_index(name='count')

In [209]:
for dataset in DATASETS_TAGS:
    sum = dfClusterIntersection[dataset]['count'].sum()
    percentage = (sum/250)*100
    print(f'{dataset}: sum({sum}) {round(percentage, 10)} %')

zheng-4: sum(26) 10.4 %
zheng-8: sum(0) 0.0 %
peripheal-blood: sum(0) 0.0 %
tabula-muris-heart: sum(71) 28.4 %
tabula-muris-marrow_P7_3: sum(0) 0.0 %


In [206]:
# COTAN vs Rest results for each dataset
dfCotanVS = {}
dfClusterGeneTool = {}
for dataset in DATASETS_TAGS:
    dfCotanVS[dataset] = {}
    dfClusterGeneTool[dataset] = dfs[dataset].groupby(['cluster', 'gene', 'tool']).size().reset_index(name='count')
    for tool in ['scanpy', 'seurat', 'monocle', 'scvi-tools']:
        dfCotanVS[dataset]['COTANvs'+tool.title()] = dfClusterGeneTool[dataset][dfClusterGeneTool[dataset]['tool'] == 'COTAN'].merge(dfClusterGeneTool[dataset][dfClusterGeneTool[dataset]['tool'] == tool], on=['cluster', 'gene'], how='inner')
        dfCotanVS[dataset]['COTANvs'+tool.title()] = dfCotanVS[dataset]['COTANvs'+tool.title()].groupby(['cluster', 'gene']).size().reset_index(name='count')
        n_cluster = dfCotanVS[dataset]['COTANvs'+tool.title()]['cluster'].nunique()
        dfCotanVS[dataset]['COTANvs'+tool.title()] = dfCotanVS[dataset]['COTANvs'+tool.title()].groupby('cluster').size().reset_index(name='count').sum()['count']
        # dfCotanVS[dataset]['COTANvs'+tool.title()] = round(dfCotanVS[dataset]['COTANvs'+tool.title()]/n_cluster, 4)
        val = round(dfCotanVS[dataset]['COTANvs'+tool.title()], 4)
        print(f'{dataset} comparison COTAN with {tool.title()} {round(val/n_cluster, 4)} {dfCotanVS[dataset]["COTANvs"+tool.title()]}')

zheng-4 comparison COTAN with Scanpy 16.5 66
zheng-4 comparison COTAN with Seurat 28.0 112
zheng-4 comparison COTAN with Monocle 14.75 59
zheng-4 comparison COTAN with Scvi-Tools 16.25 65
zheng-8 comparison COTAN with Scanpy 16.4286 115
zheng-8 comparison COTAN with Seurat 24.1429 169
zheng-8 comparison COTAN with Monocle 18.4286 129
zheng-8 comparison COTAN with Scvi-Tools 16.125 129
peripheal-blood comparison COTAN with Scanpy 18.7 187
peripheal-blood comparison COTAN with Seurat 19.3 193
peripheal-blood comparison COTAN with Monocle 21.1 211
peripheal-blood comparison COTAN with Scvi-Tools 25.0 225
tabula-muris-heart comparison COTAN with Scanpy 21.2 106
tabula-muris-heart comparison COTAN with Seurat 24.8 124
tabula-muris-heart comparison COTAN with Monocle 32.0 160
tabula-muris-heart comparison COTAN with Scvi-Tools 26.2 131
tabula-muris-marrow_P7_3 comparison COTAN with Scanpy 12.0833 145
tabula-muris-marrow_P7_3 comparison COTAN with Seurat 15.4167 185
tabula-muris-marrow_P7_3 c

In [198]:
dfCotanVS['zheng-4']['COTANvsScanpy'].groupby('cluster').size().reset_index(name='count').sum()['count']

66