In [1]:
import pandas as pd
import numpy as np
import scanpy as sc
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score

In [2]:
DATASET_TAGS = ['PBMC1', 'PBMC2', 'PBMC3', 'PBMC4']
TOOLS = ['seurat', 'monocle', 'scanpy', 'scvi-tools', 'COTAN']
N_MARKERS = 200
RANDOM_MARKERS_TRIALS = 5
RANDOM_SEED = 42
clf = RandomForestClassifier(random_state=RANDOM_SEED)

In [None]:
feature_perc = {}
f1 = {}
num_clusters = {}

for dataset in DATASET_TAGS:
	print(f"Processing dataset {dataset}")
	feature_perc[dataset] = {}
	f1[dataset] = {}
	num_clusters[dataset] = {}
	# load gene expression matrix
	data_path = f"./data/{dataset}/filtered/10X/"
	adata = sc.read_10x_mtx(
		data_path,
		var_names='gene_symbols',
		cache=False
	)
	for tool in TOOLS:
		print(f"Tool {tool}")
		# load cluster labels
		labels_path = f"./data/{dataset}/antibody_annotation/antibody_labels_postproc.csv"
		y_df = pd.read_csv(labels_path, index_col=0)
		num_clusters[dataset][tool] = np.unique(y_df['cluster.ids']).shape[0]
		# load markers
		markers_path = f"./data/{dataset}/{tool}/antibody/markers.csv"
		markers_df = pd.read_csv(markers_path)
		markers = markers_df[markers_df['rank']<=N_MARKERS]['gene'].values
		markers = np.unique(markers)
		# load test cells
		test_cells_path = f"./data/{dataset}/antibody_annotation/antibody_labels_test.csv"
		test_cells_df = pd.read_csv(test_cells_path)
		test_cells = test_cells_df['cell'].values
		# split data
		y_test = y_df.loc[test_cells]['cluster.ids'].values
		y_train = y_df[~y_df.index.isin(test_cells)]['cluster.ids'].values
		test_cells_sorted = y_df.loc[test_cells].index
		train_cells_sorted = y_df.loc[y_df.index.difference(test_cells)].index
		test_cells_sorted = [cell for cell in test_cells_sorted if cell in adata.obs_names]
		train_cells_sorted= [cell for cell in train_cells_sorted if cell in adata.obs_names]
		y_test = y_df.loc[test_cells_sorted]['cluster.ids'].values
		y_train = y_df.loc[train_cells_sorted]['cluster.ids'].values
		X_test = adata[test_cells_sorted, markers].X.toarray()
		X_train = adata[train_cells_sorted, markers].X.toarray()
		# save feature percentage
		feature_perc[dataset][tool] = X_train.shape[1]/adata.shape[1]*100
		# fit model with markers
		clf.fit(X_train, y_train)
		y_pred = clf.predict(X_test)
		# save f1 score
		f1_tool = f1_score(y_test, y_pred, average="weighted")
		f1[dataset][tool] = f1_tool
		# fit model with random features multiple times
		random_f1s = []
		for i in range(RANDOM_MARKERS_TRIALS):
			print(f"Trial {i+1}/{RANDOM_MARKERS_TRIALS}")
			# select random features
			features = np.random.choice(adata.shape[1], X_train.shape[1], replace=False)
			# split data
			X_train = adata[train_cells_sorted, features].X.toarray()
			X_test = adata[test_cells_sorted, features].X.toarray()
			# fit model
			clf.fit(X_train, y_train)
			y_pred = clf.predict(X_test)
			# save f1 score
			f1_random = f1_score(y_test, y_pred, average="weighted")
			random_f1s.append(f1_random)
		f1[dataset][f"mean random {tool} size"] = np.mean(random_f1s)
		f1[dataset][f"std random {tool} size"] = np.std(random_f1s)
		f1[dataset][f"{tool} ratio"] = f1_tool / np.mean(random_f1s)

feature_perc_df = pd.DataFrame(feature_perc)
f1_df = pd.DataFrame(f1)
feature_perc_df.to_csv("feature_perc_antibody.csv")
f1_df.to_csv("f1_antibody.csv")
print("Percentage of features used:")
display(feature_perc_df)
print("F1 scores:")
display(f1_df)
print("Number of clusters:")
display(pd.DataFrame(num_clusters))

Processing dataset PBMC1
Tool seurat


  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):


Trial 1/5


  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):


Trial 2/5


  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):


Trial 3/5


  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):


Trial 4/5


  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):


Trial 5/5


  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):


Tool monocle


  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):


Trial 1/5


  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):


Trial 2/5


  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):


Trial 3/5


  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):


Trial 4/5


  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):


Trial 5/5


  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):


Tool scanpy


  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):


Trial 1/5


  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):


Trial 2/5


  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):


Trial 3/5


  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):


Trial 4/5


  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):


Trial 5/5


  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):


Tool scvi-tools


  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):


Trial 1/5


  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):


Trial 2/5


  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):


Trial 3/5


  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):


Trial 4/5


  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):


Trial 5/5


  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):


Tool COTAN


  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):


Trial 1/5


  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):


Trial 2/5


  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):


Trial 3/5


  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):


Trial 4/5


  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):


Trial 5/5


  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):


Processing dataset PBMC2
Tool seurat


  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):


Trial 1/5


  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):


Trial 2/5


  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):


Trial 3/5


  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):


Trial 4/5


  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):


Trial 5/5


  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):


Tool monocle


  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):


Trial 1/5


  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):


Trial 2/5


  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):


Trial 3/5


  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):


Trial 4/5


  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):


Trial 5/5


  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):


Tool scanpy


  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):


Trial 1/5


  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):


Trial 2/5


  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):


Trial 3/5


  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):


Trial 4/5


  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):


Trial 5/5


  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):


Tool scvi-tools


  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):


Trial 1/5


  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):


Trial 2/5


  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):


Trial 3/5


  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):


Trial 4/5


  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):


Trial 5/5


  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):


Tool COTAN


  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):


Trial 1/5


  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):


Trial 2/5


  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):


Trial 3/5


  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):


Trial 4/5


  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):


Trial 5/5


  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):


Processing dataset PBMC3
Tool seurat


  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):


Trial 1/5


  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):


Trial 2/5


  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):


Trial 3/5


  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):


Trial 4/5


  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):


Trial 5/5


  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):


Tool monocle


  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):


Trial 1/5


  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):


Trial 2/5


  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):


Trial 3/5


  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):


Trial 4/5


  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):


Trial 5/5


  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):


Tool scanpy


  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):


Trial 1/5


  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):


Trial 2/5


  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):


Trial 3/5


  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):


Trial 4/5


  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):


Trial 5/5


  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):


Tool scvi-tools


  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):


Trial 1/5


  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):


Trial 2/5


  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):


Trial 3/5


  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):


Trial 4/5


  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):


Trial 5/5


  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):


Tool COTAN


  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):


Trial 1/5


  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):


Trial 2/5


  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):


Trial 3/5


  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):


Trial 4/5


  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):


Trial 5/5


  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):


Processing dataset PBMC4
Tool seurat


  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):


Trial 1/5


  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):


Trial 2/5


  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):


Trial 3/5


  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):


Trial 4/5


  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):


Trial 5/5


  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):


Tool monocle


  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):


Trial 1/5


  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):


Trial 2/5


  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):


Trial 3/5


  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):


Trial 4/5


  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):


Trial 5/5


  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):


Tool scanpy


  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):


Trial 1/5


  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):


Trial 2/5


  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):


Trial 3/5


  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):


Trial 4/5


  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):


Trial 5/5


  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):


Tool scvi-tools


  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):


Trial 1/5


  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):


Trial 2/5


  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):


Trial 3/5


  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):


Trial 4/5


  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):


Trial 5/5


  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):


Tool COTAN


  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):


Trial 1/5


  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):


Trial 2/5


  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):


Trial 3/5


  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):


Trial 4/5


  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):


Trial 5/5


  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):


Percentage of features used:


Unnamed: 0,PBMC1,PBMC2,PBMC3,PBMC4
seurat,8.78463,9.559253,10.587697,9.514706
monocle,8.805513,9.493579,10.244683,9.147059
scanpy,6.661562,6.589317,7.13469,6.919118
scvi-tools,3.188083,3.327496,3.346292,3.323529
COTAN,8.408743,9.165207,9.871179,8.970588


F1 scores:


Unnamed: 0,PBMC1,PBMC2,PBMC3,PBMC4
seurat,0.900962,0.904747,0.885439,0.883279
mean random seurat size,0.810373,0.704284,0.733938,0.764606
std random seurat size,0.005713,0.044979,0.028524,0.009089
seurat ratio,1.111787,1.284634,1.206422,1.155208
monocle,0.900048,0.91225,0.88754,0.88891
mean random monocle size,0.813609,0.68652,0.701198,0.77134
std random monocle size,0.014922,0.037147,0.024635,0.029158
monocle ratio,1.106241,1.328804,1.265747,1.152423
scanpy,0.903075,0.907736,0.894956,0.885581
mean random scanpy size,0.817003,0.630408,0.67503,0.745771


Number of clusters:


Unnamed: 0,PBMC1,PBMC2,PBMC3,PBMC4
seurat,10,11,13,12
monocle,10,11,13,12
scanpy,10,11,13,12
scvi-tools,10,11,13,12
COTAN,10,11,13,12
