In [1]:
import pandas as pd
import numpy as np
import scanpy as sc
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score

In [2]:
DATASET_TAGS = ['PBMC1', 'PBMC2', 'PBMC3', 'PBMC4']
TOOLS = ['seurat', 'monocle', 'scanpy', 'scvi-tools', 'COTAN']
N_MARKERS = 200
RANDOM_MARKERS_TRIALS = 5
clf = RandomForestClassifier()

In [3]:
feature_perc = {}
f1 = {}
num_clusters = {}

for dataset in DATASET_TAGS:
	print(f"Processing dataset {dataset}")
	feature_perc[dataset] = {}
	f1[dataset] = {}
	num_clusters[dataset] = {}
	# load gene expression matrix
	data_path = f"./data/{dataset}/filtered/10X/"
	adata = sc.read_10x_mtx(
		data_path,
		var_names='gene_symbols',
		cache=False
	)
	for tool in TOOLS:
		print(f"Tool {tool}")
		# load cluster labels
		labels_path = f"./data/{dataset}/{tool}/default/clustering_labels.csv"
		y_df = pd.read_csv(labels_path, index_col=0)
		num_clusters[dataset][tool] = np.unique(y_df['cluster']).shape[0]
		# load markers
		markers_path = f"./data/{dataset}/{tool}/default/markers.csv"
		markers_df = pd.read_csv(markers_path)
		markers = markers_df[markers_df['rank']<=N_MARKERS]['gene'].values
		markers = np.unique(markers)
		# load test cells
		test_cells_path = f"./data/{dataset}/{tool}/default/test_cells.csv"
		test_cells_df = pd.read_csv(test_cells_path)
		test_cells = test_cells_df['cell'].values
		# split data
		y_test = y_df.loc[test_cells]['cluster'].values
		y_train = y_df[~y_df.index.isin(test_cells)]['cluster'].values
		test_cells_sorted = y_df.loc[test_cells].index
		train_cells_sorted = y_df.loc[y_df.index.difference(test_cells)].index
		X_test = adata[test_cells_sorted, markers].X.toarray()
		X_train = adata[train_cells_sorted, markers].X.toarray()
		# save feature percentage
		feature_perc[dataset][tool] = X_train.shape[1]/adata.shape[1]*100
		# fit model with markers
		clf.fit(X_train, y_train)
		y_pred = clf.predict(X_test)
		# save f1 score
		f1_tool = f1_score(y_test, y_pred, average="weighted")
		f1[dataset][tool] = f1_tool
		# fit model with random features multiple times
		random_f1s = []
		for i in range(RANDOM_MARKERS_TRIALS):
			print(f"Trial {i+1}/{RANDOM_MARKERS_TRIALS}")
			# select random features
			features = np.random.choice(adata.shape[1], X_train.shape[1], replace=False)
			# split data
			X_train = adata[train_cells_sorted, features].X.toarray()
			X_test = adata[test_cells_sorted, features].X.toarray()
			# fit model
			clf.fit(X_train, y_train)
			y_pred = clf.predict(X_test)
			# save f1 score
			f1_random = f1_score(y_test, y_pred, average="weighted")
			random_f1s.append(f1_random)
		f1[dataset][f"random {tool} size"] = f"{np.mean(random_f1s):.6f}" + " +/- " + f"{np.std(random_f1s):.6f}"

feature_perc_df = pd.DataFrame(feature_perc)
f1_df = pd.DataFrame(f1)
feature_perc_df.to_csv("feature_perc.csv")
f1_df.to_csv("f1.csv")
print("Percentage of features used:")
display(feature_perc_df)
print("F1 scores:")
display(f1_df)
print("Number of clusters:")
display(pd.DataFrame(num_clusters))

Processing dataset PBMC1
Tool seurat


  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):


Trial 1/5


  if not is_categorical_dtype(df_full[k]):


Trial 2/5


  if not is_categorical_dtype(df_full[k]):


Trial 3/5


  if not is_categorical_dtype(df_full[k]):


Trial 4/5


  if not is_categorical_dtype(df_full[k]):


Trial 5/5


  if not is_categorical_dtype(df_full[k]):


Tool monocle


  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):


Trial 1/5


  if not is_categorical_dtype(df_full[k]):


Trial 2/5


  if not is_categorical_dtype(df_full[k]):


Trial 3/5


  if not is_categorical_dtype(df_full[k]):


Trial 4/5


  if not is_categorical_dtype(df_full[k]):


Trial 5/5


  if not is_categorical_dtype(df_full[k]):


Tool scanpy


  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):


Trial 1/5


  if not is_categorical_dtype(df_full[k]):


Trial 2/5


  if not is_categorical_dtype(df_full[k]):


Trial 3/5


  if not is_categorical_dtype(df_full[k]):


Trial 4/5


  if not is_categorical_dtype(df_full[k]):


Trial 5/5


  if not is_categorical_dtype(df_full[k]):


Tool scvi-tools


  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):


Trial 1/5


  if not is_categorical_dtype(df_full[k]):


Trial 2/5


  if not is_categorical_dtype(df_full[k]):


Trial 3/5


  if not is_categorical_dtype(df_full[k]):


Trial 4/5


  if not is_categorical_dtype(df_full[k]):


Trial 5/5


  if not is_categorical_dtype(df_full[k]):


Tool COTAN


  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):


Trial 1/5


  if not is_categorical_dtype(df_full[k]):


Trial 2/5


  if not is_categorical_dtype(df_full[k]):


Trial 3/5


  if not is_categorical_dtype(df_full[k]):


Trial 4/5


  if not is_categorical_dtype(df_full[k]):


Trial 5/5


  if not is_categorical_dtype(df_full[k]):


Processing dataset PBMC2
Tool seurat


  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):


Trial 1/5


  if not is_categorical_dtype(df_full[k]):


Trial 2/5


  if not is_categorical_dtype(df_full[k]):


Trial 3/5


  if not is_categorical_dtype(df_full[k]):


Trial 4/5


  if not is_categorical_dtype(df_full[k]):


Trial 5/5


  if not is_categorical_dtype(df_full[k]):


Tool monocle


  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):


Trial 1/5


  if not is_categorical_dtype(df_full[k]):


Trial 2/5


  if not is_categorical_dtype(df_full[k]):


Trial 3/5


  if not is_categorical_dtype(df_full[k]):


Trial 4/5


  if not is_categorical_dtype(df_full[k]):


Trial 5/5


  if not is_categorical_dtype(df_full[k]):


Tool scanpy


  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):


Trial 1/5


  if not is_categorical_dtype(df_full[k]):


Trial 2/5


  if not is_categorical_dtype(df_full[k]):


Trial 3/5


  if not is_categorical_dtype(df_full[k]):


Trial 4/5


  if not is_categorical_dtype(df_full[k]):


Trial 5/5


  if not is_categorical_dtype(df_full[k]):


Tool scvi-tools


  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):


Trial 1/5


  if not is_categorical_dtype(df_full[k]):


Trial 2/5


  if not is_categorical_dtype(df_full[k]):


Trial 3/5


  if not is_categorical_dtype(df_full[k]):


Trial 4/5


  if not is_categorical_dtype(df_full[k]):


Trial 5/5


  if not is_categorical_dtype(df_full[k]):


Tool COTAN


  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):


Trial 1/5


  if not is_categorical_dtype(df_full[k]):


Trial 2/5


  if not is_categorical_dtype(df_full[k]):


Trial 3/5


  if not is_categorical_dtype(df_full[k]):


Trial 4/5


  if not is_categorical_dtype(df_full[k]):


Trial 5/5


  if not is_categorical_dtype(df_full[k]):


Processing dataset PBMC3
Tool seurat


  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):


Trial 1/5


  if not is_categorical_dtype(df_full[k]):


Trial 2/5


  if not is_categorical_dtype(df_full[k]):


Trial 3/5


  if not is_categorical_dtype(df_full[k]):


Trial 4/5


  if not is_categorical_dtype(df_full[k]):


Trial 5/5


  if not is_categorical_dtype(df_full[k]):


Tool monocle


  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):


Trial 1/5


  if not is_categorical_dtype(df_full[k]):


Trial 2/5


  if not is_categorical_dtype(df_full[k]):


Trial 3/5


  if not is_categorical_dtype(df_full[k]):


Trial 4/5


  if not is_categorical_dtype(df_full[k]):


Trial 5/5


  if not is_categorical_dtype(df_full[k]):


Tool scanpy


  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):


Trial 1/5


  if not is_categorical_dtype(df_full[k]):


Trial 2/5


  if not is_categorical_dtype(df_full[k]):


Trial 3/5


  if not is_categorical_dtype(df_full[k]):


Trial 4/5


  if not is_categorical_dtype(df_full[k]):


Trial 5/5


  if not is_categorical_dtype(df_full[k]):


Tool scvi-tools


  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):


Trial 1/5


  if not is_categorical_dtype(df_full[k]):


Trial 2/5


  if not is_categorical_dtype(df_full[k]):


Trial 3/5


  if not is_categorical_dtype(df_full[k]):


Trial 4/5


  if not is_categorical_dtype(df_full[k]):


Trial 5/5


  if not is_categorical_dtype(df_full[k]):


Tool COTAN


  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):


Trial 1/5


  if not is_categorical_dtype(df_full[k]):


Trial 2/5


  if not is_categorical_dtype(df_full[k]):


Trial 3/5


  if not is_categorical_dtype(df_full[k]):


Trial 4/5


  if not is_categorical_dtype(df_full[k]):


Trial 5/5


  if not is_categorical_dtype(df_full[k]):


Processing dataset PBMC4
Tool seurat


  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):


Trial 1/5


  if not is_categorical_dtype(df_full[k]):


Trial 2/5


  if not is_categorical_dtype(df_full[k]):


Trial 3/5


  if not is_categorical_dtype(df_full[k]):


Trial 4/5


  if not is_categorical_dtype(df_full[k]):


Trial 5/5


  if not is_categorical_dtype(df_full[k]):


Tool monocle


  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):


Trial 1/5


  if not is_categorical_dtype(df_full[k]):


Trial 2/5


  if not is_categorical_dtype(df_full[k]):


Trial 3/5


  if not is_categorical_dtype(df_full[k]):


Trial 4/5


  if not is_categorical_dtype(df_full[k]):


Trial 5/5


  if not is_categorical_dtype(df_full[k]):


Tool scanpy


  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):


Trial 1/5


  if not is_categorical_dtype(df_full[k]):


Trial 2/5


  if not is_categorical_dtype(df_full[k]):


Trial 3/5


  if not is_categorical_dtype(df_full[k]):


Trial 4/5


  if not is_categorical_dtype(df_full[k]):


Trial 5/5


  if not is_categorical_dtype(df_full[k]):


Tool scvi-tools


  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):


Trial 1/5


  if not is_categorical_dtype(df_full[k]):


Trial 2/5


  if not is_categorical_dtype(df_full[k]):


Trial 3/5


  if not is_categorical_dtype(df_full[k]):


Trial 4/5


  if not is_categorical_dtype(df_full[k]):


Trial 5/5


  if not is_categorical_dtype(df_full[k]):


Tool COTAN


  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):


Trial 1/5


  if not is_categorical_dtype(df_full[k]):


Trial 2/5


  if not is_categorical_dtype(df_full[k]):


Trial 3/5


  if not is_categorical_dtype(df_full[k]):


Trial 4/5


  if not is_categorical_dtype(df_full[k]):


Trial 5/5


  if not is_categorical_dtype(df_full[k]):


Percentage of features used:


Unnamed: 0,PBMC1,PBMC2,PBMC3,PBMC4
seurat,9.035222,10.953006,12.615291,13.882353
monocle,3.765836,2.692644,4.001829,3.852941
scanpy,7.503828,7.953882,9.223264,8.220588
scvi-tools,3.48044,3.64857,3.811266,3.676471
COTAN,13.323124,16.367484,26.137663,17.455882


F1 scores:


Unnamed: 0,PBMC1,PBMC2,PBMC3,PBMC4
seurat,0.902581,0.898912,0.884333,0.838939
random seurat size,0.782675 +/- 0.015713,0.685673 +/- 0.037665,0.698362 +/- 0.036215,0.685852 +/- 0.026542
monocle,0.993076,0.999163,0.997716,0.99281
random monocle size,0.983490 +/- 0.007322,0.975987 +/- 0.015114,0.982095 +/- 0.010251,0.982804 +/- 0.003269
scanpy,0.78109,0.815444,0.803117,0.756628
random scanpy size,0.515230 +/- 0.020910,0.593174 +/- 0.034509,0.586823 +/- 0.036834,0.531525 +/- 0.018057
scvi-tools,0.816593,0.898578,0.871107,0.868501
random scvi-tools size,0.562101 +/- 0.023813,0.539898 +/- 0.026537,0.589238 +/- 0.015008,0.612459 +/- 0.039640
COTAN,0.835748,0.762722,0.640365,0.765197
random COTAN size,0.712619 +/- 0.022375,0.565720 +/- 0.032603,0.513200 +/- 0.019702,0.630279 +/- 0.014434


Number of clusters:


Unnamed: 0,PBMC1,PBMC2,PBMC3,PBMC4
seurat,11,14,18,19
monocle,3,2,3,3
scanpy,18,18,22,22
scvi-tools,16,16,17,16
COTAN,19,27,53,31
