In [1]:
import os
import zipfile

import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

# Imputation libraries
import scprep # preprocessing
import saver
import magic
import scanpy as sc

from scipy.stats import spearmanr

# Clustering libraries
from sklearn.manifold import TSNE
import umap
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA, FastICA
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
#from sklearn.cluster import DBSCAN
from sklearn.utils import shuffle
import anndata
import scanpy.external as sce
import harmonypy
import pynndescent
import hdbscan


from sklearn.metrics import silhouette_score, adjusted_rand_score, v_measure_score
#https://nbisweden.github.io/workshop-scRNAseq/labs/compiled/scanpy/scanpy_04_clustering.html

  from .autonotebook import tqdm as notebook_tqdm


# Step 0: Download the data

In [2]:
path_data = "data/"
results_path = "results"
archive_name = "CarballoCastro_Alba.zip"

In [3]:
#Read the bulk and set gene names as variables and patients as observations

bulk_pancreas = pd.read_csv(os.path.join(path_data,"train_data","pancreas_bulk_train.csv")).set_index("level_0")
bulk = sc.read(os.path.join(path_data,"train_data","pancreas_bulk_train.csv")).transpose()
bulk.var['gene_id'] = bulk_pancreas.index
bulk.obs ['patient_id'] = bulk_pancreas.columns
bulk.var_names_make_unique()

In [4]:
#Now I read the single cell data and metadata
sc_pancreas = pd.read_csv(os.path.join(path_data,"train_data","pancreas_sc_train.csv")).set_index("Unnamed: 0")
sc_pancreas_metadata = pd.read_csv(os.path.join(path_data,"train_data","pancreas_sc_metadata_train.csv")).set_index("Source Name")
sc_train = sc.read(os.path.join(path_data,"train_data","pancreas_sc_train.csv")).transpose()
sc_train.var['gene_id'] = sc_pancreas.index
sc_train.obs['cell_id'] = sc_pancreas.columns
sc_train.obs['cell_type']= sc_pancreas_metadata['Celltype']
sc_train.var_names_make_unique()

# I also read the single cell test data and metadata

sc_pancreas_test = pd.read_csv(os.path.join(path_data,"test_data","pancreas_sc_test.csv")).set_index("Unnamed: 0")
sc_pancreas_metadata_test = pd.read_csv(os.path.join(path_data,"test_data","pancreas_sc_metadata_test_wocelltype.csv")).set_index("Source Name")

sc_test = sc.read(os.path.join(path_data,"test_data","pancreas_sc_test.csv")).transpose()
sc_test.var['gene_id'] = sc_pancreas_test.index
sc_test.obs['cell_id'] = sc_pancreas_test.columns
sc_test.var_names_make_unique()

In [5]:
print(f"Number of patients in the train dataset {bulk.X.shape[0]}")

print(f"Number of genes in the dataset {bulk.X.shape[1]}")

assert bulk.X.shape[1] == sc_train.X.shape[1]
print(f"Number of cells in the train dataset {sc_train.X.shape[0]}")

Number of patients in the train dataset 4
Number of genes in the dataset 25453
Number of cells in the train dataset 978


In [6]:
print(f"Number of cells in the test dataset {sc_test.X.shape[0]}")

Number of cells in the test dataset 789


In [7]:
for dis in sc_pancreas_metadata.Disease.unique():
    df = sc_pancreas_metadata[sc_pancreas_metadata.Disease==dis]
    print(f"There are {df.Sample.nunique()} train patients with {dis}")

There are 2 train patients with type II diabetes mellitus
There are 2 train patients with normal


In [8]:
print(f"There are {sc_pancreas_metadata.Celltype.nunique()} different cell types in the dataset")
print(f"The different cells types are {sc_pancreas_metadata.Celltype.unique()}")

There are 13 different cell types in the dataset
The different cells types are ['acinar cell' 'alpha cell' 'gamma cell' 'delta cell'
 'unclassified endocrine cell' 'beta cell' 'ductal cell'
 'endothelial cell' 'co-expression cell' 'PSC cell' 'epsilon cell'
 'MHC class II cell' 'mast cell']


In [9]:
for patient in sc_pancreas_metadata_test.Sample.unique():
    print(f"Number of cells for {patient} is {sc_pancreas_metadata_test[sc_pancreas_metadata_test.Sample==patient].shape[0]}")

Number of cells for patient5 is 203
Number of cells for patient7 is 284
Number of cells for patient6 is 302


In [10]:
for dis in sc_pancreas_metadata_test.Disease.unique():
    df = sc_pancreas_metadata_test[sc_pancreas_metadata_test.Disease==dis]
    print(f"There are {df.Sample.nunique()} test patients with {dis}")

There are 1 test patients with normal
There are 2 test patients with type II diabetes mellitus


# Step 1: Perform imputation

In [11]:
# For the imputation, we need to have the data log transformed
sc.pp.log1p(bulk)
sc.pp.log1p(sc_train)
sc.pp.log1p(sc_test)

In [12]:
magic_op = magic.MAGIC()
#Apply MAGIC in the whole training data
sc_magic = magic_op.fit_transform(sc_train.X)

Calculating MAGIC...
  Running MAGIC on 978 cells and 25453 genes.
  Calculating graph and diffusion operator...
    Calculating PCA...




    Calculated PCA in 4.20 seconds.
    Calculating KNN search...
    Calculated KNN search in 0.13 seconds.
    Calculating affinities...
    Calculated affinities in 0.07 seconds.
  Calculated graph and diffusion operator in 4.39 seconds.
  Running MAGIC with `solver='exact'` on 25453-dimensional data may take a long time. Consider denoising specific genes with `genes=<list-like>` or using `solver='approximate'`.
  Calculating imputation...
  Calculated imputation in 0.51 seconds.
Calculated MAGIC in 4.91 seconds.


In [13]:
#We will create the bulkified matrix to compare the bulk with the imputed data.
#In the bulk matrix we just have one measurement per patient per gene. 
#In the sc data we need to compute the average over each cell for each patient and gene

assert sc_pancreas_metadata.shape[0] == sc_magic.shape[0]
df_magic = pd.DataFrame(sc_magic, index= sc_pancreas_metadata.index, columns= sc_train.var['gene_id'])
train_merged = sc_pancreas_metadata.join(df_magic, on='Source Name')
train_bulkified = train_merged.groupby('Sample', as_index= False).mean()
train_bulkified.drop(train_bulkified.columns[0],axis=1, inplace=True)
train_bulkified.index = ["patient1","patient2","patient3","patient4"]
print(f'Train bulkified matrix shape is : {train_bulkified.shape}')
print(f'Train bulk matrix shape is : {bulk.X.shape}')

#I will transpose them so that they agree with the previous dimensions just in case
train_bulkified_t = train_bulkified.transpose()
bulk_t = bulk.transpose()

print(f'Transposed train bulkified matrix shape is : {train_bulkified_t.shape}')
print(f'Transposed train bulk matrix shape is : {bulk_t.X.shape}')

Train bulkified matrix shape is : (4, 25453)
Train bulk matrix shape is : (4, 25453)
Transposed train bulkified matrix shape is : (25453, 4)
Transposed train bulk matrix shape is : (25453, 4)


In [14]:
# MAGIC
corr, pval = spearmanr(bulk_t.X, train_bulkified_t)
spearman = corr.mean()
print(f'Spearman correlation for train is {spearman}')

Spearman correlation for train is 0.9580381876394828


## Imputation on test data

In [15]:
sc_magic_test = magic_op.fit_transform(sc_test.X)

assert sc_pancreas_metadata_test.shape[0] == sc_magic_test.shape[0]

df_magic_test = pd.DataFrame(sc_magic_test, index= sc_pancreas_metadata_test.index, columns= sc_test.var['gene_id'])
test_merged = sc_pancreas_metadata_test.join(df_magic_test, on='Source Name')
test_bulkified = test_merged.groupby('Sample', as_index= False).mean()
test_bulkified.drop(test_bulkified.columns[0],axis=1, inplace=True)

# We transpose them so that they agree with the previous dimensions
bulkified = test_bulkified.transpose()

Calculating MAGIC...
  Running MAGIC on 789 cells and 25453 genes.
  Calculating graph and diffusion operator...
    Calculating PCA...




    Calculated PCA in 3.36 seconds.
    Calculating KNN search...
    Calculated KNN search in 0.13 seconds.
    Calculating affinities...
    Calculated affinities in 0.05 seconds.
  Calculated graph and diffusion operator in 3.55 seconds.
  Running MAGIC with `solver='exact'` on 25453-dimensional data may take a long time. Consider denoising specific genes with `genes=<list-like>` or using `solver='approximate'`.
  Calculating imputation...
  Calculated imputation in 0.50 seconds.
Calculated MAGIC in 4.05 seconds.


In [16]:
bulkified.columns = ["patient5","patient6","patient7"]
bulkified.index.name = 'index'
bulkified.reset_index(inplace=True)

In [17]:
assert all(bulkified.columns == ["index","patient5","patient6","patient7"])
assert all(bulkified["index"] == sc_pancreas_test.index)

# Step 2: Perform clustering

In [18]:
sc_train = sc.read(os.path.join(path_data,"train_data","pancreas_sc_train.csv")).transpose()
sc_train.var['gene_id'] = sc_pancreas.index
sc_train.obs['cell_id'] = sc_pancreas.columns
sc_train.obs['cell_type']= sc_pancreas_metadata['Celltype']
sc_train.var_names_make_unique()

In [19]:
sc.pp.filter_genes(sc_train, min_cells=14)
sc.pp.log1p(sc_train)
sc.pp.normalize_total(sc_train, target_sum=1e4) 
sc.pp.log1p(sc_train, base=10)
sc_train.raw = sc_train
sc.pp.highly_variable_genes(sc_train, n_top_genes=800)
sc_train = sc_train[:, sc_train.var.highly_variable]
sc.pp.scale(sc_train)

df = pd.DataFrame(sc_train.X)



  view_to_actual(adata)


In [20]:
#Encode the true labels
cell_labels = sc_pancreas_metadata['Celltype']
encoder = LabelEncoder()
encoded_label = encoder.fit_transform(cell_labels)

clusterable_embedding = umap.UMAP(
    n_neighbors=9,
    min_dist=0.0,
    n_components=110,
    random_state=42).fit_transform(df)

pca = PCA(n_components=50)
pca_matrix = pca.fit_transform(clusterable_embedding)

labels = hdbscan.HDBSCAN(min_samples=2, min_cluster_size=14).fit_predict(clusterable_embedding)

# Evaluate the performance
ssc = silhouette_score(pca_matrix, labels)
print('Silhouette score:', ssc)

ari = adjusted_rand_score(encoded_label, labels)
print('Adjusted Rand Index:', ari)

v_measure = v_measure_score(encoded_label, labels)
print('V-Measure score:', v_measure)

performance = (1/3)*(ssc + ari + v_measure)
print('Performance:', performance)

Silhouette score: 0.77570915
Adjusted Rand Index: 0.9091723074345411
V-Measure score: 0.8980144331333542
Performance: 0.8609652975965251


## Clustering on test data

In [21]:
sc_test = sc.read(os.path.join(path_data,"test_data","pancreas_sc_test.csv")).transpose()
sc_test.var['gene_id'] = sc_pancreas_test.index
sc_test.obs['cell_id'] = sc_pancreas_test.columns

sc_test.var_names_make_unique()
sc.pp.filter_genes(sc_test, min_cells=14)
sc.pp.log1p(sc_test)
sc.pp.normalize_total(sc_test, target_sum=1e4) 
sc.pp.log1p(sc_test, base=10)
sc_test.raw = sc_test
sc.pp.highly_variable_genes(sc_test, n_top_genes=800)
sc_test = sc_test[:, sc_test.var.highly_variable]
sc.pp.scale(sc_test)

df_test = pd.DataFrame(sc_test.X)



  view_to_actual(adata)


In [22]:
clusterable_embedding_test = umap.UMAP(
    n_neighbors=9,
    min_dist=0.0,
    n_components=110,
    random_state=42).fit_transform(df_test)

pca_test = PCA(n_components=50)
pca_matrix_test = pca.fit_transform(clusterable_embedding_test)

labels_test = hdbscan.HDBSCAN(min_samples=2, min_cluster_size=14).fit_predict(clusterable_embedding_test)

# Evaluate the performance
ssc = silhouette_score(pca_matrix_test, labels_test)
print('Silhouette score:', ssc)

Silhouette score: 0.8306332


In [23]:
cluster_labels = pd.DataFrame(data = {'index': sc_pancreas_test.columns, 'cluster': (labels_test + 1)})
assert all(cluster_labels.columns == ["index","cluster"])
assert all(cluster_labels["index"] == sc_pancreas_test.columns)

In [25]:
# PCA should be a DataFrame containing the coordinates of each cell in the PCA transformed space for the 50 first PCs
pca_df = pd.DataFrame(data = pca_matrix_test, columns = ["PC"+str(i+1) for i in range(50)])
pca_df.insert(loc=0, column='index', value=sc_pancreas_test.columns)
assert all(pca_df.columns == ["index"] + ["PC"+str(i+1) for i in range(50)])
assert all(pca_df["index"] == sc_pancreas_test.columns)

In [33]:
with zipfile.ZipFile(os.path.join(results_path, archive_name), "x") as zf:
    with zf.open(f"imputed_bulkified.csv", "w") as buffer:
        bulkified.to_csv(buffer)
    with zf.open(f"cluster_membership.csv", "w") as buffer:
        cluster_labels.to_csv(buffer)
    with zf.open(f"PCA.csv", "w") as buffer:
        pca_df.to_csv(buffer)
    zf.close()

In [34]:
archive = zipfile.ZipFile(os.path.join(results_path, archive_name))
assert all(
        [
            i in archive.namelist()
            for i in [
                "imputed_bulkified.csv",
                "cluster_membership.csv",
                "PCA.csv",
            ]
        ]
    )