In [2]:
import numpy as np
import pandas as pd
import time

d = pd.read_csv("temp.csv")
d.head()

Unnamed: 0,PD1,GzmB,CD8a,CD103,CD56,CD25,CD4,CD19,CD49a,CD3,FoxP3
0,1.760079,2.241769,0.447316,0.951417,1.085694,0.365136,1.15903,0.744424,1.75649,1.208541,0.897291
1,2.504053,1.743894,0.725944,1.063522,1.146853,0.92404,2.429543,1.594169,3.008561,1.925828,0.740108
2,1.171469,1.729824,0.94235,0.670537,1.141923,0.64487,0.998817,0.800615,1.419161,1.141207,0.937181
3,1.464891,1.908043,0.499899,0.927789,0.677614,1.049692,0.864599,0.746302,1.478173,1.268048,0.886642
4,1.989203,1.786417,0.857825,0.784642,0.61729,1.16993,0.36388,0.945732,1.475951,1.183369,0.803843


## Gaussian Mixture Model

In [3]:
from sklearn.mixture import GaussianMixture

cols = d.columns
[m,n] = d.shape

thresh = 0.9
ncomp = 2*np.ones(n,dtype=int)
ncomp[list(cols).index('CD56')] = 3

for i in range(n):
    temp = d[cols[i]].to_numpy().reshape(-1,1)
    gmm = GaussianMixture(n_components=ncomp[i]).fit(temp)
    probs = gmm.predict_proba(temp)
    isconfident = np.any(probs>thresh,axis=1)
    print('For {}, confident classification: {} %'.format(cols[i],round(np.sum(isconfident)/m*100,2)))

For PD1, confident classification: 21.2 %
For GzmB, confident classification: 84.37 %
For CD8a, confident classification: 99.11 %
For CD103, confident classification: 96.12 %
For CD56, confident classification: 1.87 %
For CD25, confident classification: 4.34 %
For CD4, confident classification: 99.22 %
For CD19, confident classification: 13.42 %
For CD49a, confident classification: 90.64 %
For CD3, confident classification: 98.05 %
For FoxP3, confident classification: 76.08 %


In [140]:
# hierarchical GMM
from sklearn.mixture import GaussianMixture

def gmm_gate(d,colname,ncomp=2):
    temp = d[colname].to_numpy().reshape(-1,1)
    gmm = GaussianMixture(n_components=ncomp).fit(temp)
    labels = gmm.predict(temp)
    mapping = dict(zip( list(range(len(gmm.means_))),list(np.argsort(np.squeeze(gmm.means_))) ))
    labels = np.asarray([mapping[x] for x in labels])
    return labels

def gmm_classify(d,gatename):
    cols = d[gatename].columns
    [m,n] = d[gatename].shape
    ncomp = 2*np.ones(n,dtype=int)
    ncomp[list(cols).index('CD56')] = 3
    
    print('\nIn {}:'.format(gatename))
    for i in range(n):
        temp = d[gatename][cols[i]].to_numpy().reshape(-1,1)
        gmm = GaussianMixture(n_components=ncomp[i]).fit(temp)
        probs = gmm.predict_proba(temp)
        isconfident = np.any(probs>thresh,axis=1)
        print('For {}, confident classification: {} %'.format(cols[i],round(np.sum(isconfident)/m*100,2)))

In [142]:
cols = d.columns
[m,n] = d.shape
thresh = 0.9
gate = {}

labels = gmm_gate(d,'CD3')
gate['CD3+'] = d.iloc[np.where(labels==1)].drop('CD3',axis=1).reset_index(drop=True)
gate['CD3-'] = d.iloc[np.where(labels==0)].drop('CD3',axis=1).reset_index(drop=True)

labels = gmm_gate(gate['CD3+'],'CD4')
gate['CD3+CD4+'] = gate['CD3+'].iloc[np.where(labels==1)].drop('CD4',axis=1).reset_index(drop=True)

labels = gmm_gate(gate['CD3+CD4+'],'FoxP3')
gate['CD3+CD4+FoxP3+'] = gate['CD3+CD4+'].iloc[np.where(labels==1)].drop('FoxP3',axis=1).reset_index(drop=True)

labels = gmm_gate(gate['CD3+'],'CD8a')
gate['CD3+CD8+'] = gate['CD3+'].iloc[np.where(labels==1)].drop('CD8a',axis=1).reset_index(drop=True)

labels = gmm_gate(gate['CD3+CD8+'],'FoxP3')
gate['CD3+CD8+FoxP3+'] = gate['CD3+CD8+'].iloc[np.where(labels==1)].drop('FoxP3',axis=1).reset_index(drop=True)

labels = gmm_gate(gate['CD3-'],'CD19')
gate['CD3-CD19+'] = gate['CD3-'].iloc[np.where(labels==1)].drop('CD19',axis=1).reset_index(drop=True)

for gatename in gate.keys():
    print('{}: {} %'.format(gatename,round(gate[gatename].shape[0]/m*100,2)))

CD3+: 35.8 %
CD3-: 64.2 %
CD3+CD4+: 25.15 %
CD3+CD4+FoxP3+: 2.58 %
CD3+CD8+: 9.99 %
CD3+CD8+FoxP3+: 6.33 %
CD3-CD19+: 1.54 %


In [135]:
gatename = 'CD3+'
gmm_classify(gate,gatename)

gatename = 'CD3+CD4+'
gmm_classify(gate,gatename)

gatename = 'CD3+CD8+'
gmm_classify(gate,gatename)

gatename = 'CD3-'
gmm_classify(gate,gatename)


In CD3+:
For PD1, confident classification: 80.87 %
For GzmB, confident classification: 67.18 %
For CD8a, confident classification: 99.55 %
For CD103, confident classification: 98.83 %
For CD56, confident classification: 5.14 %
For CD25, confident classification: 78.52 %
For CD4, confident classification: 99.49 %
For CD19, confident classification: 14.05 %
For CD49a, confident classification: 86.49 %
For FoxP3, confident classification: 82.77 %

In CD3+CD4+:
For PD1, confident classification: 63.53 %
For GzmB, confident classification: 72.83 %
For CD8a, confident classification: 94.49 %
For CD103, confident classification: 96.47 %
For CD56, confident classification: 1.3 %
For CD25, confident classification: 84.17 %
For CD19, confident classification: 37.36 %
For CD49a, confident classification: 86.99 %
For FoxP3, confident classification: 81.17 %

In CD3+CD8+:
For PD1, confident classification: 78.57 %
For GzmB, confident classification: 50.93 %
For CD103, confident classification: 99

## Louvain Clustering

In [17]:
import scanpy as sc
import anndata
import igraph
d = d.iloc[:,:11]
adata = anndata.AnnData(X=pd.DataFrame.to_numpy(d), obs=list(d.index.values), var=list(d.columns.values))
adata

AnnData object with n_obs × n_vars = 818476 × 11 
    obs: 0
    var: 0

In [None]:
print("Nearest neighbours...")
start = time.time()
sc.pp.neighbors(adata, n_neighbors=65)
print("DONE: {} s".format(time.time()-start))
print("Louvain clustering...")
mid = time.time()
sc.tl.louvain(adata, resolution=0.1)
print("DONE: {} s".format(time.time()-mid))
print("Number of clusters: ",max(np.array(adata.obs['louvain']).astype(int)+1))
d['cluster']=np.array(adata.obs['louvain']).astype(int)+1
d.to_csv("temp.csv", index=False)

Nearest neighbours...


The keyword argument 'parallel=True' was specified but no transformation for parallel execution was possible.

To find out why, try turning on parallel diagnostics, see http://numba.pydata.org/numba-doc/latest/user/parallel.html#diagnostics for help.
[1m
File "..\..\..\..\Anaconda3\lib\site-packages\umap\nndescent.py", line 47:[0m
[1m    @numba.njit(parallel=True)
[1m    def nn_descent(
[0m    [1m^[0m[0m
[0m
  self.func_ir.loc))


DONE: 916.7458536624908 s
Louvain clustering...
