In [None]:
import numpy as np
import pandas as pd
import scanpy as sc
import anndata as an
import sklearn.metrics
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.mixture import GaussianMixture as GMM
from scipy.stats import norm

In [None]:
#adata=sc.read(results_file_post)
#adata.uns['log1p'] = {"base":None}

Set up out properties

In [None]:
sc.settings.verbosity = 3             # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_header()
sc.settings.set_figure_params(dpi=80, facecolor='white')

In [None]:
#%matplotlib inline

Set scanpy out-files

In [None]:
writeDir = "write/"

fileName = "pdacMouseVeh"

resultsFileQC = writeDir + fileName + '_QC.h5ad' 


Set figure parameters

In [None]:
sc.set_figure_params(scanpy=True, dpi=100, dpi_save=150, fontsize=10, format='png')
sc.settings.figdir = "figures/" + fileName + "/"
figName = fileName

read input file

In [None]:
inDir = 'data/PDAC/'
#inputFile = f'{inDir}/.txt'
#inMetaFile = f'{inDir}/.txt'

In [None]:
adata = sc.read_10x_h5(f"{inDir}matrix_BC1381_veh.h5", gex_only=False)
adata

In [None]:
adata.var

In [None]:
adata.var[-13:]

In [None]:
numGenes = 12

hto = adata[:,-12:]
adata = adata[:,:-12]
adata.obs = pd.DataFrame(hto.X.todense(), columns=hto.var_names, index=adata.obs.index)

In [None]:
adata.obs

In [None]:
adata.var_names_make_unique()
adata.obs_names_make_unique()

In [None]:
adata.obs 

In [None]:
adata.var

# Start QC
investigate highest expressed genes

In [None]:
sc.pl.highest_expr_genes(adata, n_top=20, )

In [None]:
valX = adata.X>0
minGenes=500
minCells=15

fig, axs = plt.subplots(1, 2, figsize=(8, 4))

numCellwExp = valX.sum(axis=1)
axs[0].hist(np.log(numCellwExp), bins=100)#, log=True)
axs[0].axvline(np.log(minGenes), color='k', linestyle='dashed', linewidth=1)
axs[0].set_xlabel('num gene')
axs[0].set_ylabel('counts')

numGeneswExp = valX.sum(axis=0).T
axs[1].hist(np.log(numGeneswExp+1), bins=100, log=True)
axs[1].axvline(np.log(minCells+1), color='k', linestyle='dashed', linewidth=1)
#axs[0,1].axvline(np.log(600), color='k', linestyle='dashed', linewidth=1)
#axs[0,1].axvline(np.log(4000), color='k', linestyle='dashed', linewidth=1)
#axs[0,1].set_title('Gene means counts')
axs[1].set_xlabel('num cell')
axs[1].set_ylabel('counts')

fig.show()

In [None]:
sc.pp.filter_cells(adata, min_genes = minGenes)
sc.pp.filter_genes(adata, min_cells = minCells)

In [None]:
adata = adata[:,np.logical_not(adata.var_names=="Malat1")]

## Mito QC

In [None]:
#for g in adata.var_names:
#    if "-" in g:
#        print(g)

In [None]:
adata.var['mt'] = adata.var_names.str.startswith('mt-') 
sc.pp.calculate_qc_metrics(adata, qc_vars=['mt'], log1p = False, inplace=True)

In [None]:
#sc.pl.violin(adata, ['n_genes_by_counts'], jitter=0.4)

In [None]:
#sc.pl.violin(adata, ['total_counts'], jitter=0.4)

In [None]:
#sc.pl.violin(adata, ['pct_counts_mt'], jitter=0.4)

In [None]:
sc.pl.scatter(adata, x='total_counts', y='pct_counts_mt')
sc.pl.scatter(adata, x='total_counts', y='n_genes_by_counts')

remove cells that have too much mito or could be doublets

In [None]:
adata = adata[adata.obs.n_genes_by_counts < 6000, :]
adata = adata[adata.obs.total_counts < 25000, :]
adata = adata[adata.obs.pct_counts_mt < 10, :]

In [None]:
#sc.pl.violin(adata, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'], jitter=0.4, multi_panel=True)

# Perform Demultiplex Hashing

In [None]:
hto.var_names

In [None]:
#perfrom Hashsolo anlaysis
hashNames = ['AK1654_VEH_B0307', 'AK1656_VEH_B0308', 'AK1660_VEH_B0309']
sc.external.pp.hashsolo(adata, hashNames)#, priors=[0.01, 0.8, 0.19], number_of_noise_barcodes=None)
adata.obs.head()

visualize hashes

In [None]:
#output visulaization of hashing
sc.pl.heatmap(adata, hashNames, groupby="Classification", log=True, save = f"_{figName}_hash.png")

count each hash

In [None]:
from collections import Counter

Counter(adata.obs["Classification"])

In [None]:
hashCounts = adata.obs[hashNames]
hashCounts

In [None]:
numHashes = len(hashNames)
plotLen = 3
fig, axs = plt.subplots(2,plotLen)
plt.rcParams["figure.figsize"] = (15,5)

dfHashBoundry = pd.DataFrame(np.zeros(numHashes),hashNames, columns=["boundry"])
gmm = GMM(n_components = 2, random_state=10, covariance_type = 'full', n_init=5, means_init=[[1],[4]])
#binEx = np.arange(0.5,10,10/200).reshape(-1,1)

for i, hashName in enumerate(hashNames):
    hashCount = np.array(np.log10(adata.obs[hashName]+1)).reshape(-1, 1)
    fitGMM = gmm.fit(hashCount)
    mean = fitGMM.means_  
    covs  = fitGMM.covariances_
    weights = fitGMM.weights_
    #print(mean)
    binEx = np.arange(min(mean),max(mean),0.1).reshape(-1,1)
    fitGmmBound = fitGMM.predict(binEx)
    #print(fitGmmBound)
    hashBoundry = binEx[np.where(fitGmmBound == 1)[0][0]][0]
    #naiveBoundry = np.log10(int(hashDisc.loc["90%",hashName])+1)
    
    dfHashBoundry.loc[hashName] = hashBoundry
    
    x_axis = np.arange(0, 5, 0.1)
    y_axis0 = norm.pdf(x_axis, float(mean[0][0]), np.sqrt(float(covs[0][0][0])))*weights[0] # 1st gaussian
    y_axis1 = norm.pdf(x_axis, float(mean[1][0]), np.sqrt(float(covs[1][0][0])))*weights[1] # 2nd gaussian

    # Plot 2
    x,y = i//plotLen, i%plotLen
    axs[x,y].set_title(hashName)
    #axs[x,y].axvline(naiveBoundry, c='C3', linestyle='dashed', linewidth=1) #red
    axs[x,y].axvline(hashBoundry, c='C2', linestyle='dashed', linewidth=1)  #green
    axs[x,y].hist(hashCount, density=True, color='black', bins=100)        
    axs[x,y].plot(x_axis, y_axis0, lw=3, c='C6')                            #pink
    axs[x,y].plot(x_axis, y_axis1, lw=3, c='C1')                            #orange
    axs[x,y].plot(x_axis, y_axis0+y_axis1, lw=3, c='C0', ls=':')            #dotted blue
    
plt.tight_layout(pad=1.0)
plt.show()

In [None]:
hashIDs = hashCounts.copy()
hashID = np.array(hashNames)
for hashName in hashNames:
    print(hashName)
    print(dfHashBoundry.loc[hashName].values[0])
    hashIDs.loc[:,hashName] = np.log10(hashCounts.loc[:,hashName]+1) > dfHashBoundry.loc[hashName].values[0]
hashIDs

In [None]:
classification = np.empty(len(adata), dtype="object")
i = 0
for cellBar, hashBool in hashIDs.iterrows():
    numHashes = sum(hashBool)
    if (numHashes == 1):
        classif = hashID[hashBool.values][0]#.values
    elif (numHashes > 1):
        classif = "Doublet"
    else:
        classif = "Negative"
    classification[i] = classif
    i = i + 1

In [None]:
adata.obs["GMM"] = classification
adata.obs["GMM"].value_counts()

In [None]:
adata.obs[['AK1654_VEH_B0307', 'AK1656_VEH_B0308', 'AK1660_VEH_B0309']]

In [None]:
sc.pl.heatmap(adata, ['AK1654_VEH_B0307', 'AK1656_VEH_B0308', 'AK1660_VEH_B0309'], groupby="GMM", log=True)#, save = f"_{figName}_hash.png")

In [None]:
singlets = [x in hto.var_names for x in adata.obs["Classification"] ]
adata = adata[singlets,]

In [None]:
resultsFileQC

In [None]:
adata.write(resultsFileQC)