In [None]:
import numpy as np
import pandas as pd
import scanpy as sc
import scanpy.external as scex
import sklearn.metrics
from matplotlib import pyplot as plt
import matplotlib.cm
import bbknn
import seaborn as sns
from scipy import stats
from sklearn.mixture import GaussianMixture as GMM
from scipy.stats import norm

In [None]:
#adata.uns['log1p']['base'] = None

In [None]:
writeLoc = "../../write/"

fileName = "sarcPDX_Treat_MFH9_V4"

resultsFile = writeLoc + fileName + '.h5ad'  # the file that will store the analysis results
resultsFileQC = writeLoc + fileName + '_QC.h5ad'  
#results_file_ = writeLoc + fileName + '_fTry.h5ad'

In [None]:
inputFile = "../../data/sarcoma/sarcPDX_Treat_MFH9/filtered_feature_bc_matrix.h5"

In [None]:
adata =  sc.read_10x_h5(inputFile, gex_only=False)
adata

In [None]:
#split up input into genes and hashes 
hto = adata[:,adata.var["feature_types"] == "Antibody Capture"]
adata = adata[:,adata.var["feature_types"] == "Gene Expression"]
adata.obs = pd.DataFrame(hto.X.todense(), columns=hto.var_names, index=adata.obs.index)

In [None]:
adata.var_names_make_unique()

In [None]:
hto.var_names

In [None]:
sc.pl.highest_expr_genes(adata, n_top=20, )

In [None]:
adata

In [None]:
valX = adata.X>0

In [None]:
minGenes=400
minCells=3

fig, axs = plt.subplots(2, 2, figsize=(8, 8))

numCellwExp = valX.sum(axis=1)
axs[0,0].hist(np.log(numCellwExp), bins=100)#, log=True)
axs[0,0].axvline(np.log(minGenes), color='k', linestyle='dashed', linewidth=1)
#axs[0,0].axvline(np.log(600), color='k', linestyle='dashed', linewidth=1)
#axs[0,0].axvline(np.log(4000), color='k', linestyle='dashed', linewidth=1)
#axs[0,0].set_title('Gene means counts')
axs[0,0].set_xlabel('sum cell with exp counts')
axs[0,0].set_ylabel('counts')

numGeneswExp = valX.sum(axis=0).T
axs[0,1].hist(np.log(numGeneswExp+1), bins=100, log=True)
axs[0,1].axvline(np.log(minCells+1), color='k', linestyle='dashed', linewidth=1)
#axs[0,1].axvline(np.log(600), color='k', linestyle='dashed', linewidth=1)
#axs[0,1].axvline(np.log(4000), color='k', linestyle='dashed', linewidth=1)
#axs[0,1].set_title('Gene means counts')
axs[0,1].set_xlabel('sum cell have exp counts')
axs[0,1].set_ylabel('counts')

sumCellswExp = adata.X.sum(axis=1)
axs[1,0].hist(np.log(sumCellswExp), bins=100)#, log=True)
axs[1,0].axvline(np.log(minGenes), color='k', linestyle='dashed', linewidth=1)
#axs[1,0].axvline(np.log(50000), color='k', linestyle='dashed', linewidth=1)
#axs[1,0].set_title('Gene means counts')
axs[1,0].set_xlabel('sum cell counts')
axs[1,0].set_ylabel('counts')

sumGeneswExp = adata.X.sum(axis=0).T
axs[1,1].hist(np.log(sumGeneswExp+1), bins=100, log=True)
axs[1,1].axvline(np.log(minCells+1), color='k', linestyle='dashed', linewidth=1)
#axs[1,1].axvline(np.log(50000), color='k', linestyle='dashed', linewidth=1)
#axs[1,1].set_title('Gene means counts')
axs[1,1].set_xlabel('gene exp')
axs[1,1].set_ylabel('counts')

fig.show()

In [None]:
sc.pp.filter_cells(adata, min_genes = minGenes)
sc.pp.filter_genes(adata, min_cells = minCells)

In [None]:
from collections import Counter
print(Counter(adata.var_names.str.startswith('GRCh38_')))
print(Counter(adata.var_names.str.startswith('GRCh38_MT-')))

print(Counter(adata.var_names.str.startswith('mm10___')))
print(Counter(adata.var_names.str.startswith('mm10___mt-')))

In [None]:
adata

In [None]:
adata.var['human'] = adata.var_names.str.startswith('GRCh38_')  # annotate the group of human genes as 'human'
adata.var['mouse'] = adata.var_names.str.startswith('mm10___')  # annotate the group of mouse mitochondrial genes as 'mt'

sc.pp.calculate_qc_metrics(adata, qc_vars=['human'], percent_top=None, log1p=False, inplace=True)
sc.pp.calculate_qc_metrics(adata, qc_vars=['mouse'], percent_top=None, log1p=False, inplace=True)

In [None]:
sc.pl.violin(adata, ['pct_counts_mouse', 'pct_counts_human'],
             jitter=0.4, multi_panel=True, log = True)

In [None]:
sc.pl.violin(adata, ['n_genes_by_counts', 'total_counts'],
             jitter=0.4, multi_panel=True)

In [None]:
sum(adata.obs.pct_counts_mouse < 2)

In [None]:
adata = adata[adata.obs.pct_counts_mouse < 2, :]
adata = adata[:,adata.var_names.str.startswith('GRCh38_')]
adata

In [None]:
sc.pl.violin(adata, ['pct_counts_mouse', 'pct_counts_human'],
             jitter=0.4, multi_panel=True, log = True)

In [None]:
adata.var['hMT'] = adata.var_names.str.startswith('GRCh38_MT-')  # annotate the group of human mitochondrial genes as 'hMT'
#adata.var['mmt'] = adata.var_names.str.startswith('mm10___mt-')  # annotate the group of mouse mitochondrial genes as 'mmt'

sc.pp.calculate_qc_metrics(adata, qc_vars=['hMT'], percent_top=None, log1p=False, inplace=True)
#sc.pp.calculate_qc_metrics(adata, qc_vars=['mmt'], percent_top=None, log1p=False, inplace=True)

In [None]:
sc.pl.violin(adata, ['n_genes_by_counts', 'total_counts', "pct_counts_hMT"], #pct_counts_mmt
             jitter=0.4, multi_panel=True)

In [None]:
sc.pl.scatter(adata, x='total_counts', y='pct_counts_hMT')
sc.pl.scatter(adata, x='total_counts', y='n_genes_by_counts')

In [None]:
adata

In [None]:
adata = adata[adata.obs.n_genes_by_counts < 5500, :]
adata = adata[adata.obs.n_genes_by_counts > 1000, :]
adata = adata[adata.obs.total_counts < 27500, :]
adata = adata[adata.obs.pct_counts_hMT < 12, :]

In [None]:
sc.pl.violin(adata, ['n_genes_by_counts', 'total_counts', "pct_counts_hMT"], #pct_counts_mmt
             jitter=0.4, multi_panel=True)

In [None]:
hto.var_names

In [None]:
import scanpy.external as sce
sce.pp.hashsolo(adata, hto.var_names)#number_of_noise_barcodes = 1
adata.obs.head()

In [None]:
sc.pl.heatmap(adata, hto.var_names, groupby="Classification", log=True)

In [None]:
hto.var_names

In [None]:
len(hto.var_names)

In [None]:
Counter(adata.obs["Classification"])

In [None]:
hashCounts = adata.obs[hto.var_names]
hashCounts.describe([0.01,.1,.2,.3,.4,.5,.6,.7,.8,.9,0.99])

In [None]:
#hashCount = np.log1p(adata.obs[hashName])
#len(hashCount)

In [None]:
fig, axs = plt.subplots(2,5)
dfHashBoundry = pd.DataFrame(np.zeros(len(hto.var_names)),hto.var_names, columns=["boundry"])
gmm = GMM(n_components = 2, random_state=10,  n_init=5, covariance_type = 'full', means_init=[[2],[7]])#
#binEx = np.arange(0.5,10,10/200).reshape(-1,1)

for i, hashName in enumerate(hto.var_names):
    hashCount = np.log1p(adata.obs[hashName]).values
    trimV = 1*len(hashCount)//100
    hashCount = np.sort(hashCount)[trimV:].reshape(-1, 1)
    fitGMM = gmm.fit(hashCount)
    mean = fitGMM.means_  
    covs  = fitGMM.covariances_
    weights = fitGMM.weights_
    #print(mean)
    binEx = np.arange(min(mean),max(mean),0.1).reshape(-1,1)
    fitGmmBound = fitGMM.predict(binEx)
    #print(fitGmmBound)
    hashBoundry = binEx[np.where(fitGmmBound == 1)[0][0]][0]
    
    dfHashBoundry.loc[hashName] = hashBoundry
    
    x_axis = np.arange(0, 12, 0.1)
    y_axis0 = norm.pdf(x_axis, float(mean[0][0]), np.sqrt(float(covs[0][0][0])))*weights[0] # 1st gaussian
    y_axis1 = norm.pdf(x_axis, float(mean[1][0]), np.sqrt(float(covs[1][0][0])))*weights[1] # 2nd gaussian

    # Plot 2
    axs[i//5,i%5].set_title(hashName)
    #axs[i//5,i%5].axvline(naiveBoundry, c='C3', linestyle='dashed', linewidth=1) #red
    axs[i//5,i%5].axvline(hashBoundry, c='C2', linestyle='dashed', linewidth=1)  #green
    axs[i//5,i%5].hist(hashCount, density=True, color='black', bins=100)        
    axs[i//5,i%5].plot(x_axis, y_axis0, lw=3, c='C6')                            #pink
    axs[i//5,i%5].plot(x_axis, y_axis1, lw=3, c='C1')                            #orange
    axs[i//5,i%5].plot(x_axis, y_axis0+y_axis1, lw=3, c='C0', ls=':')            #dotted blue
    
plt.tight_layout(pad=1.0)
plt.rcParams["figure.figsize"] = (20,5)
plt.show()

In [None]:
#hashIDs = hashCounts.copy()
#hashID = hto.var_names
#for hashName in hto.var_names:
#    print(hashName)
#    print(dfHashBoundry.loc[hashName].values[0])
#    hashIDs.loc[:,hashName] = np.log1p(hashCounts.loc[:,hashName]) > dfHashBoundry.loc[hashName].values[0]
#hashIDs

In [None]:
#classification = np.empty(len(adata), dtype="object")
#i = 0
#for cellBar, hashBool in hashIDs.iterrows():
#    numHashes = sum(hashBool)
#    if (numHashes == 1):
#        classif = hashID[hashBool.values].values[0]
#    elif (numHashes > 1):
#        classif = "doublet"
#    else:
#        classif = "negative"
#    classification[i] = classif
#    i = i + 1
#    #break

In [None]:
#adata.obs["ClassGMM"] = classification
#adata.obs["ClassGMM"].value_counts()

In [None]:
#sc.pl.heatmap(adata, hto.var_names, groupby="ClassGMM", log=True)#, save = f"_{figName}_hash.png")

In [None]:
sc.pl.heatmap(adata, hto.var_names, groupby="Classification", log=True)#, save = f"_{figName}_hash.png")

In [None]:
#['RPS8370_pSpCTRE_B0256', 'UPS2236_B0255']
singlets = [x in hto.var_names for x in adata.obs["Classification"]]
adata = adata[singlets,]
adata

In [None]:
sarc =  {'4wk5_B0260': "wk4",
         'Control4_B0254': "ctl",
         '4wk4_B0259': "wk4",
         '1wk5_B0257': "wk1",
         '1wk1_B0255': "wk1",
         'Control1_B0251': "ctl",
         '1wk3_B0256': "wk1",
         '4wk2_B0258': "wk4",
         'Control3_B0253': "ctl",
         'Control2_B0252': "ctl"
        }
adata.obs["chemo"] = [sarc[x] for x in adata.obs["Classification"]]

In [None]:
#['RPS8370_pSpCTRE_B0256', 'UPS2236_B0255']
#ups = [x in ["UPS"] for x in adata.obs["sarc"]]
#adata = adata[ups,]
#adata

In [None]:
sc.set_figure_params(scanpy=True, dpi=100, dpi_save=150, fontsize=10, format='png')

In [None]:
adata.var_names = [x.split("_")[1] for x in adata.var_names]
adata.var_names_make_unique()

In [None]:
print(resultsFileQC)
adata = sc.read_h5ad(resultsFileQC)
#adata.write(resultsFileQC)
adata = adata[:,"MALAT1" != adata.var_names]
adata.layers["counts"] = adata.X.copy()
#adata.write(resultsFileQC)
adata

In [None]:
sc.pp.normalize_total(adata, target_sum=1e4)#, exclude_highly_expressed=True

In [None]:
sc.pp.log1p(adata)

In [None]:
cell_cycle_genes = [x.strip() for x in open('../../data/regev_lab_cell_cycle_genes_Human.txt')]
s_genes = cell_cycle_genes[:43]
g2m_genes = cell_cycle_genes[43:]
cell_cycle_genes = [x for x in cell_cycle_genes if x in adata.var_names]

In [None]:
sc.tl.score_genes_cell_cycle(adata, s_genes=s_genes, g2m_genes=g2m_genes)

In [None]:
adata_cc_genes = adata[:, cell_cycle_genes]
sc.tl.pca(adata_cc_genes)
sc.pl.pca_scatter(adata_cc_genes, color=['phase',"Classification"])

In [None]:
adata = adata[adata.obs.phase=="G1",:]

In [None]:
adata.write("../../write/sarcPDX_Treat_G1_MFH9_V4_QC.h5ad")


In [None]:
sc.pp.highly_variable_genes(adata)#, flavor="seurat", n_top_genes=2000)
#min_disp=0.5,min_mean=0.0125, max_mean=3, span=0.3, n_bins=20,

In [None]:
sum(adata.var.highly_variable)

In [None]:
adata.var

In [None]:
minMean = 0.045
maxMean = 2.9
minDisp = 0.35

fig, axs = plt.subplots(1, 2, figsize=(10, 5))

means = adata.var[["means"]][adata.var[["means"]] > np.exp(-14)]#adata.var[["means"]],
axs[0].hist(np.log(means), bins=100)#, log=True),
axs[0].axvline(np.log(minMean), color='k', linestyle='dashed', linewidth=1)
axs[0].axvline(np.log(maxMean), color='k', linestyle='dashed', linewidth=1)
axs[0].set_title('Gene means counts')
axs[0].set_xlabel('means')
axs[0].set_ylabel('counts')

dispNorm = adata.var[["dispersions_norm"]][adata.var[["dispersions_norm"]] > np.exp(-10)]#adata.var[["means"]],
axs[1].hist(np.log(dispNorm), bins=100)#, log=True),
axs[1].axvline(np.log(minDisp), color='k', linestyle='dashed', linewidth=1)
axs[1].set_title('Gene dispersions counts')
axs[1].set_xlabel('dispersions')
axs[1].set_ylabel('counts')

sc.pp.highly_variable_genes(adata, min_disp=minDisp, min_mean=minMean, max_mean=maxMean)
print(sum(adata.var.highly_variable))

In [None]:
sc.pl.highly_variable_genes(adata)

In [None]:
sum(np.logical_and(np.logical_not(adata.var.hMT), adata.var.highly_variable))

In [None]:
adata.raw = adata

In [None]:
adata = adata[:, np.logical_and(np.logical_not(adata.var.hMT), adata.var.highly_variable)]

In [None]:
#sc.pp.regress_out(adata, ['S_score', 'G2M_score'#]),
#                         ,'total_counts', 'pct_counts_hMT'])

In [None]:
#np.max(adata.X.todense())

In [None]:
#sc.pp.scale(adata, max_value=4)

In [None]:
sc.tl.pca(adata, n_comps=100,svd_solver='arpack')

In [None]:
sc.pl.pca_variance_ratio(adata, n_pcs=100, log=True)

In [None]:
#sc.pl.pca(adata, color=["phase"],ncols=3,annotate_var_explained=True, components=['1,2','1,3',"1,4", '2,3', '2,4', "3,4"])

In [None]:
#adata.obsm["X_pca_1_40"] = adata.obsm["X_pca"][:,1:41]

In [None]:
sc.pp.neighbors(adata, n_neighbors=25, n_pcs=50)#, use_rep="X_pca_1_40")

In [None]:
sc.tl.umap(adata)

In [None]:
sc.tl.leiden(adata, resolution = 0.5)

In [None]:
sc.pl.umap(adata, color=['leiden',"phase",'chemo','Classification'],ncols=2)

In [None]:
sc.tl.embedding_density(adata, basis='umap', groupby="chemo")
sc.pl.embedding_density(adata, groupby="chemo", ncols = 4)

In [None]:
sc.pl.umap(adata, color=['CA9',"BNIP3",'COL1A1','MDM2','FBN1',"POSTN","MEG3"],ncols=4)

In [None]:
#sc.pl.umap(adata, color=['COL4A1','PECAM1','CD34',"RGS5"],ncols=2)

In [None]:
sc.pl.umap(adata, color=['TPM2','TOP2A','VIM',"ISG15"],ncols=4)

In [None]:
#adata.write('../../write/sarcPDX_Treat_MFH9_G1.h5ad')

In [None]:
resultsFile

In [None]:
#adata.write(resultsFile)

In [None]:
#sc.pl.umap(adata, color=['KRT18','GAPDH','LDHB',"TPI1"],ncols=2)

In [None]:
#sc.pl.umap(adata, color=['LYZ','PTPRC'],ncols=2)

In [None]:
sc.tl.rank_genes_groups(adata, 'leiden', method='wilcoxon', use_raw=False)
sc.pl.rank_genes_groups(adata, n_genes=25, sharey=False)

In [None]:
pd.DataFrame(adata.uns['rank_genes_groups']['names']).head(10)

In [None]:
pd.DataFrame(adata.uns['rank_genes_groups']['names']).head(200).to_csv("hvg/pdxTreated_MFH9_unfil.csv")

In [None]:
sc.tl.filter_rank_genes_groups(adata, min_in_group_fraction=0.5, min_fold_change=1, max_out_group_fraction=0.5)
sc.pl.rank_genes_groups(adata, key='rank_genes_groups_filtered', sharey=False)

In [None]:
sc.tl.dendrogram(adata, groupby="leiden", n_pcs=60)
sc.pl.rank_genes_groups_dotplot(adata, key='rank_genes_groups_filtered',n_genes=4)

In [None]:
sc.pl.rank_genes_groups_heatmap(adata, key='rank_genes_groups_filtered',n_genes=4)

In [None]:
sc.pl.umap(adata, color=['MYEOV','SNED1','TFPI2',"VEGFA","GDF15"],ncols=3)

In [None]:
topHVG = pd.DataFrame(adata.uns['rank_genes_groups_filtered']['names']).head(100)
topHVG

In [None]:
topHVG.to_csv("hvg/pdxTreated_MFH9_Filtered.csv")

0 - FAP/Apop
1 - FAP
2 - E2F pro
3 - G2M
4 - G2M pro
5 - Hypoxia
6 - NFKb

In [None]:
adata

compare 3 and 7 with mouse

compare 7 to other pdxs

In [None]:
features = ["MusImm","FAP","inflFAP","Hypox","p53Mdm2"]
sc.tl.leiden(adata, resolution=0.5, key_added = "chemoLabel")
adata.rename_categories('chemoLabel', features)
sc.pl.umap(adata, color=["chemoLabel"])

In [None]:
sc.pl.umap(adata, color=['leiden','chemo'],ncols=2)

In [None]:
#sc.set_figure_params(scanpy=True, dpi=100, dpi_save=300, fontsize=18, format='svg',color_map="magma")
#sc.settings.figdir = "figures/scSarcTime_Grant/"

In [None]:
sc.tl.dendrogram(adata, groupby = 'leiden')
sc.pl.rank_genes_groups_dotplot(adata, groupby = 'leiden',n_genes=4)

In [None]:
sc.tl.rank_genes_groups(adata, groupby = 'Classification', method="wilcoxon", key_added="classMark")
pd.DataFrame(adata.uns["classMark"]['names']).head(10)

In [None]:
sc.tl.rank_genes_groups(adata, groupby = 'chemo', method="wilcoxon", key_added="sarcMark")
pd.DataFrame(adata.uns["sarcMark"]['names']).head(10)

In [None]:
#adata = sc.read(resultsFile)

In [None]:
resultsFile

In [None]:
adata.write(resultsFile)

In [None]:
sc.read(resultsFile)

In [None]:
adata

In [None]:
adata = sc.read(resultsFile)

In [None]:
resultsFileQC

In [None]:
#adata.write(writeLoc+ "sarcPDX_UPS_UPS2236.h5ad")

In [None]:
adata = sc.read(resultsFileQC)
#adata.write(resultsFileQC)
adata

In [None]:
ctl = [x in ["ctl"] for x in adata.obs["sarc"]]
adata = adata[ctl,]
adata

In [None]:
adata.var_names = [x.split("_")[1] for x in adata.var_names]
adata.var_names_make_unique()

In [None]:
sc.pp.normalize_total(adata, exclude_highly_expressed=True)

In [None]:
sc.pp.log1p(adata)

In [None]:
cell_cycle_genes = [x.strip() for x in open('../../data/regev_lab_cell_cycle_genes_Human.txt')]
s_genes = cell_cycle_genes[:43]
g2m_genes = cell_cycle_genes[43:]
cell_cycle_genes = [x for x in cell_cycle_genes if x in adata.var_names]

In [None]:
sc.tl.score_genes_cell_cycle(adata, s_genes=s_genes, g2m_genes=g2m_genes)

In [None]:
adata_cc_genes = adata[:, cell_cycle_genes]
sc.tl.pca(adata_cc_genes)
sc.pl.pca_scatter(adata_cc_genes, color='phase')

In [None]:
sc.pp.highly_variable_genes(adata)#, flavor="seurat", n_top_genes=2000)

In [None]:
sum(adata.var.highly_variable)

In [None]:
x = adata.var[["means"]][adata.var[["means"]] > np.exp(-10)]#adata.var[["means"]]
plt.hist(np.log(x), bins=100)#, log=True)
plt.axvline(np.log(0.05), color='k', linestyle='dashed', linewidth=1)
plt.axvline(np.log(2.5), color='k', linestyle='dashed', linewidth=1)
plt.title('Gene means counts')
plt.xlabel('means')
plt.ylabel('counts')
plt.show()

In [None]:
x = adata.var[["dispersions_norms"]][adata.var[["dispersions_norms"]] > np.exp(-5)]#adata.var[["dispersions_norms"]]
plt.hist(np.log(x), bins=50)#, log=True)
plt.axvline(np.log(0.5), color='k', linestyle='dashed', linewidth=1)
plt.title('Gene dispersions_norms counts')
plt.xlabel('dispersions_norms')
plt.ylabel('counts')
plt.show()

In [None]:
sc.pp.highly_variable_genes(adata, min_disp=0.5, min_mean=0.05, max_mean=2.5)

In [None]:
sum(adata.var.highly_variable)

In [None]:
sc.pl.highly_variable_genes(adata)

In [None]:
adata.raw = adata

In [None]:
sum(adata.var.highly_variable)

In [None]:
adata = adata[:, np.logical_and(np.logical_not(adata.var.hMT), adata.var.highly_variable)]

In [None]:
sc.pp.regress_out(adata, ['S_score', 'G2M_score'])#,'total_counts', 'pct_counts_hMT'])

In [None]:
sc.pp.scale(adata, max_value=4)

In [None]:
sc.tl.pca(adata, n_comps=100,svd_solver='arpack')

In [None]:
sc.pl.pca_variance_ratio(adata, n_pcs=100, log=True)

In [None]:
sc.pp.neighbors(adata, n_neighbors=20, n_pcs=40)

In [None]:
sc.tl.umap(adata)

In [None]:
sc.tl.leiden(adata, resolution = 0.5)

In [None]:
sc.pl.umap(adata, color=['leiden',"phase",'Classification'],ncols=2)

In [None]:
sc.pl.umap(adata, color=['COL1A1','DES','MDM2',"CDK4"],ncols=2)

In [None]:
sc.pl.umap(adata, color=['PGK1','NDRG1'],ncols=2)

In [None]:
adata.write('../../write/sarcPDX_MFH9_ctl_V3.h5ad')

In [None]:
adata = sc.read(resultsFileQC)
#adata.write(resultsFileQC)
adata

In [None]:
wk1 = [x in ["wk1"] for x in adata.obs["sarc"]]
adata = adata[wk1,]
adata

In [None]:
adata.var_names = [x.split("_")[1] for x in adata.var_names]
adata.var_names_make_unique()

In [None]:
sc.pp.normalize_total(adata, exclude_highly_expressed=True)

In [None]:
sc.pp.log1p(adata)

In [None]:
cell_cycle_genes = [x.strip() for x in open('../../data/regev_lab_cell_cycle_genes_Human.txt')]
s_genes = cell_cycle_genes[:43]
g2m_genes = cell_cycle_genes[43:]
cell_cycle_genes = [x for x in cell_cycle_genes if x in adata.var_names]

In [None]:
sc.tl.score_genes_cell_cycle(adata, s_genes=s_genes, g2m_genes=g2m_genes)

In [None]:
adata_cc_genes = adata[:, cell_cycle_genes]
sc.tl.pca(adata_cc_genes)
sc.pl.pca_scatter(adata_cc_genes, color='phase')

In [None]:
sc.pp.highly_variable_genes(adata)#, flavor="seurat", n_top_genes=2000)

In [None]:
sum(adata.var.highly_variable)

In [None]:
x = adata.var[["means"]][adata.var[["means"]] > np.exp(-10)]#adata.var[["means"]]
plt.hist(np.log(x), bins=100)#, log=True)
plt.axvline(np.log(0.05), color='k', linestyle='dashed', linewidth=1)
plt.axvline(np.log(2.5), color='k', linestyle='dashed', linewidth=1)
plt.title('Gene means counts')
plt.xlabel('means')
plt.ylabel('counts')
plt.show()

In [None]:
x = adata.var[["dispersions_norms"]][adata.var[["dispersions_norms"]] > np.exp(-5)]#adata.var[["dispersions_norms"]]
plt.hist(np.log(x), bins=50)#, log=True)
plt.axvline(np.log(0.5), color='k', linestyle='dashed', linewidth=1)
plt.title('Gene dispersions_norms counts')
plt.xlabel('dispersions_norms')
plt.ylabel('counts')
plt.show()

In [None]:
sc.pp.highly_variable_genes(adata, min_disp=0.5, min_mean=0.05, max_mean=2.5)

In [None]:
sum(adata.var.highly_variable)

In [None]:
sc.pl.highly_variable_genes(adata)

In [None]:
adata.raw = adata

In [None]:
sum(adata.var.highly_variable)

In [None]:
adata = adata[:, np.logical_and(np.logical_not(adata.var.hMT), adata.var.highly_variable)]

In [None]:
sc.pp.regress_out(adata, ['S_score', 'G2M_score'])#,'total_counts', 'pct_counts_hMT'])

In [None]:
sc.pp.scale(adata, max_value=4)

In [None]:
sc.tl.pca(adata, n_comps=100,svd_solver='arpack')

In [None]:
sc.pl.pca_variance_ratio(adata, n_pcs=100, log=True)

In [None]:
sc.pp.neighbors(adata, n_neighbors=20, n_pcs=40)

In [None]:
sc.tl.umap(adata)

In [None]:
sc.tl.leiden(adata, resolution = 0.5)

In [None]:
sc.pl.umap(adata, color=['leiden',"phase",'Classification'],ncols=2)

In [None]:
sc.pl.umap(adata, color=['COL1A1','DES','MDM2',"CDK4"],ncols=2)

In [None]:
sc.pl.umap(adata, color=['PGK1','NDRG1'],ncols=2)

In [None]:
sc.tl.score_genes(adata, treatHvg["3"],score_name='treated_3')
sc.tl.score_genes(adata, treatHvg["7"],score_name='treated_7')
sc.pl.umap(adata,color=["treated_3","treated_7"])

In [None]:
adata.write('../../write/sarcPDX_MFH9_wk1_V3.h5ad')

In [None]:
adata = sc.read(resultsFileQC)
#adata.write(resultsFileQC)
adata

In [None]:
wk4 = [x in ["wk4"] for x in adata.obs["sarc"]]
adata = adata[wk4,]
adata

In [None]:
adata.var_names = [x.split("_")[1] for x in adata.var_names]
adata.var_names_make_unique()

In [None]:
sc.pp.normalize_total(adata, exclude_highly_expressed=True)

In [None]:
sc.pp.log1p(adata)

In [None]:
cell_cycle_genes = [x.strip() for x in open('../../data/regev_lab_cell_cycle_genes_Human.txt')]
s_genes = cell_cycle_genes[:43]
g2m_genes = cell_cycle_genes[43:]
cell_cycle_genes = [x for x in cell_cycle_genes if x in adata.var_names]

In [None]:
sc.tl.score_genes_cell_cycle(adata, s_genes=s_genes, g2m_genes=g2m_genes)

In [None]:
adata_cc_genes = adata[:, cell_cycle_genes]
sc.tl.pca(adata_cc_genes)
sc.pl.pca_scatter(adata_cc_genes, color='phase')

In [None]:
sc.pp.highly_variable_genes(adata)#, flavor="seurat", n_top_genes=2000)

In [None]:
sum(adata.var.highly_variable)

In [None]:
x = adata.var[["means"]][adata.var[["means"]] > np.exp(-10)]#adata.var[["means"]]
plt.hist(np.log(x), bins=100)#, log=True)
plt.axvline(np.log(0.05), color='k', linestyle='dashed', linewidth=1)
plt.axvline(np.log(2.5), color='k', linestyle='dashed', linewidth=1)
plt.title('Gene means counts')
plt.xlabel('means')
plt.ylabel('counts')
plt.show()

In [None]:
x = adata.var[["dispersions_norms"]][adata.var[["dispersions_norms"]] > np.exp(-5)]#adata.var[["dispersions_norms"]]
plt.hist(np.log(x), bins=50)#, log=True)
plt.axvline(np.log(0.5), color='k', linestyle='dashed', linewidth=1)
plt.title('Gene dispersions_norms counts')
plt.xlabel('dispersions_norms')
plt.ylabel('counts')
plt.show()

In [None]:
sc.pp.highly_variable_genes(adata, min_disp=0.5, min_mean=0.05, max_mean=2.5)

In [None]:
sum(adata.var.highly_variable)

In [None]:
sc.pl.highly_variable_genes(adata)

In [None]:
adata.raw = adata

In [None]:
sum(adata.var.highly_variable)

In [None]:
adata = adata[:, np.logical_and(np.logical_not(adata.var.hMT), adata.var.highly_variable)]

In [None]:
sc.pp.regress_out(adata, ['S_score', 'G2M_score'])#,'total_counts', 'pct_counts_hMT'])

In [None]:
sc.pp.scale(adata, max_value=4)

In [None]:
sc.tl.pca(adata, n_comps=100,svd_solver='arpack')

In [None]:
sc.pl.pca_variance_ratio(adata, n_pcs=100, log=True)

In [None]:
sc.pp.neighbors(adata, n_neighbors=20, n_pcs=40)

In [None]:
sc.tl.umap(adata)

In [None]:
sc.tl.leiden(adata, resolution = 0.5)

In [None]:
sc.pl.umap(adata, color=['leiden',"phase",'Classification'],ncols=2)

In [None]:
sc.pl.umap(adata, color=['COL1A1','DES','MDM2',"CDK4"],ncols=2)

In [None]:
sc.pl.umap(adata, color=['PGK1','NDRG1'],ncols=2)

In [None]:
adata.write('../../write/sarcPDX_MFH9_wk4_V3.h5ad')

In [None]:
treatHvg = pd.read_csv("hvg/treatedHvg.csv")
treatHvg

In [None]:
sc.tl.score_genes(adata, treatHvg["3"],score_name='treated_3')
sc.tl.score_genes(adata, treatHvg["7"],score_name='treated_7')
sc.pl.umap(adata,color=["treated_3","treated_7"])