

Date: october 2022 

Goal: 
For RNA data
1.Finding marker genes
2.Leukemia stem cell clustering
3.Trajectory inference
4.Gene matrix preparation for MCCF1 assay

Introduction to scanpy: https://scanpy-tutorials.readthedocs.io/en/latest/pbmc3k.html

# Load tools

In [None]:
import scanpy as sc
import anndata as ad
import matplotlib as mpl
import matplotlib.pyplot as pl
import seaborn as sns

In [None]:
import os
import sys
import pandas as pd
import numpy as np

In [None]:
import gzip

In [None]:
#https://stackoverflow.com/questions/71106940/cannot-import-name-centered-from-scipy-signal-signaltools
import scipy.signal.signaltools

def _centered(arr, newsize):
    # Return the center newsize portion of the array.
    newsize = np.asarray(newsize)
    currsize = np.array(arr.shape)
    startind = (currsize - newsize) // 2
    endind = startind + newsize
    myslice = [slice(startind[k], endind[k]) for k in range(len(endind))]
    return arr[tuple(myslice)]

scipy.signal.signaltools._centered = _centered

In [None]:
from matplotlib.pyplot import imshow
%matplotlib inline

In [None]:
sc.settings.verbosity = 3 # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_header()
sc.settings.set_figure_params(dpi= 80)

# Define Functions

In [None]:
def scanpyMarkerGenes(adata, clusters, met, filename):
    '''
    
    Inputs:
    1) scanpy AnnData object
    2) what clusters to compare (eg leiden, Cell Type)
    3) met = method for gene ranking (options: 't-test', 'wilcoxon', 'logreg')
    Docs on logreg: https://github.com/theislab/scanpy/issues/95, http://www.nxn.se/valent/2018/3/5/actionable-scrna-seq-clusters
    4) filename (CSV) for writing out the genes per cluster
    
    Returns: a pd dataframe with genes ranked by cluster
    
    use_raw=False >> use the PROCESSED data to avoid getting genes that aren't in the final annData object
    
    '''
    
    sc.tl.rank_genes_groups(adata, clusters, method=met, use_raw=False, penalty='l2',n_genes=100) #use_raw default = True
    # penalty: https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html#sklearn.linear_model.LogisticRegression
    
    #sc.pl.rank_genes_groups(adata, n_genes=25, sharey=False) # plot
    
    geneDF_all= pd.DataFrame(adata.uns['rank_genes_groups']['names'])
    geneDF= pd.DataFrame(adata.uns['rank_genes_groups']['names'])
    
    #### create a new df with only nonredundant genes in each column ####
    # get list of all values in df
    allValues = []
    for col in geneDF.columns: 
        for x in list(geneDF[col]):
            allValues.append(x)

    # create dictionary with cell types as keys and lists of nonredundant genes as values
    nonDupeDict = {}
    for col in geneDF.columns: # iterate thru columns and only add nondupe genes to non dupe dict
        nondupeList = []
        for gene in list(geneDF[col]):
            if allValues.count(gene) == 1:
                nondupeList.append(gene)
        nonDupeDict[col] = nondupeList

    # convert the non dupe dictionary to a new df where each column has unique genes 
    # NaN has to be used to unify the column lengths 
    nonDupeDF = pd.DataFrame(dict([ (k,pd.Series(v)) for k,v in nonDupeDict.items() ]))
    geneDF = nonDupeDF
    
    
    geneDF_all.to_csv("geneDF_ALl_logreg.csv",sep=',',header=True,index_label='')
    
    geneDF.to_csv(filename,sep=',',header=True,index_label='')
    
    return(geneDF)

In [None]:
file_path='/oak/stanford/groups/cgawad/home/Cancer_Studies/SC_RNA_SEQ/ALSF_AML/scanpy/H5AD/'

In [None]:
Combo=sc.read_h5ad(file_path +"ALSF_AML_Combo_3500_with_raw_count.h5ad")

In [None]:
HBM=sc.read_h5ad(file_path +"ALSF_total_HBM.h5ad")

In [None]:
HBM.uns['log1p']["base"] = None

In [None]:
#Combo_new.obs=Combo.obs

# Leukemia stem cell clustering

In [None]:
#get cluster and lineage information 
leiden=pd.DataFrame(Combo.obs['leiden'])
cluster=pd.DataFrame(Combo.obs['Cell_Type'])
lineage=pd.DataFrame(Combo.obs['lineage'])

In [None]:
cluster_df=leiden.join(cluster, how='outer').sort_values(by=['leiden'])
lineage_df=leiden.join(lineage, how='outer').sort_values(by=['leiden'])

In [None]:
cluster_df.set_index('leiden').T.to_dict('index')

In [None]:
leiden_lis=pd.DataFrame(Combo.obs['leiden'])['leiden'].unique().tolist()
leiden_lis.sort()


In [None]:
for group in leiden_lis:
    
    sc.pl.umap(Combo, color=['leiden'],groups=group,frameon=False)

In [None]:
AML_leiden2manCT = {
   
  '0': 'AML',
  '1': 'AML',
  '2': 'Naïve_CD4T',
  '3': 'CD14_Monocyte',
  '4': 'AML',
  '5': 'AML-PCNA',
  '6': 'AML',
  '7': 'AML',
  '8': 'AML',
  '9': 'CD14_Monocyte',
  '10': 'AML',
  '11': 'AML',
  '12': 'AML',
  '13': 'AML',
  '14': 'AML',
  '15': 'AML',
  '16': 'AML',
  '17': 'AML',
  '18': 'AML',
  '19': 'AML',
  '20': 'AML',
  '21': 'AML-CD1C',
  '22': 'AML',
  '23': 'AML',
  '24': 'AML',
  '25': 'AML',
  '26': 'AML-MKI67',
  '27': 'Naïve_CD8T',
  '28': 'AML',
  '29': 'AML',
  '30': 'NK',
  '31': 'AML',
  '32': 'AML',
  '33': 'AML',
  '34': 'AML',
  '35': 'CD20+B',
  '36': 'AML-Ery',
  '37': 'AML',
  '38': 'AML',
  '39': 'AML',
  '40': 'AML',
  '41': 'AML-CTL',
  '42': 'AML',
  '43': 'AML-CD4T',
  '44': 'AML-MKI67',
  '45': 'AML',
  '46': 'AML',
  '47': 'AML',
  '48': 'CTL',
  '49': 'Myeloid_Pro',
  '50': 'AML',
  '51': 'Erythrocytes',
  '52': 'AML',
  '53': 'AML',
  '54': 'AML-PCNA',
  '55': 'AML',
  '56': 'AML',
  '57': 'AML',
  '58': 'AML',
  '59': 'AML',
  '60': 'AML-Ery',
  '61': 'AML-CD14',
  '62': 'CTL',
  '63': 'Activated_CD4T',
  '64': 'ProB',
  '65': 'AML',
  '66': 'AML',
  '67': 'PreB',
  '68': 'AML',
  '69': 'AML',
  '70': 'AML',
  '71': 'AML',
  '72': '0_HSPC',
  '73': 'mDC',
  '74': 'AML',
  '75': 'AML',
  '76': 'CD16_Monocyte',
  '77': 'AML-Ery',
  '78': 'AML-Naïve_CD8T',
  '79': 'AML-Ery',
  '80': 'AML-B',
  '81': 'AML',
  '82': 'AML-Ery',
  '83': 'AML',
  '84': 'AML',
  '85': 'PlasmaB',
  '86': 'pDC',
  '87': 'AML',
  '88': 'AML-NK',
  '89': 'AML-Ery',
  '90': 'AML',
  '91': 'AML-CD4T',
  '92': 'AML-Ery',
  '93': 'AML-B',
  '94': 'AML',
  '95': 'CD34+ProB',
  '96': 'AML',
  '97': 'AML',
  '98': 'Macrophage',
  '99': 'AML-CD4T',
  '100': 'AML',
  '101': 'AML'
}

Combo.obs['Cell_Type'] = (
   Combo.obs['leiden']
    .map(AML_leiden2manCT)
    .astype('category')
)

In [None]:
sc.pl.umap(Combo, color=['Cell_Type'], 
           wspace = 0.2, frameon=False, cmap="Reds",
          )


In [None]:
sc.pl.umap(HBM, color=['Cell_Type'], 
           wspace = 0.2, frameon=False, cmap="Reds",
          )


In [None]:
HBMgene = scanpyMarkerGenes(Combo, clusters='lineage', met='logreg',filename='Combo_lineage-logreg.csv')

In [None]:
markerGenesDict = {}
for col in HBMgene:
    markerGenesDict[col] = list(HBMgene[col].head(5))
markerGenesDict

In [None]:
markerGenesDict={ 
    'AML': ['SPESP1', 'CFD', 'PHLDB2', 'EPCAM', 'HDC'],
    'HSC': ['CRHBP', 'AVP', 'HLF', 'RBPMS', 'ROBO4'],
    'Erythrocytes': ['ITGA2B', 'PRKAR2B', 'KLF1', 'CNRIP1', 'GATA1'],
    'Monocyte': ['S100A8', 'S100A9', 'FCN1', 'FTL', 'STAB1'],
    'Myeloid Pro': ['PRTN3', 'ELANE', 'AZU1', 'CTSG', 'PLPPR3'],
    'PlasmaB': ['IGHA1', 'TXNDC5', 'IGLC2', 'IGHG3', 'TNFRSF17'],
    'B': ['CD79A', 'MS4A1', 'CD79B', 'VPREB1', 'DNTT'],
    'NK': ['KLRF1', 'SH2D1B', 'IL2RB', 'MYOM2', 'CLIC3'],
    'T': ['IL32', 'CD3D', 'TRDV2', 'CD3E', 'CD3G']
}

In [None]:
sc.tl.dendrogram(Combo, groupby='Cell_Type')

In [None]:
sc.pl.dotplot(Combo,markerGenesDict, groupby='lineage',dendrogram=True,
             save='_Combo_lineage_Marker_genes.pdf')

In [None]:
AML=Combo[(Combo.obs["lineage"].isin(['AML','AML-Mono','Myeloid_Pro',
                                            '0_HSPC']))]

In [None]:
sc.tl.leiden(AML, resolution=2,key_added='leiden_AML')

In [None]:
leiden_lis=pd.DataFrame(AML.obs['leiden_AML'])['leiden_AML'].unique().tolist()

In [None]:
leiden_lis.sort()

In [None]:
for group in leiden_lis:
    
    sc.pl.umap(AML, color=['leiden_AML'],groups=group,frameon=False)

In [None]:
AML_leiden2manCT = {
   
  '0': 'AML_1',
  '1': 'AML_2',
  '2': 'AML_3',
  '3': 'AML_4',
  '4': 'AML_5',
  '5': 'AML_6',
  '6': 'AML_7',
  '7': 'AML_8',
  '8': 'AML_9',
  '9': 'AML_10',
  '10':'AML_11',
  '11': 'AML_12',
  '12': 'AML_13',
  '13': 'AML_14',
  '14': 'AML_15',
  '15': 'AML_16',
  '16': 'AML_17',
  '17': 'AML_18',
  '18':'AML_19',
  '19': 'AML_20',
  '20': 'AML_21',
  '21': 'AML_22',
  '22': 'AML_23',
  '23': 'AML_24',
  '24': 'Myeloid_Pro',
  '25': 'AML_25',
  '26': 'AML_26',
  '27': '0_HSPC',
  '28': 'AML_27'






}

AML.obs['AML_subtype'] = (
   AML.obs['leiden_AML']
    .map(AML_leiden2manCT)
    .astype('category')
)

In [None]:
#label the prognosis associated clusters 
AML_leiden2manCT = {
   

  '1': 'PPAC_1',
  '2': 'FPAC_1',
  '3': 'FPAC_2',
  '6':'FPAC_3',
  '10':'PPAC_2',
  '13': 'PPAC_3',
  '22': 'PPAC_4',
  '24': 'Myeloid_Pro',
  '27': '0_HSPC',
  '28': 'FPAC_4',
 




}

AML.obs['PAC_subtype'] = (
   AML.obs['leiden_AML']
    .map(AML_leiden2manCT)
    .astype('category')
)

In [None]:
sc.pl.umap(AML, color=['CD34','PAC_subtype'], frameon=False)

In [None]:
AML.obs['AML_subtype']
df_anno=pd.DataFrame(AML.obs['AML_subtype'])

In [None]:
df_2=pd.DataFrame(Combo.obs)
df_2=Combo.obs.drop(["AML_subtype"],axis=1)

In [None]:
df=df_2.join(df_anno)

In [None]:
Combo.obs=df

In [None]:
Combo.obs['AML_anno'] = np.where(Combo.obs['AML_subtype'].isnull(),
                                      Combo.obs['Cell_Type'],
                                      Combo.obs['AML_subtype']
                                     )

In [None]:
Combo.obs['PAC_anno'] = np.where(Combo.obs['PAC_subtype'].isnull(),
                                      Combo.obs['Cell_Type'],
                                      Combo.obs['PAC_subtype']
                                     )

In [None]:
Combo_new.obs['PAC_anno'] = np.where(Combo_new.obs['batch'].isin(['0','1']),
                                      Combo_new.obs['Cell_Type'],
                                      Combo_new.obs['PAC_anno']
                                     )

In [None]:
Combo_plot.obs['Prognosis-Associated Clusters']=Combo_plot.obs['PAC_subtype']

In [None]:
sc.pl.umap(Combo, color=['Prognosis-Associated Clusters'], frameon=False,
          save='_AML_PAC_Subtype.pdf')

Combo.obs['AML_anno'] = np.where(Combo.obs['AML_subtype'].isnull(),
                                      Combo.obs['LSC_Cluster'],
                                      Combo.obs['AML_subtype']
                                     )

In [None]:
sc.pl.umap(Combo, color=['AML_subtype'], frameon=False,
         # save='_AML_anno_CD34.pdf'
          )

In [None]:
Combo.obs['Cell Type']=Combo.obs['Cell_Type']

In [None]:
HBM=Combo[np.logical_not(Combo.obs["lineage"].isin(['AML',
                                                    'AML-B',
                                             'AML-Ery',
                                             'AML-Mono',
                                              'AML-T',
                                              'AML-NK'
                                             ]))]


In [None]:
#keep AML and HSPC cells from leukemia samples and keep all cells from HBM
AML=Combo[(Combo.obs["lineage"].isin(['AML','AML-Mono','Myeloid_Pro','AML-Ery',
                                            '0_HSPC']))|(Combo.obs["SampleType"].isin(['HealthyBM']))]

#remove the cells from HBM but AML specific cell groups
AML_2=AML[np.logical_not(AML.obs["lineage_Sample"].isin(['AML','AML-B',
                                                                    'AML-Ery',
                                                                    'AML-Mono',
                                                                    'AML-T',
                                                                   #'B',
                                                                   # 'Erythrocytes',
                                                                   # 'Monocyte',
                                                                  # 'NK',
                                                                  # 'T',
                                                                  # 'PlasmaB'
                                                        ]))]



In [None]:
AML_3=AML_2[(AML_2.obs["PAC_anno"].isin(['0_HSPC',
                                         'Myeloid_Pro',
'FPAC_1','FPAC_2','FPAC_3','FPAC_4',
'PPAC_1','PPAC_2','PPAC_3', 'PPAC_4',
    ]))]

In [None]:
sc.tl.rank_genes_groups(AML_3, 'PAC_anno',method='logreg')

In [None]:
LSCGenes= pd.DataFrame(AML_3.uns['rank_genes_groups']['names'])
LSCGenes_Score= pd.DataFrame(AML_3.uns['rank_genes_groups']['scores'])


In [None]:
LSCGenes_Score

In [None]:
LSCGenes.to_csv('AML_PACs_PreRank_gene_logreg.csv')
LSCGenes_Score.to_csv('AML_PACs_PreRank_gene_logreg_Score.csv')

In [None]:
LSCGenes

In [None]:
markerGenesDict = {}
for col in LSCGenes:
    markerGenesDict[col] = list(LSCGenes[col].head(5))
markerGenesDict

In [None]:
sc.tl.dendrogram(AML_3, groupby='PAC_anno')

In [None]:
import matplotlib.pyplot as plt
plt.rcParams.update({'font.size': 20})

In [None]:
sc.pl.rank_genes_groups_heatmap(AML_3, var_names=markerGenesDict,dendrogram=True,
                                use_raw=False, swap_axes=False, 
                                vmin=-3, vmax=3, cmap='bwr', 
                               figsize=(30,30), show=True,show_gene_labels=True,
                               save='_ALSF_AML_LSC_Heatmap_.png')

In [None]:
sc.tl.rank_genes_groups(AML_2, 'PAC_anno', method='wilcoxon')

In [None]:
geneDF_AML= pd.DataFrame(AML_2.uns['rank_genes_groups']['names'])
geneDF_AML_log2= pd.DataFrame(AML_2.uns['rank_genes_groups']['logfoldchanges'])
geneDF_AML_padj= pd.DataFrame(AML_2.uns['rank_genes_groups']['pvals_adj'])

In [None]:
geneDF_AML.to_csv('AML_PAC_Rank_gene_wilcoxon.csv')
geneDF_AML_log2.to_csv('AML_PAC_Rank_gene_wilcoxon_logFC.csv')
geneDF_AML_padj.to_csv('AML_PAC_Rank_gene_wilcoxon_padj.csv')

In [None]:
Combo.write(file_path + "ALSF_AML_Combo_3500_with_raw_count.h5ad")

In [None]:
AML_2.write(file_path + "ALSF_AML_leukemia_NormalHSPC_rna.h5ad")

In [None]:
AML_3.write(file_path + "ALSF_AML_PAC_enriched_rna.h5ad")

# Export data matrix for seurat

In [None]:
from scipy import io

In [None]:
def adata_to_seurat (adata, folder_name):
    with open (file_path+folder_name+'/barcodes.tsv','w') as f:
        for item in adata.obs_names:
            f.write(item +'\n')
        
    with open (file_path+folder_name+'/features.tsv','w') as f:
         for item in ['\t'.join([x,x,'Gene Expression'])for x in adata.var_names]:
                 f.write(item +'\n')  
    io.mmwrite(file_path+folder_name+'/matrix',adata.layers['Raw_Counts'].T)
    
    adata.obs.to_csv(file_path+folder_name+'/metadata.csv')

In [None]:
adata_to_seurat(AML_2, 'matrix_seurat')

# Trajectory inference with PAGA

In [None]:
sc.tl.louvain(Combo, resolution=1.0)

In [None]:
sc.tl.paga(Combo, groups='louvain')

In [None]:
sc.pl.paga(Combo, color=['louvain', 'AVP', 'CD34'])

In [None]:
Combo.obs['louvain'].cat.categories

In [None]:
Combo.obs['louvain_anno'] = Combo.obs['louvain']

In [None]:
Combo.obs['louvain_anno'].cat.categories = ['0', '1', '2', '3', '4', '5', '6', '7/HSPC', '8', '9', '10/Ery', '11', '12',
       '13', '14', '15', '16', '17', '18', '19','20']

In [None]:
sc.tl.paga(Combo, groups='louvain_anno')

In [None]:
sc.pl.paga(Combo, threshold=0.03, show=False)

In [None]:
Combo.uns['iroot'] = np.flatnonzero(Combo.obs['louvain_anno']  == '7/HSPC')[0]

In [None]:
sc.tl.dpt(Combo)

In [None]:
Combo.write(file_path + "ALSF_AML_Combo_3500_with_raw_count.h5ad")

# Trajectory inference with palantir

In [None]:
import scanpy.external as sce
import palantir

In [None]:
sce.tl.palantir(Combo, n_components = 25, knn = 50, impute_data = True, 
                use_adjacency_matrix = True)

In [None]:
cell = ['AAGCCGCAGGTGATTA-1-0']
umap = pd.DataFrame(Combo.obsm['X_umap'][:,0:2], index=Combo.obs_names, columns=['x', 'y'])

pl.scatter(umap["x"], umap["y"], s=5, color="lightgrey")
pl.scatter(umap.loc[cell, "x"], umap.loc[cell, "y"], s=30)

In [None]:
start = 'AAGCCGCAGGTGATTA-1-0'

In [None]:
pr_res = sce.tl.palantir_results(Combo, knn = 50, early_cell = start)

In [None]:
Combo.obs['palantir_pseudotime'] = pr_res.pseudotime
Combo.obs['palantir_entropy'] = pr_res.entropy
pr_res = pr_res.branch_probs

In [None]:
Combo

In [None]:
sc.pl.umap(Combo, color=['dpt_pseudotime','palantir_pseudotime', 'palantir_entropy'],
           legend_fontsize = 10,cmap='bwr',save='_Combo_palantir_pseudotime_entropy.png')

In [None]:
Combo.write(file_path + "ALSF_AML_Combo_3500_with_raw_count.h5ad")

# Gene Matrix input preparation for MCCF1 asssay

In [None]:
Combo=Combo[np.logical_not(Combo.obs["lineage_Sample"].isin(['AML','AML-B',
                                                                    'AML-Ery',
                                                                    'AML-Mono',
                                                                    'AML-T',
                                                                   #'B',
                                                                   # 'Erythrocytes',
                                                                   # 'Monocyte',
                                                                  # 'NK',
                                                                  # 'T',
                                                                  # 'PlasmaB'
                                                        ]))].copy()


In [None]:
Obs_name='PAC_anno'
group_name='Sample'

In [None]:
Cell_count=pd.DataFrame(Combo.obs[Obs_name])
Cell_count_batch=pd.DataFrame(Combo.obs[group_name])
Cell_sum=Cell_count_batch.join(Cell_count)

In [None]:
Cell_sum=Cell_sum.groupby([group_name,Obs_name]).size().to_frame('Cell_sum')

In [None]:
Cell_sum

In [None]:
Cell_sum.reset_index(inplace=True)
data = Cell_sum.pivot(index=[group_name], # Columns that will not change
columns=Obs_name, # Column holding new column names/categories
values='Cell_sum') # Name of value column to spread

data

In [None]:
Total=Cell_sum.groupby([group_name]).sum()

In [None]:
data=data.join(Total)
data.to_csv('AML_scRNA_Cell_summary__%s'% Obs_name+ "_by_"+group_name+".csv")
data

In [None]:
adata=Combo
name="PAC_anno"

In [None]:
gene_ids = adata.raw.var.index.values
clusters = adata.obs['PAC_anno'].cat.categories
obs = adata.raw[:,gene_ids].X.toarray()
obs = pd.DataFrame(obs,columns=gene_ids,index=adata.obs['PAC_anno'])
obs_bool = obs.astype(bool)
fraction_obs_adata = obs_bool.groupby(level=0).sum()/obs_bool.groupby(level=0).count()
count_obs_adata = obs_bool.groupby(level=0).sum()

In [None]:
data=fraction_obs_adata.T.loc[(fraction_obs_adata.T['0_HSPC']<0.05)]
data

In [None]:
data=data.loc[(data['AML']<0.01)]

In [None]:
data

In [None]:
color=['ARTN', 'FCGR1A', 'BGLAP', 'FCGR2A', 'IL5RA',
                         'CD96', 'CD180', 'THSD7A', 'SERPINA1', 'NRG4', 
                         'MSLN', 'ITGAX', 'CES1', 'SIGLEC12', 'OSCAR', 
                         'LILRA5', 'JAG1', 'LAMA5', 'GGT5', 'IL13RA1']

In [None]:
data.loc["OSCAR"]

In [None]:
data=fraction_obs_adata.T
data=data.loc[(data['AML-CTL']>0.1)|(data['CTL']>0.1)|(data['AML-CD4T']>0.1)|
             (data['AML-NK']>0.1)|(data['AML-Naïve_CD8T']>0.1)|
             (data['Activated_CD4T']>0.1)| (data['NK']>0.1)|(data['Naïve_CD8T']>0.1)|
             (data['Naïve_CD4T']>0.1)]

data.to_csv(file_path+"ALSF_RNA-seq_genelist_T_lineage.csv")

In [None]:
HSPC_gene=data.index.to_list()

In [None]:
data=fraction_obs_adata.T.loc[(fraction_obs_adata.T['0_HSPC']>0.1)]

In [None]:
data=data.loc[(data['PPAC_1']>0.1)|(data['PPAC_2']>0.1)|(data['PPAC_3']>0.1)|
             (data['PPAC_4']>0.1)|(data['FPAC_1']>0.1)|
             (data['FPAC_2']>0.1)|(data['FPAC_3']>0.1)|(data['FPAC_4']>0.1)]

In [None]:
data

In [None]:
max_row_indexes=pd.DataFrame(data.max(axis=1))
max_row_indexes=max_row_indexes[(max_row_indexes.iloc[:,0]> 0.1)]
max_row_indexes

In [None]:
Lineage_list=max_row_indexes.index.to_list()

In [None]:
data=fraction_obs_adata.T.loc[(fraction_obs_adata.T['AML']>0.2)]

In [None]:
AML_gene=data.index.to_list()
AML_gene

In [None]:
fraction_obs_adata.loc[:, (fraction_obs_adata != 0).any(axis=0)]

In [None]:
fraction_obs_adata.loc[:, (fraction_obs_adata != 0).any(axis=0)]

In [None]:
fraction_obs_adata.to_csv(file_path +'%s_gene_fraction.csv'%name)

In [None]:
#remove duplicate
test_list = list(max_row_indexes.index)

In [None]:
 %pprint

In [None]:
len(test_list)

In [None]:
df = pd.DataFrame(test_list)
print(df)

In [None]:
data=fraction_obs_adata.T
data=data.loc[(data['AML-B']>0.1)|(data['CD20+B']>0.1)|(data['ProB']>0.1)|
             (data['PreB']>0.1)|(data['PlasmaB']>0.1)|
             (data['CD34+ProB']>0.1)]

data.to_csv(file_path+"ALSF_RNA-seq_genelist_B_lineage.csv")

In [None]:
data.to_csv(file_path+"ALSF_RNA-seq_HSPC_shared_with_PACs_0.1.csv")

In [None]:
Sub_Gene=Combo[:,test_list].copy()

In [None]:
expr=pd.DataFrame(data=Sub_Gene.X)
expr.columns = [Sub_Gene.var_names]
expr.index = [Sub_Gene.obs_names]

In [None]:
expr

In [None]:
expr.to_csv(file_path+"ALSF_RNA-seq_expr_HSPC_0.05_other_0.1.csv")

In [None]:
Combo.obs.to_csv(file_path+"ALSF_RNA-seq_Combo_obs.csv")

In [None]:
with plt.rc_context({"figure.figsize": (8, 4), "figure.dpi": (300)}):
    sc.pl.dotplot(AML_4, LSC_CSF,groupby='Sample', 
                 dendrogram=True, 
                 #swap_axes=True,
                 save='_Combo_AML_Cell_marker_LSC_CSF_by_Sample.png')

In [None]:
with plt.rc_context({"figure.figsize": (8, 4), "figure.dpi": (300)}):
    sc.pl.dotplot(Combo, LSC_CSF,groupby='lineage', 
                 dendrogram=True, 
                 #swap_axes=True,
                 save='_Combo_AML_Cell_marker_LSC_CSF_by_lineage.png')

# LSC count matrix

In [None]:
Combo

In [None]:
adata=Combo[(Combo.obs["lineage"].isin(['AML','AML-Mono','Myeloid_Pro',
                                            '0_HSPC']))]

In [None]:
adata

In [None]:
DownSample = sc.pp.subsample(adata, n_obs = 50000,random_state=0, copy=True)

In [None]:
#DownSample = adata

In [None]:
Count=pd.DataFrame(data=DownSample.layers['Raw_Counts'].toarray())

In [None]:
Count.columns = [adata.var.index.tolist()]
Count.index = [DownSample.obs.index.tolist()]
Count

In [None]:
Count.index = Count.index.get_level_values(0)
Count.columns = Count.columns.get_level_values(0)

In [None]:
Count_f=Count.loc[:, Count.nunique() >10]

In [None]:
Count_f

In [None]:
anno_df=pd.DataFrame(DownSample.obs['AML_anno'])
anno_df

In [None]:
df_final= pd.merge(anno_df, Count_f,
                   left_index=True, right_index=True)

In [None]:
df=df_final.T

In [None]:
df.columns=df.iloc[0]

In [None]:
df=df.iloc[1:]

In [None]:
df

In [None]:
Count_T=Count.T


In [None]:
Count_T

In [None]:
df.to_csv(file_path+'ALSF_AML_subtype_Count_matrix.tsv',index=True,sep='\t')

In [None]:
Count_T.to_csv(file_path+'SQUID_scRNA_Reference_Count_matrix_1000.tsv',index=True,sep='\t')

In [None]:
obs_df=pd.DataFrame(DownSample.obs.reset_index())

In [None]:
obs_df

In [None]:
obs_df.to_csv(file_path+'SQUID_scRNA_Reference_Count_matrix_1000_meta.tsv',index=True,sep='\t')