Date: April 2022 

Goal: 

1.T cell subset and annotation

# Load tools

In [None]:
import scanpy as sc
import anndata as ad
import matplotlib as mpl
import matplotlib.pyplot as pl
import seaborn as sns

In [None]:
import os
import sys
import pandas as pd
import numpy as np


In [None]:
import muon as mu
from muon import prot as pt

In [None]:
import gzip

In [None]:
#https://stackoverflow.com/questions/71106940/cannot-import-name-centered-from-scipy-signal-signaltools
import scipy.signal.signaltools

def _centered(arr, newsize):
    # Return the center newsize portion of the array.
    newsize = np.asarray(newsize)
    currsize = np.array(arr.shape)
    startind = (currsize - newsize) // 2
    endind = startind + newsize
    myslice = [slice(startind[k], endind[k]) for k in range(len(endind))]
    return arr[tuple(myslice)]

scipy.signal.signaltools._centered = _centered

In [None]:
from matplotlib.pyplot import imshow
%matplotlib inline

In [None]:
sc.settings.verbosity = 3 # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_header()
sc.settings.set_figure_params(dpi= 80)

# Define Functions

In [None]:
file_path='/oak/stanford/groups/cgawad/home/Cancer_Studies/SC_RNA_SEQ/ALSF_AML/scanpy/H5AD/'

In [None]:
Combo=sc.read_h5ad(file_path+"ALSF_AML_Combo_3500_with_raw_count.h5ad")

In [None]:
Combo.uns['log1p']["base"] = None

In [None]:
mdata_prot=sc.read_h5ad(file_path +"ALSF_AML_total_protein.h5ad")

In [None]:
mdata_prot

## T Cell
  Built a new data set named TCell by selected "Cell Type". 

In [None]:
sc.pl.umap(Combo, color=['Cell_Type'], wspace=0.6, legend_fontsize = 10,frameon=False,
           #save="_Combo_cluster_Sample.png"
          ) 

In [None]:
Tcell = Combo[Combo.obs['lineage'].isin(['AML-T','T',
                                         'NK','AML-NK']),:]

In [None]:
sc.pl.highly_variable_genes(Tcell) 

In [None]:
sc.tl.pca(Tcell, svd_solver='arpack',n_comps=50)

In [None]:
sc.pp.neighbors(Tcell, n_neighbors=100, n_pcs=50)

In [None]:
sc.tl.umap(Tcell,min_dist=0.5,spread=6)

In [None]:
sc.tl.leiden(Tcell, resolution=3
            )

In [None]:

sc.pl.umap(Tcell, color=['CD34','IL7R','CD7',# LYMPH progenitor
                         'TRAC','KLRD1','KLRG1','FGFBP2','LYAR','GZMM','FCRL6',
                         'CXCR3','SLC4A10','FOXP3',
                         'CD3D','CD4','CD8A','CD8B','CCR7',# T cells
                         'KLRB1','GNLY','GZMB','GZMA','GZMH','GZMK',
                         'KLRF1',# NK cells
                         'IFNG',#dysfunction T cells,
                         'TNF','PRF1',
                         'CD40','CD44','SELL','CD58','FAS','B3GAT1',
                         'CX3CR1','PDCD1','HNF1A',
                         'TCF7','LEF1',
                         'LAG3','TIGIT','HAVCR2',
                         'CTLA4','LAYN','ENTPD1','ITGAE',
                         
                                  
                                 ],cmap='Reds',
           save='_ALSF_AML_Lymph_lymph.Marker.Genes.png'
          )

In [None]:
Tcell_leiden2manCT = {

    '0': 'GZMB NK',
  '1': 'Naïve CD8T',
  '2': 'AML-Naïve CD4T',
  '3': 'AML-Naïve CD8T',
  '4': 'Activated CD4T',
  '5': 'GZMK CD8T',
  '6': 'Naïve CD4T',
  '7': 'Naïve CD8T',
  '8': 'Naïve CD4T',
  '9': 'AML-GZMK CD8T',
  '10': 'GZMB CD8T',
  '11': 'AML-Activated CD4T',
  '12': 'AML-GZMB CD8T',
  '13': 'AML-NK',
  '14': 'Naïve CD4T',
  '15': 'GZMK NK',
  '16': 'MAIT',
  '17': 'GZMB NK',
  '18': 'Effector memory CD8T',
  '19': 'AML-MAIT',
  '20': 'GZMB DNT',
  '21': 'Naïve CD4T',
  '22': 'Naïve CD4T',
  '23': 'Naïve CD4T',
  '24': 'Naïve CD4T',
  '25': 'Naïve CD4T',
  '26': 'Naïve CD4T',
  '27': 'AML-Activated CD4T',
  '28': 'Naïve CD4T',
  '29': 'GZMB NK',
  '30': 'Naïve CD4T',
  '31': 'Naïve CD4T',
  '32': 'Activated CD4T',
  '33': 'Naïve CD4T',
  '34': 'Naïve CD4T',
  '35': 'Naïve CD4T',
  '36': 'Naïve CD4T'
}
Tcell.obs['TCell'] = (
    Tcell.obs['leiden']
    .map(Tcell_leiden2manCT)
    .astype('category')
)

In [None]:
sc.pl.umap(Tcell, color=['Cell Type'], wspace = 0.5, 
       
           frameon=False, save='_ALSF_AML_T_CELL_TYPE.png')

In [None]:
sc.pl.umap(Tcell, color=['SampleID'], wspace = 0.5, 
       
           frameon=False,
          save='_ALSF_AML_T_Sample.png')

In [None]:
HBM_Tcell = Tcell [Tcell .obs['batch'].isin(['0','1']),:]

In [None]:
Tcell.write(file_path + "ALSF_AML_total_3500_Tcelll_rna.h5ad")

In [None]:

sc.pl.umap(HBM_Tcell, color=['CD127_pos', 'CD274_pos', 'CD3_pos',  
                         'CD7_pos', 'CD49f_pos', 'CD25_pos', 'CD279_pos',
                         'CD152_pos','CD366_pos','CD71_pos','CD45RA_pos', 'CD123_pos',
                         'CD36_pos', 'CD133_pos','CD33_pos',  'CD32_pos', 
                         'CD90_pos', 'CD10_pos',  'CD235ab_pos','CD19_pos',
                                  
                                 ],cmap='Reds')

In [None]:
sc.pl.umap(Tcell, color=['CD127_pos', 'CD274_pos', 'CD3_pos',  
                         'CD7_pos', 'CD49f_pos', 'CD25_pos', 'CD279_pos',
                         'CD152_pos','CD366_pos','CD71_pos','CD45RA_pos',
                        
                        'CD33_pos'
                                  
                                 ],cmap='Reds', ncols=6,
        save='_ALSF_AML_lymphCombo_TCells.MarkerGenes_csf.png')

In [None]:
#T cell  1M/1ML blood 

In [None]:
sc.pl.umap(Tcell, color=['TCell'], 
           save='ALSF_AML_Combo_T_Cells.png'
          )

In [None]:
#keep AML and HSPC cells from leukemia samples and keep all cells from HBM
AML=Combo[(Combo.obs["lineage"].isin(['AML','AML-Mono','Myeloid_Pro',
                                            '0_HSPC']))|(Combo.obs["SampleType"].isin(['HealthyBM']))]

#remove the cells from HBM but AML specific cell groups
AML_2=AML[np.logical_not(AML.obs["lineage_Sample"].isin(['AML','AML-B',
                                                                    'AML-Ery',
                                                                    'AML-Mono',
                                                                    'AML-T',
                                                                   #'B',
                                                                   # 'Erythrocytes',
                                                                   # 'Monocyte',
                                                                  # 'NK',
                                                                  # 'T',
                                                                  # 'PlasmaB'
                                                        ]))].copy()



In [None]:
EX_T_1=['ENTPD1','ITGAE','TNFRSF9','TNFRSF18']
EX_T_2=['HAVCR2','PDCD1', 'LAG3']
TF_1=["NR4A1","TOX",'TOX2']
RL_list_1=['ANXA1', 'FAS', 'TNFSF12', 'TNFSF13B', 'TNFRSF1A', 'TNFSF13', 'CD99', 'HLA-G', 
           'CD1D', 'LGALS9', 'RNASET2', 'TBXAS1', 'TNFSF4', 'CXCL2', 'TNFSF9', 'TNFSF10', 
           'CTSG', 'LILRB2', 'OPRL1', 'TNFRSF14', 'TNFRSF1A', 'TNFRSF25', 'TNFRSF12A', 'LTBR',
           'FAS', 'CSF1R', 'MERTK', 'TNFRSF10A', 'TNFRSF10B', 
           'TNFRSF10D', 'FZD2', 'ADORA2A', 'CCR1', 'TNFRSF6B', 'HAVCR2', 
           'CD40', 'ITGB1', 'TBXA2R', 'PTGER1', 'TNFRSF4', 'ITGA5']
RL_list_2=['HLA-F', 'CD70', 'MICB', 'PTGDS', 'TNF', 'ICAM1', 'LTA', 'NT5E', 'TNFRSF1B',
           'CD72', 'TNFSF8', 'IGF2', 'CCL5', 'SEMA4D', 
           'LILRB2', 'ADORA2B', 'TNF', 'ITGAL', 'TNFRSF1B', 'OXER1',
           'CCR3', 'LILRB1', 'CCR5', 'GPR75', 'ADORA2A']
TCell_anno=['CD3E','CD4','CD8A','CD8B',
                                  'LRRN3','CCR7','SELL',#  Human naïve T cells 
                                  'CCL5','CCL4','NKG7','GZMK','GZMB','GNLY','KLRG1','PRF1','KLRB1','KLRD1',#Activated cytotoxic CD8+ T cells 
                                  'NCAM1','TNF','LTA','STAT4','ANXA1',#Th1
                                  'GATA3','STAT6',#Th2
                                  'RORC','STAT3',#Th17
                                  'CD69','ITGAE','FAS',#Memory CD8+ T cells
                                  'IL2RA','FOXP3','TNFRSF18','TNFRSF4',#Treg
                                  'CXCR5','ICOS',#follicular B helper T cells
                'SLC4A10'
                           ]

In [None]:
sc.tl.dendrogram(AML_2, groupby='lineage_Sample')

In [None]:
sc.pl.dotplot(AML_2, RL_list_1, 'lineage_Sample', 
              dendrogram=True,cmap='bwr',
             save='_Combo_Cell_Cell_interaction_with_overlap_normal_mye.png')

In [None]:
sc.pl.dotplot(Tcell, TCell_anno, 'TCell', 
              dendrogram=True,
             save='_T_Cell_annotation_gene.png')

In [None]:
sc.pl.dotplot(Tcell, EX_T_2, 'TCell', dendrogram=True,cmap='bwr',
              save='_tumour_T_Cell_dysfuction_gene_set2.png')

In [None]:
color_list=Tcell.uns['Cell_Type_colors']
color_list

In [None]:
Cell_Type_List=pd.DataFrame(Tcell.obs['Cell_Type'])
Cell_Type=Cell_Type_List.Cell_Type.unique()
Cell_Type.categories.values

In [None]:
color_dict=dict(zip( Cell_Type.categories.values,Tcell.uns['Cell_Type_colors']))
color_dict

In [None]:
sc.pl.umap(Tcell, color=['S100A10','FOXP3',
  'PTGER4',
  'ZNF331',
  'VIM',
  'CYBA',
  'CD37',
  'PKM',
  'CD53',
  'ARPC1B',
  'RCSD1',
  'LAPTM5',
  'LINC01578',
  'SCGB3A1',
  'DOK2',
  'CYTH4',
                         
                         'TIGIT','PDCD1', 'LAG3', 'IFNG'], cmap='Reds')

In [None]:
Tcellgene = scanpyMarkerGenes(Tcell, clusters='leiden', met='logreg',filename='AML_T_cells-logreg.csv')

In [None]:
markerGenesDict = {}
for col in Tcellgene:
    markerGenesDict[col] = list(Tcellgene[col].head(15))
markerGenesDict

In [None]:
sc.tl.rank_genes_groups(Tcell, groupby='Cell_Type', method='wilcoxon')
geneDF = pd.DataFrame(Tcell.uns['rank_genes_groups']['names'])
geneDF.to_csv('ALSF_AML_LymphoRankedGenes_Cell_Type_wilcoxon.csv',sep=",",header=True,index_label="")

In [None]:
markerGenesDict = {}
for col in geneDF :
    markerGenesDict[col] = list(geneDF[col].head(20))
markerGenesDict

In [None]:
sc.tl.rank_genes_groups(Tcell, groupby='leiden', method='logreg')
geneDF = pd.DataFrame(Tcell.uns['rank_genes_groups']['names'])
geneDF.to_csv('ALSF_AML_LymphoRankedGenes_Cell_Type_logreg.csv',sep=",",header=True,index_label="")

In [None]:
sc.tl.rank_genes_groups(Tcell, groupby='Cell_Type', method='logreg')
geneDF = pd.DataFrame(Tcell.uns['rank_genes_groups']['names'])
geneDF.to_csv('ALSF_AML_LymphoRankedGenes_logreg.csv',sep=",",header=True,index_label="")

In [None]:
markerGenesDict = {}
for col in geneDF :
    markerGenesDict[col] = list(geneDF[col].head(7))
markerGenesDict

In [None]:
sc.pl.umap(Tcell, color='Cell_Type', 
            legend_loc='on data', legend_fontsize = 8,title='Cell Type', save='_lymph.CellType.png') 



In [None]:
sc.tl.leiden(Tcell, resolution=4)
sc.pl.umap(Tcell, color=['leiden', 'SampleID'], wspace = 0.6,save='_ALSF_AML_Lymph.leiden.batch.png')

In [None]:
sc.pl.umap(Tcell, color=['CD3E','CD4','CD8A','CD8B',
                                  'LRRN3','CCR7','SELL',#  Human naïve T cells 
                                  'CCL5','CCL4','NKG7','GZMK','GZMB','GNLY','KLRG1','PRF1','KLRB1','KLRD1',#Activated cytotoxic CD8+ T cells 
                                  'NCAM1','TNF','LTA','STAT4','ANXA1',#Th1
                                  'GATA3','STAT6',#Th2
                                  'RORC','STAT3',#Th17
                                  'CD69','ITGAE','FAS',#Memory CD8+ T cells
                                  'IL2RA','FOXP3','TNFRSF18','TNFRSF4',#Treg
                                  'CXCR5','ICOS',#follicular B helper T cells
                'SLC4A10'
                                 ], cmap="Reds",ncols=6,save='_ALSF_AML_lymphCombo_TCells.MarkerGenes.png')

In [None]:
sc.tl.rank_genes_groups(Tcell, groupby='Cell_Type', method='logreg')
geneDF = pd.DataFrame(Tcell.uns['rank_genes_groups']['names'])
geneDF.to_csv('ALSF_AML_LymphoRankedGenes_logreg.csv',sep=",",header=True,index_label="")

In [None]:
markerGenesDict = {}
for col in geneDF:
    markerGenesDict[col] = list(geneDF[col].head(5))
markerGenesDict

In [None]:
sc.tl.dendrogram(Tcell, groupby='Cell_Type')

In [None]:
sc.pl.rank_genes_groups_heatmap(Tcell, var_names=markerGenesDict,
                                dendrogram=True,
                                use_raw=False, swap_axes=False, 
                                vmin=-3, vmax=3, cmap='bwr', 
                               figsize=(12,10), show=True,show_gene_labels=True,
                               save='Tcell_ranked_genes_Heatmap_logreg.png')

In [None]:
sc.tl.rank_genes_groups(Tcell, groupby='Cell_Type', method='wilcoxon')
geneDF = pd.DataFrame(Tcell.uns['rank_genes_groups']['names'])
geneDF.to_csv('ALSF_AML_LymphoRankedGenes_wilcoxon.csv',sep=",",header=True,index_label="")

In [None]:
geneDF_Tcell= pd.DataFrame(Tcell.uns['rank_genes_groups']['names'])
geneDF_Tcell_log2= pd.DataFrame(Tcell.uns['rank_genes_groups']['logfoldchanges'])
geneDF_Tcell_padj= pd.DataFrame(Tcell.uns['rank_genes_groups']['pvals_adj'])

In [None]:
geneDF_Tcell

In [None]:
geneDF_Tcell.to_csv('Tcell_leu_stem_Rank_gene_wilcoxonn.csv')
geneDF_Tcell_log2.to_csv('Tcell_leu_stem_Rank_gene_wilcoxonn_logFC.csv')
geneDF_Tcell_padj.to_csv('Tcell_leu_stem_Rank_gene_wilcoxonn_padj.csv')

In [None]:
gene_ids = Tcell.raw.var.index.values
clusters = Tcell.obs['Sample'].cat.categories
obs = Tcell.raw[:,gene_ids].X.toarray()
obs = pd.DataFrame(obs,columns=gene_ids,index=Tcell.obs['Sample'])
obs_bool = obs.astype(bool)
fraction_obs_Tcell = obs_bool.groupby(level=0).sum()/obs_bool.groupby(level=0).count()
count_obs_Tcell = obs_bool.groupby(level=0).sum()

In [None]:
fraction_obs_Tcell
fraction_obs_Tcell.to_csv('Combo_AML_Tcell_gene_by_Sample_fraction.csv')

In [None]:
Tcell_count=pd.DataFrame(Tcell.obs['Cell_Type'])
Tcell_count_batch=pd.DataFrame(Tcell.obs['Sample'])
Tcell_sum=Tcell_count_batch.join(Tcell_count)

In [None]:
Tcell_sum=Tcell_sum.groupby(['Sample','Cell_Type']).size().to_frame('Tcell_sum')

In [None]:
Tcell_sum.reset_index(inplace=True)
data = Tcell_sum.pivot(index=['Sample'], # Columns that will not change
columns='Cell_Type', # Column holding new column names/categories
values='Tcell_sum') # Name of value column to spread

data

# Raw count metrics for T cells

In [None]:
adata=Combo[Combo.obs['lineage'].isin(['AML-T','T',
                                         'NK','AML-NK']),:]

In [None]:
Count=pd.DataFrame(data=adata.layers['Raw_Counts'].toarray())

In [None]:
Count

In [None]:
Count.columns = [adata.var.index.tolist()]
Count.index = [adata.obs.index.tolist()]


In [None]:
Count

In [None]:
Count.columns = Count.columns.get_level_values(0)

In [None]:
Count.index = Count.index.get_level_values(0)

In [None]:
anno_df=pd.DataFrame(Tcell.obs['Cell_Type'])

In [None]:
anno_df

In [None]:
df_final= anno_df.join(Count)

In [None]:
df_final

In [None]:
df=df_final.loc[:, df_final.nunique() > 3]

In [None]:
df

In [None]:
df=df.T

In [None]:
df.columns=df.iloc[0]


In [None]:
df=df.iloc[1:]

In [None]:
file_path='/oak/stanford/groups/cgawad/home/Cancer_Studies/SC_RNA_SEQ/ALSF_AML/scanpy/H5AD/'

In [None]:
df.to_csv(file_path+'ALSF_AML_T_Cell_Count_matrix.tsv',index=True,sep="\t")

In [None]:
Tcell.obs.to_csv(file_path+'ALSF_AML_T_Cell_Count_meta_matrix.tsv',index=True,sep="\t")

In [None]:
df_anno=pd.DataFrame(Tcell.obs['TCell'])
df_anno

In [None]:
df_2=pd.DataFrame(Combo.obs)
df_2=Combo.obs.drop(["TCell"],axis=1)
df_2

In [None]:
df=df_2.join(df_anno)
df

In [None]:
Combo.obs=df

In [None]:
Combo.obs['TCell_anno'] = np.where(Combo.obs['lineage'].isin(['AML-T','T','AML-NK','NK']),
                                      Combo.obs['TCell'],
                                      Combo.obs['PAC_anno']
                                     )

In [None]:
Combo.write(file_path + "ALSF_AML_Combo_3500_with_raw_count.h5ad")