# Processing the QC file we created from the raw Single Cell data in to clusters

## Imports

In [None]:
import numpy as np
import scanpy as sc
import os
import pandas as pd
import bbknn

## Single Cell settings

In [None]:
sc.settings.verbosity = 4
sc.settings.set_figure_params(80) 
sc.settings.file_format_figures = 'png'
sc.settings.savefigs = False
use_first_n_samples = 0
full_sparse = False

## Defining a function for processing the data & determing the parameters

In [None]:
def processing_batch(adata, neighbors=15, key='batch', reso=0.4, random_state=0):  # Higher resolution yields more leiden clusters 
    sc.pp.scale(adata, max_value=10)
    sc.tl.pca(adata, svd_solver='arpack')
    sc.logging.print_memory_usage()
    sc.pp.neighbors(adata, n_neighbors=neighbors)
    sc.pl.pca_variance_ratio(adata, log=True)
    bbknn.bbknn(adata, batch_key=key, neighbors_within_batch=2, metric='euclidean', n_pcs=30, trim=None)
    sc.tl.umap(adata)
    sc.tl.leiden(adata, resolution=reso)
    sc.pl.umap(adata, color='leiden')

## Reading the excisting h5ad file we created previously, in the QC pipeline 

In [None]:
#adata after QC
adata = sc.read(r'/Code/data/single_cell/animals/turtle/turtle_5/''turtle_5_after_qc.h5ad') 

sc.read_h5ad(r'/Code/data/single_cell/animals/turtle/turtle_5/''turtle_5_after_qc.h5ad')

# If the anndata object is processed and contains the raw data in adata.raw run -  adata = adata.raw.to_adata()

## Processing the data 

In [None]:
adata

In [None]:
adata.obs

In [None]:
adata.var

### Cleaning, normalizing, reducing dimensionality, clustering cells and creating visualizations

In [None]:
sc.pp.normalize_total(adata)
sc.pp.log1p(adata)
sc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=3, min_disp=0.5)
sc.pl.highly_variable_genes(adata)
adata.raw = adata
# adata = adata[:, adata.var.highly_variable] # For filtering non Highly variable genes. We usually keep all the genes. 

In [None]:
sc.pp.regress_out(adata, ['total_counts','pct_counts_MT'])

# Got it from: https://github.com/theislab/single-cell-tutorial/issues/35
sc.pp.scale(adata, max_value=10)
sc.pp.pca(adata, n_comps=50, use_highly_variable=True, svd_solver='arpack')
sc.pp.neighbors(adata)
sc.tl.umap(adata) 

In [None]:
processing_batch(adata, key='treatment')

### Ranking genes ( = top 20)

In [None]:
sc.tl.rank_genes_groups(adata, groupby='treatment', method='wilcoxon')
sc.pl.rank_genes_groups(adata, n_genes=50, sharey=True)
pd.DataFrame(adata.uns['rank_genes_groups']['names']).head(20)

# sc.pl.umap(adata[adata.obs['treatment'] == 'control'], color = ['PLAC8', 'CD83', 'CD74', 'CD79B', 'CD38', 'IL10', 'CD7'], size=50)

In [None]:
adata.obs['treatment'].value_counts()

### UMAP's

#### Markers test box

In [None]:
sc.pl.umap(adata, color=['CD81'], size=20)

#### T cells

In [None]:
# sc.pl.umap(adata, color=['CD3D','CD247','TCF7','GPR171'], size=20)
# sc.pl.umap(adata, color='leiden', size=20)

In [None]:
# T cells. 'CCR7' is a marker for T cells and also B cells. Markers from Roy's Extended_Data_Figures.
# Markers for T cells https://www.beckman.com/resources/cell-types/blood-cells/leukocytes/lymphocytes/t-cells
# https://panglaodb.se/markers.html?cell_type=%27T%20cells%27

sc.pl.umap(adata, color=['CD3D','CD3E','CD28','CD226','CD247', 
                         'CD4','CD7_3','CD38','STMN1','LEF1',
                         'RORA','GPR171','CCR7','CXCR4','TCF7',
                         'GPR183_1','GPR183_2','ALOX5AP','IL7R','HOPX',
                         'CD8A_2','CD8B','CD82'], size=30)

sc.pl.umap(adata, color='leiden', size=30)

In [None]:
# sc.pl.umap(adata[adata.obs['treatment'] == 'control'], color='CD3D', size=30)
# sc.pl.dotplot(adata, 'CD3D', groupby='treatment', standard_scale='var')

# sc.pl.umap(adata[adata.obs['treatment'] == 'lps'], color='CD3D', size=30)
# sc.pl.dotplot(adata, 'CD3D', groupby='treatment', standard_scale='var')

In [None]:
# sc.pl.umap(adata[adata.obs['treatment'] == 'control'], color='IFNAR2', size=30)
# sc.pl.dotplot(adata, 'IFNAR2', groupby='treatment', standard_scale='var')

# sc.pl.umap(adata[adata.obs['treatment'] == 'pic'], color='IFNAR2', size=30)
# sc.pl.dotplot(adata, 'IFNAR2', groupby='treatment', standard_scale='var')

#### B cells

In [None]:
# B cells markers from Roy's data and from https://panglaodb.se/markers.html?cell_type=%27Platelets%27#google_vignette

sc.pl.umap(adata, color=['CD79B', 'CD38','IRF8','STMN1','IGJ','TCF4_1',
                         'TCF4_2', 'CCR7','PXK','BANK1','BACH2','CD74',
                         'PLAC8_1','PLAC8_2','IGHM','HLA-DPB1_5',
                         'HLA-DPB1_6','HLA-DMA','CD83','CD81'], size=30)

celltype = {'0':'0','1':'1','2':'T_cells','3':'3','4':'T_cells','5':'5','6':'T_cells',
            '7':'T_cells','8':'8','9':'9','10':'10'}
adata.obs['cell_type'] = adata.obs.leiden.map(celltype).astype('category')
sc.pl.umap(adata, color='cell_type', size=20)

#### RBC

In [None]:
# RBC markers from Roy's data and from https://panglaodb.se/markers.html?cell_type=%27Platelets%27#google_vignette

sc.pl.umap(adata, color=['STMN1','HBA','HBM','SLC4A1','ANK1'], size=30)

celltype = {'0':'0','1':'1','2':'T_cells','3':'3','4':'T_cells','5':'B_cells','6':'T_cells',
            '7':'T_cells','8':'8','9':'9','10':'10'}
adata.obs['cell_type'] = adata.obs.leiden.map(celltype).astype('category')
sc.pl.umap(adata, color='cell_type', size=20)

#### Platelets

In [None]:
# Platelets markers from Roy's data and from https://panglaodb.se/markers.html?cell_type=%27Platelets%27#google_vignette

sc.pl.umap(adata, color=['TUBB1','PLEK','B2M','GP1BA_1','F13A1',
                         'GNAS','TAGLN2','OAZ1','CD226','PECAM1',
                         'CD36','GP1BA_2','ITGB3','ENSGALG00010014668'], size=30)

celltype = {'0':'0','1':'1','2':'T_cells','3':'RBC','4':'T_cells','5':'B_cells','6':'T_cells',
            '7':'T_cells','8':'8','9':'9','10':'10'}
adata.obs['cell_type'] = adata.obs.leiden.map(celltype).astype('category')
sc.pl.umap(adata, color='cell_type', size=20)

#### Monocytes

In [None]:
# Monocytes markers from Roy's data and from https://panglaodb.se/markers.html?cell_type=%27Platelets%27#google_vignette

sc.pl.umap(adata, color=['SPI1','S100A4','C1QA','LYZ','LGALS3','CSF1R',
                         'MAFB','CD7_3','TET2','CD40','TLR4','S100A10_2',
                         'S100A11','CSTA','CCR2','CCL8_1','CCL8_3','TXN',
                         'MAFB','ACP5','GBP1_1','GBP1_2','GBP1_3','IRF7',
                         'PLSCR1','RGS1','S100A6','MRC1_3','OASL',
                         'IFIT5','S100A9','RSAD2','ZNFX1','IFIH1','SMCHD1',
                         'PARP14_2','ENSGALG00010007911','CMPK2'], size=30)

celltype = {'0':'0','1':'Platelets','2':'T_cells','3':'RBC','4':'T_cells','5':'B_cells','6':'T_cells',
            '7':'T_cells','8':'8','9':'9','10':'10'}
adata.obs['cell_type'] = adata.obs.leiden.map(celltype).astype('category')
sc.pl.umap(adata, color='cell_type', size=20)

#### All clusters after annotations

In [None]:
celltype = {'0':'Monocytes','1':'Platelets','2':'T_cells','3':'RBC','4':'T_cells','5':'B_cells','6':'T_cells',
            '7':'T_cells','8':'Monocytes','9':'Monocytes','10':'Monocytes'}
adata.obs['cell_type'] = adata.obs.leiden.map(celltype).astype('category')
sc.pl.umap(adata, color='cell_type', size=30, save=True)

In [None]:
adata.obs['cell_type'].value_counts()

#### TLR genes in adata by treatment

In [None]:
TLR_genes = ['TLR7', 'TLR5', 'TLR2_1', 'TLR2_2', 'TLR3', 'TLR1', 'TLR21', 'TLR4']

In [None]:
sc.pl.umap(adata[adata.obs['treatment'] == 'control'], color=TLR_genes, size=50)
sc.pl.dotplot(adata, TLR_genes, groupby='treatment', standard_scale='var')

sc.pl.umap(adata[adata.obs['treatment'] == 'lps'], color=TLR_genes, size=50)
#sc.pl.dotplot(adata, TLR_genes, groupby='treatment', standard_scale='var')

sc.pl.umap(adata[adata.obs['treatment'] == 'pic'], color=TLR_genes, size=50)
#sc.pl.dotplot(adata, TLR_genes, groupby='treatment', standard_scale='var')

#### ISG genes in adata

In [None]:
df_isg_genes = pd.read_csv('/Code/data/single_cell/animals/chicken/ca1/new_genome/ISG_genes_ca1_new_genome.csv', index_col=0)
df_isg_genes = df_isg_genes.sort_index()

df_isg_genes

In [None]:
isg_genes_list = df_isg_genes.index.tolist()
isg_genes_list

##### UMAP's of the ISG genes

In [None]:
celltype = {'0':'Platelets','1':'T_cells','2':'B_cells','3':'Monocytes','4':'T_cells','5':'RBC','6':'Monocytes'}
adata.obs['cell_type'] = adata.obs.leiden.map(celltype).astype('category')
sc.pl.umap(adata, color=isg_genes_list, size=50)

In [None]:
sc.pl.umap(adata, color=['TLR7', 'TLR5', 'TLR2_1', 'TLR2_2', 'TLR3', 'TLR1', 'TLR21', 'TLR4'], size=50)

##### Dotplot of the ISG genes in expressed in chicken

In [None]:
sc.pl.dotplot(adata, isg_genes_list, groupby='treatment', standard_scale='var')

### Subclustering

#### T cells

In [None]:
# Subclustering the T_cells cluster i found. there are two options to do the same function:

subset_adata_t = adata[adata.obs.cell_type == 'T_cells'].copy()
subset_adata_t

# t_cluster = adata[adata.obs.leiden.isin(['1','4'])]

# bbknn.bbknn(t_cluster, batch_key='treatment', neighbors_within_batch=2, metric='euclidean', n_pcs=30, trim=None)
# sc.tl.umap(t_cluster)
# sc.tl.leiden(t_cluster, resolution=0.4)
# sc.pl.umap(t_cluster, color =['leiden'])

In [None]:
bbknn.bbknn(subset_adata_t, batch_key='treatment', neighbors_within_batch=2, metric='euclidean', n_pcs=30, trim=None)
sc.tl.umap(subset_adata_t)
sc.tl.leiden(subset_adata_t, resolution=0.4)
sc.pl.umap(subset_adata_t, color=['CD3E','CD3D','CD28','TCF7','IL7R','CD8B','CD82','CD226',
                                  'CD247','CD4','CD7_3','CD38','STMN1','LEF1','RORA','GPR171','CCR7',
                                  'CXCR4','TCF7','GPR183_1','GPR183_2','ALOX5AP','IL7R','HOPX',
                                  'CD8A_2','leiden'], size=30)

In [None]:
sc.tl.rank_genes_groups(subset_adata_t, 'leiden', method='wilcoxon')
sc.pl.rank_genes_groups(subset_adata_t, n_genes=25, sharey=False)
pd.set_option('display.max_columns', None)
pd.DataFrame(subset_adata_t.uns['rank_genes_groups']['names']).head(100).to_csv(r'/Code/data/single_cell/animals/chicken/ca1/new_genome/''subset_adata_rank_genes_t_cells_top100.csv', index=False)

In [None]:
sc.pl.umap(subset_adata_t[subset_adata_t.obs['treatment'] == 'control'], color='CD28', size=30)
sc.pl.dotplot(subset_adata_t, 'CD28', groupby='treatment', standard_scale='var')

sc.pl.umap(subset_adata_t[subset_adata_t.obs['treatment'] == 'pic'], color='CD28', size=30)
sc.pl.dotplot(subset_adata_t, 'CD28', groupby='treatment', standard_scale='var')

In [None]:
sc.tl.dendrogram(subset_adata_t, groupby='leiden')
sc.pl.dendrogram(subset_adata_t, groupby='leiden')

In [None]:
subset_adata_t.obs['leiden'].value_counts()

In [None]:
sc.tl.leiden(subset_adata_t, resolution=0.4)
clusgene = list(pd.DataFrame(subset_adata_t.uns['rank_genes_groups']['names']).head(100)['0'])
sc.pl.dotplot(subset_adata_t, [i for i in clusgene[:50] if i.startswith('RP') == False], groupby='leiden', standard_scale=None)
sc.pl.dotplot(subset_adata_t, ['CD3D','CD3E','CD28','CD226','CD247','CD4',
                               'CD7_3','CD38','STMN1','LEF1','RORA','GPR171',
                               'CCR7','CXCR4','TCF7','GPR183_1','GPR183_2',
                               'ALOX5AP','IL7R','HOPX'], groupby='leiden', standard_scale='var')
sc.pl.umap(subset_adata_t, color='leiden')    # For visualization only

In [None]:
# sc.pl.rank_genes_groups_dotplot(subset_adata, n_genes=5, standard_scale='var')
sc.pl.umap(subset_adata_t, color=find_gene('CD4') + find_gene('CD8') + find_gene('NKG7')
           + find_gene('CD226') + find_gene('IL2RA') + find_gene('CD247') 
           +find_gene('TCF7'), size=100)
sc.pl.umap(subset_adata_t, color='treatment') 
sc.pl.umap(subset_adata_t, color='leiden')    # For visualization only

In [None]:
t_cell_type = {'0':'0','1':'1','2':'T_memory','3':'3','4':'4','5':'T_memory','6':'6'}
subset_adata_t.obs['cell_type'] = subset_adata_t.obs.leiden.map(celltype).astype('category')
sc.pl.umap(subset_adata_t, color='cell_type', size=30)

## Gene search tools
This function returns a list of genes that are present in adata (markers for example) from an input of a general list.

In [None]:
def find_gene(gene):
    GENES = list(adata.raw.var.index[adata.raw.var.index.str.startswith(gene.upper())])
    return GENES

find_gene('tlr')

In [None]:
# adata.var[adata.var_names == 'GZMK']
adata.var[adata.var['gene_ids'] == 'ENSGALG00010003777']