In [None]:
import numpy as np
import pandas as pd
import scanpy as sc
import sys
from joblib import Parallel, delayed
sc.settings.verbosity = 3 # verbosity: errors (0), warnings (1), info (2), hints
sc.logging.print_header()
sc.settings.set_figure_params(dpi=80, facecolor='white')
adata = sc.read_10x_mtx(
"datasets",
var_names="gene_symbols",
cache=True
)
adata.var_names_make_unique()
#Adding patient column to the adata:
meta = pd.read_csv("datasets/metadata.csv.gz")
type(meta)
meta.columns = ["cell_id", "tissue", "patient", "cell-type"]
patient_list = meta["cell_id"].values.tolist()
patient_14_df = meta[meta["patient"] == "PA14"]
patient_14_transcripts = patient_14_df["cell_id"].values.tolist()
# for t in patient_14_transcripts:
# print(t)
patient_04_df = meta[meta["patient"] == "PA04"]
patient_04_transcripts = patient_04_df["cell_id"].values.tolist()
# for t in patient_04_transcripts:
# print(t)
adata.obs["patient"] = "ignore"
adata.obs.loc[patient_04_transcripts, "patient"] = "PA04"
adata.obs.loc[patient_14_transcripts, "patient"] = "PA14"

In [None]:
# print(adata.obs)
# print(adata.obs_names)
adata2 = adata[adata.obs["patient"].isin(["PA04", "PA14"])].copy()
print(adata2.obs)

In [None]:
#filter 20 highest expressed genes
#sc.pl.highest_expr_genes(adata, n_top=20, )
#filtering out the cells with low gene expression/genes that don't show up in many cells
adata3 = adata2.copy()
sc.pp.filter_cells(adata3, min_genes=200)
sc.pp.filter_genes(adata3, min_cells=3)
#annotate mitochondrial, ribosomal, and hemoglobin genes
adata3.var['mt'] = adata3.var_names.str.startswith('MT-')
adata3.var['ribo'] = adata3.var_names.str.startswith(("RPS", "RPL"))
adata3.var['hb'] = adata3.var_names.str.startswith("^HB[^(P)]")
sc.pp.calculate_qc_metrics(adata3, qc_vars=["mt", "ribo", "hb"])

In [None]:
adata4 = adata3[
(adata3.obs['pct_counts_mt'] < 10) &
(adata3.obs['pct_counts_ribo'] < 20) &
(adata3.obs['pct_counts_hb'] < 5),
:
].copy()

In [None]:
#PLOT
sc.pl.violin(adata4, ['pct_counts_mt', 'pct_counts_ribo', 'pct_counts_hb'])

In [None]:
# Doublet detection and removal
adata5 = adata4.copy()
sc.pp.scrublet(adata5)

In [None]:
print(adata5.obs["predicted_doublet"])
adata6 = adata5[adata5.obs["predicted_doublet"] == False].copy()

In [None]:
# Saving count data
adata6.layers["counts"] = adata6.X.copy()
# Normalizing to median total counts
sc.pp.normalize_total(adata6)
# Logarithmize the data
sc.pp.log1p(adata6)

In [None]:
#cell-typist annotation by cell-type