In [None]:
import scanpy as sc
from pathlib import Path
import requests
import os
from tqdm import tqdm
import anndata as ad
import tarfile


In [None]:
# define coloring color for colored output
def red(text: str) -> str:
    return f"\033[31m{text}\033[0m"


def cyan(text: str) -> str:
    return f"\033[36m{text}\033[0m"


def green(text: str) -> str:
    return f"\033[32m{text}\033[0m"


def yellow(text: str) -> str:
    return f"\033[33m{text}\033[0m"

In [None]:
print(
    red("Hello World!"),
    cyan("Hello World!"),
    green("Hello World!"),
    yellow("Hello World!"),
)

## 1. Downloading the Dataset and Loading the Data

__Note:__
Sample name includes the developmental stage in weeks and the internal ID.
week8_001 – this sample is collected from week 8 of development and ID is 001.
Some developmental stages have replicates.

Kameneva P, Artemov AV, Kastriti ME, Faure L et al. 
Single-cell transcriptomics of human embryos identifies multiple sympathoblast 
lineages with potential implications for neuroblastoma origin. 
Nat Genet 2021 May;53(5):694-706. 

Main link for the dataset: 
https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE147821

In [None]:
# direct download link
dataset_url = "https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE147821&format=file"

In [None]:
# create the processed directory if it doesn't exist and set the path to the concatenated file
processed_dir = Path("data/processed")
processed_dir.mkdir(exist_ok=True, parents=True)
concat_file = processed_dir / "concatenated.h5ad"

In [None]:
# dictionary to map sample identifiers to sample properties
files = {
    "GSM4446535": "week8_001",
    "GSM4446536": "week9_063",
    "GSM4446537": "week6_088",
    "GSM4446538": "week14_123",
    "GSM4446539": "week12_124",
    "GSM4446540": "week8_125",
    "GSM4446541": "week9_005",
    "GSM4446542": "week11_006",
    "GSM4446543": "week9_007",
    "GSM4734601": "week8_016",
    "GSM4734602": "week9_031_paraganglia",
    "GSM4734603": "week12_035",
    "GSM4734604": "week12_036_extraadrenal",
}

In [None]:
files

In [None]:
def get_dataset(url: str, name: str, folder: str = "data/raw") -> None:
    """
    Get the Dataset from the URL.

    Parameters
    ----------
    url : str
        URL of the Dataset.
    name : str
        Name of the Dataset to save.
    folder : str, optional
        Folder to save the Dataset, by default "data/raw"

    """
    ref_dir = Path(folder)
    ref_dir.mkdir(exist_ok=True, parents=True)
    response = requests.get(url, stream=True)
    output_path = ref_dir / name
    # Check if the file already exists
    total_size = int(response.headers.get("content-length", 0))
    if output_path.exists() and os.stat(output_path).st_size == total_size:
        print(f"{output_path} already exists, skipping downloading...")
    else:
        # Download the file
        with (
            Path.open(output_path, "wb") as f,
            tqdm(  # progress bar
                desc="Downloading",
                total=total_size,
                unit="B",
                unit_scale=True,
                unit_divisor=1024,
                colour="green",
            ) as bar,
        ):
            for chunk in response.iter_content(chunk_size=8192):
                if chunk:
                    f.write(chunk)
                    bar.update(len(chunk))

    return

### Fetch the dataset

In [None]:
get_dataset(dataset_url, name="GSE147821_RAW.tar")

### Extract the tar file

In [None]:
with tarfile.open("data/raw/GSE147821_RAW.tar", "r") as tar:
    tar.extractall("data/raw/GSE147821_RAW")

In [None]:
os.listdir("data/raw/GSE147821_RAW")

In [None]:
def concat_h5_files(
    files: dict, raw_dir="data/raw/GSE147821_RAW", out_dir="data/processed"
):
    concat_file = Path(out_dir) / "concatenated.h5ad"
    if not Path(concat_file).exists():
        samples = []
        for key in files:
            # find the file
            h5_file = [key for key in os.listdir(raw_dir) if key.startswith(key)][0]
            # find the matching info
            info = files[key]
            # extract the information from the file name
            week_str = info.split("_")[0]  # gets the week as string
            week = week_str.split("week")[1]  # converts to integer
            sample_name = info.split("_")[1]  # gets the sample name

            # assign the full path
            full_path = Path(raw_dir) / h5_file

            # read the file
            sample = sc.read_10x_h5(full_path)
            sample.var_names_make_unique()
            sample.obs_names = [f"{info}_{cell}" for cell in sample.obs_names]

            # Add metadata
            sample.obs["sample_id"] = sample_name
            sample.obs["week"] = week
            sample.obs["filename"] = h5_file

            # append to the list
            samples.append(sample)
        # concatenate the samples
        adt = ad.concat(
            samples,
            join="outer",
            label="sample",
            keys=[sample.obs["sample_id"][0] for sample in samples],
        )
        # save the file
        adt.write_h5ad(concat_file)
    else:
        adt = ad.read_h5ad(concat_file)

    return adt

In [None]:
adt = concat_h5_files(files=files)

In [None]:
adt.X.max()

## 2. Performing data normalization (consider cell cycle correction) and quality control,data cleaning.

### Quality control

In [None]:
# mitochondrial genes
adt.var["mt"] = adt.var_names.str.startswith("MT-")
# ribosomal genes
adt.var["ribo"] = adt.var_names.str.startswith(("RPS", "RPL"))
# hemoglobin genes
adt.var["hb"] = adt.var_names.str.contains("^HB[^(P)]")

# 
sc.pp.calculate_qc_metrics(adt, qc_vars=["mt", "ribo", "hb"], inplace=True, log1p=True)

In [None]:
adt

In [None]:
# Plot before filtering
sc.pl.violin(
    adt,
    ["n_genes_by_counts", "total_counts", "pct_counts_mt"],
    jitter=0.4,
    multi_panel=True,
)


## 3. Generate the UMAP and perform cluster annotation.

Take a note of the clusters on Figure 1b https://www.nature.com/articles/s41588-
021-00818-x for marker genes.

## 4. Visualize the data on the dot plot showing the 5 top differentially expressed genes per cluster

## 5. Sub-select adrenal medulla clusters (Schwann cell precursors (SCPs),Chromaffin cells, Sympathoblasts) and re-cluster them to improve the resolution of transitions.

Similar to re-clustering on Figure 2 https://www.nature.com/articles/s41588-021-
00818-x. Note that not all datasets may be included in the re-clustering. 
If you choose to do so, explain why that may be necessary.

For the last task, you can choose between two alternatives 
 
6a. Perform the trajectory analysis between SCPs and chromaffin cells, SCPs and 
sympathoblasts, and between chromaffin cells and sympathoblasts.  
Plot the important gene changes along the trajectories on heatmaps. 
You can use any trajectory analysis tool. Please explain your choice. 
 
6b. Using the SCENIC tool, analyse the regulons in SCPs, chromaffin cells, and 
sympathoblasts.  
Visualize important regulons whose activity spans the transitions between the 
clusters. Compare the regulon's expression to the expression of the 
corresponding transcription factors.

### 6a. Perform the trajectory analysis between SCPs and chromaffin cells, SCPs and sympathoblasts, and between chromaffin cells and sympathoblasts. Plot the important gene changes along the trajectories on heatmaps. You can use any trajectory analysis tool. Please explain your choice.

## 7. Generate the report:
7a. Please provide annotated code, documentation, results, and plots organized
in a PDF file.  
7b. Please provide short explanations of why you chose specific methods of
analysis.  
7c. Please provide a short, plausible interpretation of the results of task 6.  
7d. Please provide a small abstract of the overall results of the analysis (max 150
words).  