In [None]:
import scanpy as sc
from pathlib import Path
import requests
import os
from tqdm import tqdm
import tarfile


In [19]:
# define coloring color for colored output
def red(text: str) -> str:
    return f"\033[31m{text}\033[0m"

def cyan(text: str) -> str:
    return f"\033[36m{text}\033[0m"

def green(text: str) -> str:
    return f"\033[32m{text}\033[0m"

def yellow(text: str) -> str:
    return f"\033[33m{text}\033[0m"

In [20]:
print(red("Hello World!"), cyan("Hello World!"), green("Hello World!"), yellow("Hello World!"))

[31mHello World![0m [36mHello World![0m [32mHello World![0m [33mHello World![0m


## 1. Downloading the Dataset

__Note:__
Sample name includes the developmental stage in weeks and the internal ID.
week8_001 – this sample is collected from week 8 of development and ID is 001.
Some developmental stages have replicates.

Kameneva P, Artemov AV, Kastriti ME, Faure L et al. 
Single-cell transcriptomics of human embryos identifies multiple sympathoblast 
lineages with potential implications for neuroblastoma origin. 
Nat Genet 2021 May;53(5):694-706. 

Main link for the dataset: 
https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE147821

In [21]:
# direct download link
dataset_url = "https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE147821&format=file"

In [27]:
def get_dataset(url: str, name:str,folder:str="data/raw") -> None:
    """Get the Dataset from the URL.
    
    Parameters
    ----------
    url : str
        URL of the Dataset.

    
    """
    ref_dir = Path(folder)
    ref_dir.mkdir(exist_ok=True,parents=True)
    response = requests.get(url, stream=True)
    output_path = ref_dir / name
    # Check if the file already exists
    total_size = int(response.headers.get("content-length", 0))
    if output_path.exists() and os.stat(output_path).st_size == total_size:
        print(f"{output_path} already exists, skipping downloading...")
    else:
        # Download the file
        with (
            Path.open(output_path, "wb") as f,
            tqdm(  # progress bar
                desc="Downloading",
                total=total_size,
                unit="B",
                unit_scale=True,
                unit_divisor=1024,
                colour="green",
            ) as bar,
        ):
            for chunk in response.iter_content(chunk_size=8192):
                if chunk:
                    f.write(chunk)
                    bar.update(len(chunk))
    
    return

In [None]:
get_dataset(dataset_url, name="GSE147821_RAW.tar")

Downloading:  68%|[32m██████▊   [0m| 1.40G/2.06G [37:50<19:10, 623kB/s]  

In [None]:
with tarfile.open("GSE147821_RAW.tar", "r") as tar:
    tar.extractall(".data/raw")

## 2. Performing data normalization (consider cell cycle correction) and quality control,data cleaning.

## 3. Generate the UMAP and perform cluster annotation.

Take a note of the clusters on Figure 1b https://www.nature.com/articles/s41588-
021-00818-x for marker genes.

## 4. Visualize the data on the dot plot showing the 5 top differentially expressed genes per cluster

## 5. Sub-select adrenal medulla clusters (Schwann cell precursors (SCPs),Chromaffin cells, Sympathoblasts) and re-cluster them to improve the resolution of transitions.

Similar to re-clustering on Figure 2 https://www.nature.com/articles/s41588-021-
00818-x. Note that not all datasets may be included in the re-clustering. 
If you choose to do so, explain why that may be necessary.

For the last task, you can choose between two alternatives 
 
6a. Perform the trajectory analysis between SCPs and chromaffin cells, SCPs and 
sympathoblasts, and between chromaffin cells and sympathoblasts.  
Plot the important gene changes along the trajectories on heatmaps. 
You can use any trajectory analysis tool. Please explain your choice. 
 
6b. Using the SCENIC tool, analyse the regulons in SCPs, chromaffin cells, and 
sympathoblasts.  
Visualize important regulons whose activity spans the transitions between the 
clusters. Compare the regulon's expression to the expression of the 
corresponding transcription factors.

### 6a. Perform the trajectory analysis between SCPs and chromaffin cells, SCPs and sympathoblasts, and between chromaffin cells and sympathoblasts. Plot the important gene changes along the trajectories on heatmaps. You can use any trajectory analysis tool. Please explain your choice.

## 7. Generate the report:
7a. Please provide annotated code, documentation, results, and plots organized
in a PDF file.  
7b. Please provide short explanations of why you chose specific methods of
analysis.  
7c. Please provide a short, plausible interpretation of the results of task 6.  
7d. Please provide a small abstract of the overall results of the analysis (max 150
words).  