In [None]:
import os
import tarfile
from pathlib import Path

import requests
from tqdm import tqdm


## 1. Downloading the Dataset and expanding the Data

__Note:__
Sample name includes the developmental stage in weeks and the internal ID.
week8_001 – this sample is collected from week 8 of development and ID is 001.
Some developmental stages have replicates.

Kameneva P, Artemov AV, Kastriti ME, Faure L et al. 
Single-cell transcriptomics of human embryos identifies multiple sympathoblast 
lineages with potential implications for neuroblastoma origin. 
Nat Genet 2021 May;53(5):694-706. 

Main link for the dataset: 
https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE147821

In [None]:
# direct download link
dataset_url = "https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE147821&format=file"

In [None]:
def get_dataset(url: str, name: str, folder: str = "data/raw") -> None:
    """
    Get the Dataset from the URL.

    Parameters
    ----------
    url : str
        URL of the Dataset.
    name : str
        Name of the Dataset to save.
    folder : str, optional
        Folder to save the Dataset, by default "data/raw"

    """
    ref_dir = Path(folder)
    ref_dir.mkdir(exist_ok=True, parents=True)
    response = requests.get(url, stream=True)
    output_path = ref_dir / name
    # Check if the file already exists
    total_size = int(response.headers.get("content-length", 0))
    if output_path.exists() and os.stat(output_path).st_size == total_size:
        print(f"{output_path} already exists, skipping downloading...")
    else:
        # Download the file
        with (
            Path.open(output_path, "wb") as f,
            tqdm(  # progress bar
                desc="Downloading",
                total=total_size,
                unit="B",
                unit_scale=True,
                unit_divisor=1024,
                colour="green",
            ) as bar,
        ):
            for chunk in response.iter_content(chunk_size=8192):
                if chunk:
                    f.write(chunk)
                    bar.update(len(chunk))

    return

### Fetch the dataset

In [None]:
get_dataset(dataset_url, name="GSE147821_RAW.tar")

### Extract the tar file

In [None]:
with tarfile.open("data/raw/GSE147821_RAW.tar", "r") as tar:
    tar.extractall("data/raw/GSE147821_RAW")