# dimenSNEon Tests

In [None]:
# This notebook runs dimenSNEon's t-SNE and scanpy's builtin t-SNE simulations on sample data from 10x genomics.
# You can tweak parameters below to alter the test.

# Truncate the data to this many datapoints in the interest of speed.
NUM_DATAPOINTS=1000

# How many iterations of dimenSNEon to run.
NUM_ITERATIONS=1000

# Perplexity to target.
PERPLEXITY=30

# The directory (relative to the notebook) where the test data is.
DATADIR = "data/filtered_feature_bc_matrix"

In [None]:
# First, ensure test data is downloaded
import os, urllib.request, tarfile

DATA_URL = "https://cf.10xgenomics.com/samples/cell-exp/3.0.0/pbmc_10k_v3/pbmc_10k_v3_filtered_feature_bc_matrix.tar.gz"
FILENAME = "data/data.tar.gz"

if not os.path.isdir("data"):
    os.mkdir("data")
if not os.path.isdir("data/filtered_feature_bc_matrix"):
    print("Hold on, downloading sample data from 10x genomics...")

    # We have to fake the user agent to make 10x happy so we don't get a 403...
    request = urllib.request.Request(
        DATA_URL,
        data=None,
        headers={
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'
        }
    )
    response = urllib.request.urlopen(request)
    with open(FILENAME, "wb") as file:
        file.write(response.read())
        file.close()

    print("Downloaded! Uncompressing...")
    with tarfile.open(FILENAME, "r:gz") as tar:
        tar.extractall("data")
        tar.close()

    print("Done!")

In [None]:
# Import libraries and load data

# Ensure dimensneon is in the path (so we don't need to install it to run this)
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

import scanpy as sc
import dimensneon.tsne as dtsne

data = sc.read_10x_mtx(DATADIR, cache=True)
print("Loaded data!")

In [None]:
# Normalize counts and get highly variable genes
sc.pp.normalize_per_cell(data, counts_per_cell_after=1e4)
sc.pp.log1p(data)
sc.pp.highly_variable_genes(data, n_top_genes=100)

In [None]:
# Arbitrarily limit to NUM_DATAPOINTS datapoints in the interest of speed
data_var = data[:, data.var['highly_variable']][0:NUM_DATAPOINTS, 0:NUM_DATAPOINTS]
sc.pp.neighbors(data_var) # computes neighborhood graphs. Needed to run clustering.
sc.tl.leiden(data_var) # clusters cells based on expression profiles. This is needed to color cells by cluster.

In [None]:
# Create two copies of the data. One for runing with builtin, one for running with dimensneon.

data_builtin = data_var.copy()
data_dsne = data_var

In [None]:
# Run the builtin scanpy tSNE and graph it
sc.tl.tsne(data_builtin, perplexity=PERPLEXITY)
sc.pl.tsne(data_builtin, color=['leiden'], legend_loc='on data', legend_fontsize=10, alpha=0.8, size=20)

In [None]:
# Uncomment these two lines to reload dimenSNEon when making changes
# import importlib
# importlib.reload(dtsne)

# Run dimenSNEon and graph it
dtsne.tsne(data_dsne, iterations=NUM_ITERATIONS, perplexity=PERPLEXITY)
sc.pl.tsne(data_dsne, color=['leiden'], legend_loc='on data', legend_fontsize=10, alpha=0.8, size=20)