# Tile Detection

As we don't completely know which tiles belong to which Nova-ST chips, we need to detect this using the data itself.

We do this by checking a small subset of the reads from the spatial data vs. the subset files we saved from the HDMI analysis earlier

In [None]:
import pickle
from collections import defaultdict

import pysam
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [None]:
tiles_dir = 'HDMI_Tiles_Data'
subsets = defaultdict(dict)
for lane in range(1, 5):
    for surface in range(1, 3):
        for swath in range(1, 7):
            with open(f'{tiles_dir}/{lane}_{surface}{swath}_subset_barcodes.pickle', 'rb') as fh:
                subsets[f'{lane}_{surface}{swath}'] = pickle.load(fh)

In [None]:
data_dir = "" # Demultiplexed spatial libraries

In [None]:
samples = [
    # Put the names of the samples here, this should match the demultiplexing sheet and will be appended with _R1_001.fastq.gz later
]

In [None]:
samples_data = defaultdict(set)

We load the first 100k barcodes from each sample to profile this small subset of reads

In [None]:
for sample in samples:
    print(f'On {sample}:')
    with pysam.FastqFile(f'{data_dir}/{sample}_R1_001.fastq.gz', persist=False) as fh:
        for en, entry in enumerate(fh):
            seq = entry.sequence[:32]
            samples_data[sample].add(seq)
            
            if (en + 1) % 100000 == 0:
                print(f'    Finished {en + 1} reads')
                break


Plot the barcode pattern from the spatial library and ensure it looks correct

In [None]:
for sample in samples:
    base_pairs = defaultdict(lambda: defaultdict(lambda: 0))
    for seq in samples_data[sample]:
        for n, base in enumerate(seq):
            base_pairs[n][base] += 1
        if (en) % 1000000 == 0:
            print(f'Finished {en} reads')
            break 

    bp_df = pd.DataFrame(base_pairs, index=['A', 'C', 'G', 'T', 'N'])

    bp_df = bp_df.drop('N')
    bp_df.columns = [int(x) + 1 for x in bp_df.columns]

    bp_df/bp_df.sum()



    (bp_df/bp_df.sum()).T.plot(kind='line', figsize=(18, 12))
    plt.grid(True, which='major')
    plt.title(sample)


In [None]:
# Helper functions to convert tiles to plotted columns and back again

def get_x(lane, surface, swath):
    # Return x coordinate of the tile, swaths are grouped, each lane has 2 surfaces, each surface has 6 swaths, add a gap of 1 between every surface
    return (lane - 1) * 14 + (surface - 1) * 7 + swath    

def reverse_get_x(x):
    # Return lane, surface, swath of the tile
    # 1 -> 1, 1, 1
    # 2 -> 1, 1, 2
    # 7 -> None
    # 8 -> 1, 2, 1
    # 44 -> 4, 1, 2

    lane = (x - 1) // 14 + 1
    surface = ((x - 1) % 14) // 7 + 1
    swath = ((x - 1) % 14) % 7 + 1
    return lane, surface, swath


Here, we count how many times we see a barcode from each tile (subset) in the reads we took from the library

In [None]:
lanes = 4
surfaces = 2
swaths = 6
tiles = 78
counts_dfs = {}
for s, data in samples_data.items():
    df = pd.DataFrame(
        data=np.nan,
        columns=list(range((lanes * surfaces * swaths) + lanes + surfaces + 1)),
        index=[x for x in range(1, tiles + 1)],
    )
    col = 0
    for lane in range(1, lanes + 1):
        for surface in range(1, surfaces + 1):
            for swath in range(1, swaths + 1):
                for tile, tile_bcs in subsets[f"{lane}_{surface}{swath}"].items():
                    x = get_x(lane, surface, swath) - 1
                    y = int(tile)
                    df.loc[y, x] = len(tile_bcs & data)

    counts_dfs[s] = df

Higlight every tile where one or more reads was identified to visually inspect where the tissue is

In [None]:
fig, axs = plt.subplots(1, len(samples) ,figsize=((20 * len(samples)), 78), facecolor='white')
current_cmap = matplotlib.cm.get_cmap()
current_cmap.set_bad(color='black')
for ax, sample, df in zip(axs, counts_dfs.keys(), counts_dfs.values()):
    ax.imshow(df, vmax=1, vmin=0, )
    ax.set_title(sample)

In [None]:
def extract_whitelist(start, end, path, lane=1, surface=1, swaths=[1, 2, 3, 4, 5, 6], trim_to_31=True, sample=None):
    final_path = f"{path}/whitelist"
    if sample:
        final_path += f"_{sample}"
    final_path += f"_{lane}_{surface}_tiles_{start}_to_{end}"
    if trim_to_31:
        final_path += "_31bp"
    final_path += ".tsv"

    with open(final_path, "w") as whitelist_fh:
        for tile_y in range(start, end + 1):
            for swath in swaths:
                tile = f"{surface}{swath}{tile_y}"
                with open(f"{tiles_dir}/{lane}_{tile}_barcodes.pickle", "rb") as tile_fh:
                    tile_data = pickle.load(tile_fh)
                for bc in tile_data.keys():
                    if trim_to_31:
                        whitelist_fh.write(f"{bc[:31]}\n")
                    else:
                        whitelist_fh.write(f"{bc}\n")
        return final_path

Detect the surface for each sample

In [None]:
sample_surface_detections = {}
for sample, df in counts_dfs.items():
    surface_counts = defaultdict(int)
    for ix, val in df.sum().items():
        lane, surface, swath = reverse_get_x(ix+1)
        surface_counts[f"{lane}_{surface}"] += val

    max_lane, max_surface = max(surface_counts, key=surface_counts.get).split("_")[0:2]
    sample_surface_detections[sample] = (max_lane, max_surface)

Save identify the exact tiles, create and save the whitelist for each sample

In [None]:
path = "output"

for sample, (lane, surface) in sample_surface_detections.items():
    # Detect edges tile
    THRESHOLD = 2

    x = get_x(int(lane), int(surface), 1) - 1
    selected_counts = counts_dfs[sample].loc[:, x : x + 5]
    selected_counts = selected_counts.loc[:, (selected_counts != 0).any(axis=0)]
    selected_counts[selected_counts < THRESHOLD] = 0
    first = int(selected_counts.ne(0).idxmax().median()) - 3
    last = int(selected_counts.iloc[::-1].ne(0).idxmax().median()) + 3
    print(f"{sample} - {lane}_{surface} - {first} to {last}")
    plt.figure()
    cumsum = counts_dfs[sample].loc[:, x : x + 5].sum(axis=1).cumsum()
    plt.plot(cumsum)
    plt.title(f"{sample} - {lane}_{surface} - {first} to {last}")
    plt.vlines(first, 0, cumsum.max(), colors="red")
    plt.vlines(last, 0, cumsum.max(), colors="red")

    extract_whitelist(first, last, path, lane=lane, surface=surface, trim_to_31=True, sample=sample)