# HDMI Processing

This notebook is for the initial processing of the HDMI sequencing round.

We will confirm the quality of the barcodes, extract the coordinates per barcode and save these to disk, as well as prepare some files for later analyses

In [None]:
import pysam
import collections as c
import pandas as pd
import matplotlib.pyplot as plt
import pickle
import os

%matplotlib inline

Check overall nucleotide distribution of HDMIs

In [None]:
# Generated from demultiplex_per_tile.sh
tiles_fastq_dir = '' # /path/to/Demultiplexed_per_tile/output

In [None]:
tile_fastqs = [x for x in os.listdir(tiles_fastq_dir) if x.endswith('.fastq.gz')]

# Barcode structure check

We look at the reads from the first tile of the flowcell and ensure that all of the barcode reads together appear to have the expected sequence (NNVNBVNNVNNVNNVNNVNNVNNVNNVNNNNN)

In [None]:
base_pairs = c.defaultdict(lambda: c.defaultdict(lambda: 0))

In [None]:
# Grab reads from the first tile to check barcode structure
with pysam.FastqFile(f'{tiles_fastq_dir}/{tile_fastqs[0]}') as fh:
    for en, entry in enumerate(fh):
        for n, base in enumerate(entry.sequence):
            base_pairs[n][base] += 1

In [None]:
bp_df = pd.DataFrame(base_pairs)

In [None]:
bp_df = bp_df.drop('N')
bp_df.columns = [int(x) + 1 for x in bp_df.columns]

In [None]:
base_proportions = bp_df/bp_df.sum()

# NNVNBVNNVNNVNNVNNVNNVNNVNNVNNNNN

In [None]:
base_proportions.T.plot(kind='line', figsize=(18, 12))
plt.grid(True, which='major')


# Coordinate Extraction

Next, we load each read and check it against the expected patterns, discarding any which do not match.

For those reads that pass filtering, we store the barcode and the x and y coordinates (local to the tile) in dictionaries and then to disk using pickle files.

We also save a small number of the reads (10000 per tile) to a seperate file allowing fast identifications of tiles -> Nova-ST chips later.

In [None]:
tiles_dir = 'HDMI_Tiles_Data' # Path to the folder to output results

In [None]:
if not os.path.isdir(tiles_dir):
    os.mkdir(tiles_dir)

In [None]:
bc_pattern = 'NNVNBVNNVNNVNNVNNVNNVNNVNNVNNNNN'
filter_list = []
for n, b in enumerate(bc_pattern):
    if b == 'V':
        filter_list.append((n, 'T'))
    elif b == 'B':
        filter_list.append((n, 'A'))
    else:
        continue


In [None]:
def is_read_bad(seq, filter_list, allowed_mismatches=0):
    bad_count = 0
    for n, not_base in filter_list:
        if seq[n] == not_base:
            bad_count += 1
        if bad_count > allowed_mismatches:
            return True
    return False
        

In [None]:
def extract_coords(tileinfo, subset_size = 10000):
    fastq_dir, tiles_dir, tile_id = tileinfo
    with pysam.FastxFile(f"{fastq_dir}/{tile_id}.fastq.gz", persist=False) as fh:
        total_bad = 0
        tile_data = {}
        subset = set()
        for en, entry in enumerate(fh):
            seq = entry.sequence
            # Don't save barcodes which don't match the expected pattern
            if is_read_bad(seq, filter_list, allowed_mismatches=0):
                total_bad += 1
                continue
            _, x, y = entry.name.split(":")[-3:]
            tile_data[seq] = (x, y)
            if en < subset_size:
                subset.add(seq)
        with open(f"{tiles_dir}/{tile_id}_barcodes.pickle", "wb") as tile_fh:
            pickle.dump(tile_data, tile_fh)
        return {
            "tile": tile_id,
            "data": f"{tiles_dir}/{tile_id}_barcodes.pickle",
            "subset": subset,
            "good_barcodes": en - total_bad,
            "bad_barcodes": total_bad,
        }

In [None]:
all_tiles_data = []
for tile in tile_fastqs:
    tile = tile.split(".")[0]
    all_tiles_data.append((tiles_fastq_dir, tiles_dir, tile))

In [None]:
from multiprocessing import Pool

In [None]:
with Pool(processes=70) as pool:
    results = pool.map(extract_coords, all_tiles_data)

In [None]:
total_bad_reads = 0
total_good_reads = 0
for tile in results:
    total_bad_reads += tile['bad_barcodes']
    total_good_reads += tile['good_barcodes']

print('Final counts:')
print(f"Total good barcodes: {total_good_reads}")
print(f"Total bad barcodes: {total_bad_reads}")

In [None]:
print(f'{total_bad_reads / (total_good_reads + total_bad_reads) * 100:2f}% of barcodes were bad')

In [None]:
subset_datas = c.defaultdict(lambda: c.defaultdict(set))
for tile in results:
    tile_info = tile['tile']
    lane = tile_info.split('_')[0]
    surface = tile_info.split('_')[1][0]
    swath = tile_info.split('_')[1][1]
    tile_no = tile_info.split('_')[1][2:]
    subset_datas[f'{lane}_{surface}{swath}'][tile_no].update(tile['subset'])


for subset in subset_datas:
    with open(f'{tiles_dir}/{subset}_subset_barcodes.pickle', 'wb') as fh:
        pickle.dump(subset_datas[subset], fh)