# Barcode and UMI Correction

## UMI Correction

Number of possible sequences for one 16 bp barcode considering 1 SNP:

In [6]:
inflations_snp = 16*3
inflations_snp

48

Number of possible 1bp indels:

In [7]:
inflations_indel = (16+1)*4 + 16 * 4
inflations_indel

132

Number of inflated barcodes given a list of N barcodes:

In [12]:
barcode_list_size = 7000
inflations = barcode_list_size * (inflations_snp + inflations_indel)
print('{:,}'.format(inflations))

1,260,000


Ratio to original list (fix):

In [13]:
print(inflations / barcode_list_size)

180.0


In [14]:
barcodes_in_whitelist = 7000
print(barcodes_in_whitelist * inflations / barcode_list_size)

1260000.0


### Error Rate Calculation

In [27]:
q_value = 27
error_probability = 10 ** (-q_value/10)

In [28]:
barcode_length = 16
probability_all_correct = (1 - error_probability) ** barcode_length

In [29]:
probability_all_correct

0.9685491120344801

# Barcode Correction

In [2]:
barcodes_not_found = 147756

## Method 1: CellRanger
In this method, each barcode that is not contained in the whitelist is inflated and compared if a sequence that is 1 Hamming distance away from the observed barcode is within the whitelist. The sequence is more likely to be corrected to this whitelisted barcode if that barcode occurred frequently and the read base quality of the given sequence is low.

In [5]:
corrected_1 = 13789
print('{:.2}%'.format(corrected_1 / barcodes_not_found * 100))

9.3%


## Method 2: Inflated Graph
This method does not use a whitelist. Each observed barcode sequence is inflated (Levenshtein distance 1). If any of these inflated codes match an observed barcode, an edge is created between the observed barcode and the inflated code (i.e. another observed barcode). Subsequently, for all edges it is evaluated whether or not the number of occurrences of one barcode is significantly larger than that of the other code. If so, the less frequently observed barcode is corrected.

In [6]:
corrected_2 = 9539
print('{:.2}%'.format(corrected_2 / barcodes_not_found * 100))

6.5%


## Method 3: Correct to Whitelist
In this method, each observed barcode is inflated (Levenshtein distance 1) and compared to a whitelist. If there is only one whitelisted barcode with distance 1 away from the observed code, it is corrected accordingly.

In [7]:
corrected_3 = 13766
print('{:.2}%'.format(corrected_3 / barcodes_not_found * 100))

9.3%


## Method 4: Compare two Min Hamming Distances
For this method, the Hamming distances to the closest and second closest barcode in the whitelist is calculated. If the closest whitelisted code is less/equal than 2 away and the second closest code is more/equal than 3 away, the sequence is corrected.

## Method 5: Starcode

In [9]:
corrected_5 = 71687
print('{:.2f}%'.format(corrected_5 / barcodes_not_found * 100))

48.52%


# Analysis

In [44]:
import csv
import os
import gzip

def read_rg_stats(filename):
    csv_data = dict()
    with open(filename) as csv_file:
        reader = csv.DictReader(csv_file, delimiter='\t')
        for line in reader:
            csv_data[line['rg']] = line
    return csv_data
def read_barcode_stats(filename):
    barcode_histogram = dict()
    with open(filename) as csv_file:
        reader = csv.DictReader(csv_file, delimiter='\t')
        for line in reader:
            barcode_histogram[line['barcode']] = barcode_histogram.get('barcode', 0) + int(line['occurrence'])
    return barcode_histogram
def read_barcodes(barcodes_filename):
    barcodes = set()
    _, barcodes_file_extension = os.path.splitext(barcodes_filename)
    is_gzip = barcodes_file_extension == '.gz'
    with (open(barcodes_filename) if not is_gzip else gzip.open(barcodes_filename)) as barcodes_file:
        for line in barcodes_file:
            if is_gzip:
                line = line.decode('utf-8')
            if line[-3:-1] == '-1':
                barcodes.add(line[:-3])
            else:
                barcodes.add(line[:-1])
    return barcodes
def read_starcode_clusters(filename):
    starcode_clusters = dict()
    with open(filename) as csv_file:
        reader = csv.DictReader(csv_file, delimiter='\t')
        for line in reader:
            starcode_clusters[line['cluster']] = int(line['occurrence'])
    return starcode_clusters

# PacBio

In [81]:
data = read_rg_stats('results/pacbio/SM-HLD5O_sw_rg_stats.tsv')['all']
barcode_histogram = read_barcode_stats('results/pacbio/SM-HLD5O_sw_barcode_stats.tsv')
whitelist_10x = read_barcodes('/Users/mgatzen/files/bam/10x_tool/barcodes/3M-february-2018.txt')
whitelist_illumina = read_barcodes('/Users/mgatzen/files/bam/10x_tool/barcodes.1.tsv')
starcode_data = read_starcode_clusters('results/pacbio/SM-HLD5O_sw_starcode.tsv')

In [82]:
adapter_found = int(data['adapter_found'])
barcodes_in_10x_and_illumina = int(data['barcode_in_10x_and_illumina_whitelist'])
barcodes_in_10x = int(data['barcode_in_10x_whitelist'])
barcodes_not_in_any = int(data['barcode_not_in_any_whitelist'])

unique_barcodes = set(barcode_histogram.keys())
unique_barcodes_in_10x_and_illumina = set()
unique_barcodes_in_10x = set()
unique_barcodes_not_in_any = set()

for barcode in unique_barcodes:
    if barcode in whitelist_10x:
        if barcode in whitelist_illumina:
            unique_barcodes_in_10x_and_illumina.add(barcode)
        else:
            unique_barcodes_in_10x.add(barcode)
    else:
        unique_barcodes_not_in_any.add(barcode)


print('Adapters found:                       {:10,}'.format(adapter_found))
print('Barcodes in 10x + Illumina whitelist: {:10,} ({:5.2f}%)'.format(barcodes_in_10x_and_illumina, barcodes_in_10x_and_illumina / adapter_found * 100))
print('Barcodes in 10x whitelist:            {:10,} ({:5.2f}%)'.format(barcodes_in_10x, barcodes_in_10x / adapter_found * 100))
print('Barcodes in no whitelist:             {:10,} ({:5.2f}%)'.format(barcodes_not_in_any, barcodes_not_in_any / adapter_found * 100))
print()
print('Number of unique barcodes:            {:10,}'.format(len(unique_barcodes)))
print('Unique barcodes in 10x + Illumina:    {:10,} ({:5.2f}%)'.format(len(unique_barcodes_in_10x_and_illumina), len(unique_barcodes_in_10x_and_illumina) / len(unique_barcodes) * 100))
print('Unique barcodes in 10x:               {:10,} ({:5.2f}%)'.format(len(unique_barcodes_in_10x), len(unique_barcodes_in_10x) / len(unique_barcodes) * 100))
print('Unique barcodes not in any:           {:10,} ({:5.2f}%)'.format(len(unique_barcodes_not_in_any), len(unique_barcodes_not_in_any) / len(unique_barcodes) * 100))

Adapters found:                        4,427,717
Barcodes in 10x + Illumina whitelist:  3,726,334 (84.16%)
Barcodes in 10x whitelist:               540,037 (12.20%)
Barcodes in no whitelist:                161,346 ( 3.64%)

Number of unique barcodes:               164,743
Unique barcodes in 10x + Illumina:         7,626 ( 4.63%)
Unique barcodes in 10x:                   99,827 (60.60%)
Unique barcodes not in any:               57,290 (34.78%)


### Starcode

In [83]:
starcode_clusters = set(starcode_data.keys())
illumina_in_starcode = [barcode for barcode in whitelist_illumina if barcode in starcode_clusters]
illumina_not_in_starcode = [barcode for barcode in whitelist_illumina if barcode not in starcode_clusters]
starcode_in_illumina = [barcode for barcode in starcode_clusters if barcode in whitelist_illumina]
starcode_not_in_illumina = [barcode for barcode in starcode_clusters if barcode not in whitelist_illumina]


print('Clusters in starcode: {:,}'.format(len(starcode_clusters)))
print('Illumina clusters covered by starcode: {:10,} ({:5.2f}%)'.format(len(illumina_in_starcode), len(illumina_in_starcode) / len(whitelist_illumina) * 100))
print('Illumina clusters not in starcode:     {:10,} ({:5.2f}%)'.format(len(illumina_not_in_starcode), len(illumina_not_in_starcode) / len(whitelist_illumina) * 100))
print('Starcode clusters covered by Illumina: {:10,} ({:5.2f}%)'.format(len(starcode_in_illumina), len(starcode_in_illumina) / len(starcode_clusters) * 100))
print('Starcode clusters not in Illumina:     {:10,} ({:5.2f}%)'.format(len(starcode_not_in_illumina), len(starcode_not_in_illumina) / len(starcode_clusters) * 100))

Clusters in starcode: 113,094
Illumina clusters covered by starcode:      7,551 (99.02%)
Illumina clusters not in starcode:             75 ( 0.98%)
Starcode clusters covered by Illumina:      7,551 ( 6.68%)
Starcode clusters not in Illumina:        105,543 (93.32%)


In [84]:
starcode_corrected = 0
for code in [code for code in starcode_clusters if code in whitelist_illumina]:
    starcode_corrected += starcode_data[code]
print(starcode_corrected)
print(starcode_data['ACTGTCCTCCATCACC'])

3861162


KeyError: 'ACTGTCCTCCATCACC'

### Mehrtash method

In [None]:
print('Number of corrected barcodes (to Illumina whitelist): {:10,} ({:5.2f}%)'.format(int(data['corrected_mehrtash']), int(data['corrected_mehrtash']) / (int(data['barcode_in_10x_whitelist']) + int(data['barcode_not_in_any_whitelist'])) * 100))

### Kiran method

In [53]:
print('Number of corrected barcodes (to Illumina whitelist): {:10,} ({:5.2f}%)'.format(int(data['corrected_kiran']), int(data['corrected_kiran']) / (int(data['barcode_in_10x_whitelist']) + int(data['barcode_not_in_any_whitelist'])) * 100))

Number of corrected barcodes (to Illumina whitelist):     13,766 ( 2.07%)


# ONT

In [77]:
data = read_rg_stats('results/ont/bam1_sw_rg_stats.tsv')['all']
barcode_histogram = read_barcode_stats('results/ont/bam1_sw_barcode_stats.tsv')
whitelist_10x = read_barcodes('/Users/mgatzen/files/bam/10x_tool/barcodes/3M-february-2018.txt')
whitelist_illumina = read_barcodes('/Users/mgatzen/files/bam/10x_tool/barcodes/ont/samp1.ont.tsv')
starcode_data = read_starcode_clusters('results/ont/bam1_sw_starcode.tsv')

In [78]:
adapter_found = int(data['adapter_found'])
barcodes_in_10x_and_illumina = int(data['barcode_in_10x_and_illumina_whitelist'])
barcodes_in_10x = int(data['barcode_in_10x_whitelist'])
barcodes_not_in_any = int(data['barcode_not_in_any_whitelist'])

unique_barcodes = set(barcode_histogram.keys())
unique_barcodes_in_10x_and_illumina = set()
unique_barcodes_in_10x = set()
unique_barcodes_not_in_any = set()

for barcode in unique_barcodes:
    if barcode in whitelist_10x:
        if barcode in whitelist_illumina:
            unique_barcodes_in_10x_and_illumina.add(barcode)
        else:
            unique_barcodes_in_10x.add(barcode)
    else:
        unique_barcodes_not_in_any.add(barcode)


print('Adapters found:                       {:10,}'.format(adapter_found))
print('Barcodes in 10x + Illumina whitelist: {:10,} ({:5.2f}%)'.format(barcodes_in_10x_and_illumina, barcodes_in_10x_and_illumina / adapter_found * 100))
print('Barcodes in 10x whitelist:            {:10,} ({:5.2f}%)'.format(barcodes_in_10x, barcodes_in_10x / adapter_found * 100))
print('Barcodes in no whitelist:             {:10,} ({:5.2f}%)'.format(barcodes_not_in_any, barcodes_not_in_any / adapter_found * 100))
print()
print('Number of unique barcodes:            {:10,}'.format(len(unique_barcodes)))
print('Unique barcodes in 10x + Illumina:    {:10,} ({:5.2f}%)'.format(len(unique_barcodes_in_10x_and_illumina), len(unique_barcodes_in_10x_and_illumina) / len(unique_barcodes) * 100))
print('Unique barcodes in 10x:               {:10,} ({:5.2f}%)'.format(len(unique_barcodes_in_10x), len(unique_barcodes_in_10x) / len(unique_barcodes) * 100))
print('Unique barcodes not in any:           {:10,} ({:5.2f}%)'.format(len(unique_barcodes_not_in_any), len(unique_barcodes_not_in_any) / len(unique_barcodes) * 100))

Adapters found:                        1,953,035
Barcodes in 10x + Illumina whitelist:  1,418,346 (72.62%)
Barcodes in 10x whitelist:               158,087 ( 8.09%)
Barcodes in no whitelist:                376,602 (19.28%)

Number of unique barcodes:               235,802
Unique barcodes in 10x + Illumina:         8,740 ( 3.71%)
Unique barcodes in 10x:                   58,280 (24.72%)
Unique barcodes not in any:              168,782 (71.58%)


### Starcode

In [79]:
starcode_clusters = set(starcode_data.keys())
illumina_in_starcode = [barcode for barcode in whitelist_illumina if barcode in starcode_clusters]
illumina_not_in_starcode = [barcode for barcode in whitelist_illumina if barcode not in starcode_clusters]
starcode_in_illumina = [barcode for barcode in starcode_clusters if barcode in whitelist_illumina]
starcode_not_in_illumina = [barcode for barcode in starcode_clusters if barcode not in whitelist_illumina]


print('Clusters in starcode: {:,}'.format(len(starcode_clusters)))
print('Illumina clusters covered by starcode: {:10,} ({:5.2f}%)'.format(len(illumina_in_starcode), len(illumina_in_starcode) / len(whitelist_illumina) * 100))
print('Illumina clusters not in starcode:     {:10,} ({:5.2f}%)'.format(len(illumina_not_in_starcode), len(illumina_not_in_starcode) / len(whitelist_illumina) * 100))
print('Starcode clusters covered by Illumina: {:10,} ({:5.2f}%)'.format(len(starcode_in_illumina), len(starcode_in_illumina) / len(starcode_clusters) * 100))
print('Starcode clusters not in Illumina:     {:10,} ({:5.2f}%)'.format(len(starcode_not_in_illumina), len(starcode_not_in_illumina) / len(starcode_clusters) * 100))

Clusters in starcode: 168,740
Illumina clusters covered by starcode:      8,595 (98.34%)
Illumina clusters not in starcode:            145 ( 1.66%)
Starcode clusters covered by Illumina:      8,595 ( 5.09%)
Starcode clusters not in Illumina:        160,145 (94.91%)


In [80]:
starcode_corrected = 0
for code in [code for code in starcode_clusters if code in whitelist_illumina]:
    starcode_corrected += starcode_data[code]
print(starcode_corrected)
print(starcode_data['ACTGTCCTCCATCACC'])

1640602
1632


### Mehrtash method

In [63]:
print('Number of corrected barcodes (to Illumina whitelist): {:10,} ({:5.2f}%)'.format(int(data['corrected_mehrtash']), int(data['corrected_mehrtash']) / (int(data['barcode_in_10x_whitelist']) + int(data['barcode_not_in_any_whitelist'])) * 100))

Number of corrected barcodes (to Illumina whitelist):     35,212 (10.63%)


### Kiran method

In [64]:
print('Number of corrected barcodes (to Illumina whitelist): {:10,} ({:5.2f}%)'.format(int(data['corrected_kiran']), int(data['corrected_kiran']) / (int(data['barcode_in_10x_whitelist']) + int(data['barcode_not_in_any_whitelist'])) * 100))

Number of corrected barcodes (to Illumina whitelist):     31,461 ( 9.50%)
