## Processing DADA2 output - Denoised Barcodes 

In [8]:
import pandas as pd
import Levenshtein

In [51]:
# Load unique ASVs counts from denoising
asvtab_barcodes =  pd.read_table('asvtab_barcodes.txt')
asvtab_barcodes.head()

Unnamed: 0,BARCODE,C1110GR1_S132,C1110GR2_S133,C1110GR3_S134,C115GR1_S87,C115GR2_S88,C115GR3_S89,C1710GR1_S150,C1710GR2_S151,C1710GR3_S152,C175GR1_S105,C175GR2_S106,C175GR3_S107
0,CCCTGATGACTAAGTTGTTC,1925,2459,2560,3382,3902,5457,2854,2795,3407,2918,1993,2588
1,CATCAGAGTCCATGTCAGGG,749,639,702,562,402,588,768,882,874,327,309,363
2,TATAACACTGGTGAACTAGC,630,755,582,517,759,681,1056,805,721,510,370,439
3,TTACACTCTCTAGCATTCGG,811,683,819,817,677,861,800,817,818,487,508,461
4,TAGTACCCGGAACCTAGGGC,601,882,578,600,785,637,672,785,859,365,399,423


In [52]:
# Number of ASVs from denoising
len(asvtab_barcodes)

8395

In [53]:
# Get sequences from BARCODE column
asvtab_barcodes_seqs = asvtab_barcodes.loc[:, 'BARCODE'].tolist()

In [67]:
# Barcodes and ORFs info table - mutant yeast pool 
barcodes = pd.read_table('yeast_pool_barcodes_info.tsv', index_col='UPTAG_sequence_20mer')
len(barcodes)

6337

In [76]:
barcodes.head()

Unnamed: 0_level_0,ORF_name,deletion_alias,ORF
UPTAG_sequence_20mer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ACTATATGTGAAGGCATGGC,YAL001C,,YAL001C
ATACTGACAGCACGCATGGC,YAL002W,,YAL002W
GACATATCAGCATACATGGC,YAL003W,,YAL003W
TATGGCACGGCAGACATTCC,YAL004W,,YAL004W
AGGCATACTACACAGATTCC,YAL005C,,YAL005C


In [68]:
# Get sequences from UPTAG_sequence_20mer column
barcodes_seq = barcodes.index.tolist()

### Levenshtein distance filtering

Remove ASVs that have Levenshtein distance > 2 to expected barcode sequences

In [56]:
# Dictionary of distances between barcodes and ASVs
levenshtein_dict = dict()
for i in barcodes_seq:
    levenshtein_dict[i] = dict()
    for x in asvtab_barcodes_seqs:
        levenshtein_dict[i][x] = Levenshtein.distance(i, x)

In [57]:
# Filter distances <= 2
levenshtein_dict_filtered = dict()
for i in levenshtein_dict:
    if len(levenshtein_dict[i]) > 0:
        levenshtein_dict_filtered[i] = dict()
        for x in levenshtein_dict[i]:
            if levenshtein_dict[i][x] <= 2:
                if len(levenshtein_dict_filtered[i]) == 0:
                    levenshtein_dict_filtered[i][x] = levenshtein_dict[i][x]
                elif levenshtein_dict[i][x] == list(levenshtein_dict_filtered[i].values())[0]:
                    levenshtein_dict_filtered[i][x] = levenshtein_dict[i][x]
                elif levenshtein_dict[i][x] < list(levenshtein_dict_filtered[i].values())[0]:
                    levenshtein_dict_filtered[i] = dict()
                    levenshtein_dict_filtered[i][x] = levenshtein_dict[i][x]
                elif levenshtein_dict[i][x] > list(levenshtein_dict_filtered[i].values())[0]:
                    continue

In [58]:
len(levenshtein_dict_filtered)

6337

In [59]:
# Dictionary of ASV:Barcode
invdict={}
for i in levenshtein_dict_filtered:
    for x in levenshtein_dict_filtered[i].keys():
        invdict[x] = i 
len(invdict)

5447

In [60]:
# Dictionary to dataframe
levenshtein_df = pd.DataFrame.from_dict(invdict, orient='index', columns=['BARCODE'])
levenshtein_df.index.rename('ASV', inplace=True)

In [61]:
# Number of unique ASV:barcodes
len(levenshtein_df['BARCODE'].unique())

5405

### Filter asvtab_barcodes to contain only ASVs with levenshtein distance <=2 to barcodes

In [62]:
# Filtering Asvtab_barcodes to contain only ASVs with levenshtein distance <=2 to barcodes

asvtab_barcodes = asvtab_barcodes.set_index("BARCODE")
asvtab_filtered = asvtab_barcodes[asvtab_barcodes.index.isin(levenshtein_df.index)]
asvtab_filtered.index.rename('ASV', inplace=True)
len(asvtab_filtered)

5447

In [63]:
# Add Barcode sequence that matches ASV with Levenshtein distance <=2
asvtab_filtered = asvtab_filtered.join(levenshtein_df['BARCODE'])

### Add ORF name to filtered asvtab_barcodes

In [72]:
# Add ORF name 
asvtab_orf = asvtab_filtered.join(barcodes['ORF'], on=['BARCODE'])
asvtab_orf = asvtab_orf.set_index('ORF')
asvtab_orf.head()

Unnamed: 0,C1110GR1_S132,C1110GR2_S133,C1110GR3_S134,C115GR1_S87,C115GR2_S88,C115GR3_S89,C1710GR1_S150,C1710GR2_S151,C1710GR3_S152,C175GR1_S105,C175GR2_S106,C175GR3_S107,BARCODE
YDL060W,1925,2459,2560,3382,3902,5457,2854,2795,3407,2918,1993,2588,CCCTGATGACTAAGTTGTTC
YJR039W,749,639,702,562,402,588,768,882,874,327,309,363,CATCAGAGTCCATGTCAGGG
YLR308W,630,755,582,517,759,681,1056,805,721,510,370,439,TATAACACTGGCGAACTAGC
YOL090W,811,683,819,817,677,861,800,817,818,487,508,461,TTACACTCTCTAGCATTCGG
YLR287C,601,882,578,600,785,637,672,785,859,365,399,423,TAGTACCCGGAACCTAGAGC


In [73]:
# Summing counts of ASVs that are from the same barcode
asvtab_orf.index = [i.strip() for i in asvtab_orf.index]
asvtab_orf = asvtab_orf.groupby(by=asvtab_orf.index, axis=0).sum()

In [74]:
len(asvtab_orf)

5405

### Final counts table

In [75]:
# FINAL BARCODE/ORF COUNT TABLE FOR DESEQ2
asvtab_orf

Unnamed: 0,C1110GR1_S132,C1110GR2_S133,C1110GR3_S134,C115GR1_S87,C115GR2_S88,C115GR3_S89,C1710GR1_S150,C1710GR2_S151,C1710GR3_S152,C175GR1_S105,C175GR2_S106,C175GR3_S107
YAL001C,18,16,13,33,45,20,23,9,44,26,25,28
YAL002W,5,28,11,17,39,11,12,4,4,6,1,12
YAL003W,33,27,39,55,56,56,22,14,21,30,19,20
YAL004W,33,48,57,91,41,71,22,15,33,32,29,19
YAL005C,5,25,16,4,28,21,12,7,6,23,21,5
...,...,...,...,...,...,...,...,...,...,...,...,...
YPR197C,98,68,140,175,139,191,158,77,111,91,68,93
YPR198W,62,99,96,135,205,281,147,105,181,142,74,149
YPR199C,84,124,82,185,278,112,125,119,127,113,94,109
YPR200C,114,74,146,228,139,196,170,164,152,121,145,77


In [78]:
import os
os.system('jupyter nbconvert --to html Levenshtein_distance_filtering.ipynb')

0