In [8]:
import numpy as np
import pandas as pd
import tkinter as tk
from tkinter import ttk
import tkinter.font as tkf
from tkinter import messagebox
from tkinter import filedialog
import threading
import time

In [9]:
headers = ['gene_id', 'UID', 'seq', 'Reserved', 'count']
header_widths = [200, 150, 350, 100, 80]

In [10]:
def reverseComplement(sequence):
    complement = {'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C', 'N': 'N'}
    rc_sequence=''
    for s in sequence:
        rc_sequence = complement[s] + rc_sequence
    return rc_sequence

# Preprocess

In [11]:
def preprocessFASTQ():
    global reads, indicator_preprocess, kmer_dict_reads
    
    num = len(reads)
    indicator_preprocess = 0
    gain = 500000/num
    
    
    kmer_dict_reads = {}

    k = 20
    
    start_time = time.time()

    for read in reads:
        for i in range(len(read)-k+1):
            kmer_dict_reads[read[i:i+k]] = set()
        indicator_preprocess += gain 

    for read in reads:
        for i in range(len(read)-k+1):
            kmer_dict_reads[read[i:i+k]].add(read)
        indicator_preprocess += gain
        
    end_time = time.time()
    delta_time = end_time - start_time
    
    text_time.delete('1.0', tk.END)
    text_time.insert('1.0', str(delta_time))
    
    messagebox.showinfo("Preprocess FASTQ Completed", "Current FASTQ preprocess successfully completed!")

In [12]:
def start_preprocess_thread(event):
    global preprocess_thread, indicator_preprocess
    preprocess_thread = threading.Thread(target=preprocessFASTQ)
    preprocess_thread.daemon = True
    
    progressbar['value'] = indicator_preprocess
    
    preprocess_thread.start()
    root.after(20, check_preprocess_thread)

def check_preprocess_thread():
    if preprocess_thread.is_alive():
        progressbar['value'] = indicator_preprocess
        
        root.after(20, check_preprocess_thread)

# Match

In [13]:
def buttonMatch():
    gotten = text_sequence.get('1.0', tk.END)
    p1 = gotten.rstrip()
    
    try:
        count = len(kmer_dict_reads[p1])
    except KeyError:
        count = 0
    
    text_count.delete('1.0', tk.END)
    text_count.insert('1.0', str(count))

# Match All

In [14]:
def matchAll():
    global  kmer_dict_reads, indicator_matchAll, df
    num = len(df)
    indicator_matchAll = 0
    gain = 1000000/num
    
    arr = np.array(df)
    
    for i in range(len(arr)):
        key = arr[i,2]
        try:
            n1 = len(kmer_dict_reads[key])
        except KeyError:
            n1 = -1
        arr[i, 4] = n1
        indicator_matchAll += gain
    
    df = pd.DataFrame(arr)
    df = df.set_index('UID', drop=False) 

In [15]:
def start_matchAll_thread(event):
    global matchAll_thread, indicator_matchAll
    matchAll_thread = threading.Thread(target=matchAll)
    matchAll_thread.daemon = True
    
    progressbar['value'] = indicator_matchAll
    
    matchAll_thread.start()
    root.after(20, check_matchAll_thread)

def check_matchAll_thread():
    if matchAll_thread.is_alive():
        progressbar['value'] = indicator_matchAll
        
        root.after(20, check_matchAll_thread)

# FASTQ File Load

In [16]:
def buttonBrowseFASTQ():
    global filenameFASTQ
    progressbar_loadSequences['value'] = 0
    try:
        filenameFASTQ = filedialog.askopenfilename(filetypes=(('FASTQ files', '*.fastq'), ('All files', '*.*')))
        text_fileFASTQ.delete('1.0', tk.END)
        text_fileFASTQ.insert('1.0', filenameFASTQ.split('/')[-1])
    except:
        filenameFASTQ = ''    

In [17]:
def loadFASTQ():
    global reads
    
    start_time = time.time()
    if filenameFASTQ == '':
        messagebox.showwarning("No File", "Sorry, no file loaded! Please choose FASTQ file first.")
    else:       

        f = open(filenameFASTQ)
        reads = []

        try:
            while 1:
                name = f.readline().rstrip()
                read = f.readline().rstrip()
                f.readline()
                quality = f.readline().rstrip()

                if len(name) == 0:
                    break

                reads.append(read)           
            
            end_time = time.time()
            delta_time = end_time - start_time
                       
            text_time.delete('1.0', tk.END)
            text_time.insert('1.0', str(delta_time))           
            
        except:
            messagebox.showwarning("File Loading Failed", "Sorry, file loading failed! Please check the file format.")

In [18]:
def start_loadFASTQ_thread(event):
    global loadFASTQ_thread
    loadFASTQ_thread = threading.Thread(target=loadFASTQ)
    loadFASTQ_thread.daemon = True
    
    progressbar_loadFASTQ.start(10)
    loadFASTQ_thread.start()
    root.after(20, check_loadFASTQ_thread)

def check_loadFASTQ_thread():
    if loadFASTQ_thread.is_alive():
        progressbar_loadFASTQ.start(10)
        root.after(20, check_loadFASTQ_thread)
    else:
        progressbar_loadFASTQ.stop()
        progressbar_loadFASTQ['value']=100
        messagebox.showinfo("FASTQ File Loaded", "FASTQ file successfully loaded!")

# Target Gene File Load

In [19]:
def buttonBrowseSequences():
    global filenameSequences
    
    try:
        filenameSequences = filedialog.askopenfilename(filetypes=(('Comma-Separated (CSV) text file', '*.csv'), ('All files', '*.*')))
        text_fileSequences.delete('1.0', tk.END)
        text_fileSequences.insert('1.0', filenameSequences.split('/')[-1])
    except:
        filenameSequences = ''    

In [20]:
def loadSequences():
    global filenameSequences, df, recordNum
   
    if filenameSequences == '':
        messagebox.showwarning("No File", "Sorry, no file chosen! Please choose file of sequences first.")
    else:        
        try:
            start_time = time.time()
            
            df = pd.read_csv(filenameSequences)
            df['count'] = 0
            df = df.set_index('UID', drop=False)  
            
            recordNum = len(df)
            
            progressbar_loadSequences['value'] = 100
            
            end_time = time.time()
            delta_time = end_time - start_time
                       
            text_time.delete('1.0', tk.END)
            text_time.insert('1.0', str(delta_time))
            
            text_recordNum.delete('1.0', tk.END)
            text_recordNum.insert('1.0', str(recordNum))
            
            messagebox.showinfo("File of Sequences Loaded", "File of sequences successfully loaded!")        
        except:
            messagebox.showwarning("File Loading Failed", "Sorry, file loading failed! Please check the file format.")    

# Table Events

In [21]:
def OnDoubleClick(event):
    item = table.selection()[0]
    value = table.item(item, 'values')
    geneID = value[0]
    uid = value[1]
    sequence = value[2]
    rc_sequence = reverseComplement(sequence)
    
    text_geneID.delete('1.0', tk.END)
    text_geneID.insert('1.0', str(geneID))
    
    text_uid.delete('1.0', tk.END)
    text_uid.insert('1.0', str(uid))
    
    text_sequence.delete('1.0', tk.END)
    text_sequence.insert('1.0', str(sequence))
    
    text_rc_sequence.delete('1.0', tk.END)
    text_rc_sequence.insert('1.0', str(rc_sequence))
    

In [22]:
def sortby(tree, col, descending):
    """sort tree contents when a column header is clicked on"""
    # grab values to sort
    data = [(tree.set(child, col), child) for child in tree.get_children('')]
    # if the data to be sorted is numeric change to float
    #data =  change_numeric(data)
    # now sort the data in place
    data.sort(reverse=descending)
    for ix, item in enumerate(data):
        tree.move(item[1], '', ix)
    # switch the heading so it will sort in the opposite direction
    tree.heading(col, command=lambda col=col: sortby(tree, col, int(not descending)))

In [23]:
def display_in_table():
    for a in df.index:
        row = df.ix[a]
        table.insert("", "end", "", values=tuple(row)) 

In [24]:
def clear():
    for i in table.get_children():
        table.delete(i)

In [25]:
def refresh():
    start_time = time.time()
    clear()
    display_in_table()
    delta_time = time.time() - start_time
    
    text_time.delete('1.0', tk.END)
    text_time.insert('1.0', str(delta_time))           

In [26]:
def buttonExport():
    pass

# String Preprocess

# Main Flow

In [27]:
root = tk.Tk()

indicator_preprocess = 0
indicator_loadSequences = 0
indicator_matchAll = 0
filenameSequences = ''
recordNum = 0
count = 0

root.geometry("{0}x{1}+0+0".format(root.winfo_screenwidth(), root.winfo_screenheight()))
#root.attributes('-fullscreen', True)
root.title('Sequence Matching Tool')


# Multicolumn Listbox/////////////////////////////////////////////////////////////////////////////
table = ttk.Treeview(height="20", columns=headers, selectmode="extended")
table.pack(padx=10, pady=20, ipadx=1200, ipady=140)

i = 1
for header in headers:
    table.heading('#'+str(i), text=header.title(), anchor=tk.W, command=lambda c=header: sortby(table, c, 0))
    table.column('#'+str(i), stretch=tk.NO, minwidth=0, width=tkf.Font().measure(header.title())+header_widths[i-1]) 
    i+=1    
table.column('#0', stretch=tk.NO, minwidth=0, width=0)

table.bind("<Double-1>", OnDoubleClick)
#///////////////////////////////////////////////////////////////////////////////////////////

# Scrollbar////////////////////////////////////////////////////////////////////////////////////////
vsb = ttk.Scrollbar(table, orient="vertical",  command = table.yview)
hsb = ttk.Scrollbar(table, orient="horizontal", command = table.xview)
## Link scrollbars activation to top-level object
table.configure(yscrollcommand=vsb.set, xscrollcommand=hsb.set)
## Link scrollbar also to every columns
map(lambda col: col.configure(yscrollcommand=vsb.set,xscrollcommand=hsb.set), table)
vsb.pack(side = tk.RIGHT, fill = tk.Y)
hsb.pack(side = tk.BOTTOM, fill = tk.X)        

#//////////////////////////////////////////////////////////////////////////////////////////////
y0 =400
y1 = 440
y2 = 510
y3 = 550
y4 = 610
y5 = 645
y6 = 685
# Text /////////////////////////////////////////////////////////////////////////////////////
text_recordNum=tk.Text(root, width=10, height=1, font=('tahoma', 9), bd=2, wrap='none')
text_recordNum.place(x=1170, y=y0)
label_recordNum=tk.Label(root, text='records', font=('tahoma', 9))
label_recordNum.place(x=1270,y=y0)

text_fileSequences=tk.Text(root, width=50, height=1, font=('tahoma', 9), bd=2, wrap='none')
text_fileSequences.place(x=60, y=y0)

text_fileFASTQ=tk.Text(root, width=36, height=1, font=('tahoma', 9), bd=2, wrap='none')
text_fileFASTQ.place(x=60, y=y4)

text_count=tk.Text(root, width=16, height=1, font=('tahoma', 9), bd=2)
text_count.place(x=1000, y=y3)
label_count=tk.Label(root, text='Count:', font=('tahoma', 9))
label_count.place(x=940,y=y3)

text_time=tk.Text(root, width=20, height=1, font=('tahoma', 9), bd=2)
text_time.place(x=1090, y=y1)
label_time=tk.Label(root, text='Time:', font=('tahoma', 9))
label_time.place(x=1030,y=y1)
label_seconds=tk.Label(root, text='second(s)', font=('tahoma', 9))
label_seconds.place(x=1270,y=y1)

text_geneID=tk.Text(root, width=20, height=1, font=('tahoma', 9), bd=2)
text_geneID.place(x=140, y=y2)
label_geneID=tk.Label(root, text='Gene ID:', font=('tahoma', 9))
label_geneID.place(x=60,y=y2)

text_uid=tk.Text(root, width=20, height=1, font=('tahoma', 9), bd=2)
text_uid.place(x=390, y=y2)
label_uid=tk.Label(root, text='UID:', font=('tahoma', 9))
label_uid.place(x=340,y=y2)

text_sequence=tk.Text(root, width=38, height=1, font=('tahoma', 9), bd=2)
text_sequence.place(x=680, y=y2)
label_sequence=tk.Label(root, text='Sequence:', font=('tahoma', 9))
label_sequence.place(x=600,y=y2)

text_rc_sequence=tk.Text(root, width=38, height=1, font=('tahoma', 9), bd=2)
text_rc_sequence.place(x=1000, y=y2)


# ProgressBar /////////////////////////////////////////////////////////////////////////////
progressbar_loadSequences = ttk.Progressbar(root, length=200, maximum=100, mode='determinate')
progressbar_loadSequences.place(x=500,y=y0)

progressbar_loadFASTQ = ttk.Progressbar(root, length=250, mode='indeterminate')
progressbar_loadFASTQ.place(x=400,y=y4)

progressbar = ttk.Progressbar(root, length=420, maximum=1000000, mode='determinate')
progressbar.place(x=720,y=y4)

# Button /////////////////////////////////////////////////////////////////////////////////
button_browseSequences = ttk.Button(root, text="Browse sgRNA...", width=20, command=buttonBrowseSequences)
button_browseSequences.place(x=60, y=y1)

button_loadSequences = ttk.Button(root, text="Load sgRNA", width=20, command=loadSequences)
button_loadSequences.place(x=500, y=y1)

button_clear = ttk.Button(root, text="Clear", width=20, command=clear)
button_clear.place(x=770, y=y1)

button_refresh = ttk.Button(root, text="Browse", width=20, command=refresh)
button_refresh.place(x=770, y=y0)

button_loadFASTQ = ttk.Button(root, text="Load FASTQ", width=20, command=lambda:start_loadFASTQ_thread(None))
button_loadFASTQ.place(x=400, y=y5)

button_preprocessFASTQ = ttk.Button(root, text="Preprocess FASTQ", width=20, command=lambda:start_preprocess_thread(None))
button_preprocessFASTQ.place(x=720, y=y5)

button_match = ttk.Button(root, text="Preprocess FASTQ", width=20, command=lambda:start_preprocess_thread(None))
button_match.place(x=720, y=y5)

button_browseFASTQ = ttk.Button(root, text="Browse FASTQ...", width=20, command=buttonBrowseFASTQ)
button_browseFASTQ.place(x=60, y=y5)

button_matchAll = ttk.Button(root, text="Match All", width=20, command=lambda:start_matchAll_thread(None))
button_matchAll.place(x=960, y=y5)

button_match = ttk.Button(root, text="Match", width=20, command=buttonMatch)
button_match.place(x=680, y=y3)

button_export = ttk.Button(root, text="Export", width=20, command=buttonExport)
button_export.place(x=1180, y=y5)

button_exit = ttk.Button(root, text="Exit", width=20, command=root.destroy)
button_exit.place(x=1180, y=y6)

root.bind('<Return>', start_preprocess_thread)
root.bind('<Return>', start_loadFASTQ_thread)
root.bind('<Return>', start_matchAll_thread)

root.mainloop()

Exception in thread Thread-7:
Traceback (most recent call last):
  File "pandas\index.pyx", line 154, in pandas.index.IndexEngine.get_loc (pandas\index.c:3704)
  File "pandas\hashtable.pyx", line 375, in pandas.hashtable.Int64HashTable.get_item (pandas\hashtable.c:7200)
TypeError: an integer is required

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:\Anaconda3\lib\threading.py", line 911, in _bootstrap_inner
    self.run()
  File "C:\Anaconda3\lib\threading.py", line 859, in run
    self._target(*self._args, **self._kwargs)
  File "<ipython-input-14-2051aa437e9a>", line 19, in matchAll
    df = df.set_index('UID', drop=False)
  File "C:\Anaconda3\lib\site-packages\pandas\core\frame.py", line 2607, in set_index
    level = frame[col].values
  File "C:\Anaconda3\lib\site-packages\pandas\core\frame.py", line 1797, in __getitem__
    return self._getitem_column(key)
  File "C:\Anaconda3\lib\site-packages\pandas\core\frame.