# Imports

In [1]:
import os
import numpy 
import pandas 
import sys

In [2]:
testFile = """@M01757:9:000000000-AN67B:1:1101:13276:1772 1:N:0:1
TGACAGGACCAGTCACGCTTTTTCTCGGAGAAGATCAAAATCTGTCGTCTTTATTGACCATATACATAGTTCAGTCGCTGTACAACACTTATCTGAAA
+
11AAAFA1AAAFGDDF11EFGGH0F300000001D1111111D22A/BAFFG2DF1211111D222D212D222D1//B//1D21B//BF1B1F2221
"""
if os.path.exists("testFile.fq"):
    os.remove("testFile.fq")
f = open("testFile.fq", "a")
f.write(testFile)
f.close()


In [3]:
!cat testFile.fq

@M01757:9:000000000-AN67B:1:1101:13276:1772 1:N:0:1
TGACAGGACCAGTCACGCTTTTTCTCGGAGAAGATCAAAATCTGTCGTCTTTATTGACCATATACATAGTTCAGTCGCTGTACAACACTTATCTGAAA
+
11AAAFA1AAAFGDDF11EFGGH0F300000001D1111111D22A/BAFFG2DF1211111D222D212D222D1//B//1D21B//BF1B1F2221


# Functions

In [4]:
def read_fastq_to_pandas(filenames, num_records = 50, verbose = True):
    header = []
    sequence = []
    quality = []
    counter = 0 
    stop_count = 0
    stop = num_records * 4
    for filename in filenames:
        f = open(filename, "r")
        lines = f.readlines()
        print("Processing File: {}".format( filename))
        delim = "@"
        if lines[0].startswith("@"):
            delim = lines[0].split(":")[0]
        for i in range(len(lines)):
            line = lines[i]
            if verbose:
                print("\tProcessing Line: {}/{}".format( i +1 ,len(lines)))
            if line.startswith(delim):
                counter = 0
            counter +=1
            if counter == 1:
                header.append(line)
            if counter == 2:
                sequence.append(line)
            if counter == 4:
                quality.append(line)
                
            
            stop_count +=1
            if stop_count == stop: break
    #print(len(header), len(sequence), len(quality))
    dictionary = {'header': header, 'sequence': sequence, 'quality': quality} 
    df = pandas.DataFrame(dictionary)

    df[[
        "instrument_name",
        "run_id", 
        "flowcell_id",
        "flowcell_lane_tile_number",
        "x-coord_within_tile",
        "y-coord_within_tile",
        "member_pair",
        "filtered",
        "control_bits",
        "index_sequence"]] = df["header"].str.split(pat=":", expand=True)
    
    df['instrument_name'] =  df['instrument_name'].apply(lambda x: x[1:] if x.startswith("@") else x)

    return df
    


In [5]:
list_of_files = [x for x in os.listdir(os.getcwd()) if x.endswith(".fq")]
print(list_of_files)
df = read_fastq_to_pandas(list_of_files)   
df.head()

['testFile.fq']
Processing File: testFile.fq
	Processing Line: 1/4
	Processing Line: 2/4
	Processing Line: 3/4
	Processing Line: 4/4


Unnamed: 0,header,sequence,quality,instrument_name,run_id,flowcell_id,flowcell_lane_tile_number,x-coord_within_tile,y-coord_within_tile,member_pair,filtered,control_bits,index_sequence
0,@M01757:9:000000000-AN67B:1:1101:13276:1772 1:...,TGACAGGACCAGTCACGCTTTTTCTCGGAGAAGATCAAAATCTGTC...,11AAAFA1AAAFGDDF11EFGGH0F300000001D1111111D22A...,M01757,9,000000000-AN67B,1,1101,13276,1772 1,N,0,1\n


# Data Import

In [6]:
number_of_sequences = 100
datapath = os.getcwd() + "/data/SRR7817180_hBM-MSC_3.fastq"
df = read_fastq_to_pandas([datapath], 
                          num_records = number_of_sequences, verbose = False)

Processing File: /Users/awells/capstone/capstone2022/data/SRR7817180_hBM-MSC_3.fastq


In [7]:
df

Unnamed: 0,header,sequence,quality,instrument_name,run_id,flowcell_id,flowcell_lane_tile_number,x-coord_within_tile,y-coord_within_tile,member_pair,filtered,control_bits,index_sequence
0,@HWI-ST1075L:314:C3RTEACXX:8:1101:2032:1996 1:...,NGTCTTGTTATCGTTCACGCGCCTATCAAAATCTTT\n,#4:DDFFFHHHHHJJJJJJJJJJJIJJIJJJJJJJJ\n,HWI-ST1075L,314,C3RTEACXX,8,1101,2032,1996 1,N,0,ACATTGGC\n
1,@HWI-ST1075L:314:C3RTEACXX:8:1101:3159:1999 1:...,NTGTATAGCACATACTTCAGGCCTGCGGCACCACCC\n,#1=DDFFFHHHHHJJJIJJGHJJIJJJJJJJJJJJJ\n,HWI-ST1075L,314,C3RTEACXX,8,1101,3159,1999 1,N,0,ACATTGGC\n
2,@HWI-ST1075L:314:C3RTEACXX:8:1101:3698:1978 1:...,NTCTCAGTGATAAATACTTCGACAGGACCACTTGAG\n,#1=DDFFEHHHHHJJJJJJJJJJJJIJJJIJJJJJJ\n,HWI-ST1075L,314,C3RTEACXX,8,1101,3698,1978 1,N,0,ACATTGGC\n
3,@HWI-ST1075L:314:C3RTEACXX:8:1101:3790:1978 1:...,NCCTGAAGCAACTGCTGTAACTGTGTTACAGTCTGA\n,#1=BDDFDFHHFDGGGGHIHGGGCFFGGEBCAFFAF\n,HWI-ST1075L,314,C3RTEACXX,8,1101,3790,1978 1,N,0,ACATTGGC\n
4,@HWI-ST1075L:314:C3RTEACXX:8:1101:3902:1995 1:...,NTACTGTCTTCATTAGCACCATCCGCTCTCGCTTTG\n,#1=DDFFFHHHHHJJJIJJJIJJJIJJJIIGIJJJJ\n,HWI-ST1075L,314,C3RTEACXX,8,1101,3902,1995 1,N,0,ACATTGGC\n
...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,@HWI-ST1075L:314:C3RTEACXX:8:1101:7164:2064 1:...,TCCAGCTTTTATTCTTTCTTTCTCTTTCCCTAGGTC\n,@@@DBDDDHHFFDGGIIAEHCH@EHGGIGGIH9CHH\n,HWI-ST1075L,314,C3RTEACXX,8,1101,7164,2064 1,N,0,ACATTGGC\n
96,@HWI-ST1075L:314:C3RTEACXX:8:1101:7156:2084 1:...,TGCACATACAAACGGCCTATCTCGAGGAATTAAGTC\n,CCCFFFFFHHGHHJJIJJJJJJJJIIJIIIJHIJHI\n,HWI-ST1075L,314,C3RTEACXX,8,1101,7156,2084 1,N,0,ACATTGGC\n
97,@HWI-ST1075L:314:C3RTEACXX:8:1101:7038:2125 1:...,GGAAGATCCAGGTACTGTGGCTCATCTCCCTCCTCA\n,@BCFFFFFHHHHDHIJIJJIJJJJJJIIJJJJJIJJ\n,HWI-ST1075L,314,C3RTEACXX,8,1101,7038,2125 1,N,0,ACATTGGC\n
98,@HWI-ST1075L:314:C3RTEACXX:8:1101:7201:2144 1:...,GTGGGATGCCTTCGTCTTGGCCCTCGACTTGGCCTT\n,<@@DADDFHGHHGIIJJJAEHIJJJIHIIJGIEGGI\n,HWI-ST1075L,314,C3RTEACXX,8,1101,7201,2144 1,N,0,ACATTGGC\n


# Pre-processing

### Strip Newline Character from Sequence

In [8]:
df['sequence'] = df['sequence'].apply(lambda x: x.strip())
df['index_sequence'] = df['index_sequence'].apply(lambda x: x.strip())

### Update Dtypes

In [9]:
df['filtered'] = df['filtered'].astype('category')

### DateFrame Inspection

In [10]:
df.head()

Unnamed: 0,header,sequence,quality,instrument_name,run_id,flowcell_id,flowcell_lane_tile_number,x-coord_within_tile,y-coord_within_tile,member_pair,filtered,control_bits,index_sequence
0,@HWI-ST1075L:314:C3RTEACXX:8:1101:2032:1996 1:...,NGTCTTGTTATCGTTCACGCGCCTATCAAAATCTTT,#4:DDFFFHHHHHJJJJJJJJJJJIJJIJJJJJJJJ\n,HWI-ST1075L,314,C3RTEACXX,8,1101,2032,1996 1,N,0,ACATTGGC
1,@HWI-ST1075L:314:C3RTEACXX:8:1101:3159:1999 1:...,NTGTATAGCACATACTTCAGGCCTGCGGCACCACCC,#1=DDFFFHHHHHJJJIJJGHJJIJJJJJJJJJJJJ\n,HWI-ST1075L,314,C3RTEACXX,8,1101,3159,1999 1,N,0,ACATTGGC
2,@HWI-ST1075L:314:C3RTEACXX:8:1101:3698:1978 1:...,NTCTCAGTGATAAATACTTCGACAGGACCACTTGAG,#1=DDFFEHHHHHJJJJJJJJJJJJIJJJIJJJJJJ\n,HWI-ST1075L,314,C3RTEACXX,8,1101,3698,1978 1,N,0,ACATTGGC
3,@HWI-ST1075L:314:C3RTEACXX:8:1101:3790:1978 1:...,NCCTGAAGCAACTGCTGTAACTGTGTTACAGTCTGA,#1=BDDFDFHHFDGGGGHIHGGGCFFGGEBCAFFAF\n,HWI-ST1075L,314,C3RTEACXX,8,1101,3790,1978 1,N,0,ACATTGGC
4,@HWI-ST1075L:314:C3RTEACXX:8:1101:3902:1995 1:...,NTACTGTCTTCATTAGCACCATCCGCTCTCGCTTTG,#1=DDFFFHHHHHJJJIJJJIJJJIJJJIIGIJJJJ\n,HWI-ST1075L,314,C3RTEACXX,8,1101,3902,1995 1,N,0,ACATTGGC


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 13 columns):
 #   Column                     Non-Null Count  Dtype   
---  ------                     --------------  -----   
 0   header                     100 non-null    object  
 1   sequence                   100 non-null    object  
 2   quality                    100 non-null    object  
 3   instrument_name            100 non-null    object  
 4   run_id                     100 non-null    object  
 5   flowcell_id                100 non-null    object  
 6   flowcell_lane_tile_number  100 non-null    object  
 7   x-coord_within_tile        100 non-null    object  
 8   y-coord_within_tile        100 non-null    object  
 9   member_pair                100 non-null    object  
 10  filtered                   100 non-null    category
 11  control_bits               100 non-null    object  
 12  index_sequence             100 non-null    object  
dtypes: category(1), object(12)
memory us

In [12]:
def generate_N_grams(sequence, maxlen=5, reverse = False):
    
    array = []
    for i in range(len(sequence)- maxlen + 1):
        array.append(sequence[i:i+maxlen])
    
    sequence = sequence[::-1] # Reverse sequence
    
    if reverse:
        for i in range(len(sequence)- maxlen + 1):
            array.append(sequence[i:i+maxlen])
        
    return array

In [13]:
reverse_sequence = True
max_length = 8
df['short_sequence'] = df['sequence'].apply(lambda x: generate_N_grams(x,max_length,reverse_sequence ))

In [14]:
df

Unnamed: 0,header,sequence,quality,instrument_name,run_id,flowcell_id,flowcell_lane_tile_number,x-coord_within_tile,y-coord_within_tile,member_pair,filtered,control_bits,index_sequence,short_sequence
0,@HWI-ST1075L:314:C3RTEACXX:8:1101:2032:1996 1:...,NGTCTTGTTATCGTTCACGCGCCTATCAAAATCTTT,#4:DDFFFHHHHHJJJJJJJJJJJIJJIJJJJJJJJ\n,HWI-ST1075L,314,C3RTEACXX,8,1101,2032,1996 1,N,0,ACATTGGC,"[NGTCTTGT, GTCTTGTT, TCTTGTTA, CTTGTTAT, TTGTT..."
1,@HWI-ST1075L:314:C3RTEACXX:8:1101:3159:1999 1:...,NTGTATAGCACATACTTCAGGCCTGCGGCACCACCC,#1=DDFFFHHHHHJJJIJJGHJJIJJJJJJJJJJJJ\n,HWI-ST1075L,314,C3RTEACXX,8,1101,3159,1999 1,N,0,ACATTGGC,"[NTGTATAG, TGTATAGC, GTATAGCA, TATAGCAC, ATAGC..."
2,@HWI-ST1075L:314:C3RTEACXX:8:1101:3698:1978 1:...,NTCTCAGTGATAAATACTTCGACAGGACCACTTGAG,#1=DDFFEHHHHHJJJJJJJJJJJJIJJJIJJJJJJ\n,HWI-ST1075L,314,C3RTEACXX,8,1101,3698,1978 1,N,0,ACATTGGC,"[NTCTCAGT, TCTCAGTG, CTCAGTGA, TCAGTGAT, CAGTG..."
3,@HWI-ST1075L:314:C3RTEACXX:8:1101:3790:1978 1:...,NCCTGAAGCAACTGCTGTAACTGTGTTACAGTCTGA,#1=BDDFDFHHFDGGGGHIHGGGCFFGGEBCAFFAF\n,HWI-ST1075L,314,C3RTEACXX,8,1101,3790,1978 1,N,0,ACATTGGC,"[NCCTGAAG, CCTGAAGC, CTGAAGCA, TGAAGCAA, GAAGC..."
4,@HWI-ST1075L:314:C3RTEACXX:8:1101:3902:1995 1:...,NTACTGTCTTCATTAGCACCATCCGCTCTCGCTTTG,#1=DDFFFHHHHHJJJIJJJIJJJIJJJIIGIJJJJ\n,HWI-ST1075L,314,C3RTEACXX,8,1101,3902,1995 1,N,0,ACATTGGC,"[NTACTGTC, TACTGTCT, ACTGTCTT, CTGTCTTC, TGTCT..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,@HWI-ST1075L:314:C3RTEACXX:8:1101:7164:2064 1:...,TCCAGCTTTTATTCTTTCTTTCTCTTTCCCTAGGTC,@@@DBDDDHHFFDGGIIAEHCH@EHGGIGGIH9CHH\n,HWI-ST1075L,314,C3RTEACXX,8,1101,7164,2064 1,N,0,ACATTGGC,"[TCCAGCTT, CCAGCTTT, CAGCTTTT, AGCTTTTA, GCTTT..."
96,@HWI-ST1075L:314:C3RTEACXX:8:1101:7156:2084 1:...,TGCACATACAAACGGCCTATCTCGAGGAATTAAGTC,CCCFFFFFHHGHHJJIJJJJJJJJIIJIIIJHIJHI\n,HWI-ST1075L,314,C3RTEACXX,8,1101,7156,2084 1,N,0,ACATTGGC,"[TGCACATA, GCACATAC, CACATACA, ACATACAA, CATAC..."
97,@HWI-ST1075L:314:C3RTEACXX:8:1101:7038:2125 1:...,GGAAGATCCAGGTACTGTGGCTCATCTCCCTCCTCA,@BCFFFFFHHHHDHIJIJJIJJJJJJIIJJJJJIJJ\n,HWI-ST1075L,314,C3RTEACXX,8,1101,7038,2125 1,N,0,ACATTGGC,"[GGAAGATC, GAAGATCC, AAGATCCA, AGATCCAG, GATCC..."
98,@HWI-ST1075L:314:C3RTEACXX:8:1101:7201:2144 1:...,GTGGGATGCCTTCGTCTTGGCCCTCGACTTGGCCTT,<@@DADDFHGHHGIIJJJAEHIJJJIHIIJGIEGGI\n,HWI-ST1075L,314,C3RTEACXX,8,1101,7201,2144 1,N,0,ACATTGGC,"[GTGGGATG, TGGGATGC, GGGATGCC, GGATGCCT, GATGC..."


### Unstack n-seq
https://stackoverflow.com/questions/42012152/unstack-a-pandas-column-containing-lists-into-multiple-rows

In [15]:
lst_col = 'short_sequence'

df = pandas.DataFrame({ col:numpy.repeat(df[col].values, df[lst_col].str.len())
                       for col in df.columns.difference([lst_col])}).assign(**{lst_col:numpy.concatenate(df[lst_col].values)})[df.columns.tolist()]


In [16]:
df

Unnamed: 0,header,sequence,quality,instrument_name,run_id,flowcell_id,flowcell_lane_tile_number,x-coord_within_tile,y-coord_within_tile,member_pair,filtered,control_bits,index_sequence,short_sequence
0,@HWI-ST1075L:314:C3RTEACXX:8:1101:2032:1996 1:...,NGTCTTGTTATCGTTCACGCGCCTATCAAAATCTTT,#4:DDFFFHHHHHJJJJJJJJJJJIJJIJJJJJJJJ\n,HWI-ST1075L,314,C3RTEACXX,8,1101,2032,1996 1,N,0,ACATTGGC,NGTCTTGT
1,@HWI-ST1075L:314:C3RTEACXX:8:1101:2032:1996 1:...,NGTCTTGTTATCGTTCACGCGCCTATCAAAATCTTT,#4:DDFFFHHHHHJJJJJJJJJJJIJJIJJJJJJJJ\n,HWI-ST1075L,314,C3RTEACXX,8,1101,2032,1996 1,N,0,ACATTGGC,GTCTTGTT
2,@HWI-ST1075L:314:C3RTEACXX:8:1101:2032:1996 1:...,NGTCTTGTTATCGTTCACGCGCCTATCAAAATCTTT,#4:DDFFFHHHHHJJJJJJJJJJJIJJIJJJJJJJJ\n,HWI-ST1075L,314,C3RTEACXX,8,1101,2032,1996 1,N,0,ACATTGGC,TCTTGTTA
3,@HWI-ST1075L:314:C3RTEACXX:8:1101:2032:1996 1:...,NGTCTTGTTATCGTTCACGCGCCTATCAAAATCTTT,#4:DDFFFHHHHHJJJJJJJJJJJIJJIJJJJJJJJ\n,HWI-ST1075L,314,C3RTEACXX,8,1101,2032,1996 1,N,0,ACATTGGC,CTTGTTAT
4,@HWI-ST1075L:314:C3RTEACXX:8:1101:2032:1996 1:...,NGTCTTGTTATCGTTCACGCGCCTATCAAAATCTTT,#4:DDFFFHHHHHJJJJJJJJJJJIJJIJJJJJJJJ\n,HWI-ST1075L,314,C3RTEACXX,8,1101,2032,1996 1,N,0,ACATTGGC,TTGTTATC
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5795,@HWI-ST1075L:314:C3RTEACXX:8:1101:7089:2213 1:...,GTCTGGGCTTCCGCAGCTTCTTCTTCTTGAAGTAAG,=@@FFFFFHHGHHIJJIJJJIIJJJJIJIIJDIJJJ\n,HWI-ST1075L,314,C3RTEACXX,8,1101,7089,2213 1,N,0,ACATTGGC,CCTTCGGG
5796,@HWI-ST1075L:314:C3RTEACXX:8:1101:7089:2213 1:...,GTCTGGGCTTCCGCAGCTTCTTCTTCTTGAAGTAAG,=@@FFFFFHHGHHIJJIJJJIIJJJJIJIIJDIJJJ\n,HWI-ST1075L,314,C3RTEACXX,8,1101,7089,2213 1,N,0,ACATTGGC,CTTCGGGT
5797,@HWI-ST1075L:314:C3RTEACXX:8:1101:7089:2213 1:...,GTCTGGGCTTCCGCAGCTTCTTCTTCTTGAAGTAAG,=@@FFFFFHHGHHIJJIJJJIIJJJJIJIIJDIJJJ\n,HWI-ST1075L,314,C3RTEACXX,8,1101,7089,2213 1,N,0,ACATTGGC,TTCGGGTC
5798,@HWI-ST1075L:314:C3RTEACXX:8:1101:7089:2213 1:...,GTCTGGGCTTCCGCAGCTTCTTCTTCTTGAAGTAAG,=@@FFFFFHHGHHIJJIJJJIIJJJJIJIIJDIJJJ\n,HWI-ST1075L,314,C3RTEACXX,8,1101,7089,2213 1,N,0,ACATTGGC,TCGGGTCT


# Test Area

In [17]:
"""
import re

def quick_highlight(text_main, text_short):
    #text_main = subframe['sequence'][0]
    highlight_list = [text_short]
    highlight_str = r"(?:" + '|'.join(highlight_list) + r")\S"
    text_highlight = re.sub(highlight_str, '\033[44;33m\g<0>\033[m', text_main)
    return text_highlight
for i in range(len(df['sequence'])):
    df['sequence'][i] = quick_highlight(df['sequence'][i],df['short_sequence'][i])
df.sequence
"""

pass