# Useful links

- https://www.tensorflow.org/datasets/api_docs/python/tfds#modules
- https://www.kaggle.com/thomasnelson/working-with-dna-sequence-data-for-ml
- https://www.kaggle.com/thomasnelson/working-with-dna-sequence-data-for-ml-part-2
- https://www.biorxiv.org/content/10.1101/353474v3.full
- https://biopython.org/
- https://en.wikipedia.org/wiki/K-mer

In [3]:
pip install --upgrade pandas biopython sklearn #install missing libraries

Collecting pandas
[?25l  Downloading https://files.pythonhosted.org/packages/52/3f/f6a428599e0d4497e1595030965b5ba455fd8ade6e977e3c819973c4b41d/pandas-0.25.3-cp36-cp36m-manylinux1_x86_64.whl (10.4MB)
[K     |████████████████████████████████| 10.4MB 1.4MB/s eta 0:00:01
[?25hRequirement already up-to-date: biopython in /usr/local/lib/python3.6/dist-packages (1.76)
Requirement already up-to-date: sklearn in /usr/local/lib/python3.6/dist-packages (0.0)
Collecting pytz>=2017.2 (from pandas)
[?25l  Downloading https://files.pythonhosted.org/packages/e7/f9/f0b53f88060247251bf481fa6ea62cd0d25bf1b11a87888e53ce5b7c8ad2/pytz-2019.3-py2.py3-none-any.whl (509kB)
[K     |████████████████████████████████| 512kB 2.2MB/s eta 0:00:01
Installing collected packages: pytz, pandas
Successfully installed pandas-0.25.3 pytz-2019.3
You should consider upgrading via the 'pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [31]:
from Bio import SeqIO
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd 

In [45]:
# Parse FASTAQ files

# './data/SRR062634.filt.fastq'
def readFastq(filePath) :
    seq=[]
    for seq_record in SeqIO.parse(filePath, "fastq"):
       seq.append(str(seq_record.seq))        
    
    data = pd.DataFrame(seq, columns =['seq'])
    #data['kmer'] = data.apply(lambda x: getKmers(x['seq']), axis=1)
    return data

def calculateKmers(data) :
    return  data.apply(lambda x: getKmers(x['seq']), axis=1)

def calculateWordBag(data) :
    return data.apply(lambda x : ' '.join(x))
        
# Convert string to k-mer
## @param sequence string : String to convert
## @param size int : K-mer word length
def getKmers(sequence, size=6):
    return [sequence[x:x+size].lower() for x in range(len(sequence) - size + 1)]






In [46]:
data = readFastq('./data/SRR062634.filt.fastq')

In [47]:
kmers = calculateKmers(data)

In [48]:
words = calculateWordBag(kmers)

In [50]:
list(words)

['tgatca gatcat atcatt tcattt catttg atttga tttgat ttgatt tgatta gattaa attaat ttaata taatac aatact atactg tactga actgac ctgaca tgacat gacatg acatgt catgta atgtag tgtaga gtagac tagaca agacaa gacaag acaaga caagaa aagaag agaaga gaagaa aagaaa agaaaa gaaaag aaaagt aaagta aagtat agtatg gtatgt tatgtt atgttt tgtttc gtttca tttcat ttcatg tcatgc catgct atgcta tgctat gctatt ctattt tatttt attttg ttttga tttgag ttgagt tgagta gagtaa agtaac gtaact taactt aacttc acttcc cttcca ttccat tccatt ccattt cattta atttag tttaga ttagaa tagaag agaagc gaagcc aagcct agccta gcctac cctact ctactc tactcc actcct ctcctg tcctga cctgag ctgagc tgagca gagcac agcaca gcacaa cacaac acaaca caacat aacatt',
 'aatgtt atgtta tgttat gttatt ttatta tattaa attaaa ttaaaa taaaaa aaaaat aaaatg aaatgg aatgga atggac tggaca ggacac gacacc acacct cacctt accttt cctttt cttttt tttttc ttttct tttctc ttctca tctcac ctcaca tcacac cacaca acacat cacatt acattc cattca attcag ttcagt tcagtt cagttt agtttc gtttca tttcat ttcatt tcattg cattgt attgtc ttgtct tgtctc 

In [43]:
kmers.apply(lambda x : ' '.join(x))

0         tgatca gatcat atcatt tcattt catttg atttga tttg...
1         aatgtt atgtta tgttat gttatt ttatta tattaa atta...
2         cagatc agatca gatcag atcaga tcagaa cagaat agaa...
3         tgggta gggtac ggtaca gtacag tacagt acagta cagt...
4         tgagtg gagtgt agtgtt gtgtta tgttat gttatt ttat...
                                ...                        
308841    cagctg agctgc gctgcc ctgcct tgccta gcctat ccta...
308842    attaaa ttaaat taaatg aaatgt aatgtt atgttt tgtt...
308843    cccgca ccgcat cgcatc gcatcc catccc atcccc tccc...
308844    agatgg gatggg atgggg tggggt ggggtt gggttt ggtt...
308845    aattaa attaaa ttaaaa taaaat aaaatt aaatta aatt...
Length: 308846, dtype: object

In [None]:
# Test the k-mer function
mySeq = 'CATGGCCATCCCCCCCCGAGCGGGGGGGGGG'
mySeq2 = 'GATGGCCATCCCCGCCCGAGCGGGGGGGG'
mySeq3 = 'CATGGCCATCCCCGCCCGAGCGGGCGGGG'

# Create a k-mer sentence from the strings
words = getKmers(mySeq, size=6)
kmer_sentence = ' '.join(words)
kmer_sentence2 = ' '.join(getKmers(mySeq2, size=6))
kmer_sentence3 = ' '.join(getKmers(mySeq3, size=6))

print(kmer_sentence)

In [None]:
# Creating the Bag of Words model
cv = CountVectorizer()
X = cv.fit_transform([kmer_sentence, kmer_sentence2, kmer_sentence3]).toarray()

# Display the 3 vectorized ADN sequence
print(X)