-
Notifications
You must be signed in to change notification settings - Fork 0
/
kmers2.py
143 lines (122 loc) · 4.8 KB
/
kmers2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
import pandas as pd
import numpy as np
from itertools import product
from git.beiko_lab.DNA_encoders.base_for_encoders import BaseForEncoder
from Codes.Libraries.raw_data.fasta import parse_fasta
class Kmers2(BaseForEncoder):
"""
Decompose a sequence into kmers and count them.
This version 2 is almost twice faster than version 1 (i.e. Class Kmers)
"""
def __init__(self, binary=True, list_of_ks=[4]):
super().__init__(binary=binary)
self.list_of_ks = list_of_ks
self.oligonucs = {
1: ["A", "C", "G", "T"],
2: ["".join(c) for c in product("ACGT", repeat=2)],
3: ["".join(c) for c in product("ACGT", repeat=3)],
4: ["".join(c) for c in product("ACGT", repeat=4)],
5: ["".join(c) for c in product("ACGT", repeat=5)],
6: ["".join(c) for c in product("ACGT", repeat=6)],
}
def bulk_count_kmers_several_ks(self, seqs, list_of_ks=None):
"""
Parameters
----------
seq : str
DNA sequence to encode
list_of_ks : list
List of values for the k-mers encoder. The values must be integers.
Returns
----------
feature_vector : np.ndarray
Array of all the encoded sequences.
"""
if list_of_ks is None:
list_of_ks = self.list_of_ks
# decompose each seq in series into the ks in list of ks
decompose = []
length = len(seqs[0])
for seq in seqs:
d=[]
for k in list_of_ks:
d.extend([[seq[i:i + k]] for i in range(0, length - k + 1)])
# converting list to array and counting unique elements
u,c = np.unique(d, return_counts=True)
decompose.append(pd.DataFrame(c.reshape([1,u.shape[0]]), columns=u))
del seqs
# create an empty DataFrame
columns = []
for k in list_of_ks:
columns.extend(self.oligonucs[k])
feature_vector = pd.DataFrame(columns=columns)
for d in decompose:
feature_vector = pd.concat([feature_vector, d], ignore_index=True, axis=0, sort=True)
feature_vector.fillna(value=0, inplace=True)
return feature_vector
# def count_kmers_single_k(self, seq, k=3):
# """
# Count k-mer occurrences in a given seq.
#
# Parameters
# ----------
# seq : string
# A single DNA sequence.
# k : int
# The value of k for which to count kmers.
#
# Returns
# -------
# counts : pd.DataFrame
# DataFrame where columns are oligonucleotides and line 0 has the # of times each oligonucleotide appears in seq
# A dictionary of counts keyed by their individual kmers (strings
# of length k).
# """
# # Calculate how many kmers of length k there are
# num_kmers = len(seq) - k + 1
# # decompose sequence into k-mers
# decomposed = [seq[i:i + k] for i in range(0, num_kmers)]
# # count how many k-mers there are in the list 'decomposed'
# u,c = np.unique(decomposed, return_counts=True)
# df = pd.DataFrame(c, columns=u)
# # Return the final counts
# return df
def encode_fasta_file(
self,
fastafile: str,
outputfile: str,
list_of_ks: list = None,
add_class_to_entries: bool = True,
):
"""
Encodes a fastafile into a feature vector of k-mers for k in list_of_k
and stores it into outputfile.
Parameters
----------
fastafile : string
Each entry in this file must have 58 bp
list_of_ks : list
List of the numbers we want k to have
outputfile : string
Path and file name where to store the encoded sequences
add_class_to_entries : boolean
If the sequence's label should be appended to the encoded sequence
verbose : boolean
If the print statements should be enabled
Returns
-------
None
"""
if not list_of_ks:
list_of_ks = self.list_of_ks
else:
self.list_of_ks = list_of_ks
print(f"Encoding {fastafile} into k-mers for k in {list_of_ks}...")
regs = parse_fasta(fastafile, split_key_into_keydescription=False, notify_if_more_than_one_contig=False)
# count the number of oligonucleotides
kmers_to_save_in_file = self.bulk_count_kmers_several_ks(regs.sequence.values)
if add_class_to_entries:
kmers_to_save_in_file = pd.concat([kmers_to_save_in_file, regs[["key"]]], axis=1)
del regs
kmers_to_save_in_file.to_csv(outputfile, header=False, index=False)
print(f"Encoded sequence saved at {outputfile}")