In [52]:
# Define k-mer merge function
# Reference : https://github.com/jerryji1993/DNABERT/blob/master/motif/motif_utils.py
# @param kmers : Array of k-mers
# @return : original sequence.
def kmer2seq(kmers):
    """
    Convert kmers to original sequence
    
    Arguments:
    kmers -- str, kmers separated by space.
    
    Returns:
    seq -- str, original sequence.
    """
    kmers_list = kmers.split(" ")
    bases = [kmer[0] for kmer in kmers_list[0:-1]]
    bases.append(kmers_list[-1])
    seq = "".join(bases)
    assert len(seq) == len(kmers_list) + len(kmers_list[0]) - 1
    return seq

# Reads file, merge k-mers in original file, and write the sequence in new file.
def generate_original_sequence(origin_file_path, target_file_path):
    origin = open(origin_file_path, 'r')
    target = open(target_file_path, 'w+')
    
    next(origin)
    for line in origin:
        arr = line.strip().split('\t')
        kmers = arr[0]
        label = arr[1]
        seq = kmer2seq(kmers)
        target.write(seq + '\t' + arr[1] + '\n')
    
    origin.close()
    target.close()
    
# Data and target path.
DATA_PATH = "data/ft/fine_tuning_sample_k-mer_3_ALPHA_BETA_DELTA.tsv"
TARGET_PATH = "data/ft/fine_tuning_sample_k-mer_3_ALPHA_BETA_DELTA_merged.txt"

# Vector constants for each base.
ONE_HOT_ENCODING = { 
    'A' : [1,0,0,0],
    'T' : [0,1,0,0], # T and U are basically the same thing. 
    'U' : [0,1,0,0], # T is in DNA and U is in RNA.
    'G' : [0,0,1,0],
    'C' : [0,0,0,1]
}

INTEGER_ENCODING = { 
    'A' : 1,
    'T' : 2, # T and U are basically the same thing. 
    'U' : 2, # T is in DNA and U is in RNA.
    'G' : 3,
    'C' : 4
}

LABELS_INTEGER = {
    '0' : 0,
    '1' : 1,
    '2' : 2
}

LABELS = {
    '0' : [1,0,0],
    '1' : [0,1,0],
    '2' : [0,0,1]
}


In [53]:
# Generate original sequence from given file containing kmers and label.
generate_original_sequence(DATA_PATH, TARGET_PATH)

In [125]:
# Generate one hot encoding representation from given seq.
# @param seq : A sequence and its label in format {seq}<tab>{label}.
# @param mode : Encoding mode, 'one-hot' and 'int'. Default mode is 'one-hot'.
# @return : one hot encoding vectors.
def one_hot_encoding_from_seq(seq, mode='one-hot'):
    vector = []
    labels = []
    arr = seq.strip().split('\t')
    seq = arr[0].strip()
    if mode == 'one-hot':
        label = LABELS[arr[1]]
    elif mode == 'int':
        label = LABELS_INTEGER[arr[1]]
    else:
        label = LABELS[arr[1]]
    for c in seq.strip():
        if mode == 'one-hot':
            ec = ONE_HOT_ENCODING[c]
        elif mode == 'int':
            ec = INTEGER_ENCODING[c]
        else:
            ec = ONE_HOT_ENCODING[c]
        vector.append(ec)
    return vector, label

# Reads sequences from file and convert into one hot encoding.
# @param source_file_path : File from which seqs are read.
# @param mode : Encoding mode, 'one-hot' and 'int'
# @return : vectors representating a set of sequences.
def read_one_hot_encoding_from_file(source_file_path, mode):
    f = open(source_file_path, 'r')
    m = []
    l = []
    line_count = 0
    for line in f:
        line_count += 1
        vector, label = one_hot_encoding_from_seq(line, mode)
        m.append(vector)
        l.append(label)
    print('finished reading {} lines.'.format(line_count))
    return m, l

In [126]:
# Generate one hot encoding from sequences.
features, labels = read_one_hot_encoding_from_file(TARGET_PATH, mode='one-hot')

finished reading 300 lines.


In [124]:
print('size of set of features : {}'.format(len(features)))
print('length of features : {}'.format(len(features[0])))
print('size of label : {}'.format(len(labels)))

size of set of features : 300
length of features : 102
size of label : 300


In [127]:
np_features = np.array(features)
np_labels = np.array(labels)
print(np_features.shape)
print(np_labels.shape)

(300, 102, 4)
(300, 3)


In [128]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score

In [113]:
scaler = StandardScaler()
features = scaler.fit_transform(features)
features[:2]

ValueError: Found array with dim 3. StandardScaler expected <= 2.

In [117]:
kmeans = KMeans(init="random", n_clusters=3, n_init=10, random_state=42)
kmeans.fit(features)

ValueError: Found array with dim 3. Estimator expected <= 2.

In [106]:
kmeans.inertia_

22973.063146886525

In [107]:
print('cluster center : {}'.format(len(kmeans.cluster_centers_)))
kmeans.cluster_centers_

cluster center : 3


array([[ 1.59895416e+00, -1.42809356e+00, -7.17657702e-01,
        -8.75225992e-01,  8.73266559e-01,  8.62461884e-01,
        -1.29854417e+00, -6.92755363e-01,  1.32041575e+00,
         2.36102815e-01, -1.07563836e+00, -1.19153203e+00,
         1.65046451e+00, -8.37819745e-02, -3.66864419e-02,
         6.53023148e-02,  1.00576374e+00,  2.42259718e-01,
        -3.52008532e-01, -1.83976619e-01,  7.70761774e-01,
        -3.01041443e-01,  1.42243416e+00, -5.12341513e-01,
        -1.31104554e-01,  8.63186152e-01, -6.76057763e-02,
        -8.11372010e-01,  3.38715637e-01, -7.96297966e-01,
         4.59702205e-01,  9.94392614e-01, -7.23460230e-01,
         4.37405077e-01,  3.02207251e-01, -6.72576746e-01,
         8.78773782e-01, -3.22902277e-01,  1.72065212e+00,
        -4.05582920e-01, -9.49071165e-01, -7.31772978e-01,
        -4.74034958e-02,  8.20458986e-01,  1.05538069e-01,
        -1.11827781e+00, -1.06088809e+00,  1.41502985e+00,
        -7.05873156e-01, -2.65440275e-01, -2.67159080e-0

In [108]:
kmeans.n_iter_

11

In [109]:
print(labels[299])
print(kmeans.labels_[299])

2
1
