In [1]:
#  Data Preprocessing
import Bio

In [22]:
# Generate K-Mer from sequence
s = "GTTCTCTAAACGAACTTTAAAATCTGTGTGGCTGTCACTCGGCTGCATGCTTAGTGCACT"

# Generate array of k-mer
# from https://github.com/jerryji1993/DNABERT/blob/master/motif/motif_utils.py
#
# @param sequence : sequence you want to process
# @param k : how many length you want in k-mer
# @n_k_mer : how many k-mers are retrieve. If all kmers are required, please put -1.
def create_k_mer(sequence, k, n_k_mer):
    if k > 0:
        arr = [sequence[i:i+k] for i in range(len(sequence)+1-k)]
        if n_k_mer > 0:
            arr = arr[0:n_k_mer]
        kmer = ' '.join(arr)
        return kmer
    else:
        return sequence

kmer = create_k_mer(s, 6, 10)
print(kmer)

GTTCTC TTCTCT TCTCTA CTCTAA TCTAAA CTAAAC TAAACG AAACGA AACGAA ACGAAC


In [23]:
# B.1.1.7 (Alpha), 
ALPHA_FASTA_PATH="complete-nucl-sars_cov_2-B.1.1.7-human_origin.fasta"
# B.1.351 (Beta), 
BETA_FASTA_PATH="complete-nucl-sars_cov_2-B.1.351-human_origin.fasta"
# B.1.617.2 (Delta), 
DELTA_FASTA_PATH="complete-nucl-sars_cov_2-B.1.617.2-human_origin.fasta"
# and P.1 (Gamma)

ALPHA_CLASS = 1
BETA_CLASS = 2
DELTA_CLASS = 3

K_MER = 6
N_K_MER = 100
N_SAMPLES = 10

In [24]:
# Create fine tuning file from fasta file.
from Bio import SeqIO
import os

# Generate file for fine tuning using FASTA file.
# @param fasta_file : Original fasta file.
# @param label_for_this_file : What label for this fine tuning file.
# @param output_file_path : What and where the fine tuning is named and stored. 
#                           If file path exists, existing file will be removed.
# @param n_samples : How many sequence will be put in fine tuning file. 
#                    If all sequence is to be generated, please put -1.
# @param k_mer : Size of k-mer. If k-mer is not required, please put -1.
# @param n_k_mer : How many kmers are written to file for each sequence in fasta file. 
#                  If all kmers are written, please put -1.
def generate_sample_fine_tuning_file(fasta_file, label_for_this_file, output_file_path, n_samples, k_mer, n_k_mer):
    records = list(SeqIO.parse(fasta_file, 'fasta'))
    if len(records) >= n_samples:
        records = records[0:n_samples]
    
    if (os.path.exists(output_file_path)):
        os.remove(output_file_path)
        
    output_file = open(output_file_path, 'w+')
    for r in records:
        output_file.write(create_k_mer(str(r.seq), k_mer, n_k_mer) + '\t' + str(label_for_this_file) + '\n')
    output_file.close()

In [None]:
# Generate Raw K-Mer from Fasta files.
generate_sample_fine_tuning_file(ALPHA_FASTA_PATH, ALPHA_CLASS, 'raw_alpha_'+str(K_MER)+'.txt', -1, K_MER, -1)
generate_sample_fine_tuning_file(BETA_FASTA_PATH, BETA_CLASS, 'raw_beta_'+str(K_MER)+'.txt', -1, K_MER, -1)
generate_sample_fine_tuning_file(DELTA_FASTA_PATH, DELTA_CLASS, 'raw_delta_'+str(K_MER)+'.txt', -1, K_MER, -1)

In [None]:
# Generate fine tuning file
generate_sample_fine_tuning_file(ALPHA_FASTA_PATH, ALPHA_CLASS, 'fine_tuning_sample_alpha_'+str(K_MER)+'.txt', N_SAMPLES, K_MER, N_K_MER)
generate_sample_fine_tuning_file(BETA_FASTA_PATH, BETA_CLASS, 'fine_tuning_sample_beta_'+str(K_MER)+'.txt', N_SAMPLES, K_MER, N_K_MER)
generate_sample_fine_tuning_file(DELTA_FASTA_PATH, DELTA_CLASS, 'fine_tuning_sample_delta_'+str(K_MER)+'.txt', N_SAMPLES, K_MER, N_K_MER)

In [25]:
# Generate data for predictions.
# @param fasta_file : Path to fasta file.
# @param output_file_path : What and where the prediction file is named and stored. 
#                           If file path exists, existing file will be removed.
# @seq_index : From where sequence is read.
# @n_samples : How many sequences are used to create prediction file.
# @k_mer : Size of k-mer.
# @n_k_mer : How many kmers are written to file for each sequence in fasta file.
def generate_data_to_predict(fasta_file, output_file_path, seq_index, n_samples, k_mer, n_k_mer):
    records = list(SeqIO.parse(fasta_file, 'fasta'))
    if (len(records)) > n_samples:
        records = records[seq_index:n_samples]
    
    if (os.path.exists(output_file_path)):
        os.remove(output_file_path)
    
    output_file = open(output_file_path, 'w+')
    for r in records:
        output_file.write(create_k_mer(str(r.seq), k_mer, n_k_mer) + '\n')
    output_file.close()

In [None]:
# Generate data for prediction
generate_data_to_predict(ALPHA_FASTA_PATH, 'prediction_sample_alpha_'+str(K_MER)+'.txt', N_SAMPLES, N_SAMPLES, K_MER, N_K_MER)
generate_data_to_predict(ALPHA_FASTA_PATH, 'prediction_ample_alpha_'+str(K_MER)+'.txt', N_SAMPLES, N_SAMPLES, K_MER, N_K_MER)
generate_data_to_predict(ALPHA_FASTA_PATH, 'prediction_sample_alpha_'+str(K_MER)+'.txt', N_SAMPLES, N_SAMPLES, K_MER, N_K_MER)