# Packages

In [1]:
pip install primer3-py

Note: you may need to restart the kernel to use updated packages.


In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random
import glob
from collections import Counter
import tensorflow.compat.v1 as tf
from sklearn.model_selection import StratifiedKFold
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from matplotlib import cm
from sklearn.manifold import TSNE
import os
import shutil
import time
import copy
import primer3


# Functions

## Basic Functions

In [3]:
def S3_to_sageMaker(sequence_type, variant_Name):
    bucket='sars-cov-2-harry'
    key = 'seq_and_seqName/{}_GISAID_{}.csv'.format(sequence_type, variant_Name)

    s3 = boto3.client('s3')
    obj = s3.get_object(Bucket=bucket, Key=key)
    return obj['Body']


def sys_path(basic_path):
    variant_list = ['alpha', 'beta', 'gamma', 'delta', 'omicron']
    # basic_path = './'
    for variant in variant_list:
        folder = os.path.exists(basic_path + 'Variant_virus/GISAID_' + variant)
        if not folder:
            os.makedirs(basic_path + 'Variant_virus/GISAID_' + variant)
        for temp in ['Seq_and_SeqName', 'index', 'model', 'Final_result']:
            folder = os.path.exists(basic_path + temp)
            if not folder:
                os.makedirs(basic_path + temp)
        for forward_reverse in ['forward', 'reverse']:
            for temp in ['filter', 'posPool', 'maxPool', 'filter_seq', 'featsVector', 'dataDNAFeatures', 'feature',
                         'Seq_appearance', 'result']:
                f = basic_path + variant + '/' + forward_reverse + '/' + temp
                folder = os.path.exists(f)
                if not folder:
                    os.makedirs(f)
            if forward_reverse == 'reverse':
                for temp in ['model', 'amplicon_length', 'CG_Check', 'exist_primer_check', 'forward_primer',
                             'second_data']:
                    f = basic_path + variant + '/' + forward_reverse + '/' + temp
                    folder = os.path.exists(f)
                    if not folder:
                        os.makedirs(f)
                    if temp == 'CG_Check':
                        for sub_temp in ['new_primers', 'exist_primers']:
                            f = basic_path + variant + '/' + forward_reverse + '/' + temp + '/' + sub_temp
                            folder = os.path.exists(f)
                            if not folder:
                                os.makedirs(f)


def readFASTA(fa):
    '''
    :msg: read a xxx.fasta file
    :param fa: --- {str} --- the path of the xxx.fasta file
    :return: --- {dict} --- return a dictionary with key = seqName and value = sequence
    '''
    FA = open(fa)
    seqDict = {}
    for line in FA:
        if line.startswith('>'):
            seqName = line.replace('>', '').split()[0]
            seqDict[seqName] = ''
        else:
            seqDict[seqName] += line.replace('\n', '').strip()
    FA.close()
    return seqDict


def readFASTA_iter(fa):
    '''
    :msg: read a xxx.fasta file
    :param fa: --- {str} --- the path of the xxx.fasta file
    :return: --- {generator} --- return a generator which gives each sequence name and sequence of the xxx.fasta file
    '''
    with open(fa, 'r') as FA:
        seqName, seq = '', ''
        while 1:
            line = FA.readline()
            line = line.strip('\n')
            if (line.startswith('>') or not line) and seqName:
                yield ((seqName, seq))
            if line.startswith('>'):
                seqName = line[1:].split()[0]
                seq = ''
            else:
                seq += line
            if not line:
                break


def getSeq(fa, querySeqName, start=1, end=0):
    '''
    :msg: get a particular sequence of a xxx.fasta file
    :param fa: --- {str} --- the path of the xxx.fasta file
    :param querySeqName: --- {str} --- the name of the particular sequence
    :param start: --- {int} --- the starting position of intercepting the sequence (defaults to 1)
    :param end: --- {int} --- the ending position of intercepting the sequence (defaults to 0 / full length)
    :return: --- {str} --- the sequence which intercepted
    '''
    if start < 0:
        start = start + 1
    for seqName, seq in readFASTA_iter(fa):
        if querySeqName == seqName:
            if end != 0:
                returnSeq = seq[start - 1: end]
                print('The start position and end position is {} / {}'.format(start - 1, end))
            else:
                returnSeq = seq[start - 1:]
            return returnSeq


def getReverseComplement(sequence):
    '''
    :msg: get the reverse cDNA of the RNA sequence
    :param sequence: --- {str} --- a RNA sequence of the virus
    :return: --- {str} --- the reverse cDNA sequence of the given RNA
    '''
    sequence = sequence.upper()
    sequence = sequence.replace('A', 't')
    sequence = sequence.replace('T', 'a')
    sequence = sequence.replace('C', 'g')
    sequence = sequence.replace('G', 'c')
    return sequence.upper()[::-1]


def getGC(sequence):
    '''
    :msg: get the GC content of a sequence
    :param sequence: --- {str} --- a sequence of RNA
    :return: --- {float} --- the GC content of sequence
    '''
    sequence = sequence.upper()
    content = (sequence.count("G") + sequence.count("C")) / len(sequence)
    return content


def readSeqByWindow(sequence, winSize, stepSize):
    '''
    :msg: sliding window to read a sequence
    :param sequence: --- {str} --- a sequence of RNA
    :param winSize: --- {int} --- the Window size
    :param stepSize: --- {int} --- the Step size
    :return: --- {generator} --- return a generator which gives each sequence of the Window
    '''
    if stepSize <= 0:
        return False
    now = 0
    seqLen = len(sequence)
    while (now + winSize - stepSize < seqLen):
        yield sequence[now:now + winSize]
        now += stepSize



## Model Functions

In [4]:
def oneHot(array, size):
    output = []
    for i in range(len(array)):
        temp = np.zeros(size)
        temp[int(array[i])] = 1
        output.append(temp)
    return np.array(output)


# function to declare easily the weights only by shape
def weight_variable(shape):
    initial = tf.truncated_normal(shape, stddev=0.1)
    return tf.Variable(initial)


# function to declare easily the bias only by shape
def bias_variable(shape):
    initial = tf.constant(0.1, shape=shape)
    return tf.Variable(initial)


def getBatch_run(data, labels, size, run, vector, sampleSize):
    infLimit = run * size
    supLimit = infLimit + size
    if supLimit > len(data):
        supLimit = len(data)
    batch = []
    for i in range(infLimit, supLimit):
        batch.append(vector[i])
    outData = []
    outLabels = []
    for i in range(len(batch)):
        sample = np.zeros(sampleSize)
        for j in range(0, len(data[batch[i]])):
            if data[batch[i]][j] == 'C':
                sample[j] = 0.25
            elif data[batch[i]][j] == 'T':
                sample[j] = 0.50
            elif data[batch[i]][j] == 'G':
                sample[j] = 0.75
            elif data[batch[i]][j] == 'A':
                sample[j] = 1.0
            else:
                sample[j] = 0.0
        outData.append(sample)
        outLabels.append(labels[batch[i]])
    return np.array(outData), np.array(outLabels)


def getBatch(data, labels, size, sampleSize):
    index = []
    for i in range(len(data)):
        index.append(i)
    batch = random.sample(index, size)
    outData = []
    outLabels = []
    for i in range(len(batch)):
        sample = np.zeros(sampleSize)
        for j in range(0, len(data[batch[i]])):
            if (data[batch[i]][j] == 'C'):
                sample[j] = 0.25
            elif (data[batch[i]][j] == 'T'):
                sample[j] = 0.50
            elif (data[batch[i]][j] == 'G'):
                sample[j] = 0.75
            elif (data[batch[i]][j] == 'A'):
                sample[j] = 1.0
            else:
                sample[j] = 0.0
        outData.append(sample)
        outLabels.append(labels[batch[i]])
    return np.array(outData), np.array(outLabels)



## Other Functions

### Get Data

In [5]:
def get_data(path):
    print('------------------ Processing get_data ------------------')
    variant_list = ['alpha', 'beta', 'gamma', 'delta', 'omicron']

    for Name in variant_list:
        print('\nNow running the --- {} --- variant\n'.format(Name))
        files = glob.glob(path + 'Variant_virus/GISAID_' + Name + '/*')
        seqName_list, seq_list = [], []
        file_count, data_count = 0, 0
        for file in files:
            file_count += 1
            for seqName, seq in readFASTA_iter(file):
                data_count += 1
                seqName_list.append(seqName)
                seq_list.append(seq)
                if data_count % 1000 == 0:
                    print('{} ... No.{} file with {} sequence'.format(Name, file_count, data_count))

        print('\n{} : {} sequence\n'.format(Name, len(seqName_list)))
        print('Saving the seqName_list...\n')
        pd.DataFrame(seqName_list).to_csv(path + 'Seq_and_SeqName/seqName_GISAID_' + Name + '.csv',
                                          header=None, index=None)
        print('\nSaving the seq_list...')
        pd.DataFrame(seq_list).to_csv(path + 'Seq_and_SeqName/seq_GISAID_' + Name + '.csv',
                                      header=None, index=None)


def mixed_data(basic_path, delta_num, other_percent=1):
    # path = '/Users/harry/Documents/Sars-Cov-2 Project/v3_Dataset/'
    variant_list = ['alpha', 'beta', 'gamma', 'delta', 'omicron']

    vectorSize = 0
    delta_num = delta_num * 2
    mix_seqName = []
    mix_sequence = []
    mix_label = []

    for Name in variant_list:
        seqName_file = S3_to_sageMaker('seqName', Name)
        seqName = pd.read_csv(seqName_file, header=None).values.ravel()  # get the sequence name

        seq_file = S3_to_sageMaker('seq', Name)
        sequence = pd.read_csv(seq_file, header=None).values.ravel()  # get the sequence

        for i in range(len(sequence)):
            if len(sequence[i]) > vectorSize:
                vectorSize = len(sequence[i])
        print('{}    vector size : {}'.format(Name, vectorSize))

        rand = np.random.randint(100000)

        if Name == 'alpha':
            label = 1
            np.random.seed(rand)
            np.random.shuffle(seqName)
            mix_seqName.append(seqName[:int(delta_num * other_percent)])

            np.random.seed(rand)
            np.random.shuffle(sequence)
            mix_sequence.append(sequence[:int(delta_num * other_percent)])

            seq_labels = np.array([label for x in range(len(seqName[:int(delta_num * other_percent)]))])
            np.random.seed(rand)
            np.random.shuffle(seq_labels)
            mix_label.append(seq_labels)

        elif Name == 'beta':
            label = 2
            np.random.seed(rand)
            np.random.shuffle(seqName)
            mix_seqName.append(seqName[:int(delta_num * other_percent)])

            np.random.seed(rand)
            np.random.shuffle(sequence)
            mix_sequence.append(sequence[:int(delta_num * other_percent)])

            seq_labels = np.array([label for x in range(len(seqName[:int(delta_num * other_percent)]))])
            np.random.seed(rand)
            np.random.shuffle(seq_labels)
            mix_label.append(seq_labels)

        elif Name == 'gamma':
            label = 3
            np.random.seed(rand)
            np.random.shuffle(seqName)
            mix_seqName.append(seqName[:int(delta_num * other_percent)])

            np.random.seed(rand)
            np.random.shuffle(sequence)
            mix_sequence.append(sequence[:int(delta_num * other_percent)])

            seq_labels = np.array([label for x in range(len(seqName[:int(delta_num * other_percent)]))])
            np.random.seed(rand)
            np.random.shuffle(seq_labels)
            mix_label.append(seq_labels)

        elif Name == 'delta':
            label = 4
            np.random.seed(rand)
            np.random.shuffle(seqName)
            mix_seqName.append(seqName[:delta_num])

            np.random.seed(rand)
            np.random.shuffle(sequence)
            mix_sequence.append(sequence[:delta_num])

            seq_labels = np.array([label for x in range(len(seqName[:delta_num]))])
            np.random.seed(rand)
            np.random.shuffle(seq_labels)
            mix_label.append(seq_labels)

        elif Name == 'omicron':
            label = 0
            np.random.seed(rand)
            np.random.shuffle(seqName)
            mix_seqName.append(seqName[:delta_num])

            np.random.seed(rand)
            np.random.shuffle(sequence)
            mix_sequence.append(sequence[:delta_num])

            seq_labels = np.array([label for x in range(len(seqName[:delta_num]))])
            np.random.seed(rand)
            np.random.shuffle(seq_labels)
            mix_label.append(seq_labels)

    pd.DataFrame(mix_seqName).to_csv(basic_path + 'mix_seqName.csv', header=None, index=None)
    pd.DataFrame(mix_sequence).to_csv(basic_path + 'mix_mix_sequence.csv', header=None, index=None)
    pd.DataFrame(mix_label).to_csv(basic_path + 'mix_label.csv', header=None, index=None)

    return vectorSize


def train_valid_data(basic_path, n_splits=2):
    # path = '/Users/harry/Documents/Sars-Cov-2 Project/v3_Dataset/'

    mix_seqName = pd.read_csv(basic_path + 'mix_seqName.csv', header=None).values.ravel()
    mix_sequence = pd.read_csv(basic_path + 'mix_mix_sequence.csv', header=None).values.ravel()
    mix_label = pd.read_csv(basic_path + 'mix_label.csv', header=None).values.ravel()

    skf = StratifiedKFold(n_splits=n_splits)
    # skf.get_n_splits(mix_sequence, mix_label)
    for train_index, test_index in skf.split(mix_sequence, mix_label):
        mix_sequence_train, mix_sequence_test = mix_sequence[train_index], mix_sequence[test_index]
        mix_seqName_train, mix_seqName_test = mix_seqName[train_index], mix_seqName[test_index]
        mix_label_train, mix_label_test = mix_label[train_index], mix_label[test_index]

    pd.DataFrame(mix_sequence_train).to_csv(basic_path + 'index/train_sequence.csv', header=None, index=None)
    pd.DataFrame(mix_sequence_test).to_csv(basic_path + 'index/valid_sequence.csv', header=None, index=None)

    pd.DataFrame(mix_seqName_train).to_csv(basic_path + 'index/train_seqName.csv', header=None, index=None)
    pd.DataFrame(mix_seqName_test).to_csv(basic_path + 'index/valid_seqName.csv', header=None, index=None)

    pd.DataFrame(mix_label_train).to_csv(basic_path + 'index/train_label.csv', header=None, index=None)
    pd.DataFrame(mix_label_test).to_csv(basic_path + 'index/valid_label.csv', header=None, index=None)



### Feature

In [6]:

def posPool(path, vectorSize, max_Pooling_window_size):
    # path = '/Users/harry/Documents/Sars-Cov-2 Project/v5_Dataset/'
    files = glob.glob(path + 'filter/*.csv')
    numberWindows = int(vectorSize / max_Pooling_window_size) + 1
    for file in files:
        filterIndex = file.split('/')[-1].split('.')[0].split('_')[-1]
        data = pd.read_csv(file, header=None).values

        sizeData = np.shape(data)

        print(sizeData)

        maxPool = np.zeros(shape=(sizeData[0], numberWindows))
        posPool = np.zeros(shape=(sizeData[0], numberWindows))

        for i in range(0, sizeData[0]):
            maxPool_windowSize = max_Pooling_window_size
            pad_left_HPool = 0
            max = -1e6
            index = pad_left_HPool
            position = -1
            indexMax = 0
            for j in range(0, sizeData[1]):
                if data[i][j] > max:
                    max = data[i][j]
                    position = j
                index = index + 1
                if index == maxPool_windowSize or j == sizeData[1] - 1:
                    maxPool[i][indexMax] = max
                    posPool[i][indexMax] = position
                    max = -1e6
                    position = -1
                    index = 0
                    indexMax = indexMax + 1

        pd.DataFrame(maxPool).to_csv(path + 'maxPool/maxPool_' + str(filterIndex) + '.csv', header=None, index=None)
        pd.DataFrame(posPool).to_csv(path + 'posPool/posPool_' + str(filterIndex) + '.csv', header=None, index=None)

    return numberWindows


def posPool_top(path, vectorSize, max_Pooling_window_size):
    # path = '/Users/harry/Documents/Sars-Cov-2 Project/v5_Dataset/'
    files = glob.glob(path + 'filter/*.csv')
    numberWindows = int(vectorSize / max_Pooling_window_size) + 1
    for file in files:
        filterIndex = file.split('/')[-1].split('.')[0].split('_')[-1]
        data = pd.read_csv(file, header=None).values

        sizeData = np.shape(data)

        print(sizeData)

        maxPool = np.zeros(shape=(sizeData[0], numberWindows))
        posPool = np.zeros(shape=(sizeData[0], numberWindows))

        for i in range(0, sizeData[0]):
            print(i)

            temp = list(data[i])
            t = copy.deepcopy(temp)

            max_number = []
            max_index = []
            for _ in range(numberWindows):
                number = np.max(t)
                index = t.index(number)
                t[index] = 0
                max_number.append(number)
                max_index.append(index)

            for j in range(0, numberWindows):
                maxPool[i][j] = max_number[j]
                posPool[i][j] = max_index[j]

        pd.DataFrame(maxPool).to_csv(path + 'maxPool/maxPool_' + str(filterIndex) + '.csv', header=None, index=None)
        pd.DataFrame(posPool).to_csv(path + 'posPool/posPool_' + str(filterIndex) + '.csv', header=None, index=None)

    return numberWindows


def posPool_combination(path, vectorSize, max_Pooling_window_size, top_in_window):
    # path = '/Users/harry/Documents/Sars-Cov-2 Project/v5_Dataset/'
    files = glob.glob(path + 'filter/*.csv')
    numberWindows = (int(vectorSize / max_Pooling_window_size) + 1) * top_in_window
    for file in files:
        filterIndex = file.split('/')[-1].split('.')[0].split('_')[-1]
        data = pd.read_csv(file, header=None).values

        sizeData = np.shape(data)

        print(sizeData)

        maxPool = np.zeros(shape=(sizeData[0], numberWindows))
        posPool = np.zeros(shape=(sizeData[0], numberWindows))

        for i in range(0, sizeData[0]):
            print(i)
            maxPool_windowSize = max_Pooling_window_size
            pad_left_HPool = 0
            max = -1e6
            index = pad_left_HPool
            position = -1
            indexMax = 0

            max_number = []
            max_index = []

            temp_value = []

            loop = 0
            for j in range(0, sizeData[1]):
                temp_value.append(data[i][j])
                index = index + 1
                if index == maxPool_windowSize or j == sizeData[1] - 1:
                    t = copy.deepcopy(temp_value)

                    for _ in range(top_in_window):
                        number = np.max(t)
                        index = t.index(number)
                        t[index] = 0
                        max_number.append(number)
                        max_index.append(index + loop * max_Pooling_window_size)

                    loop += 1
                    temp_value = []
                    index = 0

            for j in range(numberWindows):
                maxPool[i][j] = max_number[j]
                posPool[i][j] = max_index[j]

        pd.DataFrame(maxPool).to_csv(path + 'maxPool/maxPool_' + str(filterIndex) + '.csv', header=None, index=None)
        pd.DataFrame(posPool).to_csv(path + 'posPool/posPool_' + str(filterIndex) + '.csv', header=None, index=None)

    return numberWindows

       
def creatFeatVector(path, numberWindows, vectorSize, numberFilters):
    # path = '/Users/harry/Documents/Sars-Cov-2 Project/v5_Dataset/'
    pos_path = path + 'posPool/'
    path_seq = path + 'filter_seq/'
    files = glob.glob(pos_path + '/*.csv')

    # Parameters
    # numberFilters = 21

    p_value = 0
    if numberFilters % 2 == 0:
        p_value = 1
        numberFilters += 1
    
    padding = int((numberFilters - 1) / 2)

    for file in files:
        filterIndex = file.split('/')[-1].split('.')[0].split('_')[-1]
        print('Processing...   Loop 1 -- Index {}'.format(filterIndex))

        posMatrix = pd.read_csv(file, header=None).values
        matrix = pd.read_csv(path_seq + 'filter_seq.csv', header=None).values.ravel()

        outData = []
        for i in range(len(matrix)):
            sample = np.zeros(vectorSize)
            for j in range(len(matrix[i])):
                if matrix[i][j] == 'C':
                    sample[j] = 0.25
                elif matrix[i][j] == 'T':
                    sample[j] = 0.50
                elif matrix[i][j] == 'G':
                    sample[j] = 0.75
                elif matrix[i][j] == 'A':
                    sample[j] = 1.0
                else:
                    sample[j] = 0.0
            outData.append(sample)

        matrix = np.array(outData)
        sizePosMatrix = np.shape(posMatrix)
        dataDNA = [[0 for i in range(numberWindows * numberFilters)] for j in range(sizePosMatrix[0])]
        sizeDNAMatrix = np.shape(matrix)

        for i in range(sizePosMatrix[0]):
            temp = ((matrix[i]))
            for j in range(sizePosMatrix[1]):
                coef = int(posMatrix[i][j])
                for k in range(padding + 1):
                    if (coef + k) < len(temp):
                        dataDNA[i][j * numberFilters + padding + k] = temp[coef + k]
                    if (coef - k) >= 0 and (coef - k) < len(temp):
                        dataDNA[i][j * numberFilters + padding - k] = temp[coef - k]

        dataDNAString = [[0 for i in range(numberWindows * numberFilters)] for j in range(sizePosMatrix[0])]

        for i in range(sizePosMatrix[0]):
            for j in range(numberWindows * numberFilters):
                if dataDNA[i][j] == 0.25:
                    dataDNAString[i][j] = "C"
                elif dataDNA[i][j] == 0.50:
                    dataDNAString[i][j] = "T"
                elif dataDNA[i][j] == 0.75:
                    dataDNAString[i][j] = "G"
                elif dataDNA[i][j] == 1.00:
                    dataDNAString[i][j] = "A"
                else:
                    dataDNAString[i][j] = "N"

        dataDNAFeatures = [[0 for i in range(numberWindows)] for j in range(sizePosMatrix[0])]
        for i in range(sizePosMatrix[0]):
            for j in range(numberWindows):
                dataDNAFeatures[i][j] = str("")

        for i in range(sizePosMatrix[0]):
            indexFeature = 0
            feature = 0
            for j in range(numberWindows * numberFilters):
                dataDNAFeatures[i][feature] = str(dataDNAFeatures[i][feature]) + str(dataDNAString[i][j])
                indexFeature = indexFeature + 1
                if indexFeature == numberFilters:
                    feature = feature + 1
                    indexFeature = 0

        featsVector = []
        for i in range(sizePosMatrix[0]):
            for j in range(numberWindows):
                count = featsVector.count(dataDNAFeatures[i][j])
                if count == 0:
                    if "N" not in dataDNAFeatures[i][j]:
                        featsVector.append(dataDNAFeatures[i][j])
        
        new_featsVector = ['NNNNNNNNNNNNNNNNNNNNN']
        if p_value == 1:
            for feats in featsVector:
                new_feats = ''.join(list(feats)[:-1])
                new_featsVector.append(new_feats)

        pd.DataFrame(new_featsVector).to_csv(path + 'featsVector/featsVector_' + str(filterIndex) + '.csv',
                                         header=None, index=None)
        # pd.DataFrame(dataDNAFeatures).to_csv(path + 'dataDNAFeatures/dataDNAFeatures_' + str(filterIndex) + '.csv',
        #                                      header=None, index=None)



def getFeature(basic_path, path, variant_Name, Number):
    files = glob.glob(path + 'featsVector/*.csv')
    seq_file = S3_to_sageMaker('seq', variant_Name)
    sequences = pd.read_csv(seq_file, header=None).values.ravel()  # get the sequence
    np.random.shuffle(sequences)
    sequences = sequences[:Number]

    for file in files:
        filterIndex = file.split('/')[-1].split('.')[0].split('_')[-1]
        print('Processing...   Loop 1 -- Index {}'.format(filterIndex))

        vector = pd.read_csv(file, header=None).values.ravel()
        print('Sequence Size: {}'.format(len(sequences)))
        print('featVector Size: {}'.format(len(vector)))

        featureList = []
        count = 0
        for seq in sequences:
            count += 1
            if count % 100 == 0:
                print('Calculating the features in sequence No.{}      with {} Total'.format(count, len(sequences)))
            for feature in vector:
                if feature in seq:
                    feature_count = seq.count(feature)
                    if feature_count == 1 and feature not in featureList:
                        featureList.append(feature)
        if len(featureList):
            pd.DataFrame(featureList).to_csv(path + 'feature/features_' + str(filterIndex) + '.csv', header=None, index=None)



### Appearance

In [7]:
def sameFeature(path):
    # path = '/Users/harry/Documents/Sars-Cov-2 Project/v3_Dataset/'

    feature_files = glob.glob(path + 'feature/*')

    p_value = 0
    for file in feature_files:
        if p_value == 0:
            feature_data = list(pd.read_csv(file, header=None).values.ravel())
            p_value = 1
        elif p_value == 1:
            feature_data_new = list(pd.read_csv(file, header=None).values.ravel())
            for var in feature_data_new:
                feature_data.append(var)

    CountFeature_dict = Counter(feature_data)

    RepeatList = []
    nonRepeatList = list(set(feature_data))
    new_features = []
    for feature in nonRepeatList:
        if 'C' in feature and 'G' in feature:
            CountFeature_dict = Counter(feature)
            CG_content = CountFeature_dict.get('C') + CountFeature_dict.get('G')

            if CG_content > len(feature) * 0.3 and CG_content < len(feature) * 0.7:
                new_features.append(feature)
            else:
                pass
        else:
            pass
    nonRepeatList = list(set(new_features))

    for key in CountFeature_dict.keys():
        value = CountFeature_dict.get(key)
        if value != 1:
            RepeatList.append(key)

    pd.DataFrame(RepeatList).to_csv(path + 'Repeat_feature_List.csv', header=None, index=None)
    pd.DataFrame(nonRepeatList).to_csv(path + 'nonRepeat_feature_List.csv', header=None, index=None)


def  get_appearance(feature_file, seq_file, high_type=1, accuracy=0.95):
    '''
    :msg: get the accuracy of a feature sequence higher than 95% or lower than 5%
    :param feature_file: --- {str} --- the path of the feature sequence file
    :param seq_file: --- {str} --- the path of the sequence file
    :param high_type: --- {int} --- a compare param (defaults to 1)
    :param accuracy: --- {float} --- the limitation of the compare (defaults to 0.95)
    :return: the DataFrame with the appearance of each feature sequence
    '''
    features = pd.read_csv(feature_file, header=None).values.ravel()  # get 3827 features from the CNN model

    # features = pd.read_csv(feature_file, header=None)
    # features = features[1:][1].values.ravel()

    # seq = pd.read_csv(seq_file, header=None).values.ravel()
    seq = pd.read_csv(seq_file, header=None).values.ravel()

    np.random.shuffle(seq)
    seq = pd.DataFrame(seq[:5000])

    total_num = len(seq)
    featureDic = {}
    count = 1
    for feature in features:
        print('feature ---- No. {}      with {} Total'.format(count, len(features)))
        count += 1

        count_feature = seq[0].apply(lambda x: x.count(feature))
        available_count = len(count_feature[count_feature == 1])

        if high_type == 1 and available_count / total_num >= accuracy:
            featureDic[feature] = available_count / total_num
        elif high_type != 1 and available_count / total_num <= 1 - accuracy:
            featureDic[feature] = available_count / total_num

    featureDF = pd.Series(featureDic).to_frame()
    return featureDF


def calculateAppearance(basic_path, path, variant_Name):
    # path = '/Users/harry/Documents/Sars-Cov-2 Project/v3_Dataset/'

    files = glob.glob(basic_path + 'Variant_virus/*')
    p_value = 0

    order = [3, 1, 0, 2, 4]  # original order = ['alpha', 'beta', 'gamma', 'delta', 'omicron']
    for i in order:
        file = files[i]
        file = file.split('/')[-1]
        save_fileName = file.split('.')[0].replace('GISAID_', '')
        # save_Fasta(path, file, save_fileName)

        # feature_file = path + 'Repeat_feature_List.csv'
        feature_file = path + 'nonRepeat_feature_List.csv'
        seq_file = S3_to_sageMaker('seq', save_fileName)

        print('Get the appearance in  -- {} --  virus'.format(save_fileName))
        if variant_Name == 'omicron':
            if save_fileName == 'omicron':
                if p_value == 0:
                    featureDF = get_appearance(feature_file, seq_file, high_type=1, accuracy=0.80)
                    featureDF.to_csv(path + 'Seq_appearance/feature_' + save_fileName + '.csv')

                    temp_file = path + 'Seq_appearance/feature_' + save_fileName + '.csv'
                    temp_data = pd.read_csv(temp_file)
                    temp_feature = temp_data['Unnamed: 0'].values.ravel()
                    pd.DataFrame(temp_feature).to_csv(path + 'result/temp_feature.csv', header=None, index=None)
                    p_value = 1

                elif p_value != 0:
                    feature_file = path + 'result/temp_feature.csv'
                    featureDF = get_appearance(feature_file, seq_file, high_type=1, accuracy=0.80)
                    featureDF.to_csv(path + 'Seq_appearance/feature_' + save_fileName + '.csv')

                    temp_file = path + 'Seq_appearance/feature_' + save_fileName + '.csv'
                    temp_data = pd.read_csv(temp_file)
                    temp_feature = temp_data['Unnamed: 0'].values.ravel()
                    pd.DataFrame(temp_feature).to_csv(path + 'result/temp_feature.csv', header=None, index=None)
            elif save_fileName == 'delta':
                if p_value == 0:
                    featureDF = get_appearance(feature_file, seq_file, high_type=1, accuracy=-1)
                    featureDF.to_csv(path + 'Seq_appearance/feature_' + save_fileName + '.csv')

                    temp_file = path + 'Seq_appearance/feature_' + save_fileName + '.csv'
                    temp_data = pd.read_csv(temp_file)
                    temp_feature = temp_data['Unnamed: 0'].values.ravel()
                    pd.DataFrame(temp_feature).to_csv(path + 'result/temp_feature.csv', header=None, index=None)
                    p_value = 1

                elif p_value != 0:
                    feature_file = path + 'result/temp_feature.csv'
                    featureDF = get_appearance(feature_file, seq_file, high_type=1, accuracy=-1)
                    featureDF.to_csv(path + 'Seq_appearance/feature_' + save_fileName + '.csv')

                    temp_file = path + 'Seq_appearance/feature_' + save_fileName + '.csv'
                    temp_data = pd.read_csv(temp_file)
                    temp_feature = temp_data['Unnamed: 0'].values.ravel()
                    pd.DataFrame(temp_feature).to_csv(path + 'result/temp_feature.csv', header=None, index=None)
            else:
                if p_value == 0:
                    featureDF = get_appearance(feature_file, seq_file, high_type=0, accuracy=0.90)
                    featureDF.to_csv(path + 'Seq_appearance/feature_' + save_fileName + '.csv')

                    temp_file = path + 'Seq_appearance/feature_' + save_fileName + '.csv'
                    temp_data = pd.read_csv(temp_file)
                    temp_feature = temp_data['Unnamed: 0'].values.ravel()
                    pd.DataFrame(temp_feature).to_csv(path + 'result/temp_feature.csv', header=None, index=None)
                    p_value = 1

                elif p_value != 0:
                    feature_file = path + 'result/temp_feature.csv'
                    featureDF = get_appearance(feature_file, seq_file, high_type=0, accuracy=0.90)
                    featureDF.to_csv(path + 'Seq_appearance/feature_' + save_fileName + '.csv')

                    temp_file = path + 'Seq_appearance/feature_' + save_fileName + '.csv'
                    temp_data = pd.read_csv(temp_file)
                    temp_feature = temp_data['Unnamed: 0'].values.ravel()
                    pd.DataFrame(temp_feature).to_csv(path + 'result/temp_feature.csv', header=None, index=None)
        else:
            if save_fileName == 'omicron':
                if p_value == 0:
                    featureDF = get_appearance(feature_file, seq_file, high_type=1, accuracy=-1)
                    featureDF.to_csv(path + 'Seq_appearance/feature_' + save_fileName + '.csv')

                    temp_file = path + 'Seq_appearance/feature_' + save_fileName + '.csv'
                    temp_data = pd.read_csv(temp_file)
                    temp_feature = temp_data['Unnamed: 0'].values.ravel()
                    pd.DataFrame(temp_feature).to_csv(path + 'result/temp_feature.csv', header=None, index=None)
                    p_value = 1

                elif p_value != 0:
                    feature_file = path + 'result/temp_feature.csv'
                    featureDF = get_appearance(feature_file, seq_file, high_type=1, accuracy=-1)
                    featureDF.to_csv(path + 'Seq_appearance/feature_' + save_fileName + '.csv')

                    temp_file = path + 'Seq_appearance/feature_' + save_fileName + '.csv'
                    temp_data = pd.read_csv(temp_file)
                    temp_feature = temp_data['Unnamed: 0'].values.ravel()
                    pd.DataFrame(temp_feature).to_csv(path + 'result/temp_feature.csv', header=None, index=None)
            elif save_fileName == variant_Name:
                if p_value == 0:
                    featureDF = get_appearance(feature_file, seq_file, high_type=1, accuracy=0.95)
                    featureDF.to_csv(path + 'Seq_appearance/feature_' + save_fileName + '.csv')

                    temp_file = path + 'Seq_appearance/feature_' + save_fileName + '.csv'
                    temp_data = pd.read_csv(temp_file)
                    temp_feature = temp_data['Unnamed: 0'].values.ravel()
                    pd.DataFrame(temp_feature).to_csv(path + 'result/temp_feature.csv', header=None, index=None)
                    p_value = 1

                elif p_value != 0:
                    feature_file = path + 'result/temp_feature.csv'
                    featureDF = get_appearance(feature_file, seq_file, high_type=1, accuracy=0.95)
                    featureDF.to_csv(path + 'Seq_appearance/feature_' + save_fileName + '.csv')

                    temp_file = path + 'Seq_appearance/feature_' + save_fileName + '.csv'
                    temp_data = pd.read_csv(temp_file)
                    temp_feature = temp_data['Unnamed: 0'].values.ravel()
                    pd.DataFrame(temp_feature).to_csv(path + 'result/temp_feature.csv', header=None, index=None)
            else:
                if p_value == 0:
                    featureDF = get_appearance(feature_file, seq_file, high_type=0, accuracy=0.95)
                    featureDF.to_csv(path + 'Seq_appearance/feature_' + save_fileName + '.csv')

                    temp_file = path + 'Seq_appearance/feature_' + save_fileName + '.csv'
                    temp_data = pd.read_csv(temp_file)
                    temp_feature = temp_data['Unnamed: 0'].values.ravel()
                    pd.DataFrame(temp_feature).to_csv(path + 'result/temp_feature.csv', header=None, index=None)
                    p_value = 1

                elif p_value != 0:
                    feature_file = path + 'result/temp_feature.csv'
                    featureDF = get_appearance(feature_file, seq_file, high_type=0, accuracy=0.95)
                    featureDF.to_csv(path + 'Seq_appearance/feature_' + save_fileName + '.csv')

                    temp_file = path + 'Seq_appearance/feature_' + save_fileName + '.csv'
                    temp_data = pd.read_csv(temp_file)
                    temp_feature = temp_data['Unnamed: 0'].values.ravel()
                    pd.DataFrame(temp_feature).to_csv(path + 'result/temp_feature.csv', header=None, index=None)


def commonFeatureAppearance(path):
    # path = '/Users/harry/Documents/Sars-Cov-2 Project/v3_Dataset/'

    files = glob.glob(path + 'Seq_appearance/*')

    Name_list = []
    p_value = 0
    for file in files:
        file_name = file.split('/')[-1].split('.')[0]
        Name_list.append(file_name)

        if p_value == 0:
            data = pd.read_csv(file, header=None)
            data = data[0][data[0].notna()].values.ravel()
            p_value = 1
        elif p_value == 1:
            data_new = pd.read_csv(file, header=None)
            data_new = data_new[0][data_new[0].notna()].values.ravel()

            data = [x for x in data if x in data_new]

    Appearance_list = []
    for seq in data:
        print('\nFor the sequence: {}\n'.format(seq))
        temp_list = []
        for file in files:
            file_name = file.split('/')[-1].split('.')[0]
            data_appearance = pd.read_csv(file, header=None)
            appearance = data_appearance[data_appearance[0] == seq][1].values.ravel()[0]
            # print('{} :  {}'.format(file_name, appearance))
            temp_list.append(appearance)
        Appearance_list.append(temp_list)

    Appearance_DF = pd.DataFrame(Appearance_list)
    Appearance_DF.index = data
    Appearance_DF.columns = Name_list

    Appearance_DF = Appearance_DF[
        ['feature_alpha', 'feature_beta', 'feature_gamma', 'feature_delta', 'feature_omicron']]

    Appearance_DF = Appearance_DF.sort_values(by=['feature_omicron'], ascending=[False])
    Appearance_DF.to_csv(path + 'Appearance_DataFrame.csv')



### Forward --> Reverse

In [8]:
def get_forward_primers(path):
    # path = '/Users/harry/Documents/Sars-Cov-2 Project/v5_Dataset/'
    print('------------------ Processing get_data ------------------')

    files = glob.glob(path + 'forward_primer/*')
    forward_primer_list = []
    count = 0
    print('Start to collect forward primers...')
    for file in files:
        primers = pd.read_csv(file)['Unnamed: 0'].values.ravel()
        for forward_primer in primers:
            count += 1
            forward_primer_list.append(forward_primer)
    print('Saving the forward primers...'.format(len(forward_primer_list), count))
    forward_primer_list = list(set(forward_primer_list))
    print('Finished !     ({}/{})\n'.format(len(forward_primer_list), count))
    pd.DataFrame(forward_primer_list).to_csv(path + 'forward_primer.csv', header=None, index=None)


def CG_content_chect(path, min_CG=0.35, max_CG=0.65):
    # path = '/Users/harry/Documents/Sars-Cov-2 Project/v5_Dataset/'
    print('Checking the CG content of the forward primers obtained....')

    features = pd.read_csv(path + 'forward_primer.csv', header=None).values.ravel()

    new_features = []
    for feature in features:
        if 'C' in feature and 'G' in feature:
            CountFeature_dict = Counter(feature)
            CG_content = CountFeature_dict.get('C') + CountFeature_dict.get('G')

            if len(feature) * min_CG < CG_content < len(feature) * max_CG:
                new_features.append(feature)
        else:
            pass

    Feature_List = list(set(new_features))
    print('Finished !     ({}/{})        min={}  max={}'.format(len(Feature_List), len(features), min_CG, max_CG))
    pd.DataFrame(Feature_List).to_csv(path + 'forward_primer_CG_check.csv', header=None, index=None)


def get_after_primer_data(path, forward_primer, sequence):
    # path = '/Users/harry/Documents/Sars-Cov-2 Project/v5_Dataset/'

    # seq_file = S3_to_sageMaker('seq', variant_Name)
    # sequence = pd.read_csv(seq_file, header=None).values.ravel()  # get the sequence
    # primers = pd.read_csv(path + 'forward_primer.csv', header=None).values.ravel()

    # count = 0
    # for forward_primer in primers:
        # count += 1
    print('\nNow processing with forward primer:  {}\n'.format(forward_primer))
    in_count, out_count = 0, 0
    second_half_list = []

    np.random.shuffle(sequence)
    sequence = sequence[:6000]
    for i in range(len(sequence)):
        if i % 5000 == 0:
            print('No. {} sequence... with {} in the sequence     /     with {} out of the sequence'.format(i,
                                                                                                            in_count,
                                                                                                            out_count))
        if forward_primer in sequence[i]:
            in_count += 1
            second_half = sequence[i].split(forward_primer)[1]
            second_half_list.append(second_half)
        else:
            out_count += 1
            pass
    print('\n{} : {} sequence\n'.format(forward_primer, len(second_half_list)))
    # pd.DataFrame(second_half_list).to_csv(path + 'second_data/' + str(forward_primer) + '.csv', header=None,
    #                                         index=None)
    return pd.DataFrame(second_half_list).values.ravel()


def exist_primer_check(path, forward_primer, sequence):
    # path = '/Users/harry/Documents/Sars-Cov-2 Project/v5_Dataset/'

    primers = pd.read_csv(path + 'forward_primer.csv', header=None).values.ravel()
    count = 0
    print('Start the the exist primer check...        {} in total\n'.format(len(primers)))

    # for forward_primer in primers:
        # count += 1
    reverse_primers = {}
    print('The processing forward primer is    {}'.format(forward_primer))

    # sequence = pd.read_csv(path + 'second_data/' + forward_primer + '.csv', header=None).values.ravel()
    min_num = int(len(sequence) * 0.99)
    sequence = pd.DataFrame(sequence)
    p = 0

    for primer in primers:
        if forward_primer == primer:
            pass
        else:
            count_feature = sequence[0].apply(lambda x: x.count(primer))
            available_count = len(count_feature[count_feature == 1])
            if available_count >= min_num:
                p = 1
                reverse_primers[primer] = available_count / len(sequence)

    if p == 1:
        print('Yes')
        reverse_primers = pd.Series(reverse_primers).to_frame()
        reverse_primers.to_csv(path + 'exist_primer_check/' + forward_primer + '.csv')
    else:
        print('No')
    print('\nComplete the exist primer check !\n')


def generate_random_sequence(sequence, number):
    ''' Get the percentage of ATCG base pair in the sequence '''
    vectorSize = 0
    for i in range(len(sequence)):
        if len(sequence[i]) > vectorSize:
            vectorSize = len(sequence[i])

    percent_A, percent_T, percent_C, percent_G = [], [], [], []
    for seq in sequence:
        counter = Counter(seq)

        a = counter['A']
        t = counter['T']
        c = counter['C']
        g = counter['G']

        p_a = a / (a + t + c + g)
        p_t = t / (a + t + c + g)
        p_c = c / (a + t + c + g)
        p_g = g / (a + t + c + g)

        percent_A.append(p_a)
        percent_T.append(p_t)
        percent_C.append(p_c)
        percent_G.append(p_g)

    ''' generate a new random sequence '''
    ATCG_random_list = []
    ATCG_random_list = ATCG_random_list + ['A'] * round(vectorSize * int(np.mean(percent_A)))
    ATCG_random_list = ATCG_random_list + ['T'] * round(vectorSize * int(np.mean(percent_T)))
    ATCG_random_list = ATCG_random_list + ['C'] * round(vectorSize * int(np.mean(percent_C)))
    ATCG_random_list = ATCG_random_list + ['G'] * round(vectorSize * int(np.mean(percent_G)))

    print('\nAverage "A" in the sequence      A --    {}'.format(np.mean(percent_A)))
    print('Average "T" in the sequence      T --    {}'.format(np.mean(percent_T)))
    print('Average "C" in the sequence      C --    {}'.format(np.mean(percent_C)))
    print('Average "G" in the sequence      G --    {}\n'.format(np.mean(percent_G)))

    np.random.shuffle(ATCG_random_list)
    if len(ATCG_random_list) != vectorSize:
        if len(ATCG_random_list) > vectorSize:
            n = len(ATCG_random_list) - vectorSize
            ATCG_random_list = ATCG_random_list[:len(ATCG_random_list) - n]
        elif len(ATCG_random_list) < vectorSize:
            n = vectorSize - len(ATCG_random_list)
            ATCG_random_list = ATCG_random_list + ['N'] * n

    random_data_T = []
    for i in range(number):
        np.random.shuffle(ATCG_random_list)
        var = ''.join(ATCG_random_list)
        random_data_T.append(var)

    random_data_V = []
    for i in range(number):
        np.random.shuffle(ATCG_random_list)
        var = ''.join(ATCG_random_list)
        random_data_V.append(var)

    return random_data_T, random_data_V


def reverse_sameFeature(path):
    # path = '/Users/harry/Documents/Sars-Cov-2 Project/v5_Dataset/'

    feature_files = glob.glob(path + 'feature/*')

    p_value = 0
    for file in feature_files:
        if p_value == 0:
            feature_data = list(pd.read_csv(file, header=None).values.ravel())
            p_value = 1
        elif p_value == 1:
            feature_data_new = list(pd.read_csv(file, header=None).values.ravel())
            for var in feature_data_new:
                feature_data.append(var)

    CountFeature_dict = Counter(feature_data)

    RepeatList = []
    nonRepeatList = list(set(feature_data))
    for key in CountFeature_dict.keys():
        value = CountFeature_dict.get(key)
        if value != 1:
            RepeatList.append(key)

    pd.DataFrame(RepeatList).to_csv(path + 'Repeat_feature_List.csv', header=None, index=None)
    pd.DataFrame(nonRepeatList).to_csv(path + 'nonRepeat_feature_List.csv', header=None, index=None)


def reverse_CG_content_chect(path, min_CG=0.35, max_CG=0.65):
    # path = '/Users/harry/Documents/Sars-Cov-2 Project/v5_Dataset/'
    p_value = 0

    feature_file = path + 'nonRepeat_feature_List.csv'
    features = pd.read_csv(feature_file, header=None).values.ravel()

    new_features = []
    for feature in features:
        if 'C' in feature and 'G' in feature:
            CountFeature_dict = Counter(feature)
            CG_content = CountFeature_dict.get('C') + CountFeature_dict.get('G')

            if len(feature) * min_CG < CG_content < len(feature) * max_CG:
                new_features.append(feature)
            else:
                pass
        else:
            pass

    Feature_List = list(set(new_features))
    if len(Feature_List) == 0:
        p_value = 2
    else:
        p_value = 1
    pd.DataFrame(Feature_List).to_csv(path + 'nonRepeat_feature_List.csv', header=None, index=None)

    return p_value


def reverse_calculateAppearance(basic_path, path, variant_Name):
    # path = '/Users/harry/Documents/Sars-Cov-2 Project/v5_Dataset/'

    files = glob.glob(basic_path + 'Variant_virus/*')
    p_value = 0

    order = [3, 1, 0, 2, 4]  # original order = ['alpha', 'beta', 'gamma', 'delta', 'omicron']
    for i in order:
        file = files[i]
        file = file.split('/')[-1]
        save_fileName = file.split('.')[0].replace('GISAID_', '')
        feature_file = path + 'nonRepeat_feature_List.csv'
        seq_file = S3_to_sageMaker('seq', save_fileName)

        print('Get the appearance in  -- {} --  virus'.format(save_fileName))
        if variant_Name == 'omicron':
            if save_fileName == 'omicron':
                if p_value == 0:
                    featureDF = get_appearance(feature_file, seq_file, high_type=1, accuracy=0.80)
                    featureDF.to_csv(path + 'Seq_appearance/feature_' + save_fileName + '.csv')

                    temp_file = path + 'Seq_appearance/feature_' + save_fileName + '.csv'
                    temp_data = pd.read_csv(temp_file)
                    temp_feature = temp_data['Unnamed: 0'].values.ravel()
                    if len(temp_feature) == 0:
                        p_value = 2
                    else:
                        p_value = 1
                    pd.DataFrame(temp_feature).to_csv(path + 'result/temp_feature.csv', header=None, index=None)

                elif p_value == 1:
                    feature_file = path + 'result/temp_feature.csv'
                    featureDF = get_appearance(feature_file, seq_file, high_type=1, accuracy=0.80)
                    featureDF.to_csv(path + 'Seq_appearance/feature_' + save_fileName + '.csv')

                    temp_file = path + 'Seq_appearance/feature_' + save_fileName + '.csv'
                    temp_data = pd.read_csv(temp_file)
                    temp_feature = temp_data['Unnamed: 0'].values.ravel()
                    if len(temp_feature) == 0:
                        p_value = 2
                    pd.DataFrame(temp_feature).to_csv(path + 'result/temp_feature.csv', header=None, index=None)

                elif p_value == 2:
                    pass

            elif save_fileName == 'delta':
                if p_value == 0:
                    featureDF = get_appearance(feature_file, seq_file, high_type=1, accuracy=-1)
                    featureDF.to_csv(path + 'Seq_appearance/feature_' + save_fileName + '.csv')

                    temp_file = path + 'Seq_appearance/feature_' + save_fileName + '.csv'
                    temp_data = pd.read_csv(temp_file)
                    temp_feature = temp_data['Unnamed: 0'].values.ravel()
                    if len(temp_feature) == 0:
                        p_value = 2
                    else:
                        p_value = 1
                    pd.DataFrame(temp_feature).to_csv(path + 'result/temp_feature.csv', header=None, index=None)

                elif p_value == 1:
                    feature_file = path + 'result/temp_feature.csv'
                    featureDF = get_appearance(feature_file, seq_file, high_type=1, accuracy=-1)
                    featureDF.to_csv(path + 'Seq_appearance/feature_' + save_fileName + '.csv')

                    temp_file = path + 'Seq_appearance/feature_' + save_fileName + '.csv'
                    temp_data = pd.read_csv(temp_file)
                    temp_feature = temp_data['Unnamed: 0'].values.ravel()
                    if len(temp_feature) == 0:
                        p_value = 2
                    pd.DataFrame(temp_feature).to_csv(path + 'result/temp_feature.csv', header=None, index=None)

                elif p_value == 2:
                    pass

            else:
                if p_value == 0:
                    featureDF = get_appearance(feature_file, seq_file, high_type=0, accuracy=0.90)
                    featureDF.to_csv(path + 'Seq_appearance/feature_' + save_fileName + '.csv')

                    temp_file = path + 'Seq_appearance/feature_' + save_fileName + '.csv'
                    temp_data = pd.read_csv(temp_file)
                    temp_feature = temp_data['Unnamed: 0'].values.ravel()
                    if len(temp_feature) == 0:
                        p_value = 2
                    else:
                        p_value = 1
                    pd.DataFrame(temp_feature).to_csv(path + 'result/temp_feature.csv', header=None, index=None)

                elif p_value == 1:
                    feature_file = path + 'result/temp_feature.csv'
                    featureDF = get_appearance(feature_file, seq_file, high_type=0, accuracy=0.90)
                    featureDF.to_csv(path + 'Seq_appearance/feature_' + save_fileName + '.csv')

                    temp_file = path + 'Seq_appearance/feature_' + save_fileName + '.csv'
                    temp_data = pd.read_csv(temp_file)
                    temp_feature = temp_data['Unnamed: 0'].values.ravel()
                    if len(temp_feature) == 0:
                        p_value = 2
                    pd.DataFrame(temp_feature).to_csv(path + 'result/temp_feature.csv', header=None, index=None)

                elif p_value == 2:
                    pass
        else:
            if save_fileName == 'omicron':
                if p_value == 0:
                    featureDF = get_appearance(feature_file, seq_file, high_type=1, accuracy=-1)
                    featureDF.to_csv(path + 'Seq_appearance/feature_' + save_fileName + '.csv')

                    temp_file = path + 'Seq_appearance/feature_' + save_fileName + '.csv'
                    temp_data = pd.read_csv(temp_file)
                    temp_feature = temp_data['Unnamed: 0'].values.ravel()
                    if len(temp_feature) == 0:
                        p_value = 2
                    else:
                        p_value = 1
                    pd.DataFrame(temp_feature).to_csv(path + 'result/temp_feature.csv', header=None, index=None)

                elif p_value == 1:
                    feature_file = path + 'result/temp_feature.csv'
                    featureDF = get_appearance(feature_file, seq_file, high_type=1, accuracy=-1)
                    featureDF.to_csv(path + 'Seq_appearance/feature_' + save_fileName + '.csv')

                    temp_file = path + 'Seq_appearance/feature_' + save_fileName + '.csv'
                    temp_data = pd.read_csv(temp_file)
                    temp_feature = temp_data['Unnamed: 0'].values.ravel()
                    if len(temp_feature) == 0:
                        p_value = 2
                    pd.DataFrame(temp_feature).to_csv(path + 'result/temp_feature.csv', header=None, index=None)

                elif p_value == 2:
                    pass

            elif save_fileName == variant_Name:
                if p_value == 0:
                    featureDF = get_appearance(feature_file, seq_file, high_type=1, accuracy=0.95)
                    featureDF.to_csv(path + 'Seq_appearance/feature_' + save_fileName + '.csv')

                    temp_file = path + 'Seq_appearance/feature_' + save_fileName + '.csv'
                    temp_data = pd.read_csv(temp_file)
                    temp_feature = temp_data['Unnamed: 0'].values.ravel()
                    if len(temp_feature) == 0:
                        p_value = 2
                    else:
                        p_value = 1
                    pd.DataFrame(temp_feature).to_csv(path + 'result/temp_feature.csv', header=None, index=None)

                elif p_value == 1:
                    feature_file = path + 'result/temp_feature.csv'
                    featureDF = get_appearance(feature_file, seq_file, high_type=1, accuracy=0.95)
                    featureDF.to_csv(path + 'Seq_appearance/feature_' + save_fileName + '.csv')

                    temp_file = path + 'Seq_appearance/feature_' + save_fileName + '.csv'
                    temp_data = pd.read_csv(temp_file)
                    temp_feature = temp_data['Unnamed: 0'].values.ravel()
                    if len(temp_feature) == 0:
                        p_value = 2
                    pd.DataFrame(temp_feature).to_csv(path + 'result/temp_feature.csv', header=None, index=None)

                elif p_value == 2:
                    pass

            else:
                if p_value == 0:
                    featureDF = get_appearance(feature_file, seq_file, high_type=0, accuracy=0.95)
                    featureDF.to_csv(path + 'Seq_appearance/feature_' + save_fileName + '.csv')

                    temp_file = path + 'Seq_appearance/feature_' + save_fileName + '.csv'
                    temp_data = pd.read_csv(temp_file)
                    temp_feature = temp_data['Unnamed: 0'].values.ravel()
                    if len(temp_feature) == 0:
                        p_value = 2
                    else:
                        p_value = 1
                    pd.DataFrame(temp_feature).to_csv(path + 'result/temp_feature.csv', header=None, index=None)

                elif p_value == 1:
                    feature_file = path + 'result/temp_feature.csv'
                    featureDF = get_appearance(feature_file, seq_file, high_type=0, accuracy=0.95)
                    featureDF.to_csv(path + 'Seq_appearance/feature_' + save_fileName + '.csv')

                    temp_file = path + 'Seq_appearance/feature_' + save_fileName + '.csv'
                    temp_data = pd.read_csv(temp_file)
                    temp_feature = temp_data['Unnamed: 0'].values.ravel()
                    if len(temp_feature) == 0:
                        p_value = 2
                    pd.DataFrame(temp_feature).to_csv(path + 'result/temp_feature.csv', header=None, index=None)

                elif p_value == 2:
                    pass

    return p_value


def reverse_commonFeatureAppearance(path, forward_primer):
    # path = '/Users/harry/Documents/Sars-Cov-2 Project/v5_Dataset/'

    files = glob.glob(path + 'Seq_appearance/*')

    Name_list = []
    p_value = 0
    for file in files:
        file_name = file.split('/')[-1].split('.')[0]
        Name_list.append(file_name)

        if p_value == 0:
            data = pd.read_csv(file, header=None)
            data = data[0][data[0].notna()].values.ravel()
            p_value = 1
        elif p_value == 1:
            data_new = pd.read_csv(file, header=None)
            data_new = data_new[0][data_new[0].notna()].values.ravel()

            data = [x for x in data if x in data_new]

    Appearance_list = []
    for seq in data:
        print('\nFor the sequence: {}\n'.format(seq))
        temp_list = []
        for file in files:
            file_name = file.split('/')[-1].split('.')[0]
            data_appearance = pd.read_csv(file, header=None)
            appearance = data_appearance[data_appearance[0] == seq][1].values.ravel()[0]
            # print('{} :  {}'.format(file_name, appearance))
            temp_list.append(appearance)
        Appearance_list.append(temp_list)

    Appearance_DF = pd.DataFrame(Appearance_list)
    Appearance_DF.index = data
    Appearance_DF.columns = Name_list

    Appearance_DF = Appearance_DF[
        ['feature_alpha', 'feature_beta', 'feature_gamma', 'feature_delta', 'feature_omicron']]

    Appearance_DF = Appearance_DF.sort_values(by=['feature_omicron'], ascending=[False])
    Appearance_DF.to_csv(path + 'result/' + forward_primer + '_Appearance_DataFrame.csv')


def get_available_primers(path):
    # path = '/Users/harry/Documents/Sars-Cov-2 Project/v4_Dataset/'
    print('Get available primers with 40%-60% CG content...')
    if os.path.exists(path + 'result/temp_feature.csv'):
        os.remove(path + 'result/temp_feature.csv')

    features = pd.read_csv(path + 'forward_primer_CG_check.csv', header=None).values.ravel()

    files = glob.glob(path + 'result/*')
    for file in files:
        forward_primer = file.split('/')[-1].split('_')[0]
        if forward_primer in features:
            newpath = path + 'CG_Check/new_primers/' + forward_primer + '.csv'
            shutil.copyfile(file, newpath)

    files = glob.glob(path + 'exist_primer_check/*')
    for file in files:
        forward_primer = file.split('/')[-1].split('.')[0]
        if forward_primer in features:
            newpath = path + 'CG_Check/exist_primers/' + forward_primer + '.csv'
            shutil.copyfile(file, newpath)

    print('Finished ! \n')


def calculate_CG_content_and_melting_temperature(primers):
    primers = list(primers)
    if len(set(primers)) == 4:
        counter = Counter(primers)
        Tm = 64.9 + 41 * (counter.get('G') + counter.get('C') - 16.4) / (counter.get('A') + counter.get('T') +
                                                                         counter.get('G') + counter.get('C'))
        CG_content = (counter.get('C') + counter.get('G')) / len(primers)
        return CG_content, Tm
    else:
        return -1, -1


def length_of_amplicon(path, sequence_full):
    # path = '/Users/harry/Documents/Sars-Cov-2 Project/v4_Dataset/'

    print('Get the length of amplicon between the primers...\n')
    feature_CG_Check = pd.read_csv(path + 'forward_primer_CG_check.csv', header=None).values.ravel()
    primer_design = []

    '''--------------------------------------------------------------------------------------------------------------'''
    print('(1/2) Exist primers: ')
    files = glob.glob(path + 'CG_Check/exist_primers/*')
    count = 0

    for file in files:
        count += 1
        forward_primer = file.split('/')[-1].split('.')[0]
        print('No.{} out of {} file:          {}'.format(count, len(files), forward_primer))

        sequence = get_after_primer_data(path, forward_primer, sequence_full)
        reverse_primers = pd.read_csv(file)['Unnamed: 0'].values.ravel()
        for reverse_primer in reverse_primers:
            length = []
            if reverse_primer in feature_CG_Check:
                for seq in sequence:
                    if reverse_primer in seq:
                        amplicon = len(seq.split(reverse_primer)[0]) + len(forward_primer) + len(reverse_primer)
                        length.append(amplicon)
                    else:
                        pass
            else:
                pass

            if len(length) == 0:
                pass
            else:
                maxx = np.max(length)
                minn = np.min(length)
                mean = np.mean(length)
                f_CG, f_Tm = calculate_CG_content_and_melting_temperature(forward_primer)
                r_CG, r_Tm = calculate_CG_content_and_melting_temperature(reverse_primer)

                primer_design.append(forward_primer)
                primer_design.append(reverse_primer)
                primer_design.append(f_CG)
                primer_design.append(r_CG)
                primer_design.append(f_Tm)
                primer_design.append(r_Tm)
                primer_design.append(abs(f_Tm - r_Tm))

                primer_design.append(mean)
                primer_design.append(maxx)
                primer_design.append(minn)

    pd.DataFrame(primer_design).to_csv(path + 'amplicon_length/exist_primers.csv', header=None, index=None)

    '''--------------------------------------------------------------------------------------------------------------'''
    print('\n(2/2) new primers: ')
    primer_design = []
    files = glob.glob(path + 'CG_Check/new_primers/*')
    count = 0

    for file in files:
        count += 1
        forward_primer = file.split('/')[-1].split('.')[0]
        print('No.{} out of {} file:          {}'.format(count, len(files), forward_primer))

        sequence = get_after_primer_data(path, forward_primer, sequence_full)
        reverse_primers = pd.read_csv(file)['Unnamed: 0'].values.ravel()
        for reverse_primer in reverse_primers:
            length = []
            for seq in sequence:
                if reverse_primer in seq:
                    amplicon = len(seq.split(reverse_primer)[0]) + len(forward_primer) + len(reverse_primer)
                    length.append(amplicon)
                else:
                    pass

            if len(length) == 0:
                pass
            else:
                maxx = np.max(length)
                minn = np.min(length)
                mean = np.mean(length)
                f_CG, f_Tm = calculate_CG_content_and_melting_temperature(forward_primer)
                r_CG, r_Tm = calculate_CG_content_and_melting_temperature(reverse_primer)

                primer_design.append(forward_primer)
                primer_design.append(reverse_primer)
                primer_design.append(f_CG)
                primer_design.append(r_CG)
                primer_design.append(f_Tm)
                primer_design.append(r_Tm)
                primer_design.append(abs(f_Tm - r_Tm))

                primer_design.append(mean)
                primer_design.append(maxx)
                primer_design.append(minn)

    pd.DataFrame(primer_design).to_csv(path + 'amplicon_length/new_primers.csv', header=None, index=None)


def reshape_file(path):
    # path = '/Users/harry/Documents/Sars-Cov-2 Project/v4_Dataset/'

    Name_list = ['Forward primer', 'Reverse primer', 'Forward CG content', 'Reverse CG content',
                 'Forward Melting Temperature (Tm)', 'Reverse Melting Temperature (Tm)', 'Tm difference',
                 'amplicon_avg', 'amplicon_max', 'amplicon_min']

    new_primers = pd.read_csv(path + 'amplicon_length/new_primers.csv', header=None).values.ravel()
    new_primers = new_primers.reshape(-1, len(Name_list))
    new_primers = pd.DataFrame(new_primers)
    new_primers.columns = Name_list
    new_primers.to_csv(path + 'amplicon_length/new_primers.csv', index=None)

    exist_primers = pd.read_csv(path + 'amplicon_length/exist_primers.csv', header=None).values.ravel()
    exist_primers = exist_primers.reshape(-1, len(Name_list))
    exist_primers = pd.DataFrame(exist_primers)
    exist_primers.columns = Name_list
    exist_primers.to_csv(path + 'amplicon_length/exist_primers.csv', index=None)


def reverse(path):
    # path = '/Users/harry/Documents/Sars-Cov-2 Project/v5_Dataset/'
    new_reverse_primer = []
    new_Reverse_Melting_Temperature = []

    '''--------------------------------------------------------------------------------------------------------------'''
    print('(1/2) Exist primers: ')
    Final_primers = pd.read_csv(path + 'amplicon_length/exist_primers.csv')
    reverse_primer = Final_primers['Reverse primer'].values.ravel()
    for i in range(len(reverse_primer)):
        temp = list(reverse_primer[i])
        new_primer = []
        for j in range(len(temp)):
            if temp[len(temp) - j - 1] == 'A':
                new_primer.append('T')
            elif temp[len(temp) - j - 1] == 'T':
                new_primer.append('A')
            elif temp[len(temp) - j - 1] == 'G':
                new_primer.append('C')
            elif temp[len(temp) - j - 1] == 'C':
                new_primer.append('G')

        if len(set(new_primer)) == 4:
            counter = Counter(new_primer)
            Tm = 64.9 + 41 * (counter.get('G') + counter.get('C') - 16.4) / (counter.get('A') + counter.get('T') +
                                                                             counter.get('G') + counter.get('C'))
        else:
            Tm = -1

        var = ''.join(new_primer)
        new_reverse_primer.append(var)
        new_Reverse_Melting_Temperature.append(Tm)

    Final_primers['Reverse primer'] = new_reverse_primer
    Final_primers['Reverse Melting Temperature (Tm)'] = new_Reverse_Melting_Temperature
    Final_primers['Tm difference'] = abs(Final_primers['Forward Melting Temperature (Tm)'] -
                                         Final_primers['Reverse Melting Temperature (Tm)'])
    Final_primers.to_csv(path + 'amplicon_length/r_exist_primers.csv', index=None)
    print('Finished !')

    '''--------------------------------------------------------------------------------------------------------------'''
    new_reverse_primer = []
    new_Reverse_Melting_Temperature = []

    print('\n(2/2) new primers')
    Final_primers = pd.read_csv(path + 'amplicon_length/new_primers.csv')
    reverse_primer = Final_primers['Reverse primer'].values.ravel()
    for i in range(len(reverse_primer)):
        temp = list(reverse_primer[i])
        new_primer = []
        for j in range(len(temp)):
            if temp[len(temp) - j - 1] == 'A':
                new_primer.append('T')
            elif temp[len(temp) - j - 1] == 'T':
                new_primer.append('A')
            elif temp[len(temp) - j - 1] == 'G':
                new_primer.append('C')
            elif temp[len(temp) - j - 1] == 'C':
                new_primer.append('G')

        if len(set(new_primer)) == 4:
            counter = Counter(new_primer)
            Tm = 64.9 + 41 * (counter.get('G') + counter.get('C') - 16.4) / (counter.get('A') + counter.get('T') +
                                                                             counter.get('G') + counter.get('C'))
        else:
            Tm = -1

        var = ''.join(new_primer)
        new_reverse_primer.append(var)
        new_Reverse_Melting_Temperature.append(Tm)

    Final_primers['Reverse primer'] = new_reverse_primer
    Final_primers['Reverse Melting Temperature (Tm)'] = new_Reverse_Melting_Temperature
    Final_primers['Tm difference'] = abs(Final_primers['Forward Melting Temperature (Tm)'] -
                                         Final_primers['Reverse Melting Temperature (Tm)'])
    Final_primers.to_csv(path + 'amplicon_length/r_new_primers.csv', index=None)
    print('Finished !')


def primer_design_rules(path):
    # path = '/Users/harry/Documents/Sars-Cov-2 Project/v5_Dataset/'

    '''--------------------------------------------------------------------------------------------------------------'''
    print('(1/2) Exist primers: ')
    primers = pd.read_csv(path + 'amplicon_length/r_exist_primers.csv')
    drop_list = []

    for i in range(len(primers)):
        temp = primers[i:i + 1]
        forward_primer = temp['Forward primer'][i]
        reverse_primer = temp['Reverse primer'][i]

        if 'C' not in forward_primer[-3:] and 'G' not in forward_primer[-3:]:
            drop_list.append(i)
        else:
            if 'C' not in reverse_primer[-3:] and 'G' not in reverse_primer[-3:]:
                drop_list.append(i)
            else:
                if primer3.calcHomodimer(forward_primer).dg <= -9000 or primer3.calcHomodimer(
                        reverse_primer).dg <= -9000:
                    drop_list.append(i)
                else:
                    if primer3.calcHeterodimer(forward_primer, reverse_primer).dg <= -9000:
                        drop_list.append(i)
                    else:
                        if abs(primer3.calcTm(forward_primer) - primer3.calcTm(reverse_primer)) > 5:
                            drop_list.append(i)
                        else:
                            if primer3.calcTm(forward_primer) > 60 or primer3.calcTm(reverse_primer) < 50:
                                drop_list.append(i)
                            else:
                                if primer3.calcTm(reverse_primer) > 60 or primer3.calcTm(reverse_primer) < 50:
                                    drop_list.append(i)
                                else:
                                    primers.iloc[i:i + 1, 4] = primer3.calcTm(forward_primer)
                                    primers.iloc[i:i + 1, 5] = primer3.calcTm(reverse_primer)
                                    primers.iloc[i:i + 1, 6] = abs(
                                        primer3.calcTm(forward_primer) - primer3.calcTm(reverse_primer))

    primers = primers.drop(index=drop_list)
    primers.to_csv(path + 'amplicon_length/r_exist_primers_deltaG.csv', index=None)
    print('Finished !')

    '''--------------------------------------------------------------------------------------------------------------'''
    print('\n(2/2) new primers')

    primers = pd.read_csv(path + 'amplicon_length/r_new_primers.csv')
    drop_list = []

    for i in range(len(primers)):
        temp = primers[i:i + 1]
        forward_primer = temp['Forward primer'][i]
        reverse_primer = temp['Reverse primer'][i]

        if 'C' not in forward_primer[-3:] and 'G' not in forward_primer[-3:]:
            drop_list.append(i)
        else:
            if 'C' not in reverse_primer[-3:] and 'G' not in reverse_primer[-3:]:
                drop_list.append(i)
            else:
                if primer3.calcHomodimer(forward_primer).dg <= -9000 or primer3.calcHomodimer(
                        reverse_primer).dg <= -9000:
                    drop_list.append(i)
                else:
                    if primer3.calcHeterodimer(forward_primer, reverse_primer).dg <= -9000:
                        drop_list.append(i)
                    else:
                        if abs(primer3.calcTm(forward_primer) - primer3.calcTm(reverse_primer)) > 5:
                            drop_list.append(i)
                        else:
                            if primer3.calcTm(forward_primer) > 60 or primer3.calcTm(reverse_primer) < 50:
                                drop_list.append(i)
                            else:
                                if primer3.calcTm(reverse_primer) > 60 or primer3.calcTm(reverse_primer) < 50:
                                    drop_list.append(i)
                                else:
                                    primers.iloc[i:i + 1, 4] = primer3.calcTm(forward_primer)
                                    primers.iloc[i:i + 1, 5] = primer3.calcTm(reverse_primer)
                                    primers.iloc[i:i + 1, 6] = abs(
                                        primer3.calcTm(forward_primer) - primer3.calcTm(reverse_primer))

    primers = primers.drop(index=drop_list)
    primers.to_csv(path + 'amplicon_length/r_new_primers_deltaG.csv', index=None)
    print('Finished !')



# main

In [9]:
forward_batchSize = 50
reverse_batchSize = 50

In [10]:
basic_path = './'
sys_path(basic_path)

variant_list = ['alpha', 'beta', 'gamma', 'delta', 'omicron']  # Can change the order

primer_length = 21

generate_forward_number = 2000
train_model_number_forward = 2000

generate_reverse_number = 2000
train_model_number_reverse = 20000
# get_data(basic_path) # Run only once
vectorSize = mixed_data(basic_path, delta_num=train_model_number_forward, other_percent=1)

'-------------------------------------------------------------------------------------------------------------'
'----------------------------------------  Forward Method  ---------------------------------------------------'
'-------------------------------------------------------------------------------------------------------------'

forward_method = 'Pooling'  # ['Pooling', 'Top', 'Combination']
forward_max_Pooling_window_size = 148
'-----------------------------------------------------------------'
# forward_method = 'Top'
forward_top_number = 175
'-----------------------------------------------------------------'
# forward_method = 'Combination'
# forward_max_Pooling_window_size = 500
forward_top_in_window = 10

'-------------------------------------------------------------------------------------------------------------'
'----------------------------------------  Reverse Method  ---------------------------------------------------'
'-------------------------------------------------------------------------------------------------------------'

# reverse_method = 'Pooling'  # ['Pooling', 'Top', 'Combination']
# reverse_max_Pooling_window_size = 148
'-----------------------------------------------------------------'
reverse_method = 'Top'
reverse_top_number = 300
'-----------------------------------------------------------------'
# reverse_method = 'Combination'
# reverse_max_Pooling_window_size = 500
reverse_top_in_window = 10

In [12]:
train_valid_data(basic_path, n_splits=2)
if forward_method == 'Top':
    forward_max_Pooling_window_size = int(vectorSize / forward_top_number) + 1

seq_T = pd.read_csv(basic_path + 'index/train_sequence.csv', header=None).values.ravel()
label_T = pd.read_csv(basic_path + 'index/train_label.csv', header=None).values.ravel()

seq_V = pd.read_csv(basic_path + 'index/valid_sequence.csv', header=None).values.ravel()
label_V = pd.read_csv(basic_path + 'index/valid_label.csv', header=None).values.ravel()

seq_Test = pd.read_csv(basic_path + 'index/train_sequence.csv', header=None).values.ravel()
label_Test = pd.read_csv(basic_path + 'index/train_label.csv', header=None).values.ravel()

rand = np.random.randint(100000)
np.random.seed(rand)
np.random.shuffle(seq_T)
np.random.seed(rand)
np.random.shuffle(label_T)

rand = np.random.randint(100000)
np.random.seed(rand)
np.random.shuffle(seq_V)
np.random.seed(rand)
np.random.shuffle(label_V)

rand = np.random.randint(100000)
np.random.seed(rand)
np.random.shuffle(seq_Test)
np.random.seed(rand)
np.random.shuffle(label_Test)

## forward model

In [None]:
f = open(basic_path + 'model/outputVector.txt', 'w')
f.write('1\n')
f.write('1\n')

# Parameters
batchSize = forward_batchSize
labelSize = int(np.max(label_T) + 1)
limit = 1.01
iterMax = 50
# ------------------------------------------------
w1 = 12
wd1 = 21
h1 = forward_max_Pooling_window_size  # 31029/148 ~ 210 ---> 31079/148 ~ 210
w4 = 256
# ------------------------------------------------
# initialize variables
iter = 0
train_accuracy = 0.0
valid_accuracy = 0.0
test_accuracy = 0.0
# best validation accuracy
best = 0
validWindow = [0, 0, 0]
repeatWindow = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
validBest = 1e6
yResult = []
yTest = []

oneHot_labels_T = oneHot(label_T, labelSize)
oneHot_labels_V = oneHot(label_V, labelSize)
oneHot_labels_Test = oneHot(label_Test, labelSize)
runs = int(len(oneHot_labels_T) / batchSize)

'----------------------------------------------------------------------------------------------------------'

# Tensorflow CNN model
print('\nStrat to build the CNN model...')
tf.disable_v2_behavior()
sess = tf.InteractiveSession()

x = tf.placeholder(tf.float32, [None, vectorSize])  # input variable
keep_prob = tf.placeholder(tf.float32)  # keep between 0.50 to 1.0
y_ = tf.placeholder(tf.float32, [None, labelSize])  # expected outputs variable
x_image0 = tf.reshape(x, [-1, 1, vectorSize, 1])  # arrange the tensor as an image (1*30145) 1 channel
x_image = tf.transpose(x_image0, perm=[0, 3, 2, 1])  # arrange the tensor into 1 channels (1*30145)

# 1 LAYER
W_conv1 = weight_variable([1, wd1, 1, w1])
b_conv1 = bias_variable([w1])
h_conv1 = tf.nn.relu(tf.nn.conv2d(x_image, W_conv1, strides=[1, 1, 1, 1], padding='SAME') + b_conv1)
h_pool1 = tf.nn.max_pool(h_conv1, ksize=[1, 1, h1, 1], strides=[1, 1, h1, 1], padding='SAME')

# Rectifier LAYER
coef = int(h_pool1.get_shape()[1] * h_pool1.get_shape()[2] * h_pool1.get_shape()[3])  # 1 * 209.34 * 12 ~ 2512
h_pool2_flat = tf.reshape(h_pool1, [-1, coef])
W_fc1 = weight_variable([coef, w4])
b_fc1 = bias_variable([w4])
h_fc1 = tf.nn.relu(tf.matmul(h_pool2_flat, W_fc1) + b_fc1)

# Rectifier-Dropout LAYER
h_fc1_drop = tf.nn.dropout(h_fc1, keep_prob)
W_fc2 = weight_variable([w4, labelSize])
b_fc2 = bias_variable([labelSize])
y_conv = tf.matmul(h_fc1_drop, W_fc2) + b_fc2

# Loss Function
cross_entropy = tf.reduce_mean(
    tf.nn.softmax_cross_entropy_with_logits_v2(logits=y_conv, labels=y_) + 0.001 * tf.nn.l2_loss(W_conv1))
train_step = tf.train.AdamOptimizer(1e-4).minimize(cross_entropy)
correct_prediction = tf.equal(tf.argmax(y_conv, 1), tf.argmax(y_, 1))
trueResult = tf.argmax(y_conv, 1)
trueTest = tf.argmax(y_, 1)
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
print('\nFinished the model building!')

### new model

In [None]:
sess.run(tf.initialize_all_variables())
saver = tf.train.Saver()
plt.ion()

while (best < limit) & (iter < iterMax):
    indexBatch = []
    for iB in range(0, len(oneHot_labels_T)):
        indexBatch.append(iB)
        random.shuffle(indexBatch)
    for run in range(0, runs):
        xa, ya = getBatch_run(seq_T, oneHot_labels_T, batchSize, run, indexBatch, vectorSize)
        train_step.run(feed_dict={x: xa, y_: ya, keep_prob: 0.5})

    xa, ya = getBatch(seq_T, oneHot_labels_T, batchSize, vectorSize)
    train_accuracy = accuracy.eval(feed_dict={x: xa, y_: ya, keep_prob: 1.0})

    xaV, yaV = getBatch(seq_V, oneHot_labels_V, oneHot_labels_V.shape[0], vectorSize)
    valid_accuracy = accuracy.eval(feed_dict={x: xaV, y_: yaV, keep_prob: 1.0})
    cross_entropyVal = cross_entropy.eval(feed_dict={x: xaV, y_: yaV, keep_prob: 1.0})
    cross_entropyTrain = cross_entropy.eval(feed_dict={x: xa, y_: ya, keep_prob: 1.0})

    validWindowValue = 0
    tempValid = validWindow
    for i in range(0, len(validWindow) - 1):
        tempValid[i] = validWindow[i + 1]
    for i in range(0, len(validWindow)):
        validWindow[i] = tempValid[i]
    validWindow[len(validWindow) - 1] = valid_accuracy
    for i in range(0, len(validWindow)):
        validWindowValue = validWindowValue + validWindow[i]
    validWindowValue = validWindowValue / len(validWindow)
    tempValid = repeatWindow
    for i in range(0, len(repeatWindow) - 1):
        tempValid[i] = repeatWindow[i + 1]
    for i in range(0, len(repeatWindow)):
        repeatWindow[i] = tempValid[i]
    repeatWindow[len(repeatWindow) - 1] = valid_accuracy
    if np.var(repeatWindow) == 0 and iter > iterMax:
        iter = iter
    if validWindowValue > best or cross_entropyVal < validBest:
        validBest = cross_entropyVal
        best = validWindowValue

        xaT, yaT = getBatch(seq_V, oneHot_labels_V, oneHot_labels_V.shape[0], vectorSize)
        test_accuracy = accuracy.eval(feed_dict={x: xaT, y_: yaT, keep_prob: 1.0})
        save_path = saver.save(sess, basic_path + "model/CNN_model.ckpt")

        results = correct_prediction.eval(feed_dict={x: xaT, y_: yaT, keep_prob: 1.0})
        yResult = trueResult.eval(feed_dict={x: xaT, y_: yaT, keep_prob: 1.0})
        yTest = trueTest.eval(feed_dict={x: xaT, y_: yaT, keep_prob: 1.0})

        fOut = open(basic_path + 'model/outputVector_1.txt', 'w')
        fOut.write('1\n')
        fOut.write('1\n')
        temp = 1.0 - best
        trueAcc = str(temp)
        print(trueAcc)
        fOut.write(trueAcc + '\n')
        fOut.close()
    log = "%d	%g	%g	%g	%g	%g	%g" % (iter, train_accuracy, valid_accuracy, best,
                                                      test_accuracy, cross_entropyVal, cross_entropyTrain)
    print(log)
    f.write(log + '\n')
    iter = iter + 1

plt.ioff()

f.close()
np.savetxt(basic_path + 'model/results.txt', yResult, fmt='%i', delimiter=' ')
np.savetxt(basic_path + 'model/test.txt', yTest, fmt='%i', delimiter=' ')
f = open(basic_path + 'model/log3.txt', 'a')
name = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) + ' / CNN model'
f.write(name + '\n')
f.close()

f = open(basic_path + 'model/outputVector_2.txt', 'w')
f.write('1\n')
f.write('1\n')
temp = 1.0 - best
trueAcc = str(temp)
print(trueAcc)
f.write(trueAcc + '\n')
f.close()

xaT, yaT = getBatch(seq_V, oneHot_labels_V, oneHot_labels_V.shape[0], vectorSize)

units = sess.run(h_conv1, feed_dict={x: xaT, y_: yaT, keep_prob: 1.0})
print(units.shape)
units = sess.run(h_pool1, feed_dict={x: xaT, y_: yaT, keep_prob: 1.0})
print(units.shape)


### exsit model

In [None]:
# sess.run(tf.initialize_all_variables())
# saver = tf.train.Saver()
# saver.restore(sess, basic_path + 'model/CNN_model.ckpt')

# print('\n\nLoad CNN model complete!\n\n')

### forward primer

In [None]:
for variant_Name in variant_list:
    path = basic_path + variant_Name + '/forward/'
    seqName_file = S3_to_sageMaker('seqName', variant_Name)
    seqName = pd.read_csv(seqName_file, header=None).values.ravel()  # get the sequence name

    seq_file = S3_to_sageMaker('seq', variant_Name)
    sequence = pd.read_csv(seq_file, header=None).values.ravel()  # get the sequence

    np.random.shuffle(seqName)
    np.random.shuffle(sequence)
    seqName = seqName[:generate_forward_number]
    sequence = sequence[:generate_forward_number]
    seq_labels = np.array([0 for x in range(len(seqName))])

    seq_count = 0
    outData, outLabels = [], []
    print('\nStrat to transfer the sequence ...\n')
    for i in range(len(sequence)):
        seq_count += 1
        if seq_count % 100 == 0:
            print('Transferring...  No.{} with total {}'.format(seq_count, len(seq_labels)))
        sample = np.zeros(vectorSize)
        for j in range(0, len(sequence[i])):
            if sequence[i][j] == 'C':
                sample[j] = 0.25
            elif sequence[i][j] == 'T':
                sample[j] = 0.50
            elif sequence[i][j] == 'G':
                sample[j] = 0.75
            elif sequence[i][j] == 'A':
                sample[j] = 1.0
            else:
                sample[j] = 0.0
        outData.append(sample)
        outLabels.append(seq_labels[i])

    print('\nTransfer finished!\n')
    pd.DataFrame(sequence).to_csv(path + 'filter_seq/filter_seq.csv', header=None, index=None)
    data = np.array(outData)
    seq_labels = np.array(outLabels)
    oneHotLabels = oneHot(seq_labels, labelSize)

    '----------------------------------------------------------------------------------------------------------'
    'Filter files'

    print('the Filter files')
    units = sess.run(h_conv1, feed_dict={x: data, y_: oneHotLabels, keep_prob: 1.0})
    print('\nThe first h_conv1 layer size:      h_conv1 = {}\n'.format(units.shape))

    sampleSize = int(data.shape[0])
    Mat = np.zeros((sampleSize, vectorSize))

    for filterIndex in range(units.shape[3]):
        print('Loop 1 : Generating the  {}  Filter file'.format(filterIndex))
        for testSize in range(sampleSize):
            for inputSize in range(vectorSize):
                Mat[testSize][inputSize] = units[testSize][0][inputSize][filterIndex]
        pd.DataFrame(Mat).to_csv(path + 'filter/filter_' + str(filterIndex) + '.csv', header=None, index=None)


    if forward_method == 'Pooling':
        numberWindows = posPool(path, vectorSize, forward_max_Pooling_window_size)
    elif forward_method == 'Top':
        numberWindows = posPool_top(path, vectorSize, forward_max_Pooling_window_size)
    elif forward_method == 'Combination':
        numberWindows = posPool_combination(path, vectorSize, forward_max_Pooling_window_size,
                                            forward_top_in_window)

    creatFeatVector(path, numberWindows, vectorSize, primer_length)
    getFeature(basic_path, path, variant_Name, Number=2000)

    sameFeature(path)
    calculateAppearance(basic_path, path, variant_Name)
    commonFeatureAppearance(path)

sess.close()

In [28]:
for variant_Name in variant_list:
    old_path = basic_path + variant_Name + '/forward/' + 'Appearance_DataFrame.csv'
    new_path = basic_path + variant_Name + '/reverse/forward_primer/' + 'Appearance_DataFrame.csv'
    shutil.copyfile(old_path, new_path)


## reverse primer

In [None]:
variant_list = ['delta', 'omicron']  # Can change the order

for variant_Name in variant_list:
    path = basic_path + variant_Name + '/reverse/'

    get_forward_primers(path)
    CG_content_chect(path, min_CG=0.4, max_CG=0.6)

    seq_file = S3_to_sageMaker('seq', variant_Name)
    sequence_full = pd.read_csv(seq_file, header=None).values.ravel()  # get the sequence

    primers = pd.read_csv(path + 'forward_primer_CG_check.csv', header=None).values.ravel()
    files = glob.glob(path + 'result/*')
    exist_primer_list = []
    for file in files:
        file = file.split('/')[-1].split('_')[0]
        if file == 'temp':
            pass
        else:
            exist_primer_list.append(file)
            
    temp_list = []
    for forward_primer in primers:
        if forward_primer not in exist_primer_list:
            temp_list.append(forward_primer)
            print(forward_primer)

    primers = temp_list
    
    count = 0

    for forward_primer in primers:
        sequence_original = get_after_primer_data(path, forward_primer, sequence_full)
        exist_primer_check(path, forward_primer, sequence_original)

        count += 1
        # file = path + 'second_data/' + forward_primer + '.csv'
        print('No.{} file, the forward primer is    {}'.format(count, forward_primer))
        # sequence_original = pd.read_csv(file).values.ravel()

        rand = np.random.randint(100000)
        np.random.seed(rand)
        np.random.shuffle(sequence_original)


        sequence = sequence_original[:train_model_number_reverse]
        number = len(sequence)
        random_sequence_T, random_sequence_V = generate_random_sequence(sequence, number)

        seq_T = sequence
        label_T = np.array([0 for x in range(len(seq_T))] + [1 for x in range(len(random_sequence_T))])
        seq_V = sequence
        label_V = np.array([0 for x in range(len(seq_V))] + [1 for x in range(len(random_sequence_V))])

        seq_T = np.array(list(seq_T) + random_sequence_T)
        seq_V = np.array(list(seq_V) + random_sequence_V)

        rand = np.random.randint(100000)
        np.random.seed(rand)
        np.random.shuffle(seq_T)
        np.random.seed(rand)
        np.random.shuffle(label_T)

        rand = np.random.randint(100000)
        np.random.seed(rand)
        np.random.shuffle(seq_V)
        np.random.seed(rand)
        np.random.shuffle(label_V)

        '----------------------------------------------------------------------------------------------------------'

        f = open(path + 'model/outputVector.txt', 'w')
        f.write('1\n')
        f.write('1\n')

        # Parameters
        kfoldIndex = 0
        batchSize = reverse_batchSize

        vectorSize = 0  # 31029 / 31079
        for i in range(len(sequence_original)):
            if len(sequence_original[i]) > vectorSize:
                vectorSize = len(sequence_original[i])

        if reverse_method == 'Top':
            reverse_max_Pooling_window_size = int(vectorSize / reverse_top_number) + 1

        labelSize = 2  # int(np.max(label_T) + 1)
        limit = 1.01
        iterMax = 1
        # ------------------------------------------------
        w1 = 12
        wd1 = 21
        h1 = reverse_max_Pooling_window_size  # 31029/148 ~ 210 ---> 31079/148 ~ 210
        w4 = 256

        # ------------------------------------------------
        # initialize variables
        iter = 0
        train_accuracy = 0.0
        valid_accuracy = 0.0
        test_accuracy = 0.0
        # best validation accuracy
        best = 0
        validWindow = [0, 0, 0]
        repeatWindow = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
        validBest = 1e6
        yResult = []
        yTest = []

        oneHot_labels_T = oneHot(label_T, labelSize)
        oneHot_labels_V = oneHot(label_V, labelSize)
        runs = int(len(oneHot_labels_T) / batchSize)

        '----------------------------------------------------------------------------------------------------------'

        # Tensorflow CNN model
        print('\nStrat to build the CNN model...')
        tf.disable_v2_behavior()
        sess = tf.InteractiveSession()

        x = tf.placeholder(tf.float32, [None, vectorSize])  # input variable
        keep_prob = tf.placeholder(tf.float32)  # keep between 0.50 to 1.0
        y_ = tf.placeholder(tf.float32, [None, labelSize])  # expected outputs variable
        x_image0 = tf.reshape(x, [-1, 1, vectorSize, 1])  # arrange the tensor as an image (1*30145) 1 channel
        x_image = tf.transpose(x_image0, perm=[0, 3, 2, 1])  # arrange the tensor into 1 channels (1*30145)

        # 1 LAYER
        W_conv1 = weight_variable([1, wd1, 1, w1])
        b_conv1 = bias_variable([w1])
        h_conv1 = tf.nn.relu(tf.nn.conv2d(x_image, W_conv1, strides=[1, 1, 1, 1], padding='SAME') + b_conv1)
        h_pool1 = tf.nn.max_pool(h_conv1, ksize=[1, 1, h1, 1], strides=[1, 1, h1, 1], padding='SAME')

        # Rectifier LAYER
        coef = int(
            h_pool1.get_shape()[1] * h_pool1.get_shape()[2] * h_pool1.get_shape()[3])  # 1 * 209.34 * 12 ~ 2512
        h_pool2_flat = tf.reshape(h_pool1, [-1, coef])
        W_fc1 = weight_variable([coef, w4])
        b_fc1 = bias_variable([w4])
        h_fc1 = tf.nn.relu(tf.matmul(h_pool2_flat, W_fc1) + b_fc1)

        # Rectifier-Dropout LAYER
        h_fc1_drop = tf.nn.dropout(h_fc1, keep_prob)
        W_fc2 = weight_variable([w4, labelSize])
        b_fc2 = bias_variable([labelSize])
        y_conv = tf.matmul(h_fc1_drop, W_fc2) + b_fc2

        # Loss Function
        cross_entropy = tf.reduce_mean(
            tf.nn.softmax_cross_entropy_with_logits_v2(logits=y_conv, labels=y_) + 0.001 * tf.nn.l2_loss(W_conv1))
        train_step = tf.train.AdamOptimizer(1e-4).minimize(cross_entropy)
        correct_prediction = tf.equal(tf.argmax(y_conv, 1), tf.argmax(y_, 1))
        trueResult = tf.argmax(y_conv, 1)
        trueTest = tf.argmax(y_, 1)
        accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
        print('\nFinished the model building!')

        '----------------------------------------------------------------------------------------------------------'

        sess.run(tf.initialize_all_variables())
        saver = tf.train.Saver()

        while ((best < limit) & (iter < iterMax)):
            indexBatch = []
            for iB in range(0, len(oneHot_labels_T)):
                indexBatch.append(iB)
                random.shuffle(indexBatch)
            for run in range(0, runs):
                xa, ya = getBatch_run(seq_T, oneHot_labels_T, batchSize, run, indexBatch, vectorSize)
                train_step.run(feed_dict={x: xa, y_: ya, keep_prob: 0.5})

            xa, ya = getBatch(seq_T, oneHot_labels_T, batchSize, vectorSize)
            train_accuracy = accuracy.eval(feed_dict={x: xa, y_: ya, keep_prob: 1.0})

            xaV, yaV = getBatch(seq_V, oneHot_labels_V, oneHot_labels_V.shape[0], vectorSize)
            valid_accuracy = accuracy.eval(feed_dict={x: xaV, y_: yaV, keep_prob: 1.0})
            cross_entropyVal = cross_entropy.eval(feed_dict={x: xaV, y_: yaV, keep_prob: 1.0})
            cross_entropyTrain = cross_entropy.eval(feed_dict={x: xa, y_: ya, keep_prob: 1.0})

            validWindowValue = 0
            tempValid = validWindow
            for i in range(0, len(validWindow) - 1):
                tempValid[i] = validWindow[i + 1]
            for i in range(0, len(validWindow)):
                validWindow[i] = tempValid[i]
            validWindow[len(validWindow) - 1] = valid_accuracy
            for i in range(0, len(validWindow)):
                validWindowValue = validWindowValue + validWindow[i]
            validWindowValue = validWindowValue / len(validWindow)
            tempValid = repeatWindow
            for i in range(0, len(repeatWindow) - 1):
                tempValid[i] = repeatWindow[i + 1]
            for i in range(0, len(repeatWindow)):
                repeatWindow[i] = tempValid[i]
            repeatWindow[len(repeatWindow) - 1] = valid_accuracy
            if np.var(repeatWindow) == 0 and iter > 10:
                iter = iter
            if validWindowValue > best or cross_entropyVal < validBest:
                validBest = cross_entropyVal
                best = validWindowValue

                xaT, yaT = getBatch(seq_V, oneHot_labels_V, oneHot_labels_V.shape[0], vectorSize)
                test_accuracy = accuracy.eval(feed_dict={x: xaT, y_: yaT, keep_prob: 1.0})
                if kfoldIndex == 0:
                    save_path = saver.save(sess, path + 'model/CNN_model.ckpt')

                results = correct_prediction.eval(feed_dict={x: xaT, y_: yaT, keep_prob: 1.0})
                yResult = trueResult.eval(feed_dict={x: xaT, y_: yaT, keep_prob: 1.0})
                yTest = trueTest.eval(feed_dict={x: xaT, y_: yaT, keep_prob: 1.0})

                fOut = open(path + 'model/outputVector_1.txt', 'w')
                fOut.write('1\n')
                fOut.write('1\n')
                temp = 1.0 - best
                trueAcc = str(temp)
                print(trueAcc)
                fOut.write(trueAcc + '\n')
                fOut.close()
            log = "%d	%d	%g	%g	%g	%g	%g	%g" % (
                iter, kfoldIndex, train_accuracy, valid_accuracy, best,
                test_accuracy, cross_entropyVal, cross_entropyTrain)
            print(log)
            f.write(log + '\n')
            iter = iter + 1

        f.close()
        np.savetxt(path + 'model/results.txt', yResult, fmt='%i', delimiter=' ')
        np.savetxt(path + 'model/test.txt', yTest, fmt='%i', delimiter=' ')
        f = open(path + 'model/log3.txt', 'a')
        name = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) + ' / CNN model'
        f.write(name + '\n')
        f.close()

        f = open(path + 'model/outputVector_2.txt', 'w')
        f.write('1\n')
        f.write('1\n')
        temp = 1.0 - best
        trueAcc = str(temp)
        print(trueAcc)
        f.write(trueAcc + '\n')
        f.close()

        xaT, yaT = getBatch(seq_V, oneHot_labels_V, oneHot_labels_V.shape[0], vectorSize)

        units = sess.run(h_conv1, feed_dict={x: xaT, y_: yaT, keep_prob: 1.0})
        print(units.shape)
        units = sess.run(h_pool1, feed_dict={x: xaT, y_: yaT, keep_prob: 1.0})
        print(units.shape)

        '----------------------------------------------------------------------------------------------------------'

        sequence = sequence_original[train_model_number_reverse:generate_reverse_number + train_model_number_reverse]
        seq_labels = np.array([0 for x in range(len(sequence))])

        seq_count = 0
        outData, outLabels = [], []
        print('\nStrat to transfer the sequence ...\n')
        for i in range(len(sequence)):
            seq_count += 1
            if seq_count % 100 == 0:
                print('Transferring...  No.{} with total {}'.format(seq_count, len(seq_labels)))
            sample = np.zeros(vectorSize)
            for j in range(0, len(sequence[i])):
                if sequence[i][j] == 'C':
                    sample[j] = 0.25
                elif sequence[i][j] == 'T':
                    sample[j] = 0.50
                elif sequence[i][j] == 'G':
                    sample[j] = 0.75
                elif sequence[i][j] == 'A':
                    sample[j] = 1.0
                else:
                    sample[j] = 0.0
            outData.append(sample)
            outLabels.append(seq_labels[i])

        print('\nTransfer finished!\n')
        pd.DataFrame(sequence).to_csv(path + 'filter_seq/filter_seq.csv', header=None, index=None)
        data = np.array(outData)
        seq_labels = np.array(outLabels)
        oneHotLabels = oneHot(seq_labels, labelSize)

        '----------------------------------------------------------------------------------------------------------'
        'Filter files'

        print('the Filter files')
        units = sess.run(h_conv1, feed_dict={x: data, y_: oneHotLabels, keep_prob: 1.0})
        print('\nThe first h_conv1 layer size:      h_conv1 = {}\n'.format(units.shape))

        sampleSize = int(data.shape[0])
        Mat = np.zeros((sampleSize, vectorSize))

        for filterIndex in range(units.shape[3]):
            print('Loop 1 : Generating the  {}  Filter file'.format(filterIndex))
            for testSize in range(sampleSize):
                for inputSize in range(vectorSize):
                    Mat[testSize][inputSize] = units[testSize][0][inputSize][filterIndex]
            pd.DataFrame(Mat).to_csv(path + '/filter/filter_' + str(filterIndex) + '.csv', header=None, index=None)


        '----------------------------------------------------------------------------------------------------------'
        '----------------------------------------------------------------------------------------------------------'
        '----------------------------------------------------------------------------------------------------------'

        if reverse_method == 'Pooling':
            numberWindows = posPool(path, vectorSize, reverse_max_Pooling_window_size)
        elif reverse_method == 'Top':
            numberWindows = posPool_top(path, vectorSize, reverse_max_Pooling_window_size)
        elif reverse_method == 'Combination':
            numberWindows = posPool_combination(path, vectorSize, reverse_max_Pooling_window_size,
                                                reverse_top_in_window)

        creatFeatVector(path, numberWindows, vectorSize, primer_length)
        getFeature(basic_path, path, variant_Name, Number=100)

        reverse_sameFeature(path)
        p_value = reverse_CG_content_chect(path, min_CG=0.4, max_CG=0.6)

        if p_value == 2:
            pass
        else:
            p_value = reverse_calculateAppearance(basic_path, path, variant_Name)
            if p_value == 2:
                pass
            else:
                reverse_commonFeatureAppearance(path, forward_primer)

    sess.close()

## Final

In [None]:
for variant_Name in variant_list:
    path = basic_path + variant_Name + '/reverse/'
    
    seq_file = S3_to_sageMaker('seq', variant_Name)
    sequence_full = pd.read_csv(seq_file, header=None).values.ravel()  # get the sequence

    get_available_primers(path)
    length_of_amplicon(path, sequence_full)
    reshape_file(path)

    reverse(path)
    primer_design_rules(path)

In [None]:
for variant_Name in variant_list:
    folder = os.path.exists(basic_path + variant_Name + '/reverse/' + 'amplicon_length/r_exist_primers_deltaG.csv')
    if folder:
        shutil.copyfile(basic_path + variant_Name + '/reverse/' + 'amplicon_length/r_exist_primers_deltaG.csv',
                        basic_path + 'Final_result/exist_' + variant_Name + '_primers_result.csv')

    folder = os.path.exists(basic_path + variant_Name + '/reverse/' + 'amplicon_length/r_new_primers_deltaG.csv')
    if folder:
        shutil.copyfile(basic_path + variant_Name + '/reverse/' + 'amplicon_length/r_new_primers_deltaG.csv',
                        basic_path + 'Final_result/new_' + variant_Name + '_primers_result.csv')