functions.py

import numpy as np
import random

def quan_detector(dataset,label):
    """
    the percent of zeros(n) promoter
    """
    n = len(dataset[1])
    N = len(dataset)
    p = [0]*n
    count = 0.
    pos_count = 0.
    neg_count = 0.
    for i in range(N):
        d = dataset[i]
        if all(d == p):
            count += 1.
            if label[i][0] == 1:
                pos_count += 1.
            else:
                neg_count += 1.
    return count/N, pos_count,neg_count
#quan_detector([[0,0,0],[0,1,0],[0,2,0],[1,0,1]])

def most_repeared_promoter(dataset,label):
    N = len(dataset)
    n = len(dataset[1])
    zeros = ','.join(map(str,['0']*n))
    # print len(zeros)-n
    dict_count = {}
    for i in range(N):
        str_prom = ','.join(map(str,dataset[i]))
        if str_prom not in dict_count.keys():
            dict_count[str_prom] = [1,0,0]
            if label[i][0] == 1:
                dict_count[str_prom][1] = 1
            else:
                dict_count[str_prom][2] = 1
        else:
            dict_count[str_prom][0] += 1
            if label[i][0] == 1:
                dict_count[str_prom][1] += 1
            else:
                dict_count[str_prom][2] += 1
    if zeros in dict_count.keys():
        dict_count.pop(zeros) # remove without
    count = np.array(dict_count.values())[:,0]
    max_count = max(count)
    for k,v in dict_count.items():
        if v[0] == max_count:
            idx_temp = k
    idx = idx_temp.split(',')
    return idx, max_count, dict_count[idx_temp]

###############################################################
#########          Dataset generation            ##############
###############################################################
def indx(lab):
    #     lab = np.argmax(lab,axis=1)
    p = []  # positive samples index-- ALS
    n = []  # negative samples index-- Non-ALS
    for i in range(len(lab)):
        if lab[i] == 0:
            p.append(i)
        else:
            n.append(i)
    return p, n


def dataset(X, Y, test_ratio):
    lab = np.argmax(Y, axis=1)
    pos_s, neg_s = indx(lab)

    N = len(lab)
    idx = range(N)

    N_te = int(N * test_ratio) / 5 * 5  # number of test samples
    N_tr = N - N_te  # number of training samples

    pos_s_te = int(N_te * 0.5)
    neg_s_te = int(N_te * 0.5)

    random.shuffle(pos_s)
    random.shuffle(neg_s)

    pos_idx_te = pos_s[:pos_s_te]
    neg_idx_te = neg_s[:neg_s_te]

    te_idx = pos_idx_te + neg_idx_te
    tr_idx = list(set(idx) - set(te_idx))

    random.shuffle(te_idx)
    random.shuffle(tr_idx)

    tr_X = X[tr_idx]
    tr_Y = Y[tr_idx]

    te_X = X[te_idx]
    te_Y = Y[te_idx]
    return tr_X, tr_Y, te_X, te_Y