In [None]:
from joblib import logger
import pandas as pd
import transforms_distributed as tfd
# import tc_distributed as tcd
import tc_distributed_pro as tcdp
from dask.distributed import LocalCluster, Client, progress
from dask_jobqueue import HTCondorCluster
import logging, coloredlogs
import sys, os
import copy
import pathlib
import warnings

In [None]:
dataset = 'data/SpectF.csv'
datapath = 'result/' + dataset[5:-4]
# dataset = 'data/R/Openml_586.csv'
# datapath = 'result/R' + dataset[7:-4]

logging.basicConfig(level=logging.ERROR)
# logging.basicConfig(level=logging.ERROR, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger_new = logging.getLogger(__name__)
output_file_handler = logging.FileHandler("log/output_" + dataset[5:-3] + 'log')
# output_file_handler = logging.FileHandler("log/R/output_" + dataset[7:-3] + 'log')
output_file_handler.setFormatter(formatter)
logger_new.addHandler(output_file_handler)
coloredlogs.install(level = 'DEBUG')
pathlib.Path(datapath).mkdir(parents=True, exist_ok=True)
warnings.filterwarnings("ignore")

repeat = 4
una_oprs = tfd.unary_operators
bina_oprs = tfd.binary_operators
weights = [repeat] * (len(una_oprs) + len(bina_oprs))
# import data
dat = tcdp.load_data(dataset, logger=logger_new, art='C')
# the number of next generation's features not over inflation*num_of_curr_gen to form an up-side-down pyramid structure
# magic_number: inflation, cur_limit, total_limit, num_best_features
inflation = 10
cur_size = dat.shape[1]-1
cur_limit = 150
# total_limit can be much larger, which denpends on the device
total_limit = 300
num_best_features = round(2 * dat.shape[1])
cur_dat = dat
prev_gen = None

In [None]:
for i in range(repeat):
    cur_dat, gen = tcdp.updateDat(cur_dat, prev_gen=prev_gen, oprs_weights=weights, art='C', logger=logger_new)
    cur_gen, cur_size = tcdp.constrainFeaturesNum(cur_dat, min(inflation*cur_size, cur_limit), art='C', logger=logger_new)

    prev_gen = pd.concat([prev_gen, gen], axis=1)
    prev_gen[dat.columns[-1]] = dat.iloc[:, -1]
    prev_gen, prev_size = tcdp.constrainFeaturesNum(prev_gen, total_limit, art='C', logger=logger_new)

    cur_dat = tcdp.addInitalFeatures(cur_gen, prev_gen, dat, logger=logger_new)
    cur_dat.to_csv(datapath + '/gen' + str(i+1) + '.csv')

total_dat = pd.concat([prev_gen, cur_dat], axis=1)
total_dat = tcdp.dropHighCorrelation(total_dat, logger=logger_new)
total_dat[dat.columns[-1]] = dat.iloc[:, -1]
# num_best_features = round(0.8 * total_dat.shape[1])
best_features = tcdp.bestFeatures(total_dat, num_best_features, art='C', logger=logger_new)
best_features.to_csv(datapath + '/final' + '.csv')

init_fitness, cur_fitness = tcdp.scoreCompare(dat, best_features, art='C', logger=logger_new)
increase = (cur_fitness - init_fitness) / init_fitness
logger_new.debug("compared with the initial one, the fitness increased by %s" %(str(increase)))

## The Firs Version

In [None]:
dataset = 'data/php0iVrYT.csv'
datapath = 'result/' + dataset[5:-4]

logging.basicConfig(level=logging.ERROR)
# logging.basicConfig(level=logging.ERROR, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger_new = logging.getLogger(__name__)

output_file_handler = logging.FileHandler("log/output2_" + dataset[5:-3] + 'log')
output_file_handler.setFormatter(formatter)
logger_new.addHandler(output_file_handler)
coloredlogs.install(level = 'DEBUG')

# stdout_handler = logging.StreamHandler(sys.stdout)
# stdout_handler.setFormatter(formatter)
# logger_new.addHandler(stdout_handler)

# cluster = LocalCluster(n_workers=4, threads_per_worker=2, memory_limit='2GB')
# client = Client(cluster)

In [None]:
una_oprs = tfd.unary_operators
bina_oprs = tfd.binary_operators

pathlib.Path(datapath).mkdir(parents=True, exist_ok=True)
repeat = 5
weights = [repeat] * (len(una_oprs) + len(bina_oprs))
# load data
dat = tcdp.load_data(dataset, logger=logger_new, art='C')
cur_dat = dat
prev_gen = None
total_limit = 400
prev_size = 0

for i in range(repeat):
    cur_dat.to_csv(datapath + '/gen' + str(i) + '.csv')
    cur_dat = tcdp.update_dat(cur_dat, prev_gen=prev_gen, oprs_weights=weights, art='C', logger=logger_new)
    cur_gen, prev_size = tcdp.pyramid_cur(cur_dat, prev_size, art='C', logger=logger_new)

    prev_gen = pd.concat([prev_gen, cur_gen], axis=1)
    # total_limit can be much larger, which denpends on the device
    prev_gen[dat.columns[-1]] = dat.iloc[:, -1]
    while prev_gen.shape[1] > total_limit:
        logger.info('The number of prev_gen columns %d exceed prev_limit %d, columns selection first' % (prev_gen.shape[1], prev_limit))
        prev_gen = tcdp.feature_selection(prev_gen, art=art, logger=logger)
    prev_gen.drop(prev_gen.columns[-1], axis=1, inplace=True)

    # add in intial features
    if (i % 2):
        cur_dat = pd.concat([cur_gen, dat], axis=1)
    else :
        cur_gen[dat.columns[-1]] = dat.iloc[:, -1]
        cur_dat = cur_gen


In [None]:
# delete the first column of the data after read_csv
cur_dat = pd.read_csv('result/curr_php0iVrYT.csv')
cur_dat.drop(cur_dat.columns[0], axis=1, inplace=True)
cur_gen = pd.DataFrame(cur_dat.iloc[:, :-1], columns=cur_dat.columns[:-1])
# print(cur_gen)
prev_dat = pd.read_csv('result/prev_php0iVrYT.csv')
prev_gen = prev_dat.drop(prev_dat.columns[0], axis=1, inplace=False)
# prev_gen

In [None]:
for i in range(3):
    cur_dat, prev_gen = tcd.update_dat(cur_dat, prev_gen=prev_gen, logger=logger_new)
    cur_dat.to_csv('result/curr_' + dataset[5:])
    prev_gen.to_csv('result/prev_' + dataset[5:])
logger_new.warning('+Successfully finish all steps')