In [1]:
import random
from deap import base, creator, tools, algorithms
import pickle
import numpy as np
import matplotlib.pyplot as plt

from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, GradientBoostingRegressor, RandomForestRegressor
from sklearn.model_selection import KFold
from sklearn.utils import shuffle

import multiprocessing

from tqdm import tqdm


In [2]:
#load data
ds_name = "SemEval_2007-1249"

simp_path = f"workspace/datasets/__all_LFs/{ds_name}_simp_labels.pkl"        
src_path = f"/workspace/datasets/__all_LFs/{ds_name}_src_labels.pkl"  

simp_labels = pickle.load(open(simp_path, "rb"))
src_labels= pickle.load(open(src_path, "rb"))

X , y = np.concatenate([simp_labels, src_labels]), np.array([0]*len(simp_labels) + [1]*len(src_labels))
print(X.shape)
print(y.shape)

(2416, 1249)
(2416,)


In [3]:
# datasets: list of datasets
# make_categorical_data: transform numeric -1, 0, 1 data to categorical data, for each dimension 2 new ones, 
#       one with either SIMP/NOT_SIMP and one for ABSTAIN
def load_data(datasets, make_categorical_data):
    KAT = ''
    if make_categorical_data:
        KAT = '_KAT'

    data_merged = []
    labels = []

    for d_s in datasets:
        simp_path = f"workspace/datasets/__all_LFs/{ds_name}_simp_labels.pkl"        
        src_path = f"/workspace/datasets/__all_LFs/{ds_name}_src_labels.pkl"    

        simp_labels = pickle.load(open(simp_path, "rb"))
        src_labels = pickle.load(open(src_path, "rb")) 

        for entry in simp_labels:
            if make_categorical_data:
                new_ent = []
                for e in entry:
                    if e == -1:
                        new_ent.append(0)
                        new_ent.append(0)

                    if e == 0:
                        new_ent.append(1)
                        new_ent.append(0)

                    if e == 1:
                        new_ent.append(0)
                        new_ent.append(1)
                    
                data_merged.append(new_ent)
            else:
                data_merged.append(entry.tolist())

        for entry in src_labels:
            if make_categorical_data:
                new_ent = []
                for e in entry:
                    if e == -1:
                        new_ent.append(0)
                        new_ent.append(0)

                    if e == 0:
                        new_ent.append(1)
                        new_ent.append(0)

                    if e == 1:
                        new_ent.append(0)
                        new_ent.append(1)

                data_merged.append(new_ent)
            else:
                data_merged.append(entry.tolist())

        curr_lab = [0] * len(simp_labels) + [1] * len(simp_labels)
        labels = labels + curr_lab

    X, y = shuffle(data_merged, labels, random_state=42)
    X = np.array(X)
    y = np.array(y)
    return X, y, KAT

In [4]:
X, y, _ = load_data([ds_name], make_categorical_data=False)

In [5]:
print(X.shape)
print(y.shape)

(2416, 1249)
(2416,)


In [6]:
creator.create("FitnessMax", base.Fitness, weights=(1.0,-5.0))
#creator.create("FitnessMulti", base.Fitness, weights=(.1, -1.0))

creator.create("Individual", list, fitness=creator.FitnessMax)

In [7]:
toolbox = base.Toolbox()
toolbox.register("attr_bool", random.randint, 0, 1)
toolbox.register("individual", tools.initRepeat, creator.Individual, toolbox.attr_bool, n=X.shape[1])
toolbox.register("individual_test", tools.initRepeat, creator.Individual, toolbox.attr_bool, n=10)

toolbox.register("population", tools.initRepeat, list, toolbox.individual)

In [8]:
clf_rf = RandomForestClassifier(random_state=42)
kfold = KFold(n_splits=5, shuffle=True, random_state=42)
from sklearn.model_selection import cross_val_score

In [9]:
def eval_dim_vec(individual):

    sel_indices = np.array(individual).nonzero()[0]
    can_X = np.take(X, sel_indices, axis=1)

    cv_scores = cross_val_score(clf_rf, can_X, y, cv=kfold)
    mean_acc = np.mean(cv_scores)
    num_lfs = len(sel_indices)
    #calc ratio of remaining lfs
    ratio = len(sel_indices)/len(individual)


    return ratio, mean_acc
    return mean_acc, ratio
     

In [10]:
def eval_dim_vec_s_objective(individual):

    l1 = 1
    l2 = 20

    sel_indices = np.array(individual).nonzero()[0]
    can_X = np.take(X, sel_indices, axis=1)

    cv_scores = cross_val_score(clf_rf, can_X, y, cv=kfold)
    mean_acc = np.mean(cv_scores)
    num_lfs = len(sel_indices)
    #calc ratio of remaining lfs
    ratio = len(sel_indices)/len(individual)

    score = l1*mean_acc+l2*ratio

    return score,

In [11]:
def show_pareto_front(pareto_optimal_inds):
    fitness_vals = []
    with tqdm(total = len(pareto_optimal_inds)) as pbar:
        for ind in pareto_optimal_inds:
            fitness_vals.append(eval_dim_vec(ind))
            pbar.update(1)

    xs, ys = [x[0] for x in fitness_vals], [x[1] for x in fitness_vals]
    plt.scatter(xs, ys)
    plt.show()
    
    return fitness_vals

In [12]:
toolbox.register("evaluate", eval_dim_vec)
toolbox.register("mate", tools.cxTwoPoint)
toolbox.register("mutate", tools.mutFlipBit, indpb=0.05)
toolbox.register("select", tools.selBest)

In [13]:
def main():
    
    NGEN = 20
    MU = 200
    LAMBDA = 100
    CXPB = 0.5
    MUTPB = 0.2

    pop = toolbox.population(n=MU)
    #hof = tools.HallOfFame(15)
    hof = tools.ParetoFront()

    stats = tools.Statistics(lambda ind: ind.fitness.values)
    pool = multiprocessing.Pool(processes=4)
    toolbox.register("map", pool.map)

    stats.register("avg", np.mean, axis=0)
    stats.register("std", np.std, axis=0)
    stats.register("min", np.min, axis=0)
    stats.register("max", np.max, axis=0)
    
    #pop, logbook = algorithms.eaSimple(pop, toolbox, cxpb=0.5, mutpb=0.2, ngen=5, stats=stats, halloffame=hof, verbose=True)

    pop, logbook = algorithms.eaMuPlusLambda(pop, toolbox, MU, LAMBDA, CXPB, MUTPB, NGEN, stats,
                              halloffame=hof)
    
    return pop, logbook, hof

In [14]:
if __name__ == "__main__":
    print(ds_name)
    pop, log, hof = main()
    
    show_pareto_front(hof)

    gen, avg, min_, max_ = log.select("gen", "avg", "min", "max")
    plt.plot(gen, avg, label="average")
    plt.plot(gen, min_, label="minimum")
    plt.plot(gen, max_, label="maximum")
    plt.xlabel("Generation")
    plt.ylabel("Fitness")
    plt.legend(loc="lower right")
    plt.show()

SemEval_2007-1249
gen	nevals	avg                    	std                    	min                    	max                    
0  	200   	[0.79018193 0.50083667]	[0.00692071 0.01334666]	[0.7632548  0.46997598]	[0.80381568 0.53963171]
1  	67    	[0.7945923  0.50239392]	[0.00403131 0.01278645]	[0.78849991 0.47157726]	[0.80713259 0.5316253 ]
2  	74    	[0.79678577 0.50345476]	[0.00334225 0.01262782]	[0.7922249  0.46036829]	[0.80713259 0.5316253 ]
3  	68    	[0.79842875 0.50481585]	[0.00289292 0.01242979]	[0.79470766 0.46036829]	[0.80713259 0.5316253 ]
4  	65    	[0.79955457 0.50570456]	[0.00255305 0.01370463]	[0.7963674  0.46036829]	[0.80713259 0.53803042]
5  	69    	[0.80070286 0.50696958]	[0.00224129 0.01309122]	[0.79760194 0.46036829]	[0.80795733 0.53803042]
6  	61    	[0.80181829 0.50801041]	[0.00205064 0.0128185 ]	[0.79926424 0.46036829]	[0.80795733 0.53803042]
7  	71    	[0.80265832 0.50730985]	[0.00186693 0.01224726]	[0.80009154 0.4723779 ]	[0.80836456 0.53803042]
8  	76    	[0.80338

KeyboardInterrupt: 

In [None]:
show_pareto_front(hof)

In [None]:
opt_lfs_path = f"/workspace/datasets/gen_opt_lfs/{ds_name}_lfs.pkl"

In [None]:
pickle.dump(np.array(hof[0]), open(opt_lfs_path, "wb"))

In [None]:
pickle.load(open(opt_lfs_path, "rb"))