In [34]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from statsmodels.regression.linear_model import OLS
import math
import warnings

<h1>1. Preprocesamiento</h1>

<h2>1.1 Normalización</h2>

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
warnings.filterwarnings('ignore')

In [3]:
df_0 = pd.read_csv('Data\Concrete_Data_Yeh.csv')

df_p = preprocessing.normalize(df_0, axis=0)
df = pd.DataFrame(df_p, columns=df_0.columns)
df.describe()

Unnamed: 0,cement,slag,flyash,water,superplasticizer,coarseaggregate,fineaggregate,age,csMPa
count,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0
mean,0.029208,0.020274,0.020141,0.030946,0.022451,0.03106,0.030993,0.018259,0.028241
std,0.010856,0.023672,0.023786,0.00364,0.021616,0.002482,0.003212,0.02526,0.013172
min,0.010596,0.0,0.0,0.020759,0.0,0.025571,0.023798,0.0004,0.001837
25%,0.019984,0.0,0.0,0.028105,0.0,0.029754,0.029285,0.002799,0.018694
50%,0.028349,0.006036,0.0,0.031531,0.023158,0.030903,0.03123,0.011197,0.027158
75%,0.036359,0.03922,0.043969,0.032724,0.036908,0.032863,0.033013,0.022393,0.036375
max,0.056096,0.098607,0.074372,0.042098,0.116515,0.036554,0.039768,0.145956,0.065126


<h2>1.2 Selección por GA con AIC</h2>

<h3>1.2.1 Definición de función a optimizar</h3>

In [36]:
## Definición funcion a optimizar

def aic_criterion(df, var_objetivo):
    
    n_var = len(df.drop(columns=[var_objetivo]).columns)
    n = len(df)
    explicativas = list(df.drop(columns=[var_objetivo]).columns)
    
    y = df[var_objetivo]
    X = df[explicativas]
    
    X_val, X_other, y_val, y_other = train_test_split(X, y, test_size=0.80, random_state=42, shuffle=False)
    X_train, X_test, y_train, y_test = train_test_split(X_other, y_other, test_size=0.40, shuffle=False)
    
    del X_other
    del y_other
    
    reg = OLS(y_train, X_train).fit()
    
    return(reg.aic)

<h3>1.2.2 Definición de operadores de cruce y mutación</h3>

In [5]:
## Generar operadores de cruce y mutacion

def cruce(cromosoma_0, cromosoma_1, tipo_cruce):
    
    rng = np.random.default_rng()
    largo_cromosoma = len(cromosoma_0)
    rango_cromosoma = range(largo_cromosoma)
    
    if tipo_cruce == 'punto unico':
        
        punto = rng.choice(rango_cromosoma, size=1, replace=False)[0]
        
        descendencia_0 = np.concatenate((cromosoma_0[:punto], cromosoma_1[punto:]), axis = 0)
        descendencia_1 = np.concatenate((cromosoma_1[:punto], cromosoma_0[punto:]), axis = 0)
        
        return(descendencia_0, descendencia_1)
    
    elif tipo_cruce == 'dos puntos':
        
        puntos = rng.choice(rango_cromosoma, size=2, replace=False)
        punto_0 = min(puntos[0], puntos[1])
        punto_1 = max(puntos[0], puntos[1])
        
        descendencia_0 = np.concatenate((cromosoma_0[:punto_0]
                                         , cromosoma_1[punto_0:punto_1]
                                         , cromosoma_0[punto_1:]), axis = 0)
        descendencia_1 = np.concatenate((cromosoma_1[:punto_0]
                                         , cromosoma_0[punto_0:punto_1]
                                         , cromosoma_1[punto_1:]), axis = 0)
        
        return(descendencia_0, descendencia_1)
    
    elif tipo_cruce == 'uniforme':

        padre_0 = np.random.randint(2,size=(len(cromosoma_0), 1))
        padre_1 = (padre_0 - 1) * (- 1)
        
        descendencia_0 = [(padre_0[i] * cromosoma_0[i])[0] + (padre_1[i] * cromosoma_1[i])[0] for i in range(len(cromosoma_0))]
        descendencia_1 = [(padre_0[i] * cromosoma_1[i])[0] + (padre_1[i] * cromosoma_0[i])[0] for i in range(len(cromosoma_0))]
        
        return(descendencia_0, descendencia_1)

    
    
def mutacion(cromosoma_0, df, tipo_dato):
    
    rng = np.random.default_rng()
    largo_cromosoma = len(cromosoma_0)
    rango_cromosoma = range(largo_cromosoma)
    
    gen_mutacion = rng.choice(rango_cromosoma, size=1, replace=False)[0]
    
    if tipo_dato == 'binario':
        
        if cromosoma_0[gen_mutacion] == 1:
            nuevo_gen = 0
        else:
            nuevo_gen = 1
    
    elif tipo_dato == 'flotante':
        
        std_gen = np.std(np.array(df.iloc[:, gen_mutacion]))
        
        nuevo_gen = cromosoma_0[gen_mutacion] + np.random.normal(0, std_gen, 1)
        
    cromosoma_0 = np.concatenate((cromosoma_0[:gen_mutacion]
                                  ,np.array([nuevo_gen])[0]
                                  ,cromosoma_0[gen_mutacion+1:]), axis = 0)
    
    return(cromosoma_0)

<h3>1.2.3 Definición de estrategias de selección</h3>

In [85]:
## Generar funciones de seleccion

def seleccion(df, factor_seleccion, tipo_seleccion, tamano_torneo, tamano_elitismo):
    
    df['seleccionado'] = 0
    
    df_c = df[df['clase']=='C'].reset_index()
    df_p = df[df['clase']=='P'].reset_index()
    
    if tipo_seleccion == 'ruleta':
        
        df_c['dummy'] = 0
        max_fitness = max(df_c['fitness'])
        min_fitness= min(df_c['fitness'])
        
        if (min_fitness < 0) & (max_fitness >= 0):
            df_c['fitness_escalado'] = (df_c['fitness'] - max_fitness - 1) * (-1)
        elif min_fitness < 0:
            df_c['fitness_escalado'] = df_c['fitness'] * (-1)
        else:
            df_c['fitness_escalado'] = df_c['fitness']
            
            
        total_aic_esc = sum(df_c['fitness_escalado'])
        df_c['prob'] = df_c['fitness_escalado'] / total_aic_esc
        df_c['prob_acum'] = df_c.groupby(['dummy'])['prob'].cumsum()
        df_c['prob_acum_lag'] = df_c['prob_acum'].shift(1).fillna(0)
            
        while (sum(df_c['seleccionado']) / len(df_c)) < factor_seleccion:
                
            rand = np.random.uniform(0, 1)
            df_c['seleccionado'] =  df_c.apply(lambda x: 1 if ((x.prob_acum_lag < rand) 
                                                           & (rand <= x.prob_acum)) else x.seleccionado, axis = 1)

        df_c = df_c.drop(columns=['dummy','fitness_escalado','prob','prob_acum','prob_acum_lag'])
                
    elif tipo_seleccion == 'torneo':
        
        n = len(df_c) - 1
        df_c['fila'] = df_c.index
        
        while (sum(df_c['seleccionado']) / len(df_c)) < factor_seleccion:
                
            rand = list(np.random.choice(np.array(list(df_c[df_c['seleccionado']!=1].index)), tamano_torneo))
            
            df_c['en_torneo'] = df_c.apply(lambda x: 1 if x.fila in rand else 0, axis = 1)
            df_c['min_fitness'] = min(df_c[df_c['en_torneo'] == 1]['fitness'])
            df_c['seleccionado'] =  df_c.apply(lambda x: 1 if ((x.min_fitness == x.fitness)
                                                               & (x.en_torneo == 1)) else x.seleccionado, axis = 1)

        df_c = df_c.drop(columns=['fila','en_torneo','min_fitness'])
    
    if tamano_elitismo > 0:
        df_p['dummy'] = 0
        df_p['orden'] = df_p.sort_values(['fitness']).groupby(['dummy']).cumcount() + 1
        
        df_p['seleccionado'] = df_p.apply(lambda x: 1 if x.orden <= tamano_elitismo else x.seleccionado, axis = 1)
        
        df_p = df_p.drop(columns=['dummy','orden'])
        
    return(df_c.append(df_p))

<h3>1.2.4 Función Conjunta Final</h3>

In [105]:
# Generar función iterable

def ga_optimizacion(df
                    , var_objetivo
                    , df_parametros
                    , tamano_campeones
                    , factor_mutacion
                    , tipo_dato
                    , factor_seleccion
                    , tipo_seleccion
                    , tamano_torneo
                    , tamano_elitismo
                    , tipo_cruce):

    # Marcacion de padres
    
    df_parametros['clase'] = 'P'
    df_parametros['fitness'] = 10 ** 10
    
    # Nueva Generación por cruce
    
    df_parametros = df_parametros.sample(frac=1).reset_index(drop=True)
    df_parametros['fila'] = df_parametros.index + 1
    df_parametros['grupo_cruce'] = (df_parametros['fila'].apply(lambda x: x if x % 2 == 0 else x + 1) / 2) - 1
    
    fila_i = len(df_parametros)
    
    for j in list(df_parametros['grupo_cruce'].unique()):
        
        df_parametros_2 = df_parametros[df_parametros['grupo_cruce']==j].drop(columns=['fila'
                                                                                       , 'grupo_cruce'
                                                                                       , 'clase'
                                                                                       , 'fitness'])
        
        cromosoma_0 = np.array(df_parametros_2.iloc[0,:])
        cromosoma_1 = np.array(df_parametros_2.iloc[1,:])
        
        descend_0, descend_1 = cruce(cromosoma_0, cromosoma_1, tipo_cruce)
        
        descend_0 = list(descend_0) + ['C'] + [0] + [0] + [j]
        descend_1 = list(descend_1) + ['C'] + [0] + [0] + [j]
        
        fila_i = fila_i + 1
        df_parametros.loc[fila_i] = list(descend_0)
        fila_i = fila_i + 1
        df_parametros.loc[fila_i] = list(descend_1)
        
        fila_i = fila_i + 1
    
    # Generacion de fitness
    
    for i in range(len(df_parametros)):
        
        param_selec = df_parametros.drop(columns=['fila'
                                                  , 'grupo_cruce'
                                                  , 'clase'
                                                  , 'fitness']).iloc[i,:]
        param_selec = list(param_selec[param_selec==1].index) + [var_objetivo]
        
        df_parametros['fitness'].iloc[i] = aic_criterion(df[param_selec], var_objetivo)
    
    df_parametros = df_parametros.reset_index(drop = True)
    
    # Seleccionar mejores e incluir elitismo
    
    df_parametros['seleccionado'] = 0
    
    df_parametros = seleccion(df_parametros
                              , factor_seleccion
                              , tipo_seleccion
                              , tamano_torneo
                              , tamano_elitismo)
    
    df_parametros = df_parametros[df_parametros['seleccionado']==1].reset_index(drop = True)
    
    df_parametros['clase'] = 'P'
    
    # Mutación
    
    df_parametros = df_parametros.sample(frac=1).reset_index(drop=True)
    df_parametros['fila'] = df_parametros.index + 1
    df_parametros['con_mutacion'] = df_parametros['fila'].apply(lambda x: 1 if x < (len(df_parametros) * factor_mutacion) else 0)
    
    df_parametros_m = df_parametros[df_parametros['con_mutacion']==1].reset_index(drop = True)
    
    for k in range(len(df_parametros_m)):
        
        cromosoma_0 = df_parametros_m.drop(columns=['fila'
                                                    , 'grupo_cruce'
                                                    , 'clase'
                                                    , 'fitness'])
    
    return(df_parametros)

In [106]:
np.random.seed(1984)

df_prueba = ga_optimizacion(df = df
                    , var_objetivo = 'csMPa'
                    , df_parametros = df_parametros
                    , tamano_campeones = 10
                    , factor_mutacion = 0.1
                    , tipo_dato = 'binario'
                    , factor_seleccion = 0.9
                    , tipo_seleccion = 'ruleta'
                    , tamano_torneo = 5
                    , tamano_elitismo = 10
                    , tipo_cruce = 'punto unico'
                    )

In [108]:
df_prueba.head()

Unnamed: 0,index,cement,slag,flyash,water,superplasticizer,coarseaggregate,fineaggregate,age,cement-water,cement-coarseaggregate,cement-fineaggregate,cement-age,slag-cement,slag-water,slag-coarseaggregate,slag-fineaggregate,slag-age,flyash-cement,flyash-water,flyash-coarseaggregate,flyash-fineaggregate,flyash-age,water-cement,water-coarseaggregate,water-fineaggregate,water-age,superplasticizer-cement,superplasticizer-water,superplasticizer-coarseaggregate,superplasticizer-fineaggregate,superplasticizer-age,coarseaggregate-cement,coarseaggregate-water,coarseaggregate-fineaggregate,coarseaggregate-age,fineaggregate-cement,fineaggregate-water,fineaggregate-coarseaggregate,fineaggregate-age,age-cement,age-water,age-coarseaggregate,age-fineaggregate,exp_cement,exp_slag,exp_flyash,exp_water,exp_superplasticizer,exp_coarseaggregate,exp_fineaggregate,exp_age,exp_cement-water,exp_cement-coarseaggregate,exp_cement-fineaggregate,exp_cement-age,exp_slag-cement,exp_slag-water,exp_slag-coarseaggregate,exp_slag-fineaggregate,exp_slag-age,exp_flyash-cement,exp_flyash-water,exp_flyash-coarseaggregate,exp_flyash-fineaggregate,exp_flyash-age,exp_water-cement,exp_water-coarseaggregate,exp_water-fineaggregate,exp_water-age,exp_superplasticizer-cement,exp_superplasticizer-water,exp_superplasticizer-coarseaggregate,exp_superplasticizer-fineaggregate,exp_superplasticizer-age,exp_coarseaggregate-cement,exp_coarseaggregate-water,exp_coarseaggregate-fineaggregate,exp_coarseaggregate-age,exp_fineaggregate-cement,exp_fineaggregate-water,exp_fineaggregate-coarseaggregate,exp_fineaggregate-age,exp_age-cement,exp_age-water,exp_age-coarseaggregate,exp_age-fineaggregate,clase,fitness,fila,grupo_cruce,seleccionado,con_mutacion
0,723,1,1,0,0,1,1,1,0,0,1,0,1,0,1,0,0,1,0,1,1,0,0,0,0,0,0,0,0,1,1,1,1,1,0,0,0,1,0,0,1,1,1,1,1,0,0,0,1,0,0,1,1,1,0,1,1,0,0,0,0,1,0,1,0,0,1,1,1,0,1,1,1,0,0,1,1,0,0,1,0,0,0,1,0,0,1,P,-2093.96445,1,111.0,1,1
1,592,0,0,1,1,0,0,1,0,1,1,1,0,1,0,0,0,1,1,1,0,1,1,1,1,0,0,1,1,1,0,1,0,1,1,1,1,0,0,0,0,1,0,1,0,0,1,1,1,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,1,1,1,1,1,0,1,0,0,0,1,1,0,1,1,1,0,1,0,1,P,-3868.963277,2,46.0,1,1
2,936,1,1,1,0,0,1,1,0,1,1,0,0,1,1,0,0,0,0,1,1,0,1,1,1,0,0,1,0,1,0,0,0,1,0,1,1,1,1,0,0,1,1,1,1,0,0,1,0,0,1,1,1,0,1,1,1,0,0,1,0,0,1,0,1,0,1,0,1,0,1,0,1,1,0,1,1,1,1,0,0,1,0,1,1,0,1,P,-2097.540985,3,218.0,1,1
3,589,1,1,1,0,0,0,1,1,0,0,0,0,1,1,1,0,0,1,1,0,0,0,0,1,1,0,1,0,1,0,1,1,0,1,1,1,0,0,0,0,0,0,0,0,1,0,1,0,0,1,1,0,0,1,1,1,0,0,1,0,0,0,0,0,1,1,1,1,0,0,0,1,1,0,1,1,1,0,1,1,1,1,0,0,0,1,P,-2092.111131,4,44.0,1,1
4,954,1,0,1,0,1,0,0,1,1,1,0,0,1,0,0,0,0,0,0,1,1,0,0,1,1,1,0,1,0,1,0,1,1,0,1,1,1,1,0,1,0,1,0,1,0,1,0,0,0,0,0,1,1,1,1,0,0,1,0,1,1,1,0,0,1,0,1,1,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,P,-2091.356419,5,227.0,1,1


In [None]:
while (sum(df_c['seleccionado']) / len(df_c)) < factor_seleccion:
                
    rand = list(np.random.randint(0, n, tamano_torneo))
            
    df_c['en_torneo'] = df_c.apply(lambda x: 1 if x.fila in rand else 0, axis = 1)
    df_c['min_fitness'] = min(df_c[df_c['en_torneo'] == 1]['fitness'])
    df_c['seleccionado'] =  df_c.apply(lambda x: 1 if ((x.min_fitness == x.aic)
                                                       & (x.en_torneo == 1)) else x.seleccionado, axis = 1)
            
df_c = df_c.drop(columns=['fila','en_torneo','min_fitness'])

<h3>1.2.5 Generación de dataset con nuevos parámetros</h3>

In [55]:
## Generación de realciones

columnas = list(df.drop(columns=['csMPa']).columns)
for i in columnas:
    columnas_2 = [x for x in columnas if x not in i]
    for j in columnas_2:
        nombre_columna = i + '-' + j
        df[nombre_columna] = df[i] / df[j]

## Dejar varibales con valores no indeterminados

df = df[list(df.describe().replace([np.inf, -np.inf], np.nan).iloc[-1].dropna().index)]

## Generación de Exponenciales

columnas = list(df.drop(columns=['csMPa']).columns)

for i in columnas:
    nombre_columna = 'exp_' + i
    df[nombre_columna] = df[i].apply(lambda x: math.exp(x))

<h3>1.2.6 Generación de dataset con nuevas variables</h3>

In [56]:
parametros = list(df.drop(columns=['csMPa']).columns)
df_parametros = pd.DataFrame(np.random.randint(2
                                               ,size=(500
                                                      , len(parametros)))
                             , columns=parametros)