In [10]:
import zipfile
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import openpyxl
import re
import csv

In [11]:
# Cargamos el dataset, el cual está comprimido en un .zip
ruta_zip = '../data/RepoRT_classified_CCinformation.zip'
nombre_tsv = 'RepoRT_classified_CCinformation.tsv'

with zipfile.ZipFile(ruta_zip, 'r') as archivo_zip:
    with archivo_zip.open(nombre_tsv) as archivo_tsv:
        dataset = pd.read_csv(archivo_tsv, sep='\t')

  dataset = pd.read_csv(archivo_tsv, sep='\t')


In [12]:
# Ponemos todos los datos de las columnas del mismo tipo
dataset['name'] = dataset['name'].astype('str')
dataset['comment'] = dataset['comment'].astype('str')

In [13]:
class Column:
    def __init__(self, name, usp_code, length, id, particle_size, temperature, flowrate, t0):
        self.name = name
        self.usp_code = usp_code
        self.length = length
        self.id = id
        self.particle_size = particle_size
        self.temperature = temperature
        self.flowrate = flowrate
        self.t0 = t0
    
    def __eq__(self, value):
        if not isinstance(value, Column):
            return False
        return (
            self.name == value.name and
            self.usp_code == value.usp_code and
            self.length == value.length and
            self.id == value.id and
            self.particle_size == value.particle_size and
            self.temperature == value.temperature and
            self.flowrate == value.flowrate and
            self.t0 == value.t0
        )
    
    def __hash__(self):
        return hash((
            self.name,
            self.usp_code,
            self.length,
            self.id,
            self.particle_size,
            self.temperature,
            self.flowrate,
            self.t0
        ))
    
    def __repr__(self):
        return f"Config(eluyente1={self.eluyente1}, eluyente2={self.eluyente2}, columna={repr(self.columna)})"
    def __str__(self):
        return f"Config(eluyente1={self.eluyente1}, eluyente2={self.eluyente2}, columna={repr(self.columna)})"

In [14]:
class Config:
    def __init__(self, eluyente1, eluyente2, columna:Column):
        self.eluyente1 = eluyente1
        self.eluyente2 = eluyente2
        self.columna = columna

    def __eq__(self, value):
        if not isinstance(value, Config):
            return False
        return (
            self.eluyente1 == value.eluyente1 and
            self.eluyente2 == value.eluyente2 and
            self.columna == value.columna
        )
    
    def __hash__(self):
        return hash((self.eluyente1, self.eluyente2, self.columna))
    
    def __repr__(self):
        return (f"Config(eluyente1={self.eluyente1}, eluyente2={self.eluyente2}, "
                    f"columna=Column(name={self.columna.name}, usp_code={self.columna.usp_code}, "
                    f"length={self.columna.length}, id={self.columna.id}, "
                    f"particle_size={self.columna.particle_size}, temperature={self.columna.temperature}, "
                    f"flowrate={self.columna.flowrate}, t0={self.columna.t0}))")
    
    def __str__(self):
            # Accediendo a los atributos de la columna desde el objeto columna
            return (f"Config(eluyente1={self.eluyente1}, eluyente2={self.eluyente2}, "
                    f"columna=Column(name={self.columna.name}, usp_code={self.columna.usp_code}, "
                    f"length={self.columna.length}, id={self.columna.id}, "
                    f"particle_size={self.columna.particle_size}, temperature={self.columna.temperature}, "
                    f"flowrate={self.columna.flowrate}, t0={self.columna.t0}))")

In [15]:
eluent_columns = [
    f"eluent.{i}.{compound} 0"
    for i in [1, 2]  # Para eluent.1 y eluent.2
    for compound in [
        "h2o", "meoh", "acn", "iproh", "acetone", "hex", "chcl3", "ch2cl2", "hept",
        "formic", "acetic", "trifluoroacetic", "phosphor", "nh4ac", "nh4form",
        "nh4carb", "nh4bicarb", "nh4f", "nh4oh", "trieth", "triprop", "tribut",
        "nndimethylhex", "medronic", "pH"
    ]
]

# Lista original de columnas
columns_to_extract = [
    "id",
    "column.name", "column.usp.code_0", "column.usp.code_L1", "column.usp.code_L10",
    "column.usp.code_L109", "column.usp.code_L11", "column.usp.code_L114", "column.usp.code_L122",
    "column.usp.code_L3", "classyfire.class", "rt", "alternative_parents",
    "column.usp.code_L43", "column.usp.code_L68", "column.usp.code_L7", "column.length",
    "column.id", "column.particle.size", "column.temperature", "column.flowrate", "column.t0"
]

# Generar nombres de columnas para "t 0", ..., "t 17"
t_columns = [f"t {i}" for i in range(18)]

# Combinar todas las listas
columns_to_extract += eluent_columns + t_columns

data = dataset[columns_to_extract]
# print(data.columns)

In [16]:
def fscore_peor_caso(lista:list):
    if not lista:
        return 0
    return min(lista)

def fscore_caso_medio(lista:list):
    if not lista:
        return 0
    return sum(lista)/len(lista)

def fscore_mejor_caso(lista:list):
    if not lista:
        return 0
    return max(lista)

In [None]:
def alpha(rta, rtb, t0a, t0b):
    if rtb == t0b:
        return float('inf')
    else:
        return (rta - t0a) / (rtb - t0b)

# def t0(length, flowrate):
#     return length/flowrate

def diff(rta, rtb, total):
    return (rtb - rta) / total

In [18]:
# Obtiene los tipos de experimentos que hay, y añade una columan experiment para posteriormente agrupar.
def get_types_of_experiments(data):
    # data['experiment'] = data['id'].apply(lambda x: x.split('_')[0])
    data.loc[:, 'experiment'] = data['id'].apply(lambda x: x.split('_')[0])
    return data['experiment'].unique()

In [19]:
def get_family_name(family):
    match = re.search(r'\((.*?)\)', family)
    if match:
        family_name = family.split('(')[0].strip()
        return family_name
    return family

In [20]:
# Obtiene un conjunto de posibles padres (familia) del compuesto, uniendo el nombre de la columna class con alternative_parents.
def get_posible_families(experiment_data):
    alternative_parents = [set(item.split(",")) for item in experiment_data["alternative_parents"]]
    class_attr = experiment_data["classyfire.class"]
    return [parent_set.union({cls}) for parent_set, cls in zip(alternative_parents, class_attr)]

In [21]:
# Retorna una lista de datasets separados por experimento, tras haberlos filtrado por la familia.
def filter_by_family(experiment_data, families, family_name, family):
    return experiment_data[[family_name in conjunto or family in conjunto for conjunto in families]]

In [22]:
# Filtra el dataset según su configuración 
def filter_by_config(data_family, config):
    usp_columns = [
        'column.usp.code_0', 'column.usp.code_L1', 'column.usp.code_L10',
        'column.usp.code_L109', 'column.usp.code_L11', 'column.usp.code_L114',
        'column.usp.code_L122', 'column.usp.code_L3', 'column.usp.code_L43',
        'column.usp.code_L68', 'column.usp.code_L7'
    ]
    # Crear una lista de condiciones para las columnas usp_code
    usp_conditions = [
        (data_family[usp_col] == 1) if usp_col == config.columna.usp_code else (data_family[usp_col] == 0)
        for usp_col in usp_columns
    ]

    # Combinar todas las condiciones usando '&'
    combined_usp_condition = usp_conditions[0]
    for condition in usp_conditions[1:]:
        combined_usp_condition &= condition

    # Crear la condición general
    combined_condition = (
        (data_family['column.name'] == config.columna.name) &
        (data_family['column.length'] == config.columna.length) &
        (data_family['column.id'] == config.columna.id) &
        (data_family['column.particle.size'] == config.columna.particle_size) &
        (data_family['column.temperature'] == config.columna.temperature) &
        (data_family['column.flowrate'] == config.columna.flowrate) &
        (data_family['column.t0'] == config.columna.t0) &
        combined_usp_condition  # Condiciones combinadas de usp_code
    )

    # Filtrar los datos que cumplen la condición
    return data_family[combined_condition]

In [23]:
# Devuelve una lista de objetos de configuración
def create_config_objects(dataset):
    def get_uspcode(row):
        usp_columns = [
            'column.usp.code_0', 'column.usp.code_L1', 'column.usp.code_L10',
            'column.usp.code_L109', 'column.usp.code_L11', 'column.usp.code_L114',
            'column.usp.code_L122', 'column.usp.code_L3', 'column.usp.code_L43',
            'column.usp.code_L68', 'column.usp.code_L7'
        ]
        # Busca la primera columna con valor 1 y devuelve su nombre
        return next((col for col in usp_columns if row[col] == 1), None)

    config_objects = set()

    for _, row in dataset.iterrows():
        # Crear el objeto Column
        columna = Column(
            name=row['column.name'],
            usp_code=get_uspcode(row),  # Llama a la función para determinar el USP Code
            length=row['column.length'],
            id=row['column.id'],
            particle_size=row['column.particle.size'],
            temperature=row['column.temperature'],
            flowrate=row['column.flowrate'],
            t0=row['column.t0']
        )
            
        # Crear el objeto Config
            
        eluyente1 = next((key for key in dataset.columns if key.startswith('eluent.1.') and row[key] == 100), None)
        eluyente2 = next((key for key in dataset.columns if key.startswith('eluent.2.') and row[key] == 100), None)

        config = Config(eluyente1=eluyente1, eluyente2=eluyente2, columna=columna)
        config_objects.add(config)

    return list(config_objects)

In [24]:
# Dada los tipos de experimentos separa
def process_experiments(data, experiments, family_name, family):
    result_datasets = []
    configs = []  # Lista de configuraciones
    for exp in experiments:
        experiment_data = data[data['experiment'] == exp]
        families = get_posible_families(experiment_data)
        data_family = filter_by_family(experiment_data, families, family_name, family)
            
        # Creamos un objeto configuración y lo añadimos a una lista
        exp_configs = create_config_objects(data_family)
        configs.extend(exp_configs)
            
        datasets_by_config = [filter_by_config(data_family, config) for config in exp_configs]
        result_datasets.extend(datasets_by_config)
        
    return result_datasets, configs

In [36]:
def calculate_alpha_results(dataset):
    resultados_dataset = []
    primer_elemento = dataset.iloc[0]
    #t0_value = t0(primer_elemento['column.length'], primer_elemento['column.flowrate'])
        
    for i in range(1, len(dataset)):
        elemento_n_menos_1 = dataset.iloc[i - 1]
        elemento_n = dataset.iloc[i]
        resultado = alpha(elemento_n_menos_1['rt'], elemento_n['rt'], elemento_n_menos_1['column.t0'], elemento_n['column.t0'])#t0_value)
        resultados_dataset.append(resultado)
    return resultados_dataset


In [26]:
def calculate_diff_results(dataset):
    resultados_dataset = []
    ultimo_elemento = dataset.iloc[len(dataset)-1]
    total = ultimo_elemento['rt']
        
    for i in range(1, len(dataset)):
        elemento_n_menos_1 = dataset.iloc[i - 1]
        elemento_n = dataset.iloc[i]
        resultado = diff(elemento_n_menos_1['rt'], elemento_n['rt'], total)
        resultados_dataset.append(resultado)
    return resultados_dataset


In [27]:
def calculate_results(result_datasets, is_alpha):
    resultados = []
    for dataset in result_datasets:
        if dataset.empty:
            continue
        if(is_alpha):
            resultados.append(calculate_alpha_results(dataset))
        else:
            resultados.append(calculate_diff_results(dataset))
    return resultados

In [28]:
def build_results_list(configs, resultados, fscore):
    lista_tuplas = []
    for i in range(min(len(configs), len(resultados))):
        # Aplicamos la función fscore a los resultados de cada configuración
        score = fscore(resultados[i])
        lista_tuplas.append((configs[i].__str__(), score.__str__()))  # Emparejamos config con su puntaje
    return lista_tuplas

In [41]:
def main_with_alpha(data, family, fscore):    
    # Obtener los tipos de experimentos y el nombre de la familia
    types_of_experiments = get_types_of_experiments(data)
    family_name = get_family_name(family)

    # Procesar experimentos y obtener datasets y configuraciones
    result_datasets, configs = process_experiments(data, types_of_experiments, family_name, family)

    #save_excel(result_datasets, "../data/result_datasets.xlsx")

    # Ordenar datasets por RT
    for dataset in result_datasets:
        dataset.sort_values(by="rt", inplace=True)

    # Calcular resultados de alpha
    resultados = calculate_results(result_datasets, is_alpha=True)

    # Construir la lista de resultados finales
    lista_tuplas = build_results_list(configs, resultados, fscore)

    return lista_tuplas

In [30]:
def main_with_diff(data, family, fscore):
    # Obtener los tipos de experimentos y el nombre de la familia
    types_of_experiments = get_types_of_experiments(data)
    family_name = get_family_name(family)

    # Procesar experimentos y obtener datasets y configuraciones
    result_datasets, configs = process_experiments(data, types_of_experiments, family_name, family)

    #save_excel(result_datasets, "../data/result_datasets.xlsx")

    # Ordenar datasets por RT
    for dataset in result_datasets:
        dataset.sort_values(by="rt", inplace=True)

    # Calcular resultados de alpha
    resultados = calculate_results(result_datasets, is_alpha=False)

    # Construir la lista de resultados finales
    lista_tuplas = build_results_list(configs, resultados, fscore)

    return lista_tuplas

In [42]:
peor_resultado_alpha = main_with_alpha(data, "Organooxygen compounds (CHEMONTID:0000323)", fscore=fscore_peor_caso)
medio_resultado_alpha = main_with_alpha(data, "Organooxygen compounds (CHEMONTID:0000323)", fscore=fscore_caso_medio)
mejor_resultado_alpha = main_with_alpha(data, "Organooxygen compounds (CHEMONTID:0000323)", fscore=fscore_mejor_caso)

In [32]:
peor_resultado_diff = main_with_diff(data, "Organooxygen compounds (CHEMONTID:0000323)", fscore=fscore_peor_caso)
medio_resultado_diff = main_with_diff(data, "Organooxygen compounds (CHEMONTID:0000323)", fscore=fscore_caso_medio)
mejor_resultado_diff = main_with_diff(data, "Organooxygen compounds (CHEMONTID:0000323)", fscore=fscore_mejor_caso)

In [43]:
peor_segundos_terminos_alpha = [tupla[1] for tupla in peor_resultado_alpha]
medio_segundos_terminos_alpha = [tupla[1] for tupla in medio_resultado_alpha]
mejor_segundos_terminos_alpha = [tupla[1] for tupla in mejor_resultado_alpha]
print(peor_segundos_terminos_alpha)
print(medio_segundos_terminos_alpha)
print(mejor_segundos_terminos_alpha)

['0.02134778826206251', '0.27219796215429404', '0.9347753371915736', '0.12392256605906461', '0.10525977674654972', '0.859777500023406', '0.7963512940178193', '-1.1062460730357622', '0.45329763287167124', '0.7065688562854285', '0.7171809630599754', '0', '0.6781979082864038', '-0.1111111111111111', '0.2356020942408377', '0.8332514493465658', '0.020042442820089598', '0.07231555880204531', '0', '-0.028928336620644236', '0.7353901996370236', '0.6961409905803706', '0.6491895232293295', '0', '0.058834693490225846', '0.029426534096615463', '-0.02980233700037708', '0.5660639777468708', '0.23002891502059197', '0.5745784695201038', '-0.07347331258286918', '-0.007610732300013264', '0.9046942800788954', '0.10881186411450251', '0.3649706457925636', '-5.000000000000089', '0', '-1.9412011511625689', '-0.4328832035697412', '0.9037100283691849', '0.5604278549553948', '0.7639077340569878', '0.7590711175616837', '0.09460610232207078', '0', '-0.5160493827160494', '0.24623115577889454', '0.01878549584971602

In [34]:
peor_segundos_terminos_diff = [tupla[1] for tupla in peor_resultado_diff]
medio_segundos_terminos_diff = [tupla[1] for tupla in medio_resultado_diff]
mejor_segundos_terminos_diff = [tupla[1] for tupla in mejor_resultado_diff]
print(peor_segundos_terminos_diff)
print(medio_segundos_terminos_diff)
print(mejor_segundos_terminos_diff)

['0.006042296072507558', '0.0', '0.06250000000000006', '0.0', '0.002503661594395906', '0.0013612122990968062', '0.006060606060606039', '0.0', '0.00011725163746807784', '0.0', '0.0014825796886583655', '0', '0.23752969121140144', '0.0024937655860348875', '0.0003814973772054897', '0.05131298134916961', '0.9471285323609845', '0.7016574585635359', '0', '0.7634146341463415', '0.2156804733727811', '0.015706806282722457', '0.3340122199592668', '0', '0.004626247869491122', '0.8567375886524823', '0.004277563670196437', '0.00573613766730402', '0.0', '0.0038216560509554457', '0.0026539278131634783', '0.0', '0.03978494623655917', '0.02369077306733168', '0.0061117578579744135', '0.0025929127052722388', '0', '0.0035639286871958402', '0.0018150286917499115', '0.0', '0.010721396844846081', '0.002923976608187202', '0.008746355685131267', '0.013926747142497286', '0', '0.7257683215130024', '0.12328767123287664', '0.0071465033180194044', '0.0', '0', '0', '0.0', '0.027777777777777714', '0.0', '0', '0', '0',