Modelo 1: Predicción a partir de datos de la base de datos.

In [1]:
import zipfile
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
# Cargamos el dataset, el cual está comprimido en un .zip
ruta_zip = '../data/RepoRT_classified_CCinformation.zip'
nombre_tsv = 'RepoRT_classified_CCinformation.tsv'

with zipfile.ZipFile(ruta_zip, 'r') as archivo_zip:
    with archivo_zip.open(nombre_tsv) as archivo_tsv:
        dataset = pd.read_csv(archivo_tsv, sep='\t')


  dataset = pd.read_csv(archivo_tsv, sep='\t')


In [3]:
# Ponemos todos los datos de las columnas del mismo tipo
dataset['name'] = dataset['name'].astype('str')
dataset['comment'] = dataset['comment'].astype('str')

Estructura de los datos

In [22]:
class Column:
    def __init__(self, name, usp_code, length, id, particle_size, temperature, flowrate, t0):
        self.name = name
        self.usp_code = usp_code
        self.length = length
        self.id = id
        self.particle_size = particle_size
        self.temperature = temperature
        self.flowrate = flowrate
        self.t0 = t0
    
    def __eq__(self, value):
        if not isinstance(value, Column):
            return False
        return (
            self.name == value.name and
            self.usp_code == value.usp_code and
            self.length == value.length and
            self.id == value.id and
            self.particle_size == value.particle_size and
            self.temperature == value.temperature and
            self.flowrate == value.flowrate and
            self.t0 == value.t0
        )
    
    def __hash__(self):
        return hash((
            self.name,
            self.usp_code,
            self.length,
            self.id,
            self.particle_size,
            self.temperature,
            self.flowrate,
            self.t0
        ))

In [23]:
class Config:
    def __init__(self, eluyente1, eluyente2, columna:Column):
        self.eluyente1 = eluyente1
        self.eluyente2 = eluyente2
        self.columna = columna

    def __eq__(self, value):
        if not isinstance(value, Config):
            return False
        return (
            self.eluyente1 == value.eluyente1 and
            self.eluyente2 == value.eluyente2 and
            self.columna == value.columna
        )
    
    def __hash__(self):
        return hash((self.eluyente1, self.eluyente2, self.columna))

Filtramos el dataset y nos quedamos con los datos que nos importan

In [24]:
eluent_columns = [
    f"eluent.{i}.{compound} 0"
    for i in [1, 2]  # Para eluent.1 y eluent.2
    for compound in [
        "h2o", "meoh", "acn", "iproh", "acetone", "hex", "chcl3", "ch2cl2", "hept",
        "formic", "acetic", "trifluoroacetic", "phosphor", "nh4ac", "nh4form",
        "nh4carb", "nh4bicarb", "nh4f", "nh4oh", "trieth", "triprop", "tribut",
        "nndimethylhex", "medronic", "pH"
    ]
]

# Lista original de columnas
columns_to_extract = [
    "column.name", "column.usp.code_0", "column.usp.code_L1", "column.usp.code_L10",
    "column.usp.code_L109", "column.usp.code_L11", "column.usp.code_L114", "column.usp.code_L122",
    "column.usp.code_L3", "classyfire.class", "rt", "alternative_parents",
    "column.usp.code_L43", "column.usp.code_L68", "column.usp.code_L7", "column.length",
    "column.id", "column.particle.size", "column.temperature", "column.flowrate", "column.t0"
]

# Generar nombres de columnas para "t 0", ..., "t 17"
t_columns = [f"t {i}" for i in range(18)]

# Combinar todas las listas
columns_to_extract += eluent_columns + t_columns

data = dataset[columns_to_extract]
print(data.columns)

Index(['column.name', 'column.usp.code_0', 'column.usp.code_L1',
       'column.usp.code_L10', 'column.usp.code_L109', 'column.usp.code_L11',
       'column.usp.code_L114', 'column.usp.code_L122', 'column.usp.code_L3',
       'classyfire.class', 'rt', 'alternative_parents', 'column.usp.code_L43',
       'column.usp.code_L68', 'column.usp.code_L7', 'column.length',
       'column.id', 'column.particle.size', 'column.temperature',
       'column.flowrate', 'column.t0', 'eluent.1.h2o 0', 'eluent.1.meoh 0',
       'eluent.1.acn 0', 'eluent.1.iproh 0', 'eluent.1.acetone 0',
       'eluent.1.hex 0', 'eluent.1.chcl3 0', 'eluent.1.ch2cl2 0',
       'eluent.1.hept 0', 'eluent.1.formic 0', 'eluent.1.acetic 0',
       'eluent.1.trifluoroacetic 0', 'eluent.1.phosphor 0', 'eluent.1.nh4ac 0',
       'eluent.1.nh4form 0', 'eluent.1.nh4carb 0', 'eluent.1.nh4bicarb 0',
       'eluent.1.nh4f 0', 'eluent.1.nh4oh 0', 'eluent.1.trieth 0',
       'eluent.1.triprop 0', 'eluent.1.tribut 0', 'eluent.1.nndimeth

In [29]:
def create_config_objects(dataset):
    def get_uspcode(row):
        usp_columns = [
            'column.usp.code_0', 'column.usp.code_L1', 'column.usp.code_L10',
            'column.usp.code_L109', 'column.usp.code_L11', 'column.usp.code_L114',
            'column.usp.code_L122', 'column.usp.code_L3', 'column.usp.code_L43',
            'column.usp.code_L68', 'column.usp.code_L7'
        ]
        # Busca la primera columna con valor 1 y devuelve su nombre
        return next((col for col in usp_columns if row[col] == 1), None)

    config_objects = set()

    for _, row in dataset.iterrows():
        # Crear el objeto Column
        columna = Column(
            name=row['column.name'],
            usp_code=get_uspcode(row),  # Llama a la función para determinar el USP Code
            length=row['column.length'],
            id=row['column.id'],
            particle_size=row['column.particle.size'],
            temperature=row['column.temperature'],
            flowrate=row['column.flowrate'],
            t0=row['column.t0']
        )
        
        # Crear el objeto Config
        
        eluyente1 = next((key for key in dataset.columns if key.startswith('eluent.1.') and row[key] == 100), None)
        eluyente2 = next((key for key in dataset.columns if key.startswith('eluent.2.') and row[key] == 100), None)

        config = Config(eluyente1=eluyente1, eluyente2=eluyente2, columna=columna)
        config_objects.add(config)

    return config_objects


In [30]:
config_objects = create_config_objects(data)

print(len(config_objects))


136


De 164346 registros de configuraciones hay 136 que son distintas tomando en cuenta que definimos configuración como:

* Eluyentes utilizados (eluyente 1 y eluyente 2).

* Columna cromatográfica (Todos los parámetros de la columna cromatográfica son los mismos).

Funciones de scoring

Factor de selectividad.

$$
\alpha = \frac{t_{R,A} - t_0}{t_{R,B} - t_0}
$$

* RTa: Tiempo de retención del metabolito A.
* RTb: Tiempo de retención del metabolito B.
* 𝑡0: Tiempo de retención del soluto no retenido (tiempo muerto).
$$
\text{Tiempo muerto} = \frac{\text{longitud de la columna}}{\text{Flowrate}}
$$

In [None]:
def alpha(data):
    return (rta - t0) / (rtb - t0)

In [40]:
def filter_by_config(data_family, config):
    
    usp_columns = [
        'column.usp.code_0', 'column.usp.code_L1', 'column.usp.code_L10',
        'column.usp.code_L109', 'column.usp.code_L11', 'column.usp.code_L114',
        'column.usp.code_L122', 'column.usp.code_L3', 'column.usp.code_L43',
        'column.usp.code_L68', 'column.usp.code_L7'
    ]
    # Crear una lista de condiciones para las columnas usp_code
    usp_conditions = [
        (data_family[usp_col] == 1) if usp_col == config.columna.usp_code else (data_family[usp_col] == 0)
        for usp_col in usp_columns
    ]

    # Combinar todas las condiciones usando '&'
    combined_usp_condition = usp_conditions[0]
    for condition in usp_conditions[1:]:
        combined_usp_condition &= condition

    # Crear la condición general
    combined_condition = (
        (data_family['column.name'] == config.columna.name) &
        (data_family['column.length'] == config.columna.length) &
        (data_family['column.id'] == config.columna.id) &
        (data_family['column.particle.size'] == config.columna.particle_size) &
        (data_family['column.temperature'] == config.columna.temperature) &
        (data_family['column.flowrate'] == config.columna.flowrate) &
        (data_family['column.t0'] == config.columna.t0) &
        combined_usp_condition  # Condiciones combinadas de usp_code
    )

    # Filtrar los datos que cumplen la condición
    return data_family[combined_condition]


In [None]:
def main(data, family, fscore = None):

    # Obtención de un conjunto con los posibles padres
    alternative_parents = [set(item.split(",")) for item in data["alternative_parents"]]
    class_attr = data["classyfire.class"]
    families = [parent_set.union({cls}) for parent_set, cls in zip(alternative_parents, class_attr)]

    data_family = data[[family in conjunto for conjunto in families]]

    print(data_family)

    configs = create_config_objects(data_family)

    print(len(configs))

    # Para todos los datos de una misma configuración
    datasets_by_config = [
        filter_by_config(data_family, config)
        for config in configs
    ]
    
    print(f"Cantidad de datasets generados: {len(datasets_by_config)}")

    # Ordenar por RT en una lista
    

'''
    # Calcular alpha en pares siguiendo el orden
    alpha(data_family.drop(columns=["alternative_parents", "classyfire.class"]))
    
    # Calcular la función de score
    score = fscore()
    
    return score
'''

'\n    # Calcular alpha en pares siguiendo el orden\n    alpha(data_family.drop(columns=["alternative_parents", "classyfire.class"]))\n    \n    # Calcular la función de score\n    score = fscore()\n    \n    return score\n'

In [41]:
main(data, "Flavonoids (CHEMONTID:0000334)")

                        column.name  column.usp.code_0  column.usp.code_L1  \
0        Waters ACQUITY UPLC HSS T3                0.0                 1.0   
1        Waters ACQUITY UPLC HSS T3                0.0                 1.0   
2        Waters ACQUITY UPLC HSS T3                0.0                 1.0   
10       Waters ACQUITY UPLC HSS T3                0.0                 1.0   
14       Waters ACQUITY UPLC HSS T3                0.0                 1.0   
...                             ...                ...                 ...   
163723  Waters ACQUITY UPLC BEH C18                0.0                 1.0   
163931  Waters ACQUITY UPLC BEH C18                0.0                 1.0   
163933  Waters ACQUITY UPLC BEH C18                0.0                 1.0   
164081  Waters ACQUITY UPLC BEH C18                0.0                 1.0   
164082  Waters ACQUITY UPLC BEH C18                0.0                 1.0   

        column.usp.code_L10  column.usp.code_L109  column.usp.c