In [1]:
from Ej4gen import get_users, get_cars
import pandas as pd
from pymemcache.client import base
from pymemcache import serde
from abc import ABC, abstractmethod
import pymongo
import sqlite3
import psycopg2
import duckdb
import matplotlib.pyplot as plt
import time
import numpy as np
from sqlalchemy import create_engine, text
from functools import partial
import re
import warnings
from memory_profiler import memory_usage
import math
from copy import copy

warnings.filterwarnings('ignore')

cache = base.Client(('127.0.0.1', 11211),
    serde=serde.pickle_serde)

In [2]:
# Clase abstracta con la funcionalidad común de las bases de datos
class DB(ABC):
    def __init__(self, dbname, table_name, unique_cols, query_type="sql", close_conn_after_op=False):
        self.dbname = dbname
        self.table_name = table_name
        self._unique_cols = unique_cols or []
        self.oid = 0
        self._cached_queries = set()
        self._query_type = query_type
        self.close_conn_after_op = close_conn_after_op
        if self.close_conn_after_op:
            try:
                self._open_connection()
                self._delete_table()
            finally:
                self._close_connection()
        else:
            self._open_connection()
            self._delete_table()

    @abstractmethod
    def _delete_table(self):
        pass

    def _get_cache_key(self, query):
        # La llave de la caché usa el nombre de la implementación concreta para poder hacer pruebas con todas a la vez
        return self.__class__.__name__ + "/" + re.sub(r'\s+', '#', str(query))

    def _clear_cache(self):
        for key in list(self._cached_queries):
            cache.delete(key)
        self._cached_queries = set()

    # Insert de la interfaz
    def insert(self, data_csv, data_json, *_, oid=None, batch_size=None, **__):
        if oid is None or oid >= self.oid:
            self.oid += 1
            self._clear_cache()
            if batch_size is None:
                batch_size = len(data_json) # Si no se especifica un tamaño de batch, insertar todo a la vez
            return self._insert(data_csv, data_json, batch_size)

    @abstractmethod
    def _insert(self, data_csv, data_json, batch_size):
        pass

    def read(self, *_, oid=None, query=None, use_cache=True, **__):
        if oid is None or oid >= self.oid:
            self.oid += 1
            # Usar un select all como query predeterminada
            query = query or (f'SELECT * FROM {self.table_name};' if self._query_type == "sql" else {})
            key = self._get_cache_key(query)
            cached_data = cache.get(key)
            if use_cache and cached_data is not None:
                return cached_data
            if self.close_conn_after_op:
                try:
                    self._open_connection()
                    data = self._read(query)
                finally:
                    self._close_connection()
            else:
                data = self._read(query)
            cache.set(key, data)
            self._cached_queries.add(key)
            return data

    @abstractmethod
    def _read(self, query):
        pass

    def update(self, *_, oid=None, **__):
        if oid is None or oid >= self.oid:
            self.oid += 1
            self._clear_cache()
            self._update()

    @abstractmethod
    def _update(self):
        pass

    def to_df(self):
        if self.close_conn_after_op:
            try:
                self._open_connection()
                return self._to_df()
            finally:
                self._close_connection()
        else:
            return self._to_df()

    @abstractmethod
    def _to_df(self):
        pass

    @abstractmethod
    def _open_connection(self):
        pass

    @abstractmethod
    def _close_connection(self):
        pass

    def create_index(self, index_cols=None):
        if index_cols is None:
            index_cols = self._unique_cols

        if self.close_conn_after_op:
            try:
                self._open_connection()
                return self._create_index(index_cols)
            finally:
                self._close_connection()
        else:
            return self._create_index(index_cols)

    @abstractmethod
    def _create_index(self, index_cols):
        pass

    def __del__(self):
        self._clear_cache()
        if not self.close_conn_after_op:
            self._close_connection()

In [3]:
# Clase para calcular tiempos en el gestor de MongoDB
class MongoDB(DB):
    def __init__(self, dbname, table_name, unique_cols=None, close_conn_after_op=False):
        super().__init__(dbname, table_name, unique_cols, query_type="mongo", close_conn_after_op=close_conn_after_op)

    def _delete_table(self):
        self.db.drop_collection(self.table_name)

    def _insert(self, _, data_json, batch_size):
        if self.close_conn_after_op:
            for i in range(0, len(data_json), batch_size):
                try:
                    self._open_connection()
                    self.collection.insert_many(data_json[i:i+batch_size]) # Inserta los registros en batches
                finally:
                    self._close_connection()
        else:
            for i in range(0, len(data_json), batch_size):
                self.collection.insert_many(data_json[i:i+batch_size]) # Inserta los registros en batches

    def _read(self, query):
        if isinstance(query, list): # Detecta si debe hacer un aggregate, como es el caso del join
            return list(self.collection.aggregate(query))
        return list(self.collection.find(query))

    def _update(self):
        """
        Actualiza el campo 'dni' añadiendo un 0 al final de cada valor para todos los documentos en la colección.
        La actualización se realiza de una sola vez para todos los documentos.
        """
        # Busca todos los documentos y selecciona el campo 'dni'
        if self.close_conn_after_op:
            self._open_connection()
        documents = self.collection.find({}, {"dni": 1})

        bulk_updates = []

        for document in documents:
            current_dni = document.get('dni', '')
            new_dni = str(current_dni) + '0'

            # Preparar la operación de actualización en bloque
            bulk_updates.append(
                pymongo.UpdateOne(
                    {"_id": document["_id"]}, # Filtra por el ID del documento (siempre indexado)
                    {"$set": {"dni": new_dni}} # Establece el nuevo valor de 'dni'
                )
            )
        
        if self.close_conn_after_op:
            self._close_connection()

        # Ejecuta las operaciones de una en una para medir la diferencia en rendimiento respecto a la conexión
        if bulk_updates:
            if self.close_conn_after_op:
                for op in bulk_updates:
                    try:
                        self._open_connection()
                        self.collection.bulk_write([op])
                    finally:
                        self._close_connection()
            else:
                for op in bulk_updates:
                    self.collection.bulk_write([op])

    def _close_connection(self):
        if self._client:
            self._client.close()
            self._client = None

    def _open_connection(self):
        self._client = pymongo.MongoClient('mongodb://localhost:27017/')
        self.db = self._client[self.dbname]
        self.collection = self.db[self.table_name]

    def _to_df(self):
        return pd.DataFrame(self.read()) # Usa la query predeterminada para obtener la base de datos entera

    def _create_index(self, index_cols):
        for col in index_cols:
            self.collection.create_index([(col, pymongo.ASCENDING)], unique=True)


# Clase para calcular tiempos en el gestor de PostgreSQL
class PostgresqlDB(DB):
    def __init__(self, dbname, table_name, unique_cols=None, close_conn_after_op=False):
        super().__init__(dbname, table_name, unique_cols, close_conn_after_op=close_conn_after_op)

    def _delete_table(self):
        self.cursor.execute(text(f"DROP TABLE IF EXISTS {self.table_name};")) # Limpiar la tabla antes de insertar
        self.cursor.commit()

    def _create_table(self, data_csv):
        """
        Create a table based on the DataFrame's columns and types, with unique constraints on specified columns.
        """
        # Mapping de pandas a PostgreSQL
        type_mapping = {
            'int64': 'INTEGER',
            'float64': 'FLOAT',
            'object': 'TEXT',
            'bool': 'BOOLEAN',
            'datetime64[ns]': 'TIMESTAMP',
            'timedelta[ns]': 'INTERVAL'
        }

        columns_with_types = []
        for col, dtype in data_csv.dtypes.items():
            sql_type = type_mapping.get(str(dtype), 'TEXT') # Usar TEXT por defecto
            columns_with_types.append(f"{col} {sql_type}")

        # if self._unique_cols:
        #     unique_str = ", ".join([f"UNIQUE({col})" for col in self._unique_cols])
        #     columns_with_types.append(unique_str)

        # Generar el texto de creación de la tabla
        create_table_query = f"""
        CREATE TABLE {self.table_name} (
            {', '.join(columns_with_types)}
        );
        """

        self.cursor.execute(text(create_table_query))
        self.cursor.commit()

    def _insert(self, data_csv, _, batch_size):
        """
        Inserta los datos de un DataFrame de pandas (ya cargado con read_csv) en la tabla de PostgreSQL.
        """
        if self.close_conn_after_op:
            try:
                self._open_connection()
                self._create_table(data_csv)
            finally:
                self._close_connection()
        else:
            self._create_table(data_csv)
        
        # Query para insertar valores
        columns = ', '.join(data_csv.columns)
        placeholders = ', '.join([f':{col}' for col in data_csv.columns])
        insert_query = f"""
        INSERT INTO {self.table_name} ({columns})
        VALUES ({placeholders})
        """
        
        data = data_csv.to_dict('records')
        
        if self.close_conn_after_op: # Insertar por batches
            for i in range(0, len(data), batch_size):
                try:
                    self._open_connection()
                    self.cursor.execute(text(insert_query), data[i:i+batch_size])
                    self.cursor.commit()
                finally:
                    self._close_connection()
        else:
            for i in range(0, len(data), batch_size):
                self.cursor.execute(text(insert_query), data[i:i+batch_size])
                self.cursor.commit()

    def _read(self, query):
        return pd.read_sql_query(query, self.conn)

    def _update(self):
        """
        Actualiza el campo 'dni' añadiendo un 0 al final de cada valor para cada registro de manera individual.
        """
        select_query = text(f"SELECT dni FROM {self.table_name};")

        # Se obtienen todos los DNIs
        if self.close_conn_after_op:
            try:
                self._open_connection()
                result = self.cursor.execute(select_query)
                dni_list = result.fetchall()
            finally:
                self._close_connection()
        else:
            result = self.cursor.execute(select_query)
            dni_list = result.fetchall()

        # Se actualiza cada DNI individualmente para poder comparar el rendimiento según la conexión
        for dni in dni_list:
            updated_dni = dni[0] + '0'

            if self.close_conn_after_op:
                try:
                    self._open_connection()
                    update_query = text(f"UPDATE {self.table_name} SET dni = :updated_dni WHERE dni = :original_dni;")
                    self.cursor.execute(update_query, {'updated_dni': updated_dni, 'original_dni': dni[0]})
                    self.cursor.commit()
                finally:
                    self._close_connection()
            else:
                update_query = text(f"UPDATE {self.table_name} SET dni = :updated_dni WHERE dni = :original_dni;")
                self.cursor.execute(update_query, {'updated_dni': updated_dni, 'original_dni': dni[0]})
                self.cursor.commit()

    def _close_connection(self):
        if self.cursor:
            self.cursor.close()
            self.cursor = None

    def _open_connection(self):
        self.conn = create_engine(f'postgresql+psycopg2://postgres:postgres@localhost:5432/{self.dbname}')
        self.cursor = self.conn.connect()

    def _to_df(self):
        return pd.read_sql_query(f'SELECT * FROM {self.table_name};', self.conn)

    def _create_index(self, index_cols=None):
        for col in index_cols:
            index_query = f"CREATE INDEX idx_{self.table_name}_{col} ON {self.table_name} ({col});"
            self.cursor.execute(text(index_query))
        self.cursor.commit()


# Clase para calcular tiempos en el gestor de Sqlite3DB
class Sqlite3DB(DB):
    def __init__(self, dbname, table_name, unique_cols=None, close_conn_after_op=False):
        super().__init__(dbname + ".sqlite3", table_name, unique_cols, close_conn_after_op=close_conn_after_op)

    def _delete_table(self):
        self.conn.execute(f"DROP TABLE IF EXISTS {self.table_name};") # Limpiar la tabla antes de insertar
        self.conn.commit()  # Aplicar los cambios

    def _create_table(self, data_csv):
        """
        Create a table based on the DataFrame's columns and types, with unique constraints on specified columns.
        """
        # Mapping de pandas a SQLite
        type_mapping = {
            'int64': 'INTEGER',
            'float64': 'REAL',
            'object': 'TEXT',
            'bool': 'BOOLEAN',
            'datetime64[ns]': 'TEXT'
        }

        columns_with_types = []
        for col, dtype in data_csv.dtypes.items():
            sql_type = type_mapping.get(str(dtype), 'TEXT') # Usar TEXT por defecto
            columns_with_types.append(f"{col} {sql_type}")

        # if self._unique_cols:
        #     unique_str = ", ".join([f"UNIQUE({col})" for col in self._unique_cols])
        #     columns_with_types.append(unique_str)

        create_table_query = f"""
        CREATE TABLE {self.table_name} (
            {', '.join(columns_with_types)}
        );
        """

        self.conn.execute(create_table_query)
        self.conn.commit()

    def _insert(self, data_csv, _, batch_size):
        if self.close_conn_after_op:
            try:
                self._open_connection()
                self._create_table(data_csv)
            finally:
                self._close_connection()
        else:
            self._create_table(data_csv)
        
        columns = ', '.join(data_csv.columns)
        placeholders = ', '.join(['?' for _ in data_csv.columns])
        insert_query = f"""
        INSERT INTO {self.table_name} ({columns})
        VALUES ({placeholders})
        """
        
        data = [tuple(x) for x in data_csv.to_numpy()]
        
        if self.close_conn_after_op: # Insertar por batches
            for i in range(0, len(data), batch_size):
                try:
                    self._open_connection()
                    self.cursor.executemany(insert_query, data[i:i+batch_size])
                    self.conn.commit()
                finally:
                    self._close_connection()
        else:
            for i in range(0, len(data), batch_size):
                self.cursor.executemany(insert_query, data[i:i+batch_size])
                self.conn.commit()

    def _read(self, query):
        self.cursor.execute(query)
        return self.cursor.fetchall()

    def _update(self):
        """
        Actualiza el campo 'dni' añadiendo un 0 al final de cada valor para cada registro de manera individual.
        """
        select_query = f"SELECT dni FROM {self.table_name};"
        
        # Se obtienen todos los DNIs
        if self.close_conn_after_op:
            try:
                self._open_connection()
                self.cursor.execute(select_query)
                dni_list = self.cursor.fetchall()
            finally:
                self._close_connection()
        else:
            self.cursor.execute(select_query)
            dni_list = self.cursor.fetchall()

        # Se actualiza cada DNI individualmente para poder comparar el rendimiento según la conexión
        for dni in dni_list:
            updated_dni = dni[0] + '0'

            if self.close_conn_after_op:
                try:
                    self._open_connection()
                    update_query = f"UPDATE {self.table_name} SET dni = :updated_dni WHERE dni = :original_dni;"
                    self.cursor.execute(update_query, {'updated_dni': updated_dni, 'original_dni': dni[0]})
                    self.conn.commit()
                finally:
                    self._close_connection()
            else:
                update_query = f"UPDATE {self.table_name} SET dni = :updated_dni WHERE dni = :original_dni;"
                self.cursor.execute(update_query, {'updated_dni': updated_dni, 'original_dni': dni[0]})
                self.conn.commit()

    def _close_connection(self):
        if self.conn:
            self.conn.close()
            self.conn = None

    def _open_connection(self):
        self.conn = sqlite3.connect(self.dbname)
        self.cursor = self.conn.cursor()

    def _to_df(self):
        # Convertir los resultados en un DataFrame
        return pd.DataFrame(self.read(), columns=[desc[0] for desc in self.cursor.description])

    def _create_index(self, index_cols=None):
        for col in index_cols:
            index_query = f"CREATE INDEX idx_{self.table_name}_{col} ON {self.table_name} ({col});"
            self.conn.execute(index_query)
        self.conn.commit()


# Clase para calcular tiempos en el gestor de DuckDB
class DuckDB(DB):
    def __init__(self, dbname, table_name, unique_cols=None, close_conn_after_op=False):
        super().__init__(dbname, table_name, unique_cols, close_conn_after_op=close_conn_after_op)

    def _delete_table(self):
        self.conn.execute(f"DROP TABLE IF EXISTS {self.table_name};")

    def _create_table(self, data_csv, index_cols=None):
        """
        Create a table based on the DataFrame's columns and types, with unique constraints on specified columns.
        """
        # Mapping de pandas a DuckDB
        type_mapping = {
            'int64': 'INTEGER',
            'float64': 'DOUBLE',
            'object': 'TEXT',
            'bool': 'BOOLEAN',
            'datetime64[ns]': 'TIMESTAMP'
        }

        columns_with_types = []
        for col, dtype in data_csv.dtypes.items():
            sql_type = type_mapping.get(str(dtype), 'TEXT') # Usar TEXT por defecto
            columns_with_types.append(f"{col} {sql_type}")

        if index_cols:
            unique_str = ", ".join([f"UNIQUE({col})" for col in index_cols])
            columns_with_types.append(unique_str)

        create_table_query = f"""
        CREATE TABLE {self.table_name} (
            {', '.join(columns_with_types)}
        );
        """

        self.conn.execute(create_table_query)

    def _insert(self, data_csv, _, batch_size, index_cols=None):
        if self.close_conn_after_op:
            try:
                self._open_connection()
                self._create_table(data_csv, index_cols)
            finally:
                self._close_connection()
        else:
            self._create_table(data_csv, index_cols)
        self.data = data_csv
        
        columns = ', '.join(data_csv.columns)
        placeholders = ', '.join(['?' for _ in data_csv.columns])
        insert_query = f"""
        INSERT INTO {self.table_name} ({columns})
        VALUES ({placeholders})
        """
        
        data = [tuple(x) for x in data_csv.to_numpy()]
        
        if self.close_conn_after_op: # Insertar por batches
            for i in range(0, len(data), batch_size):
                try:
                    self._open_connection()
                    self.conn.executemany(insert_query, data[i:i+batch_size])
                    self.conn.commit()
                finally:
                    self._close_connection()
        else:
            for i in range(0, len(data), batch_size):
                self.conn.executemany(insert_query, data[i:i+batch_size])
                self.conn.commit()

    def _read(self, query):
        self.cursor.execute(query)
        return self.cursor.fetchall()

    def _update(self):
        """
        Actualiza el campo 'dni' añadiendo un 0 al final de cada valor para cada registro de manera individual.
        """
        select_query = f"SELECT dni FROM {self.table_name};"
        
        # Se obtienen todos los DNIs
        if self.close_conn_after_op:
            try:
                self._open_connection()
                self.cursor.execute(select_query)
                dni_list = self.cursor.fetchall()
            finally:
                self._close_connection()
        else:
            self.cursor.execute(select_query)
            dni_list = self.cursor.fetchall()

        # Se actualiza cada DNI individualmente para poder comparar el rendimiento según la conexión
        for dni in dni_list:
            updated_dni = dni[0] + '0'

            if self.close_conn_after_op:
                try:
                    self._open_connection()
                    update_query = f"UPDATE {self.table_name} SET dni = ? WHERE dni = ?;"
                    self.cursor.execute(update_query, (updated_dni, dni[0]))
                    self.conn.commit()
                finally:
                    self._close_connection()
            else:
                update_query = f"UPDATE {self.table_name} SET dni = ? WHERE dni = ?;"
                self.cursor.execute(update_query, (updated_dni, dni[0]))
                self.conn.commit()

    def _close_connection(self):
        if self.conn:
            self.conn.close()
            self.conn = None

    def _open_connection(self):
        self.conn = duckdb.connect(self.dbname)
        self.cursor = self.conn.cursor()

    def _to_df(self):
        return pd.DataFrame(self.read(), columns=[desc[0] for desc in self.cursor.description])

    def _create_index(self, index_cols=None):
        self._delete_table()
        self._insert(self.data, None, self.data.shape[0], index_cols)

In [4]:
class Measurements:
    def __init__(self, classes, dbname, table_name, unique_cols, sizes):
        self.classes = classes
        self.sizes = sizes
        self.dbname = dbname
        self.table_name = table_name
        self._unique_cols = unique_cols

    def _get_data(self, n, dnis=None):
        return get_users(n) if self.table_name == 'users' else get_cars(([] if dnis is None else dnis), n)

    def _measure_time(self, function):
        t1 = time.perf_counter(), time.process_time()
        function()
        t2 = time.perf_counter(), time.process_time()
        return t2[0] - t1[0], t2[1] - t1[1]

    def _measure_time_and_memory(self, function):
        # Se mide la memoria de la función de medición de tiempos
        # De hacerlo al revés, todas las mediciones tardarían unos 6 segundos más por el tiempo de inicialización de `memory_usage`
        
        # Se usa una lista para poder sacar los resultados de la función interna
        result = []
        def _measure_time(function):
            t1 = time.perf_counter(), time.process_time()
            function()
            t2 = time.perf_counter(), time.process_time()
            result.append((t2[0] - t1[0], t2[1] - t1[1]))
        memory = memory_usage(lambda: _measure_time(function))
        return result[0][0], result[0][1], np.average(memory)

    def _get_inserts(self):
        n = self.sizes[0]
        data_csv, data_json = self._get_data(n)
        results = {cl: {"batches": {"close": [], "keep alive": []}, "standard": {"result": []}} for cl in self.classes}

        # Datos de tiempos según el tamaño del batch
        for cl, cl_class in self.classes.items():
            for close_conn in [False, True]:
                print(f"\n{cl} {'close     ' if close_conn else 'keep alive'} {' ' * (20 - len(cl) - len(str(n)))}{n}", end="")
                i = 1
                while i < 2 * len(data_json):
                    cl_obj = cl_class(self.dbname, self.table_name, self._unique_cols, close_conn_after_op=close_conn)
                    print(f" | {i if i < len(data_json) else 'full'} ", end="")
                    t_real, t_cpu, memory = self._measure_time_and_memory(partial(cl_obj.insert, data_csv, data_json, oid=cl_obj.oid, batch_size=i))
                    results[cl]["batches"]['close' if close_conn else 'keep alive'].append((t_real, t_cpu, memory))
                    i *= 2
                    t_str = f"{t_real:.5f}"
                    print(f"{t_str:>10}s", end="")
            results[cl]["standard"]["result"].append((t_real, t_cpu, memory))

        # Datos de tiempos según el tamaño del dataset
        for size in self.sizes[1:]:
            data_csv, data_json = self._get_data(size)
            for cl, cl_class in self.classes.items():
                cl_obj = cl_class(self.dbname, self.table_name, self._unique_cols, close_conn_after_op=False) # False ya que no hay diferencia dado que se añade todo en un solo update
                print(f"\n{cl}            {' ' * (20 - len(cl) - len(str(size)))}{size} | full ", end="")
                t_real, t_cpu, memory = self._measure_time_and_memory(partial(cl_obj.insert, data_csv, data_json, oid=cl_obj.oid))
                results[cl]["standard"]["result"].append((t_real, t_cpu, memory))
                t_str = f"{t_real:.5f}"
                print(f"{t_str:>10}s", end="")
        print()

        return results

    def plot_inserts(self):
        results = self._get_inserts()
        print("By batches")
        self._plot_results({cl: results[cl]["batches"] for cl in results}, ["time_real", "time_cpu", "memory"], x_axis=[2**n for n in range(0, math.ceil(math.log2(self.sizes[0])))] + [self.sizes[0]], log_base=2)
        print("By sizes")
        self._plot_results({cl: results[cl]["standard"] for cl in results}, ["time_real", "time_cpu", "memory"])

    def _get_reads(self):
        table_name = self.table_name
        try:
            results = {cl: {"standard": {}, "mix": {}} for cl in self.classes}

            for n in self.sizes:
                self.table_name = "users"
                data_csv_users, data_json_users = self._get_data(n)
                self.table_name = "cars"
                data_csv_cars, data_json_cars = self._get_data(n, dnis=[el["dni"] for el in data_json_users])
                data_mix = {el["dni"]: copy(el) for el in data_json_users}
                for car in data_json_cars:
                    data_mix[car["dni"]].setdefault("cars", []).append(car)
                data_mix = list(data_mix.values())

                # Tiempo de read por base de datos, configuración (cache, index) y query
                for cl, cl_class in self.classes.items():
                    print(f"\n{cl} {' ' * (20 - len(cl) - len(str(n)))}{n}", end="")
                    for conn in ["keep alive", "close"]:
                        cl_obj2 = cl_class(self.dbname, "cars", ["vin", "plate"], close_conn_after_op=False) # False por eficiencia, ya que no se realizan mediciones sobre esto
                        cl_obj2.insert(data_csv_cars, data_json_cars, oid=cl_obj2.oid)
                        cl_obj = cl_class(self.dbname, "users", ["dni"], close_conn_after_op=(conn == "close"))
                        if cl == "mongo":
                            cl_obj3 = cl_class(self.dbname, "mix", [], close_conn_after_op=(conn == "close"))
                            cl_obj3.insert(None, data_mix, oid=cl_obj3.oid)
                        column = cl_obj._unique_cols[0]
                        cl_obj.insert(data_csv_users, data_json_users, oid=cl_obj.oid)
                        for cache_type in ["no index", "index", "cache"]:
                            if cache_type == "index":
                                cl_obj.create_index()
                                cl_obj2.create_index()
                                cl_obj3.create_index()

                            print(f" | standard {conn} ({cache_type}) ", end="")
                            t = time.time()
                            times = []
                            for value in data_csv_cars[column][-100:]:
                                query = f"SELECT * FROM table WHERE {column} = '{value}'" if cl_obj._query_type == "sql" else {column: value}
                                t_real, t_cpu = self._measure_time(partial(cl_obj.read, query, oid=cl_obj.oid, use_cache=(cache_type == "cache")))
                                times.append((t_real, t_cpu))
                            t_str = f"{time.time() - t:.5f}"
                            print(f"{t_str:>10}s", end="")
                            results[cl]["standard"].setdefault(f"{conn} ({cache_type})", []).append((np.mean([t[0] for t in times]), np.mean([t[1] for t in times])))

                            print(f" | mix {conn} ({cache_type}) ", end="")
                            t = time.time()
                            times = []
                            times2 = []
                            for value in data_csv_cars[column][-100:]:
                                query = f"""
                                    SELECT users.*, cars.*
                                    FROM users
                                    JOIN cars ON users.dni = cars.dni
                                    WHERE users.{column} = '{value}'
                                    """ if cl_obj._query_type == "sql" else [
                                    { "$match": { column: value } },
                                    { "$lookup": {
                                        "from": "cars",
                                        "localField": "dni",
                                        "foreignField": "dni",
                                        "as": "cars"
                                    }}
                                ]
                                t_real, t_cpu = self._measure_time(partial(cl_obj.read, query, oid=cl_obj.oid, use_cache=(cache_type == "cache")))
                                times.append((t_real, t_cpu))
                                if cl == "mongo":
                                    query = {"dni": value}
                                    t_real, t_cpu = self._measure_time(partial(cl_obj3.read, query, oid=cl_obj.oid, use_cache=(cache_type == "cache")))
                                    times2.append((t_real, t_cpu))
                            t_str = f"{time.time() - t:.5f}"
                            print(f"{t_str:>10}s", end="")
                            results[cl]["mix"].setdefault(f"{conn} ({cache_type}) | join", []).append((np.mean([t[0] for t in times]), np.mean([t[1] for t in times])))
                            if cl == "mongo":
                                results[cl]["mix"].setdefault(f"{conn} ({cache_type}) | db", []).append((np.mean([t[0] for t in times2]), np.mean([t[1] for t in times2])))
            print()

            return results
        finally:
            self.table_name = table_name

    def plot_reads(self):
        results = self._get_reads()
        print("Normal read")
        self._plot_results({cl: results[cl]["standard"] for cl in results}, ["time_real", "time_cpu"])
        print("Join read")
        self._plot_results({cl: results[cl]["mix"] for cl in results}, ["time_real", "time_cpu"])

    def _get_updates(self):
        results = {cl: {} for cl in self.classes}

        # Tiempo y memoria de update por tamaño del dataset
        for n in self.sizes:
            data_csv, data_json = self._get_data(n)

            for cl, cl_class in self.classes.items():
                print(f"\n{cl} {' ' * (20 - len(cl) - len(str(n)))}{n}", end="")
                for conn in ["keep alive", "close"]:
                    cl_obj = cl_class(self.dbname, self.table_name, self._unique_cols, close_conn_after_op=(conn == "close"))
                    oid = cl_obj.oid
                    cl_obj.insert(data_csv, data_json, oid=oid)
                    for index_type in ["no index", "index"]:
                        if index_type == "index":
                            cl_obj.create_index()
                        
                        print(f" | {conn} ({index_type}) ", end="")
                        oid = cl_obj.oid
                        t_real, t_cpu, memory = self._measure_time_and_memory(
                            partial(cl_obj.update, oid=oid)
                        )
                        results[cl].setdefault(f"{conn} ({index_type})", []).append((t_real, t_cpu, memory))
                        t_str = f"{t_real:.5f}"
                        print(f"{t_str:>10}s", end="")
        print()

        return results

    def plot_updates(self):
        results = self._get_updates()
        self._plot_results(results, ["time_real", "time_cpu", "memory"])

    # Función general de plots
    def _plot_results(self, data, plot_types, x_axis=None, log_base=10):
        if x_axis is None:
            x_axis = self.sizes
        
        for plot_type in plot_types:
            fig, axs = plt.subplots(1, len(self.classes), figsize=(6*len(self.classes), 6))
            if len(self.classes) == 1:
                axs = [axs]
            
            for i, db_name in enumerate(self.classes):
                ax = axs[i]
                operations = list(data[db_name].keys())
                for operation in operations:
                    y_values = []
                    for el in data[db_name][operation]:
                        if plot_type == "memory":
                            y_values.append(el[2])
                        elif plot_type == "time_cpu":
                            y_values.append(el[1])
                        else:
                            y_values.append(el[0])
                    
                    ax.plot(x_axis, y_values, label=operation.capitalize(), marker='o')
                
                ax.set_title(f'{db_name.capitalize()}', fontsize=16)
                ax.set_xlabel('Tamaño del dataset', fontsize=14)
                ax.set_xticks(x_axis)
                ax.set_xticklabels(x_axis)
                ax.set_xscale("log", base=log_base)
                ax.grid(True)
                if len(operations) > 1:
                    ax.legend()

            titles = {
                "memory": "Uso de Memoria",
                "time_real": "Tiempo Real",
                "time_cpu": "Tiempo de CPU"
            }
            labels = {
                "memory": "Memoria (MB)",
                "time_real": "Tiempo (segundos)",
                "time_cpu": "Tiempo (segundos)"
            }
            
            plt.suptitle(titles[plot_type], fontsize=16)
            fig.supylabel(labels[plot_type], x=0, fontsize=14)
            plt.tight_layout()
            plt.show()

In [5]:
classes = {
    "mongo": MongoDB,
    "sqlite": Sqlite3DB,
    "duckdb": DuckDB,
    "postgres": PostgresqlDB,
}

measurements = Measurements(classes, "Practica_1", "users", ["dni"], [10**n for n in range(3, 6)])

In [None]:
measurements.plot_inserts()

In [None]:
measurements.plot_reads()

In [None]:
measurements.plot_updates()