## ClickHouse

#### Замеры времени вставки данных в БД ClickHouse

Справочно: мои значения при вставке 1_000_000 записей на сервер ClickHouse, расположенный в локальной 1G сети как среднее трех измерений:

**1. clickhouse_driver**

1.1. DB API 2.0:
- 1.1.1. Cursor.execute: замеры не проводились, метод катастрофически медленный при большом количестве записей
- 1.1.2. Cursor.executemany при вставке из списка: 0.9053 секунд
- 1.1.3. Cursor.executemany при вставке из генератора: 1.6330 секунд
- 1.1.4. Cursor.executemany при вставке из CSV-генератора: 6.4458 секунд

1.2. API:
- 1.2.1. Client.execute при вставке из списка: 0.8542 секунд
- 1.2.2. Client.execute при вставке из генератора: 1.6399 секунд
- 1.2.3. Client.evecute при вставке из CSV-генератора: 6.2741 секунд
- 1.2.4. Client.insert_dataframe при вставке из DataFrame: 0.9247 секунд

**2. clickhouse_connect**

- 2.1. Client.insert при вставке из списка: 2.9978 секунд
- 2.2. Client.insert_df при вставке из DataFrame: 1.1556 секунд
- 2.3. insert_file при вставке из файла: 0.4547 секунд

### 1. clickhouse_driver

In [None]:
import time
import csv

import pandas as pd

import clickhouse_driver
from clickhouse_driver import Client


def init_db(dsn: str, query_drop: str, query_create: str):
    """Инициализирует базу данных ClickHouse."""
    conn = clickhouse_driver.connect(dsn)
    cur = conn.cursor()
    cur.execute(query_drop)
    cur.execute(query_create)
    cur.close()
    conn.close()


def iter_data(size: int):
    """Генерирует последовательность кортежей с данными пользователя."""
    for i in range(size):
        yield (i, f"user_{i}", f"user_{i}@example.org")


def iter_csv(filename: str):
    converters = {"id": int}
    with open(filename, "r", encoding="utf-8") as f:
        reader = csv.DictReader(f, delimiter="\t")
        for row in reader:
            yield {
                k: (converters[k](v) if k in converters else v)
                for (k, v) in row.items()
            }


# num_records = 1_000
# num_records = 10_000
# num_records = 100_000
num_records = 1_000_000
# num_records = 10_000_000

db_params = {
    "host": "192.168.1.55",
    "port": "19000",
    "user": "clickhouse",
    "password": "clickhouse",
    "database": "test",
}

dsn = f"clickhouse://{db_params['user']}:{db_params['password']}@{db_params['host']}:{db_params['port']}/{db_params['database']}"

query_drop_table = """
DROP TABLE IF EXISTS users
"""
query_create_table = """
CREATE TABLE users (
id UInt64, username String, email String
)
ENGINE = MergeTree()
ORDER BY id
SETTINGS index_granularity=8192
"""

# сформировать CSV-файл
csv_file = "/tmp/users.tsv"
fieldnames = ["id", "username", "email"]

with open(csv_file, "w", encoding="utf-8") as f:
    writer = csv.writer(f, delimiter="\t")
    writer.writerow(fieldnames)
    writer.writerows(iter_data(num_records))

#### 1.1. DB API 2.0 

1.1.1. Вставка с использованием Cursor.execute в цикле

> !!! катастрофически медленно, "зависнет" на большом количестве записей

In [None]:
data_to_insert = iter_data(num_records)
init_db(dsn, query_drop_table, query_create_table)
query = "INSERT INTO users (id, username, email) VALUES"

conn = clickhouse_driver.connect(dsn)
cur = conn.cursor()
# <measuring time start block>
start_time = time.perf_counter()
for record in data_to_insert:
    cur.execute(query, [record])
end_time = time.perf_counter()
# <measuring time end block>
cur.close()
conn.close()
print(
    f"ClickHouse Cursor.execute: операция вставки заняла {end_time - start_time:.4f} секунд"
)

1.1.2. Вставка с использованием Cursor.executemany из списка

In [None]:
data_to_insert = list(iter_data(num_records))

init_db(dsn, query_drop_table, query_create_table)
query = "INSERT INTO users (id, username, email) VALUES"

conn = clickhouse_driver.connect(dsn)
cur = conn.cursor()
# <measuring time start block>
start_time = time.perf_counter()
cur.executemany(query, data_to_insert)
end_time = time.perf_counter()
# <measuring time end block>
cur.close()
conn.close()
print(
    f"ClickHouse Cursor.executemany: операция вставки заняла {end_time - start_time:.4f} секунд"
)

1.1.3. Вставка с использованием Cursor.executemany из генератора

In [None]:
data_to_insert = iter_data(num_records)

init_db(dsn, query_drop_table, query_create_table)
query = "INSERT INTO users (id, username, email) VALUES"

conn = clickhouse_driver.connect(dsn)
cur = conn.cursor()
# <measuring time start block>
start_time = time.perf_counter()
cur.executemany(query, data_to_insert)
end_time = time.perf_counter()
# <measuring time end block>
cur.close()
conn.close()
print(
    f"ClickHouse Cursor.executemany: операция вставки заняла {end_time - start_time:.4f} секунд"
)

1.1.4. Вставка с использованием Cursor.executemany из CSV-генератора

In [None]:
init_db(dsn, query_drop_table, query_create_table)
query = "INSERT INTO users (id, username, email) VALUES"

conn = clickhouse_driver.connect(dsn)
cur = conn.cursor()
# <measuring time start block>
start_time = time.perf_counter()
cur.executemany(query, iter_csv(csv_file))
end_time = time.perf_counter()
# <measuring time end block>
cur.close()
conn.close()
print(
    f"ClickHouse Cursor.executemany при вставке из CSV-генератора: операция вставки заняла {end_time - start_time:.4f} секунд"
)

#### 1.2. API

1.2.1. Вставка с использованием Client.execute из списка

In [None]:
data_to_insert = list(iter_data(num_records))

init_db(dsn, query_drop_table, query_create_table)
query = "INSERT INTO users (id, username, email) VALUES"

client = Client(**db_params)
# <measuring time start block>
start_time = time.perf_counter()
client.execute(query, data_to_insert)
end_time = time.perf_counter()
# <measuring time end block>
client.disconnect_connection()
print(
    f"ClickHouse Client.execute: операция вставки заняла {end_time - start_time:.4f} секунд"
)

1.2.2. Вставка с использованием Client.execute из генератора

In [None]:
data_to_insert = iter_data(num_records)

init_db(dsn, query_drop_table, query_create_table)
query = "INSERT INTO users (id, username, email) VALUES"

client = Client(**db_params)
# <measuring time start block>
start_time = time.perf_counter()
client.execute(query, data_to_insert)
end_time = time.perf_counter()
# <measuring time end block>
client.disconnect_connection()
print(
    f"ClickHouse Client.execute: операция вставки заняла {end_time - start_time:.4f} секунд"
)

1.2.2. Вставка с использованием Client.execute из CSV-генератора

In [None]:
init_db(dsn, query_drop_table, query_create_table)
query = "INSERT INTO users (id, username, email) VALUES"
client = Client(**db_params)

# <measuring time start block>
start_time = time.perf_counter()
client.execute(query, iter_csv(csv_file))
end_time = time.perf_counter()
# <measuring time end block>
client.disconnect_connection()
print(
    f"ClickHouse Client.evecute из CSV-генератора: операция вставки заняла {end_time - start_time:.4f} секунд"
)

1.2.3. Вставка с использованием Client.insert_dataframe

In [None]:
init_db(dsn, query_drop_table, query_create_table)
query = "INSERT INTO users (id, username, email) VALUES"

client = Client(**db_params)

df = pd.read_csv(csv_file, delimiter="\t")

# <measuring time start block>
start_time = time.perf_counter()
client.insert_dataframe(
    query=query, dataframe=df, settings={"use_numpy": True}
)
end_time = time.perf_counter()
# <measuring time end block>
client.disconnect_connection()
print(
    f"ClickHouse Client.insert_dataframe: операция вставки заняла {end_time - start_time:.4f} секунд"
)

### 2. clickhouse_connect

In [None]:
import time
import csv

import pandas as pd

import clickhouse_connect


def init_db(dsn: str, query_drop: str, query_create: str):
    with clickhouse_connect.get_client(dsn=dsn) as client:
        client.command(query_drop)
        client.command(query_create)


def iter_csv(filename: str):
    converters = {"id": int}
    with open(filename, "r", encoding="utf-8") as f:
        reader = csv.DictReader(f, delimiter="\t")
        for row in reader:
            yield {
                k: (converters[k](v) if k in converters else v)
                for (k, v) in row.items()
            }


# num_records = 1_000
# num_records = 10_000
# num_records = 100_000
num_records = 1_000_000
# num_records = 10_000_000

data_to_insert = [
    (i, f"user_{i}", f"user_{i}@example.org") for i in range(num_records)
]

dsn = "clickhouse://clickhouse:clickhouse@192.168.1.55:18123/test"
client = clickhouse_connect.get_client(dsn=dsn)

query_drop_table = """
DROP TABLE IF EXISTS users
"""
query_create_table = """
CREATE TABLE users (
id UInt64, username String, email String
)
ENGINE = MergeTree()
ORDER BY id
SETTINGS index_granularity=8192
"""

# сформировать CSV-файл
csv_file = "/tmp/users.tsv"
fieldnames = ["id", "username", "email"]

with open(csv_file, "w", encoding="utf-8") as f:
    writer = csv.writer(f, delimiter="\t")
    writer.writerow(fieldnames)
    writer.writerows(iter_data(num_records))

2.1. Вставка с использованием Client.insert

In [None]:
init_db(dsn, query_drop_table, query_create_table)

# <measuring time start block>
start_time = time.perf_counter()
client.insert(table="users", data=data_to_insert, column_names=fieldnames)
end_time = time.perf_counter()
# <measuring time end block>
client.close()
print(
    f"ClickHouse Client.insert: операция вставки заняла {end_time - start_time:.4f} секунд"
)

2.2. Вставка с использованием Client.insert_df

In [None]:
init_db(dsn, query_drop_table, query_create_table)

df = pd.DataFrame(data_to_insert, columns=fieldnames)
# <measuring time start block>
start_time = time.perf_counter()
client.insert_df(table="users", df=df)
end_time = time.perf_counter()
# <measuring time end block>
client.close()
print(
    f"ClickHouse Client.insert_df: операция вставки заняла {end_time - start_time:.4f} секунд"
)

2.3. Вставка с использованием insert_file

In [None]:
from clickhouse_connect.driver.tools import insert_file

init_db(dsn, query_drop_table, query_create_table)

# <measuring time start block>
start_time = time.perf_counter()
insert_file(client=client, table="users", file_path=csv_file)
end_time = time.perf_counter()
# <measuring time end block>
print(
    f"ClickHouse insert_file: операция вставки заняла {end_time - start_time:.4f} секунд"
)