# Mongo vs ClickHouse 

In [9]:
import random as rd

test_comment = " ".join([rd.choice(["lorem", "ipsum", "dolor", "sit", "amet", "consectetur", "adipiscing", "elit", "sed", "do", "eiusmod", "tempor", "incididunt", "ut", "labore", "et", "dolore", "magna", "aliqua"]) for _ in range(50)])
test_comment = test_comment.capitalize() + "."
test_comment = test_comment * 6
test_comment = test_comment[:300]


In [10]:
from datetime import datetime
from uuid import uuid4

def generate_review_batch()->list[dict]:
    for i in range(10_000):
        print(f"{i} / 10_000")
        batch = [
            {
                "user_id": str(uuid4()),
                "film_id": str(uuid4()),
                "comment": test_comment,
                "timestamp": datetime.now(),
            }
            for _ in range(1000)
        ]
        yield batch

## Mongo

In [None]:
from pymongo import MongoClient

In [None]:
client = MongoClient('localhost', 27019)
db = client['test_database']
collection = db['test_collection']

In [None]:
import time

start = time.time()
counter = 0
for batch in generate_review_batch():
    collection.insert_many(batch) 
    counter += 1
end = time.time()

print(f"Batch {counter} inserted")
print(f"Insertion took {end - start:.4f} seconds")
print(f"average insertion time: {(end - start) / counter:.4f} seconds")

In [49]:
import timeit

def read_test():
    cursor  = collection.find().batch_size(1000).limit(1000)
    _ = list(cursor)

print(f"Average select time: {timeit.timeit(read_test, number=1000) / 1000} sec")

Average select time: 0.03981100379099371 sec


### Insert batch test:
* Batch size = 1000.  10000 batches had been inserted
* Insertion took 181.6199 seconds
* average insertion time: 0.0182 seconds

### Read batch test
* Batch size = 1000.
* Reads Number = 1000 times 
* Average select time: 0.005184496374975424 sec

## ClickHouse

In [4]:
from clickhouse_driver import Client

In [5]:
client = Client(host="localhost")

In [6]:
client.execute('CREATE DATABASE IF NOT EXISTS collection ON CLUSTER company_cluster;')

[('clickhouse-node1', 9000, 0, '', 3, 0),
 ('clickhouse-node3', 9000, 0, '', 2, 0),
 ('clickhouse-node2', 9000, 0, '', 1, 0),
 ('clickhouse-node4', 9000, 0, '', 0, 0)]

In [7]:
client.execute(
"""CREATE TABLE IF NOT EXISTS collection.test_collection ON CLUSTER company_cluster (
    user_id UUID,
    film_id UUID,
    comment TEXT,
    timestamp TIMESTAMP
)
Engine=MergeTree()
ORDER BY (user_id, film_id, timestamp);""")

[('clickhouse-node1', 9000, 0, '', 3, 0),
 ('clickhouse-node3', 9000, 0, '', 2, 0),
 ('clickhouse-node4', 9000, 0, '', 1, 0),
 ('clickhouse-node2', 9000, 0, '', 0, 0)]

In [11]:
import time

start = time.time()
counter = 0
for batch in generate_review_batch():
    client.execute("INSERT INTO collection.test_collection (user_id, film_id, comment, timestamp) VALUES", batch)
    counter += 1
end = time.time()

print(f"Batch {counter} inserted")
print(f"Insertion took {end - start:.4f} seconds")
print(f"average insertion time: {(end - start) / counter:.4f} seconds")

0 / 10_000


ServerException: Code: 81.
DB::Exception: Database ugc_analytics doesn't exist. Stack trace:

0. DB::Exception::Exception(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, int, bool) @ 0xa82d07a in /usr/bin/clickhouse
1. DB::DatabaseCatalog::assertDatabaseExistsUnlocked(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&) const @ 0x13519e5a in /usr/bin/clickhouse
2. DB::DatabaseCatalog::getDatabase(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&) const @ 0x1351c8f8 in /usr/bin/clickhouse
3. DB::Context::resolveStorageID(DB::StorageID, DB::Context::StorageNamespace) const @ 0x134b755e in /usr/bin/clickhouse
4. ? @ 0x13d1351f in /usr/bin/clickhouse
5. DB::executeQuery(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, std::__1::shared_ptr<DB::Context>, bool, DB::QueryProcessingStage::Enum) @ 0x13d11e55 in /usr/bin/clickhouse
6. DB::TCPHandler::runImpl() @ 0x147f050f in /usr/bin/clickhouse
7. DB::TCPHandler::run() @ 0x14804259 in /usr/bin/clickhouse
8. Poco::Net::TCPServerConnection::start() @ 0x1745e52f in /usr/bin/clickhouse
9. Poco::Net::TCPServerDispatcher::run() @ 0x17460981 in /usr/bin/clickhouse
10. Poco::PooledThread::run() @ 0x17611609 in /usr/bin/clickhouse
11. Poco::ThreadImpl::runnableEntry(void*) @ 0x1760ed00 in /usr/bin/clickhouse
12. ? @ 0x400086d609 in ?
13. clone @ 0x40009a9293 in ?


In [None]:
import timeit

def read_test():
    _ = client.execute("SELECT user_id, film_id, progress_sec FROM ugc_analytics.ugc_film_views LIMIT 1000")

print(f"Average select time: {timeit.timeit(read_test, number=1000) / 1000} sec")
