# Mongo vs ClickHouse 

In [None]:
import random as rd

test_comment = " ".join([rd.choice(["lorem", "ipsum", "dolor", "sit", "amet", "consectetur", "adipiscing", "elit", "sed", "do", "eiusmod", "tempor", "incididunt", "ut", "labore", "et", "dolore", "magna", "aliqua"]) for _ in range(50)])
test_comment = test_comment.capitalize() + "."
test_comment = test_comment * 6
test_comment = test_comment[:300]


In [None]:
from datetime import datetime
from uuid import uuid4
from random import choice

users = [str(uuid4()) for _ in range(100)]
films = [str(uuid4()) for _ in range(1000)]

def generate_review_batch()->list[dict]:
    for i in range(10_000):
        i % 100 == 0 and print(f"{i} / 10_000")
        batch = [
            {
                "user_id": choice(users),
                "film_id": choice(films),
                "comment": test_comment,
                "timestamp": datetime.now(),
            }
            for _ in range(1000)
        ]
        yield batch

## Mongo

In [1]:
from pymongo import MongoClient
import pymongo

In [3]:
client = MongoClient('localhost', 27019)
db = client['test_database']
collection = db['test_collection']
collection.drop()

In [None]:
collection.create_index("user_id")

In [None]:
import time

start = time.time()
counter = 0
for batch in generate_review_batch():
    collection.insert_many(batch) 
    counter += 1
end = time.time()

print(f"Batch {counter} inserted")
print(f"Insertion took {end - start:.4f} seconds")
print(f"average insertion time: {(end - start) / counter:.4f} seconds")

In [None]:
import timeit

def read_test():
    cursor  = collection.find().batch_size(1000).limit(1000)
    _ = list(cursor)

print(f"Average select time: {timeit.timeit(read_test, number=1000) / 1000} sec")

In [None]:
import timeit

pipeline=[
    {
        "$group": {
            "_id": "$user_id",
            "count": {"$sum": 1}
        }
    },
    {
        "$sort": {"count": -1}
    },
    {
        "$limit": 10
    }
]
def aggregate_test():
    cursor = collection.aggregate(pipeline=pipeline)
    list(cursor)

print(f"Average select time: {timeit.timeit(aggregate_test, number=10) / 10} sec")

### Insert batch test:
* Batch size = 1000.  10000 batches had been inserted
* Insertion took 169.2879 seconds
* average insertion time: 0.0169 seconds

### Read batch test
* Batch size = 1000.
* Reads Number = 1000 times 
* Average select time: 0.006814309166977182 sec

## Aggregation test
* Reads Number = 10 times 
* Average aggregate time: 3.495923020900227 sec

## ClickHouse

In [None]:
from clickhouse_driver import Client

In [None]:
client = Client(host="localhost")

In [None]:
client.execute('CREATE DATABASE IF NOT EXISTS collection ON CLUSTER company_cluster;')

In [None]:
client.execute(
"""CREATE TABLE IF NOT EXISTS collection.test_collection ON CLUSTER company_cluster (
    user_id UUID,
    film_id UUID,
    comment TEXT,
    timestamp TIMESTAMP
)
Engine=MergeTree()
ORDER BY (user_id, film_id, timestamp);""")

In [None]:
import time

start = time.time()
counter = 0
for batch in generate_review_batch():
    client.execute("INSERT INTO collection.test_collection (user_id, film_id, comment, timestamp) VALUES", batch)
    counter += 1
end = time.time()

print(f"Batch {counter} inserted")
print(f"Insertion took {end - start:.4f} seconds")
print(f"average insertion time: {(end - start) / counter:.4f} seconds")

In [None]:
import timeit

def read_test():
    _ = client.execute("SELECT * FROM collection.test_collection LIMIT 1000")

print(f"Average select time: {timeit.timeit(read_test, number=1000) / 1000} sec")


In [None]:
sql_pipeline = """SELECT user_id, COUNT(*) as count FROM collection.test_collection GROUP BY user_id ORDER BY count DESC LIMIT 10 """
def aggregate_test():
    _ = client.execute(sql_pipeline)

print(f"Average select time: {timeit.timeit(aggregate_test, number=10) / 10} sec")

### Insert batch test:
* Batch size = 1000.  10000 batches had been inserted
* Insertion took 233.7237 seconds
* average insertion time: 0.0234 seconds

### Read batch test
* Batch size = 1000.
* Reads Number = 1000 times 
* Average select time: 0.012428301582986023 sec

## Aggregation test
* Reads Number = 10 times 
* Average select time: 0.17657093749730848 sec