## Исследование Vertica

## Подготовка базы данных

In [119]:
connection_info = {
    'host': '127.0.0.1',
    'port': 5433,
    'user': 'dbadmin',
    'password': '',
    'database': 'docker',
    'autocommit': True,
}

In [120]:
import vertica_python


with vertica_python.connect(**connection_info) as connection:
    cursor = connection.cursor()
    cursor.execute("""
    CREATE TABLE views (
        id IDENTITY,
        user_id INTEGER NOT NULL,
        movie_id VARCHAR(256) NOT NULL,
        viewed_frame INTEGER NOT NULL
    );
    """)

## Подготовка скрипта для генерации данных и тестирования быстродействия

In [43]:
import uuid
from random import randint


def generate_progress_data(count: int):
    chunk = []
    
    for _ in range(count):
        user_id = randint(1, 1_000_000_000)
        viewed_frame = randint(1, 1_000_000_000)
        chunk.append((user_id, str(uuid.uuid4()), viewed_frame))

    return chunk

In [40]:
import time
import statistics

class TimerCode():
    def __init__(self):
        self.start = time.time()
        self.durations = []
        
    def setup_start_time(self):
        self.start = time.time()

    def checkpoint(self):
        end_time = time.time()
        self.durations.append(end_time - self.start)
        self.start = end_time

    def __enter__(self):
        return self

    def __exit__(self, type, value, traceback):
        print(f"Median - {statistics.median(self.durations)}")
        avg = sum(self.durations) / len(self.durations)
        print(f"Average - {avg}")
        print(f"Summary - {sum(self.durations)}")

## Тестироване вставки данных

In [45]:
# тестируем вставку данных списком из 100 штук

import vertica_python

with TimerCode() as timer:
    with vertica_python.connect(**connection_info) as connection:
        chunk = generate_progress_data(100)
        cursor = connection.cursor()
        timer.setup_start_time()
        cursor.executemany("INSERT INTO views(user_id, movie_id, viewed_frame) VALUES (%s, %s, %s)",
        chunk)
        timer.checkpoint()
    

Median - 0.1892249584197998
Average - 0.1892249584197998
Summary - 0.1892249584197998


In [58]:
# тестируем вставку данных списком из 250 штук

import vertica_python

with TimerCode() as timer:
    with vertica_python.connect(**connection_info) as connection:
        chunk = generate_progress_data(250)
        cursor = connection.cursor()
        timer.setup_start_time()
        cursor.executemany("INSERT INTO views(user_id, movie_id, viewed_frame) VALUES (%s, %s, %s)",
        chunk)
        timer.checkpoint()

Median - 0.09809112548828125
Average - 0.09809112548828125
Summary - 0.09809112548828125


In [59]:
# тестируем вставку данных списком из 500 штук

import vertica_python

with TimerCode() as timer:
    with vertica_python.connect(**connection_info) as connection:
        chunk = generate_progress_data(500)
        cursor = connection.cursor()
        timer.setup_start_time()
        cursor.executemany("INSERT INTO views(user_id, movie_id, viewed_frame) VALUES (%s, %s, %s)",
        chunk)
        timer.checkpoint()

Median - 0.10187697410583496
Average - 0.10187697410583496
Summary - 0.10187697410583496


In [60]:
# тестируем вставку данных списком из 850 штук

import vertica_python

with TimerCode() as timer:
    with vertica_python.connect(**connection_info) as connection:
        chunk = generate_progress_data(850)
        cursor = connection.cursor()
        timer.setup_start_time()
        cursor.executemany("INSERT INTO views(user_id, movie_id, viewed_frame) VALUES (%s, %s, %s)",
        chunk)
        timer.checkpoint()

Median - 0.12445902824401855
Average - 0.12445902824401855
Summary - 0.12445902824401855


In [46]:
# тестируем вставку данных списком из 1_000 штук

import vertica_python

with TimerCode() as timer:
    with vertica_python.connect(**connection_info) as connection:
        chunk = generate_progress_data(1_000)
        cursor = connection.cursor()
        timer.setup_start_time()
        cursor.executemany("INSERT INTO views(user_id, movie_id, viewed_frame) VALUES (%s, %s, %s)",
        chunk)
        timer.checkpoint()

Median - 0.14134716987609863
Average - 0.14134716987609863
Summary - 0.14134716987609863


In [47]:
# тестируем вставку данных списком из 10_000 штук

import vertica_python

with TimerCode() as timer:
    with vertica_python.connect(**connection_info) as connection:
        chunk = generate_progress_data(10_000)
        cursor = connection.cursor()
        timer.setup_start_time()
        cursor.executemany("INSERT INTO views(user_id, movie_id, viewed_frame) VALUES (%s, %s, %s)",
        chunk)
        timer.checkpoint()

Median - 1.0215561389923096
Average - 1.0215561389923096
Summary - 1.0215561389923096


In [51]:
# тестируем вставку данных списком из 50_000 штук

import vertica_python

with TimerCode() as timer:
    with vertica_python.connect(**connection_info) as connection:
        chunk = generate_progress_data(50_000)
        cursor = connection.cursor()
        timer.setup_start_time()
        cursor.executemany("INSERT INTO views(user_id, movie_id, viewed_frame) VALUES (%s, %s, %s)",
        chunk)
        timer.checkpoint()

Median - 0.35663485527038574
Average - 0.35663485527038574
Summary - 0.35663485527038574


In [52]:
# тестируем вставку данных списком из 100_000 штук

import vertica_python

with TimerCode() as timer:
    with vertica_python.connect(**connection_info) as connection:
        chunk = generate_progress_data(100_000)
        cursor = connection.cursor()
        timer.setup_start_time()
        cursor.executemany("INSERT INTO views(user_id, movie_id, viewed_frame) VALUES (%s, %s, %s)",
        chunk)
        timer.checkpoint()

Median - 0.6463849544525146
Average - 0.6463849544525146
Summary - 0.6463849544525146


In [53]:
# тестируем вставку данных списком из 1_000_000 штук

import vertica_python

with TimerCode() as timer:
    with vertica_python.connect(**connection_info) as connection:
        chunk = generate_progress_data(1_000_000)
        cursor = connection.cursor()
        timer.setup_start_time()
        cursor.executemany("INSERT INTO views(user_id, movie_id, viewed_frame) VALUES (%s, %s, %s)",
        chunk)
        timer.checkpoint()

Median - 6.648721933364868
Average - 6.648721933364868
Summary - 6.648721933364868


In [64]:
# тестируем вставку данных списком из 5_000_000 штук

import vertica_python

with TimerCode() as timer:
    with vertica_python.connect(**connection_info) as connection:
        chunk = generate_progress_data(5_000_000)
        cursor = connection.cursor()
        timer.setup_start_time()
        cursor.executemany("INSERT INTO views(user_id, movie_id, viewed_frame) VALUES (%s, %s, %s)",
        chunk)
        timer.checkpoint()

Median - 43.139312982559204
Average - 43.139312982559204
Summary - 43.139312982559204


## Тестирование чтения данных

На момент исследования количество записей в базе

```
with vertica_python.connect(**connection_info) as connection:
    cursor = connection.cursor()
    cursor.execute("""
        SELECT count(*) FROM views;
    """)
    print(cursor.fetchall())
    
```

Output - 11314205
    

In [73]:
# тестируем чтение данных 1_000

with TimerCode() as timer:
    with vertica_python.connect(**connection_info) as connection:
        cursor = connection.cursor()
        timer.setup_start_time()
        cursor.execute("""SELECT count(*) FROM views LIMIT 1000;""")
        timer.checkpoint()

Median - 0.0627140998840332
Average - 0.0627140998840332
Summary - 0.0627140998840332


In [74]:
# тестируем чтение данных 10_000

with TimerCode() as timer:
    with vertica_python.connect(**connection_info) as connection:
        cursor = connection.cursor()
        timer.setup_start_time()
        cursor.execute("""SELECT count(*) FROM views LIMIT 10000;""")
        timer.checkpoint()

Median - 0.2651090621948242
Average - 0.2651090621948242
Summary - 0.2651090621948242


In [75]:
# тестируем чтение данных 100_000

with TimerCode() as timer:
    with vertica_python.connect(**connection_info) as connection:
        cursor = connection.cursor()
        timer.setup_start_time()
        cursor.execute("""SELECT count(*) FROM views LIMIT 10000;""")
        timer.checkpoint()

Median - 0.12449908256530762
Average - 0.12449908256530762
Summary - 0.12449908256530762


In [78]:
# тестируем чтение данных 1_000_000

with TimerCode() as timer:
    with vertica_python.connect(**connection_info) as connection:
        cursor = connection.cursor()
        timer.setup_start_time()
        cursor.execute("""SELECT count(*) FROM views LIMIT 1000000;""")
        timer.checkpoint()

Median - 0.08140301704406738
Average - 0.08140301704406738
Summary - 0.08140301704406738


In [79]:
# тестируем чтение данных 2_000_000

with TimerCode() as timer:
    with vertica_python.connect(**connection_info) as connection:
        cursor = connection.cursor()
        timer.setup_start_time()
        cursor.execute("""SELECT count(*) FROM views LIMIT 2000000;""")
        timer.checkpoint()

Median - 0.09993600845336914
Average - 0.09993600845336914
Summary - 0.09993600845336914


In [80]:
# тестируем чтение данных 5_000_000

with TimerCode() as timer:
    with vertica_python.connect(**connection_info) as connection:
        cursor = connection.cursor()
        timer.setup_start_time()
        cursor.execute("""SELECT count(*) FROM views LIMIT 5000000;""")
        timer.checkpoint()

Median - 0.09772968292236328
Average - 0.09772968292236328
Summary - 0.09772968292236328


In [81]:
# тестируем чтение данных 10_000_000

with TimerCode() as timer:
    with vertica_python.connect(**connection_info) as connection:
        cursor = connection.cursor()
        timer.setup_start_time()
        cursor.execute("""SELECT count(*) FROM views LIMIT 10000000;""")
        timer.checkpoint()

Median - 0.1652541160583496
Average - 0.1652541160583496
Summary - 0.1652541160583496


## Тестируем чтение + вычисление длины строки

In [95]:
# тестируем чтение данных 1_000

with TimerCode() as timer:
    with vertica_python.connect(**connection_info) as connection:
        cursor = connection.cursor()
        timer.setup_start_time()
        cursor.execute("""SELECT LENGTH(movie_id) FROM views LIMIT 1000;""")
        timer.checkpoint()

Median - 0.037361860275268555
Average - 0.037361860275268555
Summary - 0.037361860275268555


In [85]:
# тестируем чтение данных 100_000

with TimerCode() as timer:
    with vertica_python.connect(**connection_info) as connection:
        cursor = connection.cursor()
        timer.setup_start_time()
        cursor.execute("""SELECT LENGTH(movie_id) FROM views LIMIT 100000;""")
        timer.checkpoint()

Median - 0.13355493545532227
Average - 0.13355493545532227
Summary - 0.13355493545532227


In [93]:
# тестируем чтение данных без лимита

with TimerCode() as timer:
    with vertica_python.connect(**connection_info) as connection:
        cursor = connection.cursor()
        timer.setup_start_time()
        cursor.execute("""SELECT LENGTH(movie_id) FROM views;""")
        timer.checkpoint()

Median - 0.18950200080871582
Average - 0.18950200080871582
Summary - 0.18950200080871582


## Аналитические задачи

In [131]:
# Найдем максимальное значение среди viewed_frame (movie_timestamp)

with TimerCode() as timer:
    with vertica_python.connect(**connection_info) as connection:
        cursor = connection.cursor()
        timer.setup_start_time()
        result = cursor.execute("""SELECT max(viewed_frame) FROM views;""")
        timer.checkpoint()

Median - 0.13187384605407715
Average - 0.13187384605407715
Summary - 0.13187384605407715


In [130]:
# Найдем медианного значения среди viewed_frame (movie_timestamp)

with TimerCode() as timer:
    with vertica_python.connect(**connection_info) as connection:
        cursor = connection.cursor()
        timer.setup_start_time()
        result = cursor.execute("""SELECT median(viewed_frame) OVER() FROM views;""")
        timer.checkpoint()

Median - 1.0892741680145264
Average - 1.0892741680145264
Summary - 1.0892741680145264


In [132]:
# Найдем суммы значений по столбцу viewed_frame

with TimerCode() as timer:
    with vertica_python.connect(**connection_info) as connection:
        cursor = connection.cursor()
        timer.setup_start_time()
        result = cursor.execute("""SELECT Sum(viewed_frame) OVER() FROM views;""")
        timer.checkpoint()

Median - 0.15050792694091797
Average - 0.15050792694091797
Summary - 0.15050792694091797


## Тестирование обработки данных под нагрузкой

In [None]:
def insert_data():
    """Функция занимается постоянной вставкой данных в хранилище."""
    with vertica_python.connect(**connection_info) as connection:
        for _ in range(100):
            chunk = generate_progress_data(5_000_000)
            cursor = connection.cursor()
            cursor.executemany("INSERT INTO views(user_id, movie_id, viewed_frame) VALUES (%s, %s, %s)",
            chunk)
            print('Данные успено добавлены!')

def select_data():
    """Функция занимается постоянным вычитыванием данных из хранилища."""
    with TimerCode() as timer:
        with vertica_python.connect(**connection_info) as connection:
            cursor = connection.cursor()
            timer.setup_start_time()
            cursor.execute("""SELECT count(*) FROM views LIMIT 10000;""")
            timer.checkpoint()
            
            
def run():
    insert_data()
    select_data()
    
run()