In [None]:
import sqlite3
import time
import matplotlib.pyplot as plt
import numpy as np


def measure_query_time(db_path, query):
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()
    start_time = time.time()
    cursor.execute(query)
    conn.close()
    end_time = time.time()
    return (end_time - start_time) * 1000 

def create_index(db_path, index_sql):
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()
    start_time = time.time()
    cursor.execute(index_sql)
    conn.commit()
    conn.close()
    end_time = time.time()
    return (end_time - start_time) * 1000

database_sizes = [10000, 100000, 1000000, 5000000, 10000000]
results = {}

query = "SELECT count(*) FROM mutations WHERE pos BETWEEN 3000000 and 6000000;"
index_sql = "CREATE INDEX IF NOT EXISTS idx_mutations_pos ON mutations (pos);"

for size in database_sizes:
    db_file = f"refsnp_chrX/refsnp_chrX-{size}.sqlite3"
    print(f"Processing: {db_file}")

    #インデックス作成前の検索時間測定
    time_before_index = measure_query_time(db_file, query)

    # インデックス作成時間
    time_create_index = create_index(db_file, index_sql)

    # インデックス作成後の検索時間測定
    time_after_index = measure_query_time(db_file, query)

    results[size] = {
        "before_index_time": time_before_index,
        "create_index_time": time_create_index,
        "after_index_time": time_after_index,
    }

#グラフの描画
sizes = list(results.keys())
before_index_times = [results[size]["before_index_time"] for size in sizes]
after_index_times = [results[size]["after_index_time"] for size in sizes]


plt.figure(figsize=(10, 6))

plt.plot(sizes, before_index_times, marker='o', label='Before Index (O(n))')
plt.plot(sizes, after_index_times, marker='o', label='After Index (O(log n))')

plt.xlabel('Number of Rows')
plt.ylabel('Execution Time (ms)')
plt.title('Query Execution Time vs. Data Size')

plt.xscale('log')
plt.yscale('log')

plt.legend()
plt.grid(True)
plt.show()


print("インデックス作成に使用したSQLスクリプト")
print(index_sql)