In [2]:
import os

from datetime import datetime

from pymongo import MongoClient
import psycopg2
from dotenv import load_dotenv

from tqdm import tqdm
from tqdm.contrib.concurrent import process_map

load_dotenv(override=True)

True

In [3]:
mongo_uri = os.environ["MONGODB_CONNECTION_STRING"]
postgres_uri = os.environ["POSTGRESQL_URI_MP"]

In [4]:
# Connect to MongoDB
mongo_client = MongoClient(mongo_uri)
mongo_db = mongo_client['transactions']
mongo_collection = mongo_db['mempool-hashes']

# Connect to PostgreSQL
pg_conn = psycopg2.connect(postgres_uri)
pg_cursor = pg_conn.cursor()

In [4]:
def process_chunk(skip, limit):
    # Connect to MongoDB and PostgreSQL inside the function
    mongo_client = MongoClient(mongo_uri)
    mongo_db = mongo_client['transactions']
    mongo_collection = mongo_db['mempool-hashes']
    pg_conn = psycopg2.connect(postgres_uri)
    pg_cursor = pg_conn.cursor()

    # Initialize a counter for the number of records inserted
    total_inserted = 0

    # Iterate through a chunk of documents in the MongoDB collection
    it = tqdm(mongo_collection.find().skip(skip).limit(limit), total=limit, desc="Inserting into PostgreSQL")
    for document in it:
        it.set_postfix(total_inserted=total_inserted)
        batch_records = [(hash['hash'], datetime.utcfromtimestamp(hash['timestamp'] / 1000)) for hash in document['hashes']]
        insert_query = """
            INSERT INTO mempool_transactions (hash, first_seen) VALUES (%s, %s)
            ON CONFLICT (hash) DO NOTHING
        """
        pg_cursor.executemany(insert_query, batch_records)
        pg_conn.commit()

        total_inserted += pg_cursor.rowcount

    # Close connections
    pg_cursor.close()
    pg_conn.close()
    mongo_client.close()

    return total_inserted


In [5]:
# Get the total number of documents
total_documents = mongo_collection.estimated_document_count()

# Number of processes
num_processes = 4

# Chunk size
chunk_size = total_documents // num_processes

# Create separate lists for the skip and limit values
skip_values = [i * chunk_size for i in range(num_processes)]
limit_values = [chunk_size] * num_processes

# Use process_map to call process_chunk in parallel and display progress bars
results = process_map(process_chunk, skip_values, limit_values, max_workers=num_processes)

# Sum the results to get the total number of records inserted
total_inserted = sum(results)
print(f"Total records inserted: {total_inserted}")

  0%|          | 0/4 [00:00<?, ?it/s]

Inserting into PostgreSQL:   0%|          | 17/33790 [00:21<11:35:35,  1.24s/it, total_inserted=0]