In [1]:
from web3 import Web3
from tqdm import tqdm, trange
import pandas as pd

from datetime import datetime

import matplotlib.pyplot as plt

# Import python dotenv
from dotenv import load_dotenv

import os

# Import mongodb client
from pymongo import MongoClient, UpdateOne, DESCENDING, ASCENDING, InsertOne, DeleteOne

In [None]:

load_dotenv()
# Connect to local node
w3 = Web3(Web3.HTTPProvider('http://localhost:8545'))


# Connect to mongodb
client = MongoClient(os.getenv("MONGODB_CONNECTION_STRING"))

# Get database
mempool = client.transactions.mempool

mempool.estimated_document_count()

In [2]:
# Get the document with the highest value for 'your_field'
document = mempool.find_one(sort=[('ts', DESCENDING)])

# Extract the highest value
max_value = document and document['ts']

print(f"The highest value for 'ts' is: {max_value}")

The highest value for 'ts' is: 2023-06-22 19:42:37.576000


In [3]:
cols_of_interest = [
    "_id",
    "ts",
    "timestamp",
    "gasPrice",
    "formattedDate",    
    "hash",
]

In [4]:
# Get transactions from after a certain time
cursor = mempool.find(sort=[('ts', DESCENDING)], limit=10_000)

df = pd.DataFrame(list(cursor))[cols_of_interest]
df

Unnamed: 0,_id,ts,timestamp,gasPrice,formattedDate,hash
0,0x0b8f5ef16b2e250ede5c051cd1cd1b99933da9a72da8...,2023-06-22 19:42:37.576,1.687463e+12,17.768019,,0x0b8f5ef16b2e250ede5c051cd1cd1b99933da9a72da8...
1,0x6111fc676e94ee03a598c10d45a6b68384c08eb2ed50...,2023-06-22 19:42:35.773,1.687463e+12,16.651814,,0x6111fc676e94ee03a598c10d45a6b68384c08eb2ed50...
2,0x01c46ea15c651d26775852440c580a997e03f1baa6f3...,2023-06-22 19:42:35.311,1.687463e+12,16.651814,,0x01c46ea15c651d26775852440c580a997e03f1baa6f3...
3,0x3c3e8c80a243c618b33e8c76e494eb9869257738dfe3...,2023-06-22 19:42:33.123,1.687463e+12,20.369237,,0x3c3e8c80a243c618b33e8c76e494eb9869257738dfe3...
4,0xd1fc8a9278a37402a882b651dc0541d5fb1a439ef91f...,2023-06-22 19:42:29.991,1.687463e+12,17.768019,,0xd1fc8a9278a37402a882b651dc0541d5fb1a439ef91f...
...,...,...,...,...,...,...
9995,0xbc823c8aca7927010469eb755e7acb2bfd3181352c26...,2023-06-22 17:38:26.614,1.687456e+12,27.832564,,0xbc823c8aca7927010469eb755e7acb2bfd3181352c26...
9996,0x5361ccb22aecec53718a9b67b68b0503b3e4e7f4ab38...,2023-06-22 17:38:26.523,1.687456e+12,30.000000,,0x5361ccb22aecec53718a9b67b68b0503b3e4e7f4ab38...
9997,0x6c0ad75b2c9c55d2f1ef07249faa3384b0e014db6c1b...,2023-06-22 17:38:25.945,1.687456e+12,1.000000,2023-06-20T00:48:49.650Z,0x6c0ad75b2c9c55d2f1ef07249faa3384b0e014db6c1b...
9998,0xa27841ea8033e624ff0b294a0e99fb54a882eda55e80...,2023-06-22 17:38:25.934,1.687456e+12,1.100000,,0xa27841ea8033e624ff0b294a0e99fb54a882eda55e80...


In [5]:
df[~df.formattedDate.isna()].iloc[0]._id

'0x7956311ae4300cae4c32aafe13de940b3032bbea0c6003e94aaf822bd01451a3'

## Update documents without `ts` field

In [6]:
# find documents where 'fieldName' does not exist
results = mempool.find({'ts': {'$exists': False}})

# iterate over the sorted results
docs = list(tqdm(results))

130530it [00:26, 4981.01it/s]


In [8]:
docs[0]

{'_id': ObjectId('648ce10f18e0fbc3c3482f75'),
 'hash': '0x303936c1821da2e23c08fe2171947028ac394fe6d7eb5338d2643365a867b348',
 'type': 2,
 'accessList': [],
 'blockHash': None,
 'blockNumber': None,
 'transactionIndex': None,
 'confirmations': 0,
 'from': '0x4899951FFa90d493c58DC1db8760f15c8Ed2873A',
 'gasPrice': 30.998999649,
 'maxPriorityFeePerGas': {'_hex': '0xb2d05e00', '_isBigNumber': True},
 'maxFeePerGas': {'_hex': '0x0737af3261', '_isBigNumber': True},
 'gasLimit': 361022,
 'to': '0x7a250d5630B4cF539739dF2C5dAcb4c659F2488D',
 'value': {'_hex': '0x7c585087238004', '_isBigNumber': True},
 'nonce': 5,
 'data': '0xb6f9de95000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000800000000000000000000000004899951ffa90d493c58dc1db8760f15c8ed2873a00000000000000000000000000000000000000000000000000000000648ce1850000000000000000000000000000000000000000000000000000000000000002000000000000000000000000c02aaa39b223fe8d0a0e5c4

From the above, we can see that we have a `timestamp` field that gives us milliseconds since the Epoch, and a `formattedDate`---both of which are pretty useless. Therefore, I want to convert the `timestamp` into a native MongoDB `Date` object with time for UTC, and then drop `formattedDate`.

In [16]:
# Iterate over all docs in docs and add the timestamp
for doc in tqdm(docs):
    timestamp = doc['timestamp']

    # Create datetime object from timestamp (milliseconds since epoch)
    dt = datetime.fromtimestamp(timestamp / 1000)

    # Remove formatted date
    del doc['formattedDate']

    # Add timestamp
    doc['ts'] = dt


100%|██████████| 130530/130530 [00:00<00:00, 865786.20it/s]


In [17]:
docs[0]

{'_id': ObjectId('648ce10f18e0fbc3c3482f75'),
 'hash': '0x303936c1821da2e23c08fe2171947028ac394fe6d7eb5338d2643365a867b348',
 'type': 2,
 'accessList': [],
 'blockHash': None,
 'blockNumber': None,
 'transactionIndex': None,
 'confirmations': 0,
 'from': '0x4899951FFa90d493c58DC1db8760f15c8Ed2873A',
 'gasPrice': 30.998999649,
 'maxPriorityFeePerGas': {'_hex': '0xb2d05e00', '_isBigNumber': True},
 'maxFeePerGas': {'_hex': '0x0737af3261', '_isBigNumber': True},
 'gasLimit': 361022,
 'to': '0x7a250d5630B4cF539739dF2C5dAcb4c659F2488D',
 'value': {'_hex': '0x7c585087238004', '_isBigNumber': True},
 'nonce': 5,
 'data': '0xb6f9de95000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000800000000000000000000000004899951ffa90d493c58dc1db8760f15c8ed2873a00000000000000000000000000000000000000000000000000000000648ce1850000000000000000000000000000000000000000000000000000000000000002000000000000000000000000c02aaa39b223fe8d0a0e5c4

### Update documents without `ts` field

In [22]:
# Before:
mempool.find_one({'_id': docs[0]['_id']})

{'_id': ObjectId('648ce10f18e0fbc3c3482f75'),
 'hash': '0x303936c1821da2e23c08fe2171947028ac394fe6d7eb5338d2643365a867b348',
 'type': 2,
 'accessList': [],
 'blockHash': None,
 'blockNumber': None,
 'transactionIndex': None,
 'confirmations': 0,
 'from': '0x4899951FFa90d493c58DC1db8760f15c8Ed2873A',
 'gasPrice': 30.998999649,
 'maxPriorityFeePerGas': {'_hex': '0xb2d05e00', '_isBigNumber': True},
 'maxFeePerGas': {'_hex': '0x0737af3261', '_isBigNumber': True},
 'gasLimit': 361022,
 'to': '0x7a250d5630B4cF539739dF2C5dAcb4c659F2488D',
 'value': {'_hex': '0x7c585087238004', '_isBigNumber': True},
 'nonce': 5,
 'data': '0xb6f9de95000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000800000000000000000000000004899951ffa90d493c58dc1db8760f15c8ed2873a00000000000000000000000000000000000000000000000000000000648ce1850000000000000000000000000000000000000000000000000000000000000002000000000000000000000000c02aaa39b223fe8d0a0e5c4

In [34]:
# 'docs' is your list of documents
operations = [UpdateOne({'_id': doc['_id']}, {'$set': doc}) for doc in docs]

# Bulk write in batches of 1000
for i in trange(99_000, len(operations), 1000):
    result = mempool.bulk_write(operations[i:i+1000], ordered=False)

    print(f"Updated {result.modified_count} documents")

  3%|▎         | 1/32 [00:02<01:04,  2.09s/it]

Updated 0 documents


  6%|▋         | 2/32 [00:14<03:57,  7.92s/it]

Updated 255 documents


  9%|▉         | 3/32 [00:56<11:31, 23.85s/it]

Updated 1000 documents


 12%|█▎        | 4/32 [01:40<14:43, 31.57s/it]

Updated 1000 documents


 16%|█▌        | 5/32 [02:21<15:46, 35.04s/it]

Updated 1000 documents


 19%|█▉        | 6/32 [03:05<16:30, 38.09s/it]

Updated 1000 documents


 22%|██▏       | 7/32 [03:46<16:13, 38.94s/it]

Updated 1000 documents


 25%|██▌       | 8/32 [04:29<16:07, 40.33s/it]

Updated 1000 documents


 28%|██▊       | 9/32 [05:09<15:22, 40.10s/it]

Updated 1000 documents


 31%|███▏      | 10/32 [05:52<15:01, 40.99s/it]

Updated 1000 documents


 34%|███▍      | 11/32 [06:32<14:16, 40.77s/it]

Updated 1000 documents


 38%|███▊      | 12/32 [07:13<13:35, 40.78s/it]

Updated 1000 documents


 41%|████      | 13/32 [07:54<12:56, 40.87s/it]

Updated 1000 documents


 44%|████▍     | 14/32 [08:35<12:20, 41.11s/it]

Updated 1000 documents


 47%|████▋     | 15/32 [09:22<12:04, 42.64s/it]

Updated 1000 documents


 50%|█████     | 16/32 [10:02<11:10, 41.90s/it]

Updated 1000 documents


 53%|█████▎    | 17/32 [10:44<10:30, 42.04s/it]

Updated 1000 documents


 56%|█████▋    | 18/32 [11:24<09:41, 41.53s/it]

Updated 1000 documents


 59%|█████▉    | 19/32 [12:06<09:00, 41.58s/it]

Updated 1000 documents


 62%|██████▎   | 20/32 [12:47<08:16, 41.34s/it]

Updated 1000 documents


 66%|██████▌   | 21/32 [13:28<07:34, 41.29s/it]

Updated 1000 documents


 69%|██████▉   | 22/32 [14:10<06:54, 41.43s/it]

Updated 1000 documents


 72%|███████▏  | 23/32 [14:49<06:06, 40.77s/it]

Updated 1000 documents


 75%|███████▌  | 24/32 [15:34<05:35, 41.89s/it]

Updated 1000 documents


 78%|███████▊  | 25/32 [16:14<04:49, 41.35s/it]

Updated 1000 documents


 81%|████████▏ | 26/32 [16:58<04:12, 42.09s/it]

Updated 1000 documents


 84%|████████▍ | 27/32 [17:41<03:32, 42.51s/it]

Updated 1000 documents


 88%|████████▊ | 28/32 [18:22<02:48, 42.17s/it]

Updated 1000 documents


 91%|█████████ | 29/32 [19:05<02:06, 42.18s/it]

Updated 1000 documents


 94%|█████████▍| 30/32 [19:45<01:23, 41.58s/it]

Updated 1000 documents


 97%|█████████▋| 31/32 [20:26<00:41, 41.60s/it]

Updated 1000 documents


100%|██████████| 32/32 [20:48<00:00, 39.00s/it]

Updated 530 documents





In [38]:
# After:
mempool.find_one({'_id': docs[0]['_id']})

{'_id': ObjectId('648ce10f18e0fbc3c3482f75'),
 'hash': '0x303936c1821da2e23c08fe2171947028ac394fe6d7eb5338d2643365a867b348',
 'type': 2,
 'accessList': [],
 'blockHash': None,
 'blockNumber': None,
 'transactionIndex': None,
 'confirmations': 0,
 'from': '0x4899951FFa90d493c58DC1db8760f15c8Ed2873A',
 'gasPrice': 30.998999649,
 'maxPriorityFeePerGas': {'_hex': '0xb2d05e00', '_isBigNumber': True},
 'maxFeePerGas': {'_hex': '0x0737af3261', '_isBigNumber': True},
 'gasLimit': 361022,
 'to': '0x7a250d5630B4cF539739dF2C5dAcb4c659F2488D',
 'value': {'_hex': '0x7c585087238004', '_isBigNumber': True},
 'nonce': 5,
 'data': '0xb6f9de95000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000800000000000000000000000004899951ffa90d493c58dc1db8760f15c8ed2873a00000000000000000000000000000000000000000000000000000000648ce1850000000000000000000000000000000000000000000000000000000000000002000000000000000000000000c02aaa39b223fe8d0a0e5c4

## Update so that all transactions has the transaction hash as `_id`

In [39]:
# Find documents where '_id' and 'hash' are not equal
results = mempool.find({'$expr': {'$ne': ['$_id', '$hash']}})

docs = list(tqdm(results))
len(docs)

74623it [00:39, 1894.90it/s]


74623

In [None]:
# Initialize a list to hold your operations
operations = []

for doc in docs:    
    # Get the old _id
    old_id = doc["_id"]

    # Create a copy of the old document, but replace the _id with the new_id
    new_doc = doc.copy()
    new_doc["_id"] = doc["hash"]

    # Add an InsertOne operation for the new document and a DeleteOne operation for the old document
    operations.append(InsertOne(new_doc))
    operations.append(DeleteOne({"_id": old_id}))


# Perform the operations in batches
it = trange(0, len(operations), 1000)
total_modified = 0
for i in it:
    result = mempool.bulk_write(operations[i:i+1000], ordered=False)
    total_modified += result.modified_count
    it.set_description(f"Updated {result.modified_count} documents, {total_modified} total")

print(f"Updated {total_modified} documents")