# Collect data from the BigQuery database

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys

current_path = sys.path[0]
sys.path.append(current_path[:current_path.find('defi-measurement')] + "liquidity-distribution-history")

In [3]:
import os
from pool_state import v3Pool
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm

from datetime import datetime

import json

import pandas as pd
from prisma import Prisma

import psycopg2
import psycopg2.extras
import pandas as pd


from dotenv import load_dotenv


load_dotenv(override=True)

True

In [4]:
postgres_uri_us = os.getenv("POSTGRESQL_URI_US")

assert postgres_uri_us is not None, "Connection string to Postgres is not set"

In [5]:
pool_symbols = json.load(open("../addresses/pool_tokens.json", "r"))

## Get data for the 10 biggest pools by TVL

In [26]:
pool_addresses = [
    "0x88e6a0c2ddd26feeb64f039a2c41296fcb3f5640",  # USDC-ETH  0.05%
    "0xcbcdf9626bc03e24f779434178a73a0b4bad62ed",  # WBTC-ETH  0.30%
    "0x5777d92f208679db4b9778590fa3cab3ac9e2168",  # DAI-USDC  0.01%
    "0x4585fe77225b41b697c938b018e2ac67ac5a20c0",  # WBTC-ETC  0.05%
    "0xc63b0708e2f7e69cb8a1df0e1389a98c35a76d52",  # FRAX-USDC 0.05%
    "0x8ad599c3a0ff1de082011efddc58f1908eb6e6d8",  # USDC-ETH  0.30%
    "0x11b815efb8f581194ae79006d24e0d814b7697f6",  # ETH-USDT  0.05%
    "0x3416cf6c708da44db2624d63ea0aaef7113527c6",  # USDC-USDT 0.01%
    "0x7379e81228514a1d2a6cf7559203998e20598346",  # ETH/sETH2 0.30%
    "0x6c6bc977e13df9b0de53b251522280bb72383700",  # DAI-USDC  0.05%
]

In [27]:
# it = tqdm(pool_addresses[:1])
# for pool_address in it:
#     it.set_description(pool_address)
#     pool = v3Pool(pool_address, initialize=True, update=True, connStr=postgres_uri, chunk_length=5e3)

## New approach---Just import the whole database

In [6]:
def seed_db(
    bq_table: str,
    pg_table: str,
    start_index: int,
    max_results: int,
    total_rows: int,
) -> None:
    # BigQuery client
    proj_id = "mimetic-design-338620"
    bq_dataset = 'uniswap'

    # Postgres connection
    conn = psycopg2.connect(postgres_uri_us)
    cur = conn.cursor()
    
    it = tqdm(total=total_rows, initial=start_index)

    while True:
        it.set_description(f"Inserting row {start_index:_} - {start_index + max_results:_}")
        rows = pd.read_gbq(
            f"SELECT * FROM `{bq_dataset}.{bq_table}` LIMIT {max_results} OFFSET {start_index}",
            project_id=proj_id,
            dialect='standard',
            # progress_bar_type='tqdm'
        ).to_dict('records')

        if not rows:
            break

        # Insert the rows into Postgres only if row is not already present
        psycopg2.extras.execute_values(
            cur,
            f"""
            INSERT INTO {pg_table} VALUES %s
            """,
            # ON CONFLICT (block_number, transaction_index, log_index) DO NOTHING
            [tuple(x.values()) for x in rows],
            template=None,
            page_size=100
        )
        conn.commit()

        # Update the start index for the next batch of rows
        start_index += max_results
        it.update(max_results)

    # Close the Postgres connection
    cur.close()
    conn.close()

In [8]:
# Define your BigQuery table and Postgres table
bq_table = 'swap'
pg_table = 'swaps'

# Fetch the data from BigQuery in chunks
start_index = 21_690_000
max_results = 100_000  # adjust this value based on your system's memory
total_rows = 33_447_421

seed_db(bq_table, pg_table, start_index, max_results, total_rows)

Inserting row 33_490_000 - 33_590_000: : 33490000it [4:23:14, 819.83it/s]                              

In [6]:
# Define your BigQuery table and Postgres table
bq_table = 'MintBurnV3-labeled'
pg_table = 'mb'

# Fetch the data from BigQuery in chunks
start_index = 3_000
max_results = 100_000  # adjust this value based on your system's memory
total_rows = 1_356_519

seed_db(bq_table, pg_table, start_index, max_results, total_rows)

Inserting row 1_403_000 - 1_503_000: : 1403000it [22:08, 1053.65it/s]                           


In [7]:
# Define your BigQuery table and Postgres table
bq_table = 'V3Factory_PoolCreated'
pg_table = 'factory'

# Fetch the data from BigQuery in chunks
start_index = 0
max_results = 1_000  # adjust this value based on your system's memory
total_rows = 13_397

seed_db(bq_table, pg_table, start_index, max_results, total_rows)

Inserting row 14_000 - 15_000: : 14000it [00:30, 465.99it/s]                         


In [10]:
# Define your BigQuery table and Postgres table
bq_table = 'ethereum_uniswap_v3_pool_evt_initialize'
pg_table = 'initialize'

# Fetch the data from BigQuery in chunks
start_index = 0
max_results = 1_000  # adjust this value based on your system's memory
total_rows = 13_360

seed_db(bq_table, pg_table, start_index, max_results, total_rows)

Inserting row 14_000 - 15_000: : 14000it [00:27, 507.37it/s]                         


## Remove the duplicate rows in the `swaps` table

In [8]:
import psycopg2

def remove_duplicates(pg_table, columns):
    # establish a connection
    conn = psycopg2.connect(postgres_uri_us)
    conn.autocommit = False  # start a new transaction

    # create a cursor
    cur = conn.cursor()

    batch_size = 100_000  # number of rows to delete in each batch
    row_count = batch_size  # initial value to enter the loop

    col_str = ", ".join(columns)  # columns as a string

    # create the index if it doesn't exist
    cur.execute(f"""
        CREATE INDEX IF NOT EXISTS idx_swaps_columns ON {pg_table} ({col_str});
    """)
    conn.commit()  # commit the index creation

    # loop until there's no more duplicates
    while row_count == batch_size:
        # find the duplicates
        cur.execute(f"""
            DELETE FROM {pg_table}
            WHERE ctid IN (
                SELECT ctid
                FROM (
                    SELECT ctid,
                        ROW_NUMBER() OVER(PARTITION BY {col_str} ORDER BY ctid) AS rn
                    FROM {pg_table}
                ) t
                WHERE t.rn > 1
                LIMIT %s
            )
        """, (batch_size,))
        
        row_count = cur.rowcount  # get the number of deleted rows

        # commit the deletion
        conn.commit()

        # print the progress
        print(f"Deleted {row_count:_} rows in this iteration")

    # close the cursor and the connection
    cur.close()
    conn.close()


remove_duplicates('swaps', ['block_number', 'transaction_index', 'log_index'])

In [9]:
remove_duplicates('mb', ['block_number', 'transaction_index', 'log_index'])

Deleted 100_000 rows in this iteration
Deleted 100_000 rows in this iteration
Deleted 100_000 rows in this iteration
Deleted 100_000 rows in this iteration
Deleted 100_000 rows in this iteration
Deleted 100_000 rows in this iteration
Deleted 100_000 rows in this iteration
Deleted 100_000 rows in this iteration
Deleted 100_000 rows in this iteration
Deleted 23_317 rows in this iteration


In [11]:
remove_duplicates('factory', ['pool'])

Deleted 13_397 rows in this iteration
