In [18]:
import os

import io

from tqdm import tqdm
from azure.storage.blob import BlobServiceClient
from dotenv import load_dotenv

from sqlalchemy import create_engine, String, Integer, Float, DateTime, Boolean, Text
import pandas as pd

load_dotenv(override=True)

True

In [19]:
# Read in the environment variables
postgres_uri = os.environ["POSTGRESQL_URI_MP"]
blobstorage_uri = os.environ["AZURE_STORAGE_CONNECTION_STRING"]


In [7]:
blob_service_client = BlobServiceClient.from_connection_string(blobstorage_uri)

container_name = "uniswap-v3-pair-call-swap"
container_client = blob_service_client.get_container_client(container_name)

list(container_client.list_blobs())

[{'name': '01H7AZNSA0TM847GXK5Y4VEA37.csv', 'container': 'uniswap-v3-pair-call-swap', 'snapshot': None, 'version_id': None, 'is_current_version': None, 'blob_type': <BlobType.BLOCKBLOB: 'BlockBlob'>, 'metadata': {}, 'encrypted_metadata': None, 'last_modified': datetime.datetime(2023, 8, 9, 14, 51, 32, tzinfo=datetime.timezone.utc), 'etag': '0x8DB98E81F3ADB3D', 'size': 192706097, 'content_range': None, 'append_blob_committed_block_count': None, 'is_append_blob_sealed': None, 'page_blob_sequence_number': None, 'server_encrypted': True, 'copy': {'id': None, 'source': None, 'status': None, 'progress': None, 'completion_time': None, 'status_description': None, 'incremental_copy': None, 'destination_snapshot': None}, 'content_settings': {'content_type': 'text/csv', 'content_encoding': None, 'content_language': None, 'content_md5': None, 'content_disposition': None, 'cache_control': None}, 'lease': {'status': 'unlocked', 'state': 'available', 'duration': None}, 'blob_tier': None, 'rehydrate_p

In [16]:
df_all = None


for blob in tqdm(list(container_client.list_blobs())):
    assert blob and blob.name

    if blob.name.endswith('.csv'):

        # download the blob to memory
        blob_bytes = container_client.download_blob(blob).readall()

        # convert the blob to a dataframe
        df = pd.read_csv(io.BytesIO(blob_bytes), encoding='utf-8')

        # append the dataframe to the master dataframe
        if df_all is None:
            df_all = df
        else:
            df_all = pd.concat([df_all, df], ignore_index=True, axis=0)

assert df_all is not None

print(df_all.shape)

df_all.head()

100%|██████████| 9/9 [02:09<00:00, 14.41s/it]

(2761006, 13)





Unnamed: 0,contract_address,call_success,call_tx_hash,call_trace_address,call_block_time,call_block_number,amountSpecified,data,output_amount0,output_amount1,recipient,sqrtPriceLimitX96,zeroForOne
0,0xabd055069a6b04db7d1547f88dd01cf14ff09cfd,True,0x0d0c1e9262a6c2523a8c663da2d6832142d49684a189...,[0],2023-08-08 16:25:11.000 UTC,17871378.0,9059324407173505842245,0x00000000000000000000000000000000000000000000...,9059324407173505842245,-266149977598488744,0x3fc91a3afd70395cd496c647d5a6cc9d4b2b7fad,4295128740,True
1,0xe566e99d65b17974fd9db02e25e24ea8020f7a0e,True,0xd573c5d225343dbfcd5eaed7a3adde15a41ed09c7d63...,[1 0 0 0],2023-08-08 16:25:11.000 UTC,17871378.0,1000000000000000000000000,0x000000000000000000000000cdf7028ceab81fa0c697...,-80249553,1000000000000000000000000,0x655edce464cc797526600a462a8154650eee4b77,1461446703485210103287273052203988822378723970341,False
2,0x60594a405d53811d3bc4766596efd80fd545a270,True,0xd395030f2d3649d9b893fc7778a2c340c48fa37772dc...,[1],2023-08-08 16:25:11.000 UTC,17871378.0,100000000000000000000,0x00000000000000000000000000000000000000000000...,100000000000000000000,-54220858765559472,0x5f1972c05f96f670e105ffb5b1b2a6736abab1e3,4295128740,True
3,0xa80838d2bb3d6ebaed1978fa23b38f91775d8378,True,0x562f63847b88fd85a1450e9a4d7fcd2eda9ce8e17f89...,[8],2023-08-08 16:25:11.000 UTC,17871378.0,68918171301562960,0x00000000000000000000000000000000000000000000...,-191535322390282127469,68918171301562960,0xe9e5be2be1a4dd1e792ed7a89b0c7edd75aec743,1461446703485210103287273052203988822378723970341,False
4,0x9b6aa9a920d67413c030cd8add4c66021a733910,True,0x1759c94915a453caef644c3914b9a996882a66bf8b33...,[2],2023-08-08 16:25:11.000 UTC,17871378.0,61613429920162908832035680,0x000000000000000000000000e4c9725db696982afb58...,-1994909894777407768,61613429920162908832035680,0x827179dd56d07a7eea32e3873493835da2866976,1461446703485210103287273052203988822378723970341,False


In [22]:
df_all = df_all.drop_duplicates(keep='first').drop(columns=['data'])

print(df_all.shape)

df_all.dtypes

(2698106, 12)


contract_address       object
call_success             bool
call_tx_hash           object
call_trace_address     object
call_block_time        object
call_block_number     float64
amountSpecified        object
output_amount0         object
output_amount1         object
recipient              object
sqrtPriceLimitX96      object
zeroForOne               bool
dtype: object

In [23]:
# Convert dtypes: call_block_time -> datetime w/ timezone, call_block_number -> int64
df_all = df_all.astype({'call_block_time': 'datetime64[ns, UTC]', 'call_block_number': 'int64'})

df_all.dtypes

contract_address                   object
call_success                         bool
call_tx_hash                       object
call_trace_address                 object
call_block_time       datetime64[ns, UTC]
call_block_number                   int64
amountSpecified                    object
output_amount0                     object
output_amount1                     object
recipient                          object
sqrtPriceLimitX96                  object
zeroForOne                           bool
dtype: object

In [35]:
# Check if any column contains duplicate values
df_counts = df_all.groupby("call_tx_hash").count().sort_values(by="contract_address", ascending=False)

single_swap_txs = df_counts[df_counts["contract_address"] == 1].index.values

single_swap_txs.shape

(1968632,)

In [37]:
df_valid = df_all[df_all.call_tx_hash.isin(single_swap_txs)]

df_valid.shape

(1968632, 12)

In [38]:
# Create a connection to PostgreSQL
engine = create_engine(postgres_uri)

# Create the table
df_valid.to_sql("swap_limit_price", engine, if_exists='replace', index=False, chunksize=1000,)


1968632