In [254]:
import os

import numpy as np
import pandas as pd
import networkx as nx

import arrow
from tqdm import tqdm

from dotenv import load_dotenv
from coinbase.wallet.client import Client

load_dotenv('.env')
client = Client(os.environ['COINBASE_KEY'], os.environ['COINBASE_SECRET'])

### Config

In [255]:
SETUP_ETH_TO_USD = True  # Should be True for first run, thereafter can be set to False
TEST_LIMIT = None # Set to None for production run

projects = [
#     'bayc',
#     'coolcats',
#     'cryptoadz',
#     'cyberkongz',
#     'hashmasks',
#     'mayc',
#     'meebits',
#     'mekaverse',
#     'svs'
]

### Store base data as a dataframe

In [256]:
def create_base_data(project):
    PATH_TO_DATA = './data/collated/' + project + '.csv'  # Change if needed
    column_names = ["row", "tx_hash", "token_address", "from_address", "to_address", "token_id", "blk_number", "blk_timestamp", "eth_value"]
    
    df = pd.read_csv(PATH_TO_DATA, delimiter=',', skiprows=1, names=column_names)
    
    df["from_address"] = df.from_address.apply(lambda x: x.strip())
    df["to_address"] = df.to_address.apply(lambda x: x.strip())
    
    return df

### Transaction data

In [257]:
def get_transaction_data(project):
    PATH_TO_DATA = f"./data/balances/{project}.csv"
    return pd.read_csv(PATH_TO_DATA)

errors = []

def lookup_account_value(df, block, account):
    value = 0
    df = df.infer_objects()
    
    if account == '0x0000000000000000000000000000000000000000':
        return value
    
    try:
        df_blocked = df[(df['block'] == block) & (df['address'] == account)]
        value = df_blocked['eth_value'].head(1).iat[0]
    except Exception as e:
        errors.append((block, account))
    return value

### Setup ETH/USD data

In [258]:
def build_eth_to_usd_lookup():
    """The result is what one ETH is worth in USD"""
    column_names = ["date", "eth_to_usd"]
    df_eth_to_usd = pd.DataFrame(columns=column_names)
    
    for project in projects:
        df_transactions = get_transaction_data(project)
        
        df_transactions['eth_value'] = df_transactions['eth_value'].apply(pd.to_numeric, errors='coerce').fillna(0)
        df_transactions['usd_value'] = df_transactions['usd_value'].apply(pd.to_numeric, errors='coerce').fillna(0)
        
        df_transactions = df_transactions.astype({
            'eth_value': 'float64',
            'usd_value': 'float64'
        })
        
        df_transactions = df_transactions[df_transactions['eth_value'] != 0].groupby('date', as_index=False).first()
    
        for index, row in tqdm(df_transactions.iterrows(), total=df_transactions.shape[0]):
            date = row['date']
            eth_to_usd = row['usd_value'] / row['eth_value']

            df_eth_to_usd = df_eth_to_usd.append({
                'date': date,
                'eth_to_usd': eth_to_usd,
            }, ignore_index=True)
        
    df_eth_to_usd = df_eth_to_usd.groupby('date', as_index=False).first()
    print(df_eth_to_usd)
    
    np.save(f"./memory/eth_to_usd.npy", df_eth_to_usd)

In [259]:
if SETUP_ETH_TO_USD:
    build_eth_to_usd_lookup()

100%|███████████████████████████████████████| 277/277 [00:00<00:00, 1166.13it/s]
100%|█████████████████████████████████████████| 94/94 [00:00<00:00, 1171.22it/s]
100%|███████████████████████████████████████| 212/212 [00:00<00:00, 1142.34it/s]
100%|█████████████████████████████████████████| 85/85 [00:00<00:00, 1123.21it/s]

           date  eth_to_usd
0    2021-01-28     1240.62
1    2021-01-29     1333.61
2    2021-01-30     1380.04
3    2021-01-31     1380.00
4    2021-02-01     1313.95
..          ...         ...
304  2021-11-28     4098.53
305  2021-11-29     4298.38
306  2021-11-30     4449.42
307  2021-12-01     4636.43
308  2021-12-02     4586.87

[309 rows x 2 columns]





### Helper function to get eth_to_usd

In [260]:
np_data = np.load('./memory/eth_to_usd.npy', allow_pickle=True)
df_eth_to_usd = pd.DataFrame(data=np_data, columns=['date', 'eth_to_usd'])

def get_eth_to_usd(date):
    # This is when you miss static types.. 
    date = date.strftime("%Y-%m-%d")
    rate = df_eth_to_usd.loc[df_eth_to_usd['date'] == date].eth_to_usd.values[0]
    return rate

# Convert ETH value to USD at specified date
def get_usd_value(date, eth_value):
    if eth_value == 0:
        return eth_value
    try:
        rate = get_eth_to_usd(date)
        return rate * eth_value
    except IndexError:
        print("Date not in values: " + str(date))
        return float(client.get_spot_price(currency_pair='ETH-USD', date=date)['amount']) * eth_value

### Build time-based dataframes

In [261]:
def create_timed_data(df, df_transactions):
    ZERO_ADDRESS = '0x0000000000000000000000000000000000000000'
    column_names = [
        "date", 
        "days_since_mint", 
        "from_address", 
        "to_address", 
        "token_id", 
        "blk_number", 
        "eth_value",
        "usd_value",
        "from_value",
        "to_value",
        "from_value_usd",
        "to_value_usd"
    ]
    
    df_time = pd.DataFrame(columns=column_names)
    df_total = df.shape[0]
    
    if TEST_LIMIT:
        df = df.head(TEST_LIMIT)
        
    mint_date_set = False
    
    for index, row in tqdm(df.iterrows(), total=df_total):
        blk_timestamp = row['blk_timestamp']
        date = arrow.get(blk_timestamp).datetime

        from_address = str(row['from_address'])
        to_address = str(row['to_address'])
        token_id = row['token_id']
        blk_number = row['blk_number']
        eth_value = row['eth_value']
        usd_value = get_usd_value(date, eth_value)
        
        if not mint_date_set:
            days_since_mint = 0
            mint_date = date
            mint_date_set = True
        else:
            days_since_mint = (date - mint_date).days
            
        from_value = lookup_account_value(df_transactions, blk_number, from_address)
        to_value = lookup_account_value(df_transactions, blk_number, to_address)
        
        from_value_usd = get_usd_value(date, from_value)
        to_value_usd = get_usd_value(date, to_value)
            
        df_time = df_time.append({
            'date': date,
            'days_since_mint': days_since_mint,
            'from_address': from_address,
            'to_address': to_address,
            'token_id': token_id, 
            'blk_number': blk_number,
            'eth_value': eth_value,
            'usd_value': usd_value,
            'from_value': from_value,
            'to_value': to_value,
            'from_value_usd': from_value_usd,
            'to_value_usd': to_value_usd,
        }, ignore_index=True)
    
    df_time = df_time.infer_objects()
    return df_time

### Build graph objects from time base dataframes

In [262]:
def build_graph_from_timed(df_time, old_graph=None):    
    # Building a network per block
    # we will use a weighted and directed graph.
    graph = old_graph if old_graph is not None else nx.MultiDiGraph()

    # loop over the pandas dataframe.
    for index, row in tqdm(df_time.iterrows(), total=df_time.shape[0]):
        # read the values from the dataframe.
        # token_id  blk_timestamp eth_value 
        date = row['date']
        from_address = row['from_address']
        to_address = row['to_address']
        token_id = row['token_id']
        blk_number = row['blk_number']
        eth_value = row['eth_value']
        usd_value = row['usd_value']
        from_value = row['from_value']
        to_value = row['to_value']
        from_value_usd = row['from_value_usd']
        to_value_usd = row['to_value_usd']
        
        # make sure both addresses are in the graph.
        if from_address not in graph:
            graph.add_node(from_address)
        if to_address not in graph:
            graph.add_node(to_address)

        # set the attributes on this node.
        nx.set_node_attributes(graph, {from_address: from_value, to_address: to_value}, 'eth_value')
        nx.set_node_attributes(graph, {from_address: from_value_usd, to_address: to_value_usd}, 'usd_value')

        # keep track of how many trades a wallet has done.
        trades = nx.get_node_attributes(graph, "trades")
        if from_address in trades:
            nx.set_node_attributes(graph, {from_address:trades[from_address] + 1}, 'trades')
        else:
            nx.set_node_attributes(graph, {from_address:1}, 'trades')
        if to_address in trades:
            nx.set_node_attributes(graph, {to_address:trades[to_address] + 1}, 'trades')
        else:
            nx.set_node_attributes(graph, {to_address:1}, 'trades')

        # check if this NFT has already been sold and if yes, remove the old sale.
        # this might be a candidate for memoization - c.b.
        remove_edges = []
        for (u,v,d) in graph.edges.data():
            if d['token_id'] == token_id:
                remove_edges.append((u,v))
        # we need to remove them in a seperate step, since otherwise we change the datastructure that we are iterating over.
        for (u,v) in remove_edges:
            graph.remove_edge(u,v)

        # add an edge for the transaction. # Note changed to usd_value
        graph.add_edge(from_address, to_address, weight=usd_value, token_id=token_id) # keep track of token id by adding it to the edge.
        
    return graph

### Build time-based snapshots

In [267]:
def build_snapshots(df_time):
    res = []
    column_names = [
        "time_bucket", 
        "time_bucket_label",
        "number_of_nodes", 
        "reciprocity", 
        "assortativity", 
        "assortativity_base", 
        "assortativity_out_out", 
        "assortativity_in_in", 
        "assortativity_in_out",
        "centrality_degree",
        "centrality_closeness", 
    ]
    
    df_snapshots = pd.DataFrame(columns=column_names)
    
    df_time['date_quantile'], bins = pd.qcut(df_time['date'], 10, labels=False, retbins=True)
    time_buckets = np.unique(df_time["date_quantile"].to_numpy())
    
    for i, (time_bucket, label) in enumerate(zip(time_buckets, bins)):
        graph_selection = df_time[(df_time['date_quantile'] == time_bucket)]
        
        if i != 0:
            old_graph = res[i-1]
        else:
            old_graph = None
        
        graph_snapshot = build_graph_from_timed(graph_selection, old_graph=old_graph)
        
        res.append(graph_snapshot)
        df_snapshots = df_snapshots.append({
            "time_bucket": time_bucket,
            "time_bucket_label": label,
            "number_of_nodes": graph_snapshot.number_of_nodes(),
            "reciprocity": nx.reciprocity(graph_snapshot),
            "assortativity": nx.degree_assortativity_coefficient(graph_snapshot),
            "assortativity_base": nx.degree_pearson_correlation_coefficient(graph_snapshot.to_undirected(), weight='weight'),
            "assortativity_out_out": nx.degree_pearson_correlation_coefficient(graph_snapshot, x='out', y='out', weight='weight'),
            "assortativity_in_in": nx.degree_pearson_correlation_coefficient(graph_snapshot, x='in', y='in', weight='weight'),
            "assortativity_in_out": nx.degree_pearson_correlation_coefficient(graph_snapshot, x='in', y='out', weight='weight'),
            "centrality_degree": nx.degree_centrality(graph_snapshot),
            "centrality_closeness": nx.closeness_centrality(graph_snapshot),
        }, ignore_index=True)
        
    return (df_snapshots.sort_values(by=['time_bucket']), res)

In [264]:
for project in projects:
    df_transactions = get_transaction_data(project)
    df_time = create_timed_data(create_base_data(project), df_transactions)
    
    np.save(f"./memory/{project}/full.npy", df_time)

100%|█████████████████████████████████████| 60658/60658 [13:00<00:00, 77.70it/s]
100%|█████████████████████████████████████| 46497/46497 [09:34<00:00, 80.97it/s]
100%|█████████████████████████████████████| 43112/43112 [07:27<00:00, 96.37it/s]
100%|████████████████████████████████████| 34043/34043 [05:34<00:00, 101.80it/s]


In [268]:
for project in projects:
    column_names = [
        "date", 
        "days_since_mint", 
        "from_address", 
        "to_address", 
        "token_id", 
        "blk_number", 
        "eth_value",
        "usd_value",
        "from_value", 
        "to_value",
        "from_value_usd",
        "to_value_usd"
    ]
    
    np_data = np.load(f"./memory/{project}/full.npy", allow_pickle=True)
    df_time = pd.DataFrame(data=np_data, columns=column_names)
    
    df_snapshot_summary, g_snapshots = build_snapshots(df_time)
    
    for i, snapshot in enumerate(g_snapshots):
        nx.write_gml(snapshot, f"./memory/{project}/snapshots/{i}.gml")
        print("Successfully wrote snapshot")
    
    np.save(f"./memory/{project}/snapshots/summary.npy", df_snapshot_summary)

100%|█████████████████████████████████████| 6066/6066 [00:04<00:00, 1299.26it/s]
100%|██████████████████████████████████████| 6066/6066 [00:14<00:00, 416.78it/s]
100%|██████████████████████████████████████| 6066/6066 [00:27<00:00, 222.66it/s]
100%|██████████████████████████████████████| 6065/6065 [00:36<00:00, 166.29it/s]
100%|██████████████████████████████████████| 6066/6066 [00:43<00:00, 140.60it/s]
100%|██████████████████████████████████████| 6066/6066 [00:48<00:00, 123.82it/s]
100%|██████████████████████████████████████| 6067/6067 [00:54<00:00, 111.04it/s]
100%|██████████████████████████████████████| 6064/6064 [00:58<00:00, 103.78it/s]
100%|███████████████████████████████████████| 6066/6066 [01:02<00:00, 97.73it/s]
100%|███████████████████████████████████████| 6066/6066 [01:10<00:00, 85.60it/s]


Successfully wrote snapshot
Successfully wrote snapshot
Successfully wrote snapshot
Successfully wrote snapshot
Successfully wrote snapshot
Successfully wrote snapshot
Successfully wrote snapshot
Successfully wrote snapshot
Successfully wrote snapshot
Successfully wrote snapshot


100%|██████████████████████████████████████| 4658/4658 [00:04<00:00, 997.45it/s]
100%|██████████████████████████████████████| 4711/4711 [00:13<00:00, 356.90it/s]
100%|██████████████████████████████████████| 4584/4584 [00:21<00:00, 208.44it/s]
100%|██████████████████████████████████████| 4646/4646 [00:33<00:00, 138.71it/s]
100%|███████████████████████████████████████| 4650/4650 [00:46<00:00, 99.17it/s]
100%|███████████████████████████████████████| 4649/4649 [00:58<00:00, 79.25it/s]
100%|███████████████████████████████████████| 4650/4650 [01:07<00:00, 68.69it/s]
100%|███████████████████████████████████████| 4649/4649 [01:16<00:00, 60.95it/s]
100%|███████████████████████████████████████| 4650/4650 [01:26<00:00, 54.07it/s]
100%|███████████████████████████████████████| 4650/4650 [01:33<00:00, 49.49it/s]


Successfully wrote snapshot
Successfully wrote snapshot
Successfully wrote snapshot
Successfully wrote snapshot
Successfully wrote snapshot
Successfully wrote snapshot
Successfully wrote snapshot
Successfully wrote snapshot
Successfully wrote snapshot
Successfully wrote snapshot


100%|█████████████████████████████████████| 4317/4317 [00:03<00:00, 1098.03it/s]
100%|██████████████████████████████████████| 4313/4313 [00:10<00:00, 403.80it/s]
100%|██████████████████████████████████████| 4316/4316 [00:17<00:00, 241.87it/s]
100%|██████████████████████████████████████| 4299/4299 [00:26<00:00, 159.95it/s]
100%|██████████████████████████████████████| 4311/4311 [00:31<00:00, 136.99it/s]
100%|██████████████████████████████████████| 4311/4311 [00:36<00:00, 117.50it/s]
100%|██████████████████████████████████████| 4311/4311 [00:42<00:00, 100.79it/s]
100%|███████████████████████████████████████| 4315/4315 [00:49<00:00, 87.64it/s]
100%|███████████████████████████████████████| 4307/4307 [00:51<00:00, 82.85it/s]
100%|███████████████████████████████████████| 4312/4312 [00:57<00:00, 75.26it/s]


Successfully wrote snapshot
Successfully wrote snapshot
Successfully wrote snapshot
Successfully wrote snapshot
Successfully wrote snapshot
Successfully wrote snapshot
Successfully wrote snapshot
Successfully wrote snapshot
Successfully wrote snapshot
Successfully wrote snapshot


100%|█████████████████████████████████████| 3420/3420 [00:02<00:00, 1295.96it/s]
100%|██████████████████████████████████████| 3438/3438 [00:07<00:00, 438.75it/s]
100%|██████████████████████████████████████| 3362/3362 [00:12<00:00, 262.28it/s]
100%|██████████████████████████████████████| 3397/3397 [00:17<00:00, 190.93it/s]
100%|██████████████████████████████████████| 3405/3405 [00:21<00:00, 157.12it/s]
100%|██████████████████████████████████████| 3404/3404 [00:25<00:00, 133.10it/s]
100%|██████████████████████████████████████| 3404/3404 [00:28<00:00, 120.76it/s]
100%|██████████████████████████████████████| 3405/3405 [00:28<00:00, 118.99it/s]
100%|██████████████████████████████████████| 3403/3403 [00:28<00:00, 119.21it/s]
100%|██████████████████████████████████████| 3405/3405 [00:28<00:00, 119.59it/s]


Successfully wrote snapshot
Successfully wrote snapshot
Successfully wrote snapshot
Successfully wrote snapshot
Successfully wrote snapshot
Successfully wrote snapshot
Successfully wrote snapshot
Successfully wrote snapshot
Successfully wrote snapshot
Successfully wrote snapshot
