In [1]:
from helius import NFTAPI, NameAPI, BalancesAPI, WebhooksAPI, TransactionsAPI
from dotenv import load_dotenv
from pathlib import Path
import requests
import pandas as pd
import numpy as np
import os
import time
import json
import joblib


In [2]:
HELIUS_API_KEY = os.getenv('HELIUM_API_KEY')


In [3]:
VALIDATORS_API_KEY = os.getenv('VALIDATORS_API_KEY')

In [4]:
pd.set_option('display.max_colwidth', None)  
pd.set_option('display.max_columns', None)   
pd.set_option('display.width', 200)    
pd.set_option('display.float_format', '{:.8f}'.format)

In [5]:
# transactions_api  = TransactionsAPI(HELIUS_API_KEY)

# parsed_transaction_history = transactions_api.get_parsed_transaction_history(address="HtXa1PH33GGvH3giqMqatndHcKnzeSkwMaW46DTzDfLd")

# print(parsed_transaction_history)

In [6]:
import requests

# Replace this with your Helius API key
#API_KEY = "YOUR_HELIUS_API_KEY"
BASE_URL = f"https://mainnet.helius-rpc.com/?api-key={HELIUS_API_KEY}"

HEADERS = {"Content-Type": "application/json"}

def get_vote_accounts():
    payload = {
        "jsonrpc": "2.0",
        "id": 1,
        "method": "getVoteAccounts"
    }
    response = requests.post(BASE_URL, json=payload, headers=HEADERS)
    if response.status_code == 200:
        data = response.json()
        current_validators = data["result"]["current"]
        delinquent_validators = data["result"]["delinquent"]
        return current_validators, delinquent_validators
    else:
        print("Error fetching vote accounts:", response.text)
        return [], []

def get_stake_accounts_by_pubkey(pubkey):
    payload = {
        "jsonrpc": "2.0",
        "id": 1,
        "method": "getProgramAccounts",
        "params": [
            "Stake11111111111111111111111111111111111111",  # Stake Program
            {
                "encoding": "jsonParsed",
                "filters": [
                    {
                        "memcmp": {
                            "offset": 12,
                            "bytes": pubkey
                        }
                    }
                ]
            }
        ]
    }
    response = requests.post(BASE_URL, json=payload, headers=HEADERS)
    if response.status_code == 200:
        return response.json()["result"]
    else:
        print("Error fetching stake accounts:", response.text)
        return []



In [7]:
# # --- Example Usage ---

# # Fetch validators
# current, delinquent = get_vote_accounts()
# print(f"Total active validators: {len(current)}")
# print(f"Total delinquent validators: {len(delinquent)}")

# # Optionally fetch stake accounts by a delegator pubkey
# delegator_pubkey = "HtXa1PH33GGvH3giqMqatndHcKnzeSkwMaW46DTzDfLd"  # Replace this
# stakes = get_stake_accounts_by_pubkey(delegator_pubkey)
# print(f"Stake accounts for {delegator_pubkey}:")
# for s in stakes:
#     print(s["pubkey"], s["account"]["data"]["parsed"]["info"]["stake"])


# Fetching Validator + Stake Data via Helius API

### Vote Data

In [8]:
# JSON-RPC Payload to fetch validator vote accounts
payload = {
    "jsonrpc": "2.0",
    "id": 1,
    "method": "getVoteAccounts"
}

response = requests.post(BASE_URL, json=payload)
data = response.json()

# Combine current + delinquent validators
validators = data["result"]["current"] + data["result"]["delinquent"]

# Convert to DataFrame
df = pd.DataFrame(validators)
df.head(n=1)

Unnamed: 0,activatedStake,commission,epochCredits,epochVoteAccount,lastVote,nodePubkey,rootSlot,votePubkey
0,763095337836965,0,"[[782, 359374327, 352476184], [783, 366264519, 359374327], [784, 373162756, 366264519], [785, 380063290, 373162756], [786, 386898445, 380063290]]",True,339981346,HM1KjNaXa4w8K4gCXbieoMh5gUTNeUhg9fvdXMKeBW3L,339981315,HMV14UAuULSwqmZhsKHzaVkYAd94iWpEeURgbUegfQLc


In [9]:
# Add additional fields (can be joined with price data or slashing reports)
df["timestamp"] = pd.Timestamp.utcnow()
df["epoch"] = None  # You can get this via 'getEpochInfo' method

# Preview important fields
# print(df[[
#     "nodePubkey", "commission", "activatedStake", "lastVote", "rootSlot", 
#     "epochCredits", "epochVoteAccount"
# ]])

df = df[[
    "timestamp","nodePubkey", "commission", "activatedStake", "lastVote", "rootSlot", 
    "epochCredits", "epochVoteAccount","votePubkey"
]]


In [10]:
print(df.columns.tolist())


['timestamp', 'nodePubkey', 'commission', 'activatedStake', 'lastVote', 'rootSlot', 'epochCredits', 'epochVoteAccount', 'votePubkey']


In [11]:
expanded_rows = []

for idx, row in df.iterrows():
    node_pubkey = row['nodePubkey']
    votePubkey = row['votePubkey']
    for epoch_info in row['epochCredits']:
        epoch, current_credits, previous_credits = epoch_info
        credits_earned = current_credits - previous_credits
        expanded_rows.append({
            'timestamp': row['timestamp'],
            'nodePubkey': node_pubkey,
            'commission': row['commission'],
            'activatedStake': row['activatedStake'],
            'lastVote': row['lastVote'],
            'rootSlot': row['rootSlot'],
            'epochVoteAccount': row['epochVoteAccount'],
            'epoch': epoch,
            'credits': current_credits,
            'previous_credits': previous_credits,
            'credits_earned': credits_earned,
            'votePubkey' : votePubkey
        })

df_expanded = pd.DataFrame(expanded_rows)
df_expanded.sort_values(by=['nodePubkey', 'epoch'], inplace=True)
print(df_expanded.head())


# Preferred: use .joblib extension
joblib.dump(df_expanded, 'df_expanded.joblib')


                            timestamp                                    nodePubkey  commission  activatedStake   lastVote   rootSlot  epochVoteAccount  epoch    credits  previous_credits  \
5815 2025-05-14 14:49:45.770789+00:00  138KHwTqKNWGLoo8fK5i8UxYtwoC5tC8o7M9rY1CDEjT           0  28931907684368  339981346  339981315              True    782  663485837         656598333   
5816 2025-05-14 14:49:45.770789+00:00  138KHwTqKNWGLoo8fK5i8UxYtwoC5tC8o7M9rY1CDEjT           0  28931907684368  339981346  339981315              True    783  670375331         663485837   
5817 2025-05-14 14:49:45.770789+00:00  138KHwTqKNWGLoo8fK5i8UxYtwoC5tC8o7M9rY1CDEjT           0  28931907684368  339981346  339981315              True    784  677274232         670375331   
5818 2025-05-14 14:49:45.770789+00:00  138KHwTqKNWGLoo8fK5i8UxYtwoC5tC8o7M9rY1CDEjT           0  28931907684368  339981346  339981315              True    785  684174907         677274232   
5819 2025-05-14 14:49:45.770789+00:00  138KHw

['df_expanded.joblib']

In [12]:
df_expanded["epoch"].unique() #.sum()

array([782, 783, 784, 785, 786, 778, 779, 780, 781, 773, 774, 775, 776,
       777, 744, 745, 746, 769, 770, 729, 740, 772, 766])

### Annual Inflation Rate
Purpose: Use this to contextualize staking APY and understand network reward distribution.





In [13]:
def get_inflation_rate():
    payload = {
        "jsonrpc": "2.0",
        "id": 1,
        "method": "getInflationRate"
    }
    response = requests.post(BASE_URL, json=payload, headers=HEADERS)
    if response.status_code == 200:
        data = response.json()
        #return data["result"]
        result = data["result"]
        result["timestamp"] = pd.Timestamp.utcnow()  # Add timestamp
        return result 
    else:
        print("Error fetching inflation rate:", response.text)
        return {}

# Fetch and convert to DataFrame
inflation_data = get_inflation_rate()
df_inflation = pd.DataFrame([inflation_data])  # Single row
print(df_inflation) #.head())

joblib.dump(df_inflation, 'df_inflation.joblib')

   epoch  foundation      total  validator                        timestamp
0    786  0.00000000 0.04538333 0.04538333 2025-05-14 14:49:46.456705+00:00


['df_inflation.joblib']

### Staking Rewards - Inflation Rate
This is critical for calculating APY


- amount: Rewards in lamports (e.g., 1,863,991,600 lamports ≈ 1.863 SOL).

- effectiveSlot: Slot when rewards were applied.

- epoch: Epoch number.

- postBalance: Account balance after rewards (in lamports).

- commission: Validator’s commission rate




In [14]:
def get_inflation_reward(vote_accounts, epoch=None):
    payload = {
        "jsonrpc": "2.0",
        "id": 1,
        "method": "getInflationReward",
        "params": [vote_accounts, {"commitment": "finalized"}]
    }
    if epoch is not None:
        payload["params"].append({"epoch": epoch})
        
    response = requests.post(BASE_URL, json=payload, headers=HEADERS)
    
    if response.status_code == 200:
        data = response.json()
        result = data["result"]
        
        # Add timestamp to each reward entry
        timestamp = pd.Timestamp.utcnow()
        for entry in result:
            if entry is not None:  # Some entries may be None
                entry["timestamp"] = timestamp
        
        return result
    else:
        print("Error fetching inflation reward:", response.text)
        return []


# Get validator-specific inflation rewards
validator_addresses = df_expanded["votePubkey"].tolist()
rewards = get_inflation_reward(validator_addresses[:100])  # Max 50 addresses per call

inflation_reward = pd.DataFrame(rewards)
#inflation_reward = inflation_reward[inflation_reward["amount"] > 0]
print(inflation_reward)

joblib.dump(inflation_reward, 'inflation_reward.joblib')

    amount  commission  effectiveSlot  epoch  postBalance                        timestamp
0        0           0      339552000    785     31194720 2025-05-14 14:49:47.671765+00:00
1        0           0      339552000    785     31194720 2025-05-14 14:49:47.671765+00:00
2        0           0      339552000    785     31194720 2025-05-14 14:49:47.671765+00:00
3        0           0      339552000    785     31194720 2025-05-14 14:49:47.671765+00:00
4        0           0      339552000    785     31194720 2025-05-14 14:49:47.671765+00:00
..     ...         ...            ...    ...          ...                              ...
95       0           0      339552000    785   3204503094 2025-05-14 14:49:47.671765+00:00
96       0           0      339552000    785   3204503094 2025-05-14 14:49:47.671765+00:00
97       0           0      339552000    785   3204503094 2025-05-14 14:49:47.671765+00:00
98       0           0      339552000    785   3204503094 2025-05-14 14:49:47.671765+00:00

['inflation_reward.joblib']

### Total Network Credits

Use total network credits to normalize your validator’s credits_earned and assess relative performance. Also provides epoch context.



In [15]:
def get_epoch_info():
    payload = {
        "jsonrpc": "2.0",
        "id": 1,
        "method": "getEpochInfo"
    }
    response = requests.post(BASE_URL, json=payload, headers=HEADERS)
    if response.status_code == 200:
        data = response.json()
        result = data["result"]
        result["timestamp"] = pd.Timestamp.utcnow()  # Add timestamp
        return result 
    else:
        print("Error fetching epoch info:", response.text)
        return {}

# Fetch and convert to DataFrame
epoch_info = get_epoch_info()
df_epoch = pd.DataFrame([epoch_info])
print(df_epoch) #.head())


joblib.dump(df_epoch, 'df_epoch.joblib')

   absoluteSlot  blockHeight  epoch  slotIndex  slotsInEpoch  transactionCount                        timestamp
0     339981355    318201603    786     429355        432000      405901759601 2025-05-14 14:49:48.257215+00:00


['df_epoch.joblib']

## Circulating SOL Supply



In [16]:
def get_supply():
    payload = {
        "jsonrpc": "2.0",
        "id": 1,
        "method": "getSupply",
        "params": [{"commitment": "finalized"}]
    }
    response = requests.post(BASE_URL, json=payload, headers=HEADERS)
    if response.status_code == 200:
        data = response.json()
        return data["result"]["value"]
    else:
        print("Error fetching supply:", response.text)
        return {}

# Fetch and convert to DataFrame
supply_data = get_supply()
df_supply = pd.DataFrame([supply_data])
print(df_supply.head(n=1))



          circulating     nonCirculating  \
0  519440289342827257  81209762491391679   

                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               

In [17]:
import pandas as pd

# If df_supply is already created:
df_supply['circulating'] = df_supply['circulating'].astype(float)
df_supply['nonCirculating'] = df_supply['nonCirculating'].astype(float)
df_supply['total'] = df_supply['total'].astype(float)
# Optionally convert to billions for readability:
df_supply['circulating_sol'] = df_supply['circulating'] / 1e9
df_supply['nonCirculating_sol'] = df_supply['nonCirculating'] / 1e9
df_supply['total_sol'] = df_supply['total'] / 1e9
# If you want a DataFrame where each nonCirculatingAccount is its own row:
df_accounts = pd.DataFrame(df_supply['nonCirculatingAccounts'][0], columns=['nonCirculatingAccount'])

# Resulting outputs:
df_supply = df_supply[['circulating_sol', 'nonCirculating_sol', 'total_sol']]
print(df_supply)
#print(df_supply[['circulating_sol', 'nonCirculating_sol', 'total_sol']])
#print(df_accounts.head())
joblib.dump(df_supply, 'df_supply.joblib')

     circulating_sol  nonCirculating_sol          total_sol
0 519440289.34282726   81209762.49139167 600650051.83421886


['df_supply.joblib']

###  Transactions Per Second (TPS)



In [18]:
def get_tps():
    payload = {
        "jsonrpc": "2.0",
        "id": 1,
        "method": "getRecentPerformanceSamples",
        "params": [1]  # Fetch 1 sample
    }
    response = requests.post(BASE_URL, json=payload, headers=HEADERS)
    if response.status_code == 200:
        data = response.json()
        sample = data["result"][0]
        tps = sample["numTransactions"] / sample["samplePeriodSecs"]
        return tps
    else:
        print("Error fetching TPS:", response.text)
        return 0

# Fetch TPS
tps = get_tps()
df_tps = pd.DataFrame({"timestamp": [pd.Timestamp.utcnow()], "tps": [tps]})
print("TPS DataFrame:")
print(df_tps.head())

joblib.dump(df_tps, 'df_tps.joblib')

TPS DataFrame:
                         timestamp           tps
0 2025-05-14 14:49:55.523736+00:00 3968.15000000


['df_tps.joblib']

### SOL Price (Using CoinGecko)




In [19]:
def get_sol_price():
    url = "https://api.coingecko.com/api/v3/simple/price?ids=solana&vs_currencies=usd"
    response = requests.get(url)
    if response.status_code == 200:
        data = response.json()
        return data["solana"]["usd"]
    else:
        print("Error fetching SOL price:", response.text)
        return 0

# Fetch SOL price
sol_price = get_sol_price()
df_price = pd.DataFrame({"timestamp": [pd.Timestamp.utcnow()], "sol_price_usd": [sol_price]})
print("SOL Price DataFrame:")
print(df_price.head())

joblib.dump(df_price, 'df_price.joblib')

SOL Price DataFrame:
                         timestamp  sol_price_usd
0 2025-05-14 14:49:56.245822+00:00   177.33000000


['df_price.joblib']

In [20]:
def get_recent_priority_fees():
    payload = {
        "jsonrpc": "2.0",
        "id": 1,
        "method": "getRecentPrioritizationFees",
        "params": [[]]  # Empty accounts for global fees
    }
    response = requests.post(BASE_URL, json=payload, headers=HEADERS)
    if response.status_code == 200:
        data = response.json()
        fees = [f["prioritizationFee"] for f in data["result"]]
        avg_fee = sum(fees) / len(fees) if fees else 0  # Micro-lamports
        return avg_fee / 1e6  # Convert to lamports
    else:
        print("Error fetching priority fees:", response.text)
        return 0

# Fetch average priority fee
avg_priority_fee = get_recent_priority_fees()
base_fee = 5000  # Static base fee per signature
avg_fee = base_fee + avg_priority_fee  # Total average fee
df_fees = pd.DataFrame({
    "timestamp": [pd.Timestamp.utcnow()],
    "avg_fee_lamports": [avg_fee],
    "avg_fee_sol": [avg_fee / 1e9],
    "avg_fee_usd": [avg_fee / 1e9 * sol_price]
})
print("Network Fees DataFrame:")
print(df_fees)


joblib.dump(df_fees, 'df_fees.joblib')

Network Fees DataFrame:
                         timestamp  avg_fee_lamports  avg_fee_sol  avg_fee_usd
0 2025-05-14 14:49:56.891130+00:00     5000.00000000   0.00000500   0.00088665


['df_fees.joblib']

In [21]:


# Assuming VALIDATORS is already defined
# Example: VALIDATORS = "your-secret-api-token"

NETWORK = "mainnet"  # Change to 'testnet' or 'pythnet' as needed
BASE_URL = f"https://www.validators.app/api/v1/validators/{NETWORK}.json"

HEADERS = {
    "Token": VALIDATORS_API_KEY
}

def fetch_all_validators(with_history=False):
    params = {}
    if with_history:
        params["with_history"] = "true"
    
    response = requests.get(BASE_URL, headers=HEADERS, params=params)
    
    if response.status_code == 200:
        validators = response.json()
        print(f"Fetched {len(validators)} validators.")
        return pd.DataFrame(validators)
    else:
        print(f"Error fetching validator data: {response.status_code} - {response.text}")
        return pd.DataFrame()

# Usage
df_validators = fetch_all_validators(with_history=True)

# Display the first few rows
print(df_validators.head())

joblib.dump(df_validators, 'df_validators.joblib')


Fetched 1217 validators.
   network                                       account                                     name        keybase_id                   www_url  \
0  mainnet  AAHSdsnRREfdQNzDGRxai8CLXh9EPCoRdwULPqBYd9fb                                StakeArmy         stakearmy     https://stakearmy.com   
1  mainnet  2UBhtRuyr9nvWsUnrbWrvJiYWEU8TVBD4PLYQJKiRa9H                                Luna Labs                    https://www.lunalabs.vc/   
2  mainnet  Ed9WjPnZfAXsPttcqxMwj94qsuXVRyBsyXnDkxFva2Zv  Rakurai | High TPS | High block rewards                      https://www.rakurai.io   
3  mainnet   NLMSHTjmSiRxGJPs3uaqtsFBC2dTGYwK41U18Nmw5kH                          T-STAKE Systems                         https://t-stake.com   
4  mainnet  Frog1Fks1AVN8ywFH3HTFeYojq6LQqoEPzgQFx2Kz5Ch                       Leapfrog 🐸 No Fees  leapfrog_systems  https://leapfrog.systems   

                                                                                                        

['df_validators.joblib']

In [22]:
df_validators.columns.to_list()

['network',
 'account',
 'name',
 'keybase_id',
 'www_url',
 'details',
 'avatar_url',
 'created_at',
 'updated_at',
 'jito',
 'jito_commission',
 'stake_pools_list',
 'is_active',
 'avatar_file_url',
 'active_stake',
 'authorized_withdrawer_score',
 'commission',
 'data_center_concentration_score',
 'delinquent',
 'published_information_score',
 'root_distance_score',
 'security_report_score',
 'skipped_slot_score',
 'skipped_after_score',
 'software_version',
 'software_version_score',
 'stake_concentration_score',
 'consensus_mods_score',
 'vote_latency_score',
 'total_score',
 'vote_distance_score',
 'ip',
 'data_center_key',
 'autonomous_system_number',
 'latitude',
 'longitude',
 'data_center_host',
 'vote_account',
 'epoch_credits',
 'epoch',
 'skipped_slots',
 'skipped_slot_percent',
 'ping_time',
 'url']

# Epochs

In [23]:
BASE_URL = 'https://www.validators.app/api/v1/epochs/mainnet.json'  # Replace with the desired network ('mainnet', 'testnet', 'pythnet')

# Headers with the API token for authentication
HEADERS = {
    'Token': VALIDATORS_API_KEY
}

# Make the API request
def get_epoch_data():
    params = {'per': 50, 'page': 1}  # Fetch up to 50 epochs, page 1 (adjust as needed)
    response = requests.get(BASE_URL, headers=HEADERS, params=params)
    
    if response.status_code == 200:
        data = response.json()
        return data['epochs']
    else:
        print(f"Error fetching epoch data: {response.text}")
        return []

# Get epoch data
epoch_data = get_epoch_data()

# Convert the data to a pandas DataFrame
df_epochs = pd.DataFrame(epoch_data)

# Display the first few rows of the DataFrame
print(df_epochs) #.head())

joblib.dump(df_epochs, 'df_epochs.joblib')

    epoch  starting_slot  slots_in_epoch  network                created_at            total_rewards          total_active_stake
0     786      339552000          432000  mainnet  2025-05-12T15:30:43.000Z                      NaN                         NaN
1     785      339120000          432000  mainnet  2025-05-10T16:10:25.000Z 149125956701354.00000000 390905360209121664.00000000
2     784      338688000          432000  mainnet  2025-05-08T16:51:08.000Z 148872367316200.00000000 390601341255544064.00000000
3     783      338256000          432000  mainnet  2025-05-06T17:20:47.000Z 149356863823391.00000000 391590155188303488.00000000
4     782      337824002          432000  mainnet  2025-05-04T18:04:09.000Z 149294269842450.00000000 392310799006432960.00000000
5     781      337392000          432000  mainnet  2025-05-02T18:48:09.000Z 149600376018593.00000000 391208956635595392.00000000
6     780      336960000          432000  mainnet  2025-04-30T19:23:26.000Z 149328698516297.00000

['df_epochs.joblib']

# Exploratory Data Analysis

In [24]:
df_expanded.head(n=1)


Unnamed: 0,timestamp,nodePubkey,commission,activatedStake,lastVote,rootSlot,epochVoteAccount,epoch,credits,previous_credits,credits_earned,votePubkey
5815,2025-05-14 14:49:45.770789+00:00,138KHwTqKNWGLoo8fK5i8UxYtwoC5tC8o7M9rY1CDEjT,0,28931907684368,339981346,339981315,True,782,663485837,656598333,6887504,ASfKFAKz6fH4eip1jdLGt5Ym954kU9KYnwq2Csn9ogSz


In [25]:
df_epochs.head()

Unnamed: 0,epoch,starting_slot,slots_in_epoch,network,created_at,total_rewards,total_active_stake
0,786,339552000,432000,mainnet,2025-05-12T15:30:43.000Z,,
1,785,339120000,432000,mainnet,2025-05-10T16:10:25.000Z,149125956701354.0,3.9090536020912166e+17
2,784,338688000,432000,mainnet,2025-05-08T16:51:08.000Z,148872367316200.0,3.9060134125554406e+17
3,783,338256000,432000,mainnet,2025-05-06T17:20:47.000Z,149356863823391.0,3.915901551883034e+17
4,782,337824002,432000,mainnet,2025-05-04T18:04:09.000Z,149294269842450.0,3.9231079900643296e+17


In [26]:
print(df_epochs["epoch"].dtype)

int64


In [27]:
print(df_expanded["epoch"].dtype)

int64


In [28]:
df_expanded["activatedStake_SOL"] = df_expanded["activatedStake"] / 1e9
df_expanded["activatedStake_USD"] = df_expanded["activatedStake_SOL"] * sol_price
df_expanded.head(n=1)

Unnamed: 0,timestamp,nodePubkey,commission,activatedStake,lastVote,rootSlot,epochVoteAccount,epoch,credits,previous_credits,credits_earned,votePubkey,activatedStake_SOL,activatedStake_USD
5815,2025-05-14 14:49:45.770789+00:00,138KHwTqKNWGLoo8fK5i8UxYtwoC5tC8o7M9rY1CDEjT,0,28931907684368,339981346,339981315,True,782,663485837,656598333,6887504,ASfKFAKz6fH4eip1jdLGt5Ym954kU9KYnwq2Csn9ogSz,28931.90768437,5130495.18966898


In [29]:
df_merged = pd.merge(df_epochs, df_expanded, on="epoch", how="inner")
df_merged["vote_account"] = df_merged["votePubkey"]
df_merged.head()

Unnamed: 0,epoch,starting_slot,slots_in_epoch,network,created_at,total_rewards,total_active_stake,timestamp,nodePubkey,commission,activatedStake,lastVote,rootSlot,epochVoteAccount,credits,previous_credits,credits_earned,votePubkey,activatedStake_SOL,activatedStake_USD,vote_account
0,786,339552000,432000,mainnet,2025-05-12T15:30:43.000Z,,,2025-05-14 14:49:45.770789+00:00,138KHwTqKNWGLoo8fK5i8UxYtwoC5tC8o7M9rY1CDEjT,0,28931907684368,339981346,339981315,True,691011038,684174907,6836131,ASfKFAKz6fH4eip1jdLGt5Ym954kU9KYnwq2Csn9ogSz,28931.90768437,5130495.18966898,ASfKFAKz6fH4eip1jdLGt5Ym954kU9KYnwq2Csn9ogSz
1,786,339552000,432000,mainnet,2025-05-12T15:30:43.000Z,,,2025-05-14 14:49:45.770789+00:00,13cm6z7ajighVFYN1aR2hPQ3Rhp4QJenDbHGRmps9P1n,0,226823548737864,339981346,339981315,True,668721858,661885711,6836147,F82nmpcZMdHtMVsLtAGByPavdN5WuEX1hjNwzs3UFuwq,226823.54873786,40222619.89768543,F82nmpcZMdHtMVsLtAGByPavdN5WuEX1hjNwzs3UFuwq
2,786,339552000,432000,mainnet,2025-05-12T15:30:43.000Z,,,2025-05-14 14:49:45.770789+00:00,1EWZm7aZYxfZHbyiELXtTgN1yT2vU1HF9d8DWswX2Tp,5,37652834688596,339981346,339981315,True,789722090,782886384,6835706,HG7a8fgjTkQhGFTPukbTdf5FCwxVVjKzkbo6ToNswTXH,37652.8346886,6676977.17532873,HG7a8fgjTkQhGFTPukbTdf5FCwxVVjKzkbo6ToNswTXH
3,786,339552000,432000,mainnet,2025-05-12T15:30:43.000Z,,,2025-05-14 14:49:45.770789+00:00,1KXvrkPXwkGF6NK1zyzVuJqbXfpenPVPP6hoiK9bsK3,0,278687889297847,339981346,339981315,True,663969118,657132918,6836200,1KXz4xKV2viJCGpxqnQqdf2J45vQr5USdmtcJLTaHkm,278687.88929785,49419723.40918721,1KXz4xKV2viJCGpxqnQqdf2J45vQr5USdmtcJLTaHkm
4,786,339552000,432000,mainnet,2025-05-12T15:30:43.000Z,,,2025-05-14 14:49:45.770789+00:00,1MuaDGhuN7KRqvsupUcYmq9u1YRh1pp38hu1WV2WC6S,0,94598521531858,339981346,339981315,True,606500927,599664944,6835983,4z9rbspUBsnZmTQbWSSPETkXmWHfhzQXXc289Z3m6XcJ,94598.52153186,16775155.82324438,4z9rbspUBsnZmTQbWSSPETkXmWHfhzQXXc289Z3m6XcJ


In [30]:
df_merged['total_reward_SOL'] = df_merged['total_rewards'] / 1e9
df_merged['total_active_stake_SOL'] = df_merged['total_active_stake'] / 1e9

In [31]:
df_merged

Unnamed: 0,epoch,starting_slot,slots_in_epoch,network,created_at,total_rewards,total_active_stake,timestamp,nodePubkey,commission,activatedStake,lastVote,rootSlot,epochVoteAccount,credits,previous_credits,credits_earned,votePubkey,activatedStake_SOL,activatedStake_USD,vote_account,total_reward_SOL,total_active_stake_SOL
0,786,339552000,432000,mainnet,2025-05-12T15:30:43.000Z,,,2025-05-14 14:49:45.770789+00:00,138KHwTqKNWGLoo8fK5i8UxYtwoC5tC8o7M9rY1CDEjT,0,28931907684368,339981346,339981315,True,691011038,684174907,6836131,ASfKFAKz6fH4eip1jdLGt5Ym954kU9KYnwq2Csn9ogSz,28931.90768437,5130495.18966898,ASfKFAKz6fH4eip1jdLGt5Ym954kU9KYnwq2Csn9ogSz,,
1,786,339552000,432000,mainnet,2025-05-12T15:30:43.000Z,,,2025-05-14 14:49:45.770789+00:00,13cm6z7ajighVFYN1aR2hPQ3Rhp4QJenDbHGRmps9P1n,0,226823548737864,339981346,339981315,True,668721858,661885711,6836147,F82nmpcZMdHtMVsLtAGByPavdN5WuEX1hjNwzs3UFuwq,226823.54873786,40222619.89768543,F82nmpcZMdHtMVsLtAGByPavdN5WuEX1hjNwzs3UFuwq,,
2,786,339552000,432000,mainnet,2025-05-12T15:30:43.000Z,,,2025-05-14 14:49:45.770789+00:00,1EWZm7aZYxfZHbyiELXtTgN1yT2vU1HF9d8DWswX2Tp,5,37652834688596,339981346,339981315,True,789722090,782886384,6835706,HG7a8fgjTkQhGFTPukbTdf5FCwxVVjKzkbo6ToNswTXH,37652.83468860,6676977.17532873,HG7a8fgjTkQhGFTPukbTdf5FCwxVVjKzkbo6ToNswTXH,,
3,786,339552000,432000,mainnet,2025-05-12T15:30:43.000Z,,,2025-05-14 14:49:45.770789+00:00,1KXvrkPXwkGF6NK1zyzVuJqbXfpenPVPP6hoiK9bsK3,0,278687889297847,339981346,339981315,True,663969118,657132918,6836200,1KXz4xKV2viJCGpxqnQqdf2J45vQr5USdmtcJLTaHkm,278687.88929785,49419723.40918721,1KXz4xKV2viJCGpxqnQqdf2J45vQr5USdmtcJLTaHkm,,
4,786,339552000,432000,mainnet,2025-05-12T15:30:43.000Z,,,2025-05-14 14:49:45.770789+00:00,1MuaDGhuN7KRqvsupUcYmq9u1YRh1pp38hu1WV2WC6S,0,94598521531858,339981346,339981315,True,606500927,599664944,6835983,4z9rbspUBsnZmTQbWSSPETkXmWHfhzQXXc289Z3m6XcJ,94598.52153186,16775155.82324438,4z9rbspUBsnZmTQbWSSPETkXmWHfhzQXXc289Z3m6XcJ,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6586,766,330912000,432000,mainnet,2025-04-03T00:04:01.000Z,151035887828837.00000000,384502142003974336.00000000,2025-05-14 14:49:45.770789+00:00,Syrd2b19zpvDrSTSkUzRUoPm9xpJkPtfKj77oGDCUHN,0,32206236567,339283336,339283305,True,461129027,460324854,804173,Syrd4L1eGcZdhRGoB9wb4aJKKJKv9gMudZLMnXdV7AR,32.20623657,5711.13193043,Syrd4L1eGcZdhRGoB9wb4aJKKJKv9gMudZLMnXdV7AR,151035.88782884,384502142.00397432
6587,746,322272000,432000,mainnet,2025-02-22T04:45:22.000Z,152809126746356.00000000,383879948504408320.00000000,2025-05-14 14:49:45.770789+00:00,9SJdwWQ1YgRdYcQ9NinWUussJNX4MUKdMEy2SpMAfe67,0,1000000,337647080,337647049,True,4472689,4199203,273486,5fxz3nazxtC3pW119KPV6qEELuBSSGxvo152b4UyW9Lj,0.00100000,0.17733000,5fxz3nazxtC3pW119KPV6qEELuBSSGxvo152b4UyW9Lj,152809.12674636,383879948.50440830
6588,745,321840000,432000,mainnet,2025-02-20T05:14:55.000Z,153015323340118.00000000,384476729594204992.00000000,2025-05-14 14:49:45.770789+00:00,9SJdwWQ1YgRdYcQ9NinWUussJNX4MUKdMEy2SpMAfe67,0,1000000,337647080,337647049,True,4199203,126181,4073022,5fxz3nazxtC3pW119KPV6qEELuBSSGxvo152b4UyW9Lj,0.00100000,0.17733000,5fxz3nazxtC3pW119KPV6qEELuBSSGxvo152b4UyW9Lj,153015.32334012,384476729.59420496
6589,744,321408000,432000,mainnet,2025-02-18T05:42:44.000Z,153020257790256.00000000,383079108788735808.00000000,2025-05-14 14:49:45.770789+00:00,9SJdwWQ1YgRdYcQ9NinWUussJNX4MUKdMEy2SpMAfe67,0,1000000,337647080,337647049,True,126181,0,126181,5fxz3nazxtC3pW119KPV6qEELuBSSGxvo152b4UyW9Lj,0.00100000,0.17733000,5fxz3nazxtC3pW119KPV6qEELuBSSGxvo152b4UyW9Lj,153020.25779026,383079108.78873581


In [32]:
df_merged.columns.tolist()

['epoch',
 'starting_slot',
 'slots_in_epoch',
 'network',
 'created_at',
 'total_rewards',
 'total_active_stake',
 'timestamp',
 'nodePubkey',
 'commission',
 'activatedStake',
 'lastVote',
 'rootSlot',
 'epochVoteAccount',
 'credits',
 'previous_credits',
 'credits_earned',
 'votePubkey',
 'activatedStake_SOL',
 'activatedStake_USD',
 'vote_account',
 'total_reward_SOL',
 'total_active_stake_SOL']

In [33]:
# BASE_URL = f"https://www.validators.app/api/v1/validators/{NETWORK}.json"
# def fetch_all_validators(with_history=False):
#     params = {"with_history": "true"} if with_history else {}
#     response = requests.get(BASE_URL, headers={"Token": VALIDATORS_API_KEY}, params=params)
#     return pd.DataFrame(response.json()) if response.status_code == 200 else pd.DataFrame()
# df_validators = fetch_all_validators(with_history=True)



In [34]:
df_validators.head(n=1)

Unnamed: 0,network,account,name,keybase_id,www_url,details,avatar_url,created_at,updated_at,admin_warning,jito,jito_commission,stake_pools_list,is_active,avatar_file_url,active_stake,authorized_withdrawer_score,commission,data_center_concentration_score,delinquent,published_information_score,root_distance_score,security_report_score,skipped_slot_score,skipped_after_score,software_version,software_version_score,stake_concentration_score,consensus_mods_score,vote_latency_score,total_score,vote_distance_score,ip,data_center_key,autonomous_system_number,latitude,longitude,data_center_host,vote_account,epoch_credits,epoch,skipped_slots,skipped_slot_percent,ping_time,url
0,mainnet,AAHSdsnRREfdQNzDGRxai8CLXh9EPCoRdwULPqBYd9fb,StakeArmy,stakearmy,https://stakearmy.com,High Uptime,https://s3.amazonaws.com/keybase_processed_uploads/f0dba3e5b88045f95b11db2ebc38ab05_360_360.jpg,2022-03-22 14:19:12 UTC,2024-11-27 03:40:05 UTC,,True,1000.0,[BlazeStake],True,https://prod-validators.nyc3.digitaloceanspaces.com/3gufbxp1khfyadpw66cogyp5rlpm,23561961743202,0.0,0,0,False,2,2,1,2,2,2.1.21,2,0,0,2.0,13,2,80.76.51.122,48014-AL-Tirana,48014,41.3253,19.8184,,91ciyr81FJnZaoWcDT4PHwwdzgNp21cgH354JbCuxnwR,6833125.0,786.0,0.0,0.0,,https://www.validators.app/api/v1/validators/mainnet/AAHSdsnRREfdQNzDGRxai8CLXh9EPCoRdwULPqBYd9fb


In [35]:
# 1. Separate static validator metadata
static_validator_cols = [
    'vote_account', 'name', 'keybase_id', 'www_url',
    'details', 'avatar_url'
]
df_validators_static = df_validators[static_validator_cols].drop_duplicates()

# 2. Merge with historical data using ONLY vote_account
df_final = pd.merge(
    df_merged,
    df_validators_static,
    on='vote_account',
    how='left'
)


df_final

Unnamed: 0,epoch,starting_slot,slots_in_epoch,network,created_at,total_rewards,total_active_stake,timestamp,nodePubkey,commission,activatedStake,lastVote,rootSlot,epochVoteAccount,credits,previous_credits,credits_earned,votePubkey,activatedStake_SOL,activatedStake_USD,vote_account,total_reward_SOL,total_active_stake_SOL,name,keybase_id,www_url,details,avatar_url
0,786,339552000,432000,mainnet,2025-05-12T15:30:43.000Z,,,2025-05-14 14:49:45.770789+00:00,138KHwTqKNWGLoo8fK5i8UxYtwoC5tC8o7M9rY1CDEjT,0,28931907684368,339981346,339981315,True,691011038,684174907,6836131,ASfKFAKz6fH4eip1jdLGt5Ym954kU9KYnwq2Csn9ogSz,28931.90768437,5130495.18966898,ASfKFAKz6fH4eip1jdLGt5Ym954kU9KYnwq2Csn9ogSz,,,Nodes24.io 💎Jito 0% MEV💥,,https://nodes24.io/,Nodes24 is a reliable staking validator for your crypto,https://nodes24.io/images/apple-touch-icon.png
1,786,339552000,432000,mainnet,2025-05-12T15:30:43.000Z,,,2025-05-14 14:49:45.770789+00:00,13cm6z7ajighVFYN1aR2hPQ3Rhp4QJenDbHGRmps9P1n,0,226823548737864,339981346,339981315,True,668721858,661885711,6836147,F82nmpcZMdHtMVsLtAGByPavdN5WuEX1hjNwzs3UFuwq,226823.54873786,40222619.89768543,F82nmpcZMdHtMVsLtAGByPavdN5WuEX1hjNwzs3UFuwq,,,LAB517,vladmsq,,,https://s3.amazonaws.com/keybase_processed_uploads/a7648404e2054a7f4568e0ef6fcaad05_360_360.jpg
2,786,339552000,432000,mainnet,2025-05-12T15:30:43.000Z,,,2025-05-14 14:49:45.770789+00:00,1EWZm7aZYxfZHbyiELXtTgN1yT2vU1HF9d8DWswX2Tp,5,37652834688596,339981346,339981315,True,789722090,782886384,6835706,HG7a8fgjTkQhGFTPukbTdf5FCwxVVjKzkbo6ToNswTXH,37652.83468860,6676977.17532873,HG7a8fgjTkQhGFTPukbTdf5FCwxVVjKzkbo6ToNswTXH,,,clockchain,clockchain,https://clockchainstake.tk,Clockchain technologies,https://s3.amazonaws.com/keybase_processed_uploads/e74fe161942373aa3f96094b4fa70505_360_360.jpg
3,786,339552000,432000,mainnet,2025-05-12T15:30:43.000Z,,,2025-05-14 14:49:45.770789+00:00,1KXvrkPXwkGF6NK1zyzVuJqbXfpenPVPP6hoiK9bsK3,0,278687889297847,339981346,339981315,True,663969118,657132918,6836200,1KXz4xKV2viJCGpxqnQqdf2J45vQr5USdmtcJLTaHkm,278687.88929785,49419723.40918721,1KXz4xKV2viJCGpxqnQqdf2J45vQr5USdmtcJLTaHkm,,,1000X.sh,1000xstake,https://1000x.sh,Stake with the best,https://s3.amazonaws.com/keybase_processed_uploads/a292977de8e5dd1e12d03eb9c26a7e05_360_360.jpg
4,786,339552000,432000,mainnet,2025-05-12T15:30:43.000Z,,,2025-05-14 14:49:45.770789+00:00,1MuaDGhuN7KRqvsupUcYmq9u1YRh1pp38hu1WV2WC6S,0,94598521531858,339981346,339981315,True,606500927,599664944,6835983,4z9rbspUBsnZmTQbWSSPETkXmWHfhzQXXc289Z3m6XcJ,94598.52153186,16775155.82324438,4z9rbspUBsnZmTQbWSSPETkXmWHfhzQXXc289Z3m6XcJ,,,Mercurial Validator,,https://mercurialvalidator.com,,https://arweave.net/KOfFDGNUYYH2uBe_e-5gi3xXHIBp7JwHac02RZBi-TI
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6601,766,330912000,432000,mainnet,2025-04-03T00:04:01.000Z,151035887828837.00000000,384502142003974336.00000000,2025-05-14 14:49:45.770789+00:00,Syrd2b19zpvDrSTSkUzRUoPm9xpJkPtfKj77oGDCUHN,0,32206236567,339283336,339283305,True,461129027,460324854,804173,Syrd4L1eGcZdhRGoB9wb4aJKKJKv9gMudZLMnXdV7AR,32.20623657,5711.13193043,Syrd4L1eGcZdhRGoB9wb4aJKKJKv9gMudZLMnXdV7AR,151035.88782884,384502142.00397432,,,,,
6602,746,322272000,432000,mainnet,2025-02-22T04:45:22.000Z,152809126746356.00000000,383879948504408320.00000000,2025-05-14 14:49:45.770789+00:00,9SJdwWQ1YgRdYcQ9NinWUussJNX4MUKdMEy2SpMAfe67,0,1000000,337647080,337647049,True,4472689,4199203,273486,5fxz3nazxtC3pW119KPV6qEELuBSSGxvo152b4UyW9Lj,0.00100000,0.17733000,5fxz3nazxtC3pW119KPV6qEELuBSSGxvo152b4UyW9Lj,152809.12674636,383879948.50440830,,,,,
6603,745,321840000,432000,mainnet,2025-02-20T05:14:55.000Z,153015323340118.00000000,384476729594204992.00000000,2025-05-14 14:49:45.770789+00:00,9SJdwWQ1YgRdYcQ9NinWUussJNX4MUKdMEy2SpMAfe67,0,1000000,337647080,337647049,True,4199203,126181,4073022,5fxz3nazxtC3pW119KPV6qEELuBSSGxvo152b4UyW9Lj,0.00100000,0.17733000,5fxz3nazxtC3pW119KPV6qEELuBSSGxvo152b4UyW9Lj,153015.32334012,384476729.59420496,,,,,
6604,744,321408000,432000,mainnet,2025-02-18T05:42:44.000Z,153020257790256.00000000,383079108788735808.00000000,2025-05-14 14:49:45.770789+00:00,9SJdwWQ1YgRdYcQ9NinWUussJNX4MUKdMEy2SpMAfe67,0,1000000,337647080,337647049,True,126181,0,126181,5fxz3nazxtC3pW119KPV6qEELuBSSGxvo152b4UyW9Lj,0.00100000,0.17733000,5fxz3nazxtC3pW119KPV6qEELuBSSGxvo152b4UyW9Lj,153020.25779026,383079108.78873581,,,,,


In [36]:
df_final_sorted = df_final.sort_values(by='epoch', ascending=False).reset_index(drop=True)

df_final_sorted

%store df_final_sorted

Stored 'df_final_sorted' (DataFrame)


  db[ 'autorestore/' + arg ] = obj


In [37]:
import joblib

# Preferred: use .joblib extension
joblib.dump(df_final_sorted, 'df_final_sorted.joblib')

['df_final_sorted.joblib']

In [38]:
# df_final['name'].isna().sum()

In [39]:
df_final_sorted.columns.tolist()

['epoch',
 'starting_slot',
 'slots_in_epoch',
 'network',
 'created_at',
 'total_rewards',
 'total_active_stake',
 'timestamp',
 'nodePubkey',
 'commission',
 'activatedStake',
 'lastVote',
 'rootSlot',
 'epochVoteAccount',
 'credits',
 'previous_credits',
 'credits_earned',
 'votePubkey',
 'activatedStake_SOL',
 'activatedStake_USD',
 'vote_account',
 'total_reward_SOL',
 'total_active_stake_SOL',
 'name',
 'keybase_id',
 'www_url',
 'details',
 'avatar_url']

In [40]:
latest_epoch = df_final_sorted['epoch'].max()

# Filter out the rows with the latest epoch where total_rewards and total_active_stake are NaN
df_cleaned = df_final_sorted[~((df_final_sorted['epoch'] == latest_epoch) & df_final_sorted['total_rewards'].isna() & df_final_sorted['total_active_stake'].isna())].reset_index(drop=True)

df_cleaned.head(n=5)

joblib.dump(df_cleaned, 'df_cleaned.joblib')


['df_cleaned.joblib']