In [2]:
import requests
import pandas as pd
import numpy as np
import random
from api_encoder import get_raw_data
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

# Import Scraped Addresses

In [3]:
df = pd.read_csv('./Data/merged.csv')

In [4]:
scraped_df = pd.read_csv('./Data/scraped/non_null_addresses_816.csv')
scraped_df['address'] = scraped_df['address'].astype(str)
scraped_addresses = list(scraped_df['address'].values)
display(scraped_df.head())
print(scraped_df.dtypes)


# filter out addresses that already exist in the original dataset
existing_addresses = df['Address'].unique()
scraped_addresses = [add for add in scraped_addresses if add not in existing_addresses]

Unnamed: 0,id,csdbId,name,status,category,subcategory,address
0,Csdb__ScamDomains__e4b99f,e4b99f,myelherwallel.com,,Phishing,MyEtherWallet,0xD0cC2B24980CBCCA47EF755Da88B220a82291407
1,Csdb__ScamDomains__a3ec5b,a3ec5b,etherswap.org,,Phishing,Ethereum,0x4cdc1cba0aeb5539f2e0ba158281e67e0e54a9b1
2,Csdb__ScamDomains__91a008,91a008,xn--mytherwallet-fvb.com,,Phishing,MyEtherWallet,0x00e01a648ff41346cdeb873182383333d2184dd1
3,Csdb__ScamDomains__82a7f6,82a7f6,myethwallet.net,,Phishing,MyEtherWallet,0x00e01A648Ff41346CDeB873182383333D2184dd1
4,Csdb__ScamDomains__1224a2,1224a2,district-0x.io,,Fake ICO,district0x,0x240e125c20a4cC84Bd6E7F8D1FD07Aff4c06D43d


id              object
csdbId          object
name            object
status         float64
category        object
subcategory     object
address         object
dtype: object


# Fetch All Scraped Data

In [5]:
import datetime
from datetime import timedelta
import numpy as np

def create_Avg_min_between_received_tnx(data):
    # dependent on sent_or_receive, timestamp
    if no_received: return [0]
    else: 
        datetimes = pd.to_datetime(data[received_cond]['timeStamp'], unit='s')
        if len(datetimes) == 1:
            return [0]
        else:
            datetimes_diff = [datetimes.iloc[i] - datetimes.iloc[i - 1] for i in range(1, len(datetimes))]
            minutes_list = [td.total_seconds() / 60 for td in datetimes_diff]
            average_minutes = round(sum(minutes_list) / len(minutes_list), 2)
            return [average_minutes]

def create_Avg_min_between_sent_tnx(data):
    # dependent on sent_or_receive, timestamp
    if no_sent: return [0]
    else: 
        datetimes = pd.to_datetime(data[sent_cond]['timeStamp'], unit='s')
        if len(datetimes) == 1:
            return [0]
        else:
            datetimes_diff = [datetimes.iloc[i] - datetimes.iloc[i - 1] for i in range(1, len(datetimes))]
            minutes_list = [td.total_seconds() / 60 for td in datetimes_diff]
            average_minutes = round(sum(minutes_list) / len(minutes_list), 2)
            return [average_minutes]

def create_Sent_tnx(data):
    # dependent on sent_or_receive
    if no_sent: return [0]
    else: return [data[sent_cond].shape[0]]

def create_Received_Tnx(data):
    # dependent on sent_or_receive
    if no_sent: return [len(data)]
    else: return [len(data) - sum(sent_cond)]

def create_Number_of_Created_Contracts(data):
    # dependent on contractAddress
    return [sum(data['contractAddress'] != '')]

def create_Average_of_Unique_Received_From_Addresses(data):
    # dependent on sent_or_receive
    if no_received: return [0]
    else: return [len(data[received_cond]['from'].unique())]

def create_Average_of_Unique_Sent_To_Addresses(data):
    # dependent on sent_or_receive
    if no_sent: return [0]
    else: return [len(data[sent_cond]['to'].unique())]

def create_min_max_avg_value_received(data):
    # dependent on sent_or_receive, eth_value
    if no_received: 
        min_value_received, max_value_received, avg_val_received = [0], [0], [0]
        return min_value_received, max_value_received, avg_val_received
    else: 
        try: min_value_received = [min(data[received_cond]['eth_value'])]
        except: print('Error occurred at creating min_value_received')
        try: max_value_received = [max(data[received_cond]['eth_value'])]
        except: print('Error occurred at creating max_value_received')
        try: avg_val_received = [data[received_cond]['eth_value'].mean()]
        except: print('Error occurred at creating avg_val_received')
        return min_value_received, max_value_received, avg_val_received

    
def create_min_max_avg_value_sent(data):
    # dependent on sent_or_receive, eth_value
    if no_sent: 
        min_value_sent, max_value_sent, avg_val_sent = [0],[0],[0]
        return min_value_sent, max_value_sent, avg_val_sent
    else: 
        try: min_value_sent = [min(data[sent_cond]['eth_value'])]
        except: print('Error occurred at creating min_value_sent')
        try: max_value_sent = [max(data[sent_cond]['eth_value'])]
        except: print('Error occurred at creating max_value_sent')
        try: avg_val_sent = [data[sent_cond]['eth_value'].mean()]
        except: print('Error occurred at creating avg_val_sent')
        return min_value_sent, max_value_sent, avg_val_sent


def create_total_transactions_including_tnx_to_create_contract(data):
    return [data.shape[0]]
    
def create_total_Ether_sent(data):
    # dependent on sent_or_receive, eth_value
    if no_sent: return [0]
    else: return [round(data[sent_cond]['eth_value'].sum(), 6)]


def create_total_ether_received(data):
    # dependent on sent_or_receive, eth_value
    if no_received: return [0]
    else: return [round(data[received_cond]['eth_value'].sum(), 6)]

def create_total_ether_balance(data):
    return [round(data[received_cond]['eth_value'].sum() - data[sent_cond]['eth_value'].sum(), 6)]
    
''' ######################################################################################## '''

def reconstruct(address, data):

    new_df = pd.DataFrame()
    
    # create address feature
    new_df['Address'] = [address]
    # create target feature
    new_df['Flag'] = [1]
    # create other features
    try: new_df['Avg_min_between_received_tnx'] = create_Avg_min_between_received_tnx(data=data)
    except: print('Error occurred when creating Avg_min_between_received_tnx')

    try: new_df['Avg_min_between_sent_tnx'] = create_Avg_min_between_sent_tnx(data=data)
    except: print('Error occurred when creating Avg_min_between_sent_tnx')

    try: new_df['Sent_tnx'] = create_Sent_tnx(data=data)
    except: print('Error occurred when creating Sent_tnx')

    try: new_df['Received_Tnx'] = create_Received_Tnx(data=data)
    except: print('Error occurred when creating Received_Tnx')

    try: new_df['Number_of_Created_Contracts'] = create_Number_of_Created_Contracts(data=data)
    except: print('Error occurred when creating Number_of_Created_Contracts')

    try: new_df['Average_of_Unique_Received_From_Addresses'] = create_Average_of_Unique_Received_From_Addresses(data=data)
    except: print('Error occurred when creating Average_of_Unique_Received_From_Addresses')
    
    try: new_df['Average_of_Unique_Sent_To_Addresses'] = create_Average_of_Unique_Sent_To_Addresses(data=data)
    except: print('Error occurred when creating Average_of_Unique_Sent_To_Addresses')

    try: new_df['min_value_received'], new_df['max_value_received'], new_df['avg_value_received'] = create_min_max_avg_value_received(data=data)
    except: print('Error occurred when creating min_max_avg_value_received')

    try: new_df['min_value_sent'], new_df['max_value_sent'], new_df['avg_value_sent'] = create_min_max_avg_value_sent(data=data)
    except: print('Error occurred when creating min_max_avg_value_sent')

    try: new_df['total_transactions_including_tnx_to_create_contract'] = create_total_transactions_including_tnx_to_create_contract(data=data)
    except: print('Error occurred when creating total_transactions_including_tnx_to_create_contract')

    try: new_df['total_Ether_sent'] = create_total_Ether_sent(data=data)
    except: print('Error occurred when creating total_Ether_sent')

    try: new_df['total_ether_received'] = create_total_ether_received(data=data)
    except: print('Error occurred when creating total_ether_received')

    try: new_df['total_ether_balance'] = create_total_ether_balance(data=data)
    except: print('Error occurred when creating total_ether_balance')

    return new_df

In [10]:
#####

# test a few samples

successful_data = []
no_data = []
timeout_data = []
sample_dfs = []

scraped_dfs = pd.DataFrame()
for i, addr in tqdm(enumerate(scraped_addresses), total=len(scraped_addresses)): 
    try: 
        sample = pd.DataFrame(get_raw_data(address=addr))
        if sample.empty:
            # save samples without data
            no_data.append(i) 
            continue
        else:
            # save samples original data
            sample_dfs.append(sample)
            
            # reconstruct data
            addr_lc, addr_uc = addr.lower(), addr.upper()

            sample['sent_or_receive'] = ((sample['from'] == addr_lc) | (sample['from'] == addr_uc)).map({True: 'sent', False: 'received'})
            sample['eth_value'] = sample['value'].astype(float) / (10**18)
            received_cond = sample['sent_or_receive']=='received'
            sent_cond = sample['sent_or_receive']=='sent'
            no_received = sample[sample['sent_or_receive']=='received'].empty
            no_sent = sample[sample['sent_or_receive']=='sent'].empty

            try: sample = reconstruct(addr, sample)
            except: print(f'Error occurred at sample {i}:\n')
            scraped_dfs = pd.concat([scraped_dfs, sample], axis=0)

            # save successful samples
            successful_data.append(i)
            
    except requests.Timeout: 
        # save samples that take too long
        timeout_data.append(i)
        print(f'Fetching data for sample {i} took more than 1 minute.')
        continue
    except Exception as e:
        print(f"Error fetching data for sample {i}: {e}")

print('#'*50)
print('Number of samples successfully transformed: ', len(successful_data))
print('Number of samples without any data: ', len(no_data))
print('Number of samples that were timed out: ', len(timeout_data))
unsuccessful_data = [i for i in range(len(scraped_addresses)) if (i not in successful_data) & (i not in no_data ) & (i not in timeout_data)]
print('Number of samples that were unsuccessful: ', len(unsuccessful_data))
# for i in range(len(sample_df)): print(f'Dimensions of Sample {i}: ', sample_df[i].shape)

# sample_df = pd.concat(sample_df, axis=0)
# display(sample_df.head())
# print(sample_df.shape)

  1%|▏         | 10/679 [04:39<10:15:58, 55.24s/it]

Error fetching data for sample 9: HTTPSConnectionPool(host='api.etherscan.io', port=443): Read timed out.


  3%|▎         | 21/679 [06:01<3:56:35, 21.57s/it] 

Fetching data for sample 20 took more than 1 minute.


  6%|▌         | 41/679 [07:54<3:33:01, 20.03s/it]

Fetching data for sample 40 took more than 1 minute.


  8%|▊         | 54/679 [09:34<3:27:41, 19.94s/it]

Fetching data for sample 53 took more than 1 minute.


 21%|██        | 142/679 [16:00<3:22:51, 22.67s/it]

Fetching data for sample 141 took more than 1 minute.


 21%|██        | 144/679 [17:05<4:30:28, 30.33s/it]

Fetching data for sample 143 took more than 1 minute.


 23%|██▎       | 158/679 [20:05<4:06:53, 28.43s/it]

Fetching data for sample 157 took more than 1 minute.


 25%|██▌       | 170/679 [22:00<3:33:02, 25.11s/it]

Fetching data for sample 169 took more than 1 minute.


 25%|██▌       | 173/679 [23:03<3:48:59, 27.15s/it]

Fetching data for sample 172 took more than 1 minute.


 33%|███▎      | 224/679 [26:52<2:24:21, 19.04s/it]

Fetching data for sample 223 took more than 1 minute.


 37%|███▋      | 250/679 [30:09<2:38:45, 22.20s/it]

Fetching data for sample 249 took more than 1 minute.


 39%|███▊      | 262/679 [32:49<2:20:33, 20.22s/it]

Fetching data for sample 261 took more than 1 minute.


 39%|███▊      | 263/679 [33:50<3:45:34, 32.53s/it]

Fetching data for sample 262 took more than 1 minute.


 41%|████      | 275/679 [35:09<2:11:52, 19.59s/it]

Fetching data for sample 274 took more than 1 minute.


 44%|████▎     | 297/679 [37:51<2:32:29, 23.95s/it]

Fetching data for sample 296 took more than 1 minute.


 49%|████▊     | 330/679 [40:53<1:52:55, 19.41s/it]

Fetching data for sample 329 took more than 1 minute.


 49%|████▉     | 334/679 [41:11<54:52,  9.54s/it]  

Error fetching data for sample 333: DataFrame constructor not properly called!


 52%|█████▏    | 352/679 [43:16<1:59:53, 22.00s/it]

Fetching data for sample 351 took more than 1 minute.


 55%|█████▌    | 374/679 [46:46<2:05:52, 24.76s/it]

Fetching data for sample 373 took more than 1 minute.


 56%|█████▋    | 382/679 [47:14<18:44,  3.79s/it]  

Error fetching data for sample 381: DataFrame constructor not properly called!


 57%|█████▋    | 388/679 [48:52<2:08:56, 26.59s/it]

Fetching data for sample 387 took more than 1 minute.


 58%|█████▊    | 393/679 [50:05<1:58:04, 24.77s/it]

Fetching data for sample 392 took more than 1 minute.


 65%|██████▍   | 440/679 [55:40<2:47:47, 42.12s/it]

Error fetching data for sample 439: HTTPSConnectionPool(host='api.etherscan.io', port=443): Read timed out.


 70%|██████▉   | 473/679 [1:01:59<1:10:02, 20.40s/it]

Fetching data for sample 472 took more than 1 minute.


 73%|███████▎  | 494/679 [1:04:54<1:21:23, 26.40s/it]

Fetching data for sample 493 took more than 1 minute.


 73%|███████▎  | 499/679 [1:06:00<1:09:36, 23.21s/it]

Fetching data for sample 498 took more than 1 minute.


 93%|█████████▎| 629/679 [1:16:02<23:34, 28.29s/it]  

Fetching data for sample 628 took more than 1 minute.


 95%|█████████▍| 642/679 [1:18:14<13:42, 22.24s/it]

Fetching data for sample 641 took more than 1 minute.


 95%|█████████▍| 643/679 [1:19:15<20:23, 33.99s/it]

Fetching data for sample 642 took more than 1 minute.


 99%|█████████▉| 672/679 [1:22:14<02:45, 23.61s/it]

Fetching data for sample 671 took more than 1 minute.


100%|██████████| 679/679 [1:22:32<00:00,  7.29s/it]

##################################################
Number of samples successfully transformed:  523
Number of samples without any data:  126
Number of samples that were timed out:  26
Number of samples that were unsuccessful:  4





In [23]:
sample_dfs = pd.concat(sample_dfs)
print(scraped_dfs.shape)
print(sample_dfs.shape)

(523, 19)
(21057, 22)


In [20]:
scraped_dfs.to_csv('new_samples_aggregated.csv')
sample_dfs.to_csv('new_samples_all.csv')