# TOC

## Load imports

### Non pyspark

In [63]:
import time
import numpy as np
import pandas as pd
from fastparquet import ParquetFile, write
import matplotlib.pyplot as plt
import timeit
import dask.dataframe as dd
import holoviews as hv
import datashader as ds
import gc
from os.path import join, dirname
import pyarrow 
import pyarrow.parquet as pq
# PLOT USING HOLOVIEWS DASK AND DATASHADER
import hvplot.pandas
import hvplot.dask
import hvplot as hv
from bokeh.models import HoverTool
from pdb import set_trace
from datetime import datetime, timedelta
import csv
import dateutil.relativedelta

%matplotlib inline

### Pyspark

In [64]:
#UTILS 
import findspark
findspark.init('/usr/local/spark/spark-2.3.2-bin-hadoop2.7')

import pyspark
from pyspark.sql.functions import *
from pyspark import SparkContext, SparkConf
import pyspark.sql.functions as f
from pyspark.sql.types import DateType, StringType, IntegerType
from pyspark.sql import *
from pyspark.sql.window import Window

spark = SparkSession \
    .builder \
    .appName("Poolminers") \
    .getOrCreate()

sc=SparkContext.getOrCreate(spark)

## UTILS

In [65]:
t0 = time.time()
# EXPLODE THE TRANSACTION_LIST COLUMN IN AIONV4.BLOCK
def explode_block(df1,col):
    # explode the list the first time
    df1 = df1.withColumn(col,explode(split(f.col(col),'\],\[') ))
    # extract the transaction_hash
    df1 = df1.withColumn(col,regexp_replace('transaction_list', '(\[|\]|")', ''))
    df1 = df1.withColumn(col, df1[col].substr(0, 64))
    return df1

# munge block dataframe
def hex_to_int(x):
    return int(x,16)

def munge_block(df1):
    df1 = explode_block(df1,'transaction_list')
    udf_hex_to_int = udf(hex_to_int,IntegerType())
    df1 = df1.withColumn('difficulty',udf_hex_to_int('difficulty'))
    return df1

def value_hex_to_int(x):
    output = 0
    try:
        output = int(str(x),16)
    except:
        output = 0
        
    return output

# MAKE LIST OF TIER 1 MINERS
def make_tier1_list(df1,threshold_tx_paid_out=10,threshold_blocks_mined_per_day=1.5):
    # find all miners in period and make list
    miner_list_0 = [i.miner_address for i in df1.select('miner_address').distinct().collect()]
    miner_list_1 = [i.to_addr for i in df1.select('to_addr').distinct().collect()]
    miner_list = list(set(miner_list_0 + miner_list_1))
    # Count transactions paid out per day: group transactions by date and miner
    # tier 1 = percentage mined per day > threshold || transactions paid out > threshold per day# make unique list of tier 1
    df_temp = df1.groupBy('from_addr','block_timestamp').agg({'to_addr':'count'})
    df_temp = df_temp.dropna()
    # find daily mean
    df_temp = df_temp.groupBy('from_addr').agg({'count(to_addr)':'mean'})
    df_temp = df_temp.filter(df_temp['avg(count(to_addr))']>=threshold_tx_paid_out)
    # make list of tier 1 using tx paid out
    list_a = [i.from_addr for i in df_temp.select('from_addr').distinct().collect()]
    # check against miner list to ensure that only miners are included
    list_a = list(set(miner_list) & set(list_a))
    df_temp.unpersist()
    
    # Get percentage blocks mined per day: group by miner address, day and count
    df_temp = df.groupBy('miner_address','block_timestamp')\
        .agg({'block_number':'count'})\
        .withColumn('percent',100*(col('count(block_number)')/
                                   sum(col('count(block_number)')).over(Window.partitionBy())))
    df_temp = df_temp.groupBy('miner_address').agg({'percent':'mean'})
    df_temp = df_temp.filter(df_temp['avg(percent)']>=threshold_blocks_mined_per_day)
    list_b = [i.miner_address for i in df_temp.select('miner_address').distinct().collect()]
    df_temp.unpersist()
    print(list_a)
    print(list_b)
    #merge lists, drop duplicates
    tier1_miner_list = list(set(list_a+list_b))
    del list_a,list_b

    #check this list again miner_address
    gc.collect()
    return tier1_miner_list

# MAKE LIST OF ALL MINERS
def make_miner_list(df1):
    df_temp = df1.groupby('to_addr').agg({'from_addr':'count'})
    df_temp = df_temp.dropna()
    miner_list_1 = [i.to_addr for i in df_temp.select('to_addr').distinct().collect()]
    df_temp.unpersist()
    df_temp = df1.groupBy('miner_address').agg({'block_number':'count'})
    d_temp = df_temp.dropna()
    miner_list_0 = [i.miner_address for i in df_temp.select('miner_address').distinct().collect()]
    df_temp.unpersist()
    return list(set(miner_list_1 + miner_list_0))


# dateformat = 'yyyy-mm-dd 00:00:00'
# CHANGE INDIVIDUAL DATE TO TIMESTAMP
def date_to_timestamp(date):
    return datetime.strptime(date, "%Y-%m-%d %H:%M:%S").timestamp()

# CREATE TIMESTAMP COLUMN IN DATETYPE FORMAT GIVEN A SPARK DATAFRAME
def timestamp_to_date(df,col):
    return df.withColumn('block_timestamp', f.from_unixtime('block_timestamp').cast(DateType()))

# TRUNCATE SPARK DATAFRAME GIVEN STRING DATES
# dateformat = 'yyyy-mm-dd 00:00:00'
def truncate_dataframe(df1,startdate,enddate):
    # get a month of data prior to startdate
    startdate = date_to_timestamp(startdate)# get a month of data prior to startdate
    startdate1 = startdate - ( 30 * 24 * 60 * 60)
    enddate = date_to_timestamp(enddate)
    if startdate > enddate:
        startdate = enddate
    df1 = df1.filter((f.col('block_timestamp') >= startdate1) & 
              (f.col('block_timestamp') <= enddate))
    df1 = timestamp_to_date(df1,'block_timestamp')
    return df1

# UDF FUNCTIONS TO INCLUDE EXTERNAL 
class MyUDFs:
    #DICTIONARY WHEN MATCHING POOLNAME WITH MINER ADDRESS    
    def populate(self):
        self.df_poolinfo = pd.read_csv('../data/poolinfo.csv')
        self.dict_poolinfo = dict(zip(self.df_poolinfo.address,self.df_poolinfo.poolname))
        self.pool_keys = list(self.dict_poolinfo.keys())
        
    def get_poolname_label(self):
        def ab(miner_address,pool_tier):
            if miner_address in self.pool_keys:
                return self.dict_poolinfo[miner_address]
            else:
                if pool_tier == 1:
                    return miner_address[0:10]
                else:
                    return 'tier 2'
        return udf(ab,StringType())
    
    def get_poolname_label_list(self,lst):
        output = list()
        for miner_address in lst:
            if miner_address in self.pool_keys:
                output.append(self.dict_poolinfo[miner_address])
            else:
                output.append(miner_address[0:10])
        
        return output
                




## Load data

In [66]:
# LOAD FROM DATABASE
block_columns = ['block_timestamp','difficulty','transaction_hash',
                      'miner_address','block_number']
tx_columns = ['block_timestamp','from_addr','to_addr','value']
df_tx = spark.read.parquet('../data/transaction.parquet').select('transaction_hash','block_timestamp','from_addr','to_addr','value')  
df_block = spark.read.parquet('../data/block.parquet').select('block_timestamp','difficulty','transaction_list',
                      'miner_address','block_number')


##  Make data warehouse for period (start_date, enddate)

#### ENTER INPUT DATES

In [67]:
startdate = '2018-10-01 00:00:00'
enddate = '2018-10-14 00:00:00'
analysis_period = startdate[0:10]+' to '+enddate[0:10]+': '

In [68]:
# truncate dataframes
df_block_1 = truncate_dataframe(df_block,startdate, enddate).drop('__index_level_0__')
df_block_1 = munge_block(df_block_1)

df_tx_1 = truncate_dataframe(df_tx,startdate,enddate).drop('block_timestamp')
#
df_tx_1 = df_tx_1.drop('__index_level_0__')
df = df_block_1.join(df_tx_1, df_block_1.transaction_list == 
                                    df_tx_1.transaction_hash,how='left').drop('transaction_list')
df_tx.unpersist()
df_tx_1.unpersist()
df_block.unpersist()
df_block_1.unpersist()
gc.collect()

327

### Identify tier 1 addresses

#### SETUP

In [69]:
threshold_tx_paid_out = 10
threshold_blocks_mined_per_day = 1 # Percentage

In [70]:
tier1_miner_list = make_tier1_list(df,threshold_tx_paid_out,threshold_blocks_mined_per_day)
# ADD A COLUMN CALLED POOL_TIER: 1 , 2
pool_tier_udf = f.udf(lambda miner_address: 1 if 
                      miner_address in tier1_miner_list else 2, IntegerType())
df = df.withColumn('pool_tier',pool_tier_udf(df.miner_address))

['a05b4d9de807fef0851bc51676aa989d79b578d9b83181ef17150574701ce9af', 'a06cac75d9211abff36fe2cf9e7fc1741b076cea40324eb597b3957942be483f', 'a0087f6afb456a5d69855594606594e6bc5309cd0b8ad233ba147ea7fca5b3f3', 'a0d2086132f7076d63730e4fdbffbdfbc4e8f02e1e855187d56f7c8d412838b2', 'a099688bb19051b38c846580600812d095f89cfff7abc17ab6c4af63e408f2f1', 'a022a68ef27e5febe4570edb2ce5586974cb326f24fce2ebb23012c07dac90e0', 'a027f5dda9fc518b5465941d84201739a581d0f30b8744eca21136175785c2aa', 'a0a1e55cbbffc99d9dcaf56e5350847267471cd6d69d4dead14953e5e82d97bf', 'a0fde4c8c5b90d55f81970750c80ac7130c269bbbdce5b4ece3cfd10ff7e5f40', 'a08c1cccf49d7ac14d6e0ac19ace643df3174acfa8755ba3723e588420e644fd', 'a0548cbbc4fc8c6b73fb9a915b0fad7d537e36abc05927d0c614ce14545d8621', 'a075e26d87f852324cf55626072c930dce6129bae7defcbccd1bf45063a46ffd', 'a088082e8a56dfa1b0e903ec194048fa97007831789e42ed277dae2c64e9d95e', 'a084bbc3d8f732950c1af3dc9ea9cc9c4bbee7b1ec2a33a193cc67eab5611799', 'a07a07e8965418ed2355ed5b062dd9ec29a099578d7330

### Label pools in dataframe

In [71]:
myUDF  = MyUDFs()
myUDF.populate() 
df = df.withColumn('poolname',myUDF.get_poolname_label()(df["miner_address"],df["pool_tier"]))               
    

## Analysis & plots

### Mining addresses and rewards, tier 2

In [None]:
def plot_awards_tier2(df,startdate,enddate):
    startdate = datetime.strptime(startdate, "%Y-%m-%d %H:%M:%S")
    df_curr = df.filter(col('block_timestamp')<startdate)
    tier1_miner_list = make_tier1_list(df_curr,threshold_tx_paid_out,
                                             threshold_blocks_mined_per_day)
    all_miners = make_miner_list(df_curr)
    # find tier 2 by finding complement of tier 1 list
    tier2_miner_list = list(set(tier1_miner_list) ^ set(all_miners))
    # filter dataframe to retain only tier2 miner list
    df_curr=df_curr.filter(df_curr.to_addr.isin(tier2_miner_list))
    
    # get the values for tier 2 miners
    # convert to int
    udf_hex_to_int = udf(value_hex_to_int,IntegerType())
    df_curr = df_curr.withColumn('value',udf_hex_to_int('value'))
    df_curr = df_curr.groupby('to_addr').agg({'value':'sum'})
    df_curr = df_curr.dropna()
    # truncate address string
    df_curr = df_curr.withColumn('to_addr',df['to_addr'].substr(0,10))

    # convert small group to pandas for plotting
    df_curr1 = df_curr.toPandas().sort_values(by=['sum(value)'],ascending=False)
    df_curr.unpersist()
    # Leave out the datashade option to get the tooltip to work
    bar = df_curr1.hvplot.bar('to_addr', ['sum(value)'], rot=90,
                             width=4000,height=600,
                             title=analysis_period +'Tier 2- Miners, value')
    hv.show(bar)
    del df_curr1
    gc.collect()
    
plot_awards_tier2(df,startdate,enddate)
t1 = time.time()
total = t1 - t0
print('time elapsed up til tier 2 mining rewards= {} mins'.format(total/60))

['a05b4d9de807fef0851bc51676aa989d79b578d9b83181ef17150574701ce9af', 'a06cac75d9211abff36fe2cf9e7fc1741b076cea40324eb597b3957942be483f', 'a0d2086132f7076d63730e4fdbffbdfbc4e8f02e1e855187d56f7c8d412838b2', 'a099688bb19051b38c846580600812d095f89cfff7abc17ab6c4af63e408f2f1', 'a022a68ef27e5febe4570edb2ce5586974cb326f24fce2ebb23012c07dac90e0', 'a027f5dda9fc518b5465941d84201739a581d0f30b8744eca21136175785c2aa', 'a0a1e55cbbffc99d9dcaf56e5350847267471cd6d69d4dead14953e5e82d97bf', 'a0fde4c8c5b90d55f81970750c80ac7130c269bbbdce5b4ece3cfd10ff7e5f40', 'a08c1cccf49d7ac14d6e0ac19ace643df3174acfa8755ba3723e588420e644fd', 'a0548cbbc4fc8c6b73fb9a915b0fad7d537e36abc05927d0c614ce14545d8621', 'a075e26d87f852324cf55626072c930dce6129bae7defcbccd1bf45063a46ffd', 'a088082e8a56dfa1b0e903ec194048fa97007831789e42ed277dae2c64e9d95e', 'a084bbc3d8f732950c1af3dc9ea9cc9c4bbee7b1ec2a33a193cc67eab5611799', 'a07a07e8965418ed2355ed5b062dd9ec29a099578d7330c206666abcb0b9aab9', 'a00983f07c11ee9160a64dd3ba3dc3d1f88332a2869f25

### Bar graphs of Tier blocks mined over period, poolname

In [None]:
def plot_miners(df,startdate):
    # only plot requested period
    startdate = datetime.strptime(startdate, "%Y-%m-%d %H:%M:%S")
    df_temp = df.filter(f.col('block_timestamp')>= startdate)
    
    df_temp = df_temp.groupby('poolname').agg({'block_number':'count'})\
        .withColumn('percent',100*(col('count(block_number)')/
                                   sum(col('count(block_number)')).over(Window.partitionBy())))
    # convert small group to pandas for plotting
    df_temp = df_temp.toPandas().sort_values(by=['percent'],ascending=False)
    # Leave out the datashade option to get the tooltip to work
    
    bar = df_temp.hvplot.bar('poolname', ['count(block_number)'], rot=90,
                             subplots=True, shared_axes=False,
                             width=800,height=400,
                             title=analysis_period +'Miners, blockcount')
    bar_perc = df_temp.hvplot.bar('poolname', ['percent'], rot=90,
                                  subplots=True, shared_axes=False,
                                  width=800,height=400,
                                  title=analysis_period +'Miners by %')

    
    hover = HoverTool(tooltips=[
        ("blocks mined", "$count(block_number)"),
        ("percentage", "$percent")
    ])

    #plot.options(tools=[hover])
    # display plot
    hv.show(bar)
    hv.show(bar_perc)
    del df_temp
    gc.collect()


### Difficulty

#### SETUP

In [None]:
startdate = '2018-10-01 00:00:00'
enddate = '2018-10-14 00:00:00'
analysis_period = startdate[0:10]+' to '+enddate[0:10]+': '

In [None]:
def plot_difficulty(df,startdate):
    # only plot requested period
    startdate = datetime.strptime(startdate, "%Y-%m-%d %H:%M:%S")
    df_temp = df.filter(f.col('block_timestamp')>= startdate)

    df_temp = df_temp.select('block_timestamp','difficulty')
    #convert from string to int
    df_temp = df_temp.toPandas().sort_values(by=['block_timestamp'])
    line = df_temp.hvplot.line(x='block_timestamp',y='difficulty',rot=90,
                               width=800,height=400,
                               title=analysis_period +'Difficulty')
    hv.show(line)
    del df_temp
    gc.collect()
  

### Daily Activity Miners - blocks

In [None]:
def plot_active_miners(df,startdate):
    # only plot requested period
    startdate = datetime.strptime(startdate, "%Y-%m-%d %H:%M:%S")
    df_temp = df.filter(f.col('block_timestamp')>= startdate)
    df_temp = df_temp.groupby('poolname','block_timestamp').agg({'block_number':'count'})
    df_temp = df_temp.toPandas().sort_values(by=['block_timestamp'])
    lines = df_temp.hvplot.line(x='block_timestamp',y='count(block_number)',rot=90,
                                by='poolname',width=800,height=600,
                                title=analysis_period+'pools blocks mined daily')
    hv.show(lines)
    del df_temp
    gc.collect()


### Retention

In [None]:
def pool_retention(df,startdate):
    # filter data from previous month, excluding timespan under observation
    startdate = datetime.strptime(startdate, "%Y-%m-%d %H:%M:%S")
    # get tier 1 miner list for previous period
    df_prev = df.filter(f.col('block_timestamp')<startdate)
    tier1_miner_list_prev = make_tier1_list(df_prev,threshold_tx_paid_out,
                                             threshold_blocks_mined_per_day)
    miner_list_prev = make_miner_list(df_prev)
    # get tier 1 miner list for period under observation
    tier1_miner_list_period = tier1_miner_list
    #reuse the myUDF class defined in utils
    myUDF = MyUDFs()
    myUDF.populate()
    df_prev.unpersist()

    df_period = df.filter(f.col('block_timestamp')>=startdate)
    miner_list_period = make_miner_list(df_period)
    # POOLS RETAINED = INTERSECTION OF TWO T1 MINER LISTS
    retained = list(set(tier1_miner_list_prev) & set(tier1_miner_list_period))
    retained_all = len(list(set(miner_list_prev) & set(miner_list_period)))
    # POOLS DROPPED = IN LIST PREVIOUS BUT NOT IN THE NEW LIST
    dropped = np.setdiff1d(tier1_miner_list_prev,tier1_miner_list_period)
    dropped_all = len(np.setdiff1d(miner_list_prev,miner_list_period))
    # NEW POOLS = IN PERIOD UNDER OBSERVATION LIST BUT NOT IN PREVIOUS  MONTH LIST
    new = np.setdiff1d(tier1_miner_list_period,tier1_miner_list_prev)
    new_all = len(np.setdiff1d(miner_list_period,miner_list_prev))
    print("T1 POOLS RETAINED:{}".format(myUDF.get_poolname_label_list(retained)))
    print("T1 POOLS DROPPED:{}".format(myUDF.get_poolname_label_list(dropped)))
    print("NEW T1 POOLS:{}".format(myUDF.get_poolname_label_list(new)))
    print("\n---------------------------------------------")
    print("ALL MINERS RETAINED:{}".format(retained_all))
    print("ALL MINERS DROPPED:{}".format(dropped_all))
    print("ALL NEW MINERS:{}".format(new_all))
    
    df_period.unpersist()
    gc.collect()
pool_retention(df,startdate)

### DISPLAY PLOTS/DATA

In [None]:
plot_miners(df,startdate)
plot_difficulty(df,startdate)
plot_active_miners(df,startdate)


In [None]:
t1 = time.time()
total = t1 - t0
print('time elaped = {} mins'.format(total/60))