In [1]:
import hashlib
import hmac
import csv
import numpy as np
import pandas as pd
import joblib
from joblib import Parallel, delayed
from multiprocessing import  Pool
import multiprocessing
import itertools
import random

In [2]:
from operator import itemgetter
import logging, sys

In [3]:
import warnings
warnings.filterwarnings("ignore")

In [4]:
_DEBUG = False
KAGGLE = False

In [5]:
# logging.basicConfig( #stream=sys.stdout, 
#                     level=logging.DEBUG)

In [6]:
# logging.debug('A debug message!')

In [7]:
pd.options.display.max_rows=500
pd.options.display.max_columns=500
pd.set_option('display.max_colwidth', None)

In [8]:
import time
from multiprocessing import Value
from ctypes import py_object

In [9]:
def init_pool_processes(shared_value):
    global p
    p = shared_value
def parallelize_dataframe(df, func, data_to_share=None,other_args=None, n_cores=4):
    start = time.time()
    df_split = np.array_split(df, n_cores)
    
    if data_to_share is not None:
        p = Value(py_object)
        p.value = data_to_share        
        pool = Pool(processes=n_cores,
                    initializer=init_pool_processes, initargs=(p,))
    else:
        pool = Pool(n_cores)
        
    arg_list=[]
    arg_list.append(df_split)
            
    if other_args is not None:
        for arg in other_args:
            arg_list.append(itertools.repeat(arg))
    
    parallel_args = zip(*arg_list)
    end = time.time()
    print(f'Parallel preparation time: {end-start}')
    df = pd.concat(pool.starmap(func, parallel_args))
    pool.close()
    pool.join()
    return df

In [10]:
def debugprint(msg):
    if _DEBUG:
        print(msg)

In [11]:
def get_bin(bins,val):
    matching_bin=-1
    for i in range(len(bins)-1):
        if bins[i] <= val < bins[i+1]:
            matching_bin= bins[i]
    if matching_bin==-1:
        matching_bin=bins[-1]
    return matching_bin

In [12]:
def calculate_roll(server_seed,client_seed,nonce):
    # Compute the HMAC-SHA512 hash of string1 using string2 as the key
    
    string1 = f"{nonce}:{server_seed}:{nonce}"
    hmac_key =   f"{nonce}:{client_seed}:{nonce}".encode()
    hmac_hash = hmac.new(key=hmac_key,
                         msg=string1.encode(),
                         digestmod=hashlib.sha512).hexdigest()

    # Convert the first 8 characters of the HMAC-SHA512 hash to an integer
    string3 = hmac_hash[:8]
    number = int(string3, 16)

    # Compute the roll value
    roll = round(number / 429496.7295)
    
    return roll

In [13]:
def compute_roll_hash_arrays(server_seed_array,client_seed_array,nonce_array):
    # Vectorize the function
    vectorized_calculate_roll = np.vectorize(calculate_roll)

    # Compute the roll values for the input arrays
    roll_array = vectorized_calculate_roll(server_seed_array,
                                           client_seed_array,
                                           nonce_array)
    
    # Compute roll for hash using the hash which is the next element in array
    # Vectorize the function
    vectorized_calculate_roll_hash = np.vectorize(calculate_roll)

    # Compute the roll values for the input arrays
    roll_array_hash = vectorized_calculate_roll_hash(server_seed_array[1:],
                                           client_seed_array[:chain_length-1],
                                           nonce_array[:chain_length-1])
    return roll_array,roll_array_hash

    

In [14]:
# trans_36 = bytes((x ^ 0x36) for x in range(256))
# trans_36

In [15]:
chain_length = 5110101 #1000001

filename = "sha256_hashchain_b9556671f785fe935bee087665b4047e421ea4491a5e2021a8152cab0b74c953.npy"

if KAGGLE:
    filepath_client ="/kaggle/input/hash-file-generation-client-seed/"
else:
    filepath_client="data/"
    
# filename_client=f'{filepath_client}sha256_hashchain_client_5M.npy'
filename_client=f'{filepath_client}sha256_hashchain_client1.npy'


# Define the number of CPU cores to use
num_cores = 8

In [16]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [17]:
# server_seed_array=np.load(filename)
# print(len(server_seed_array))
# server_seed_array[0:10]

In [18]:
client_seed_array=np.load(filename_client,
                         allow_pickle=True,fix_imports=True,encoding='latin1')
client_seed_array= client_seed_array[:chain_length]
print(len(client_seed_array))
client_seed_array[0:10]

1010102


array(['3455dda4b3aecaa36d4687277766a079feebbb4ab01dc038bc8fb8a36ddad6aa',
       '57eea50e4484ba102d776506baf4b1dc0dcb3fc74fe2ab56b10b0f427cb6409e',
       '3e49b34f2d6c7116d92ac8818c94f87d5616d7f2aea5024c2f38413605dc321c',
       'a0731d74c9501894d16ba144bc10dd49e8439c7f88304bfdbe4a755870c46da3',
       '665122519e5835c8e7ebfa0ab5b1949d75ac2c9e3dc1a6965a4b6e26bfc4ed76',
       '87fa02b24749184f0ce9183f21715ce556db72ecd1ad4f32b02227b40f3b24ab',
       'edee47aed4d9b598fa647145a17e44f9d155dc53ad231c913ff5c93fc04e56ee',
       'baf9d66be4136fe8acd30c042704b1798380fe0f3ee45d5800ae01db88e03ed3',
       '8eac9ebdcbfcf1720f6b0317fa66569c3129d278c0e2605069e7c5a0a2179c8d',
       'aeab33d31e0fb10105ce7ad75e3a40736db70a824c3ed6489957d6736243184b'],
      dtype='<U64')

In [19]:
# str_cat ='pppppp'
# # str_cat ='ppppppppppppppp'
# print(len(str_cat))
# client_seed_array_1=(str_cat + pd.Series(client_seed_array)).values
# client_seed_array_1[:2]

In [20]:
# str_cat ='ppppppppppppppp'
str_cat ='woxpwoxpwoxpwoxpwoxp'
print(len(str_cat))
client_seed_array_2=(str_cat + pd.Series(client_seed_array)).values
client_seed_array_2[:2]

20


array(['woxpwoxpwoxpwoxpwoxp3455dda4b3aecaa36d4687277766a079feebbb4ab01dc038bc8fb8a36ddad6aa',
       'woxpwoxpwoxpwoxpwoxp57eea50e4484ba102d776506baf4b1dc0dcb3fc74fe2ab56b10b0f427cb6409e'],
      dtype=object)

In [21]:
np.random.seed(5000)
low = 1500
# nonce_array = np.random.randint(low, high=low+chain_length, 
#                                 size=chain_length)

nonce_array = np.arange(low,low+chain_length,1)
np.random.shuffle(nonce_array)
print(len(nonce_array))
print(nonce_array[:10])
pd.Series(nonce_array).nunique()

5110101
[ 638661 1297273 1057802 1410000 2886373 4139846 2635511 4657935 2336816
  640280]


5110101

In [22]:
# roll_array,roll_array_hash=compute_roll_hash_arrays(server_seed_array,
#                                        client_seed_array,
#                                        nonce_array)
# print(len(roll_array[0:10]))
# print(roll_array[0:100])
# print(len(roll_array_hash[0:10]))
# print(roll_array_hash[0:100])

In [23]:
# roll_array_1,roll_array_hash_1=compute_roll_hash_arrays(server_seed_array,
#                                        client_seed_array_1,
#                                        nonce_array)
# print(len(roll_array_1[0:10]))
# print(roll_array_1[0:100])
# print(len(roll_array_hash_1[0:10]))
# print(roll_array_hash_1[0:100])

In [24]:
# roll_array_2,roll_array_hash_2=compute_roll_hash_arrays(server_seed_array,
#                                        client_seed_array_2,
#                                        nonce_array)
# print(len(roll_array_2[0:10]))
# print(roll_array_2[0:100])
# print(len(roll_array_hash_2[0:10]))
# print(roll_array_hash_2[0:100])

In [25]:
# # Vectorize the function
# vectorized_calculate_roll = np.vectorize(calculate_roll)

# # Compute the roll values for the input arrays
# roll_array = vectorized_calculate_roll(server_seed_array,
#                                        client_seed_array,
#                                        nonce_array)

# # Print the output
# print(len(roll_array[0:10]))
# print(roll_array[0:100])

In [26]:
# # Vectorize the function
# vectorized_calculate_roll_hash = np.vectorize(calculate_roll)

# # Compute the roll values for the input arrays
# roll_array_hash = vectorized_calculate_roll_hash(server_seed_array[1:],
#                                        client_seed_array[:chain_length-1],
#                                        nonce_array[:chain_length-1])

# # Print the output
# print(len(roll_array_hash[0:10]))
# print(roll_array_hash[0:100])

In [46]:
ABOVE="Above"
BELOW="Below"
INTER ="Intermediate"

In [27]:
# Define a function to generate a single hash in the hash chain
def generate_hash(message):
    # Hash the message using SHA-256
    hash_obj = hashlib.sha256(message)

    # Get the hexadecimal representation of the hash
    hash_hex = hash_obj.hexdigest()

    # Return the hash
    return hash_hex

In [28]:
def generate_hash_chain(seed,chain_length):
    hash_list=[seed]
    message = seed.encode()
    for i in range(chain_length):
        hash_hex= generate_hash(message)
        # Write the hash to the CSV file
        hash_list.append(hash_hex)
        # Update the message with the current hash
        message = hash_hex.encode()
    return hash_list

def compute_multirolls(hash_list,client_seed,nonce):
    rolls=[]
    for cur_hash in hash_list:
        roll = calculate_roll(cur_hash,client_seed,nonce)
        rolls.append(roll)
    return rolls
def compute_multirolls_nonce(server_hash,client_seed,hash_list_nonce):
    rolls=[]
    for cur_hash_nonce in hash_list_nonce:
        roll = calculate_roll(server_hash,client_seed,cur_hash_nonce)
        rolls.append(roll)
    return rolls

In [29]:
def predict_digit_pattern(server_hash,roll_hash,nonce,client_seed,
                        match_digit_arr,match_digit_indices,
                        mismatch_digit_arr=None,mismatch_digit_indices=None):

#     rolls_list  = compute_multirolls(hash_list_server,client_seed,nonce)
    match = True
    
#     roll_first = rolls_list[0]
    factors = np.array([10000,1000,100,10])
    factors_filt = factors[match_digit_indices]
    for i,val in enumerate(factors_filt):
        match = match & (int((roll_hash % val) / (val / 10))==match_digit_arr[i])
        if not match: 
            break

    if mismatch_digit_arr is not None:
        factors_filt = factors[mismatch_digit_indices]
        for i,val in enumerate(factors_filt):
            match = match & (int((roll_hash % val) / (val / 10))!=mismatch_digit_arr[i])
            if not match: 
                break            

    return match
 

def predict_digit_output(server_hash,nonce,client_seed,
                         match_digit_arr,match_digit_count_arr,
                        hash_list_server,match_digit_indices=None):

    rolls_list  = compute_multirolls(hash_list_server,client_seed,nonce)
    match = True
    
    #eliminate zero from all roll_hash
#     for roll in rolls_list:
#         str_roll = f'{roll:04d}'
#         cur_count = str_roll.count('0')
#         if cur_count!=0:
#             match=False
            
    for roll,match_digit,match_count,match_indices \
            in zip(rolls_list,match_digit_arr,match_digit_count_arr,match_digit_indices):
        str_roll = f'{roll:04d}'
        
        if match_digit_indices is None:
            str_roll_indexed = str_roll
        else:
            str_roll_indexed = ''
            for index in match_indices:
                str_roll_indexed = str_roll_indexed + str_roll[int(index)]
                
        cur_count = str_roll_indexed.count(match_digit)
        if cur_count!=match_count:
            match=False
            
    return match, rolls_list  

In [30]:
# test_file_random ='/kaggle/input/hash-generate-random-seeds/test_data_random.csv'
# test_seeds_2=pd.read_csv(test_file_random)
# print(test_seeds_2.shape)
# test_seeds_2.head()

In [31]:
# #create special client seed pattern array

# def get_hexval(val):
#     hexdata='abcdef'
#     if val >=10:
#         return hexdata[val-10]
#     else:
#         return str(val)

# str_spl_client = 'woxpwoxpwoxpwoxpwoxp066103c1b2a6ebe01cf30afd49a6b931278793fc457dee84510f03e11779d5be'
# spl_client_list=[]
# for index in range(20,20+64):
#     for val in range(16):
#         spl_client_list.append(str_spl_client[:index]+get_hexval(val)+str_spl_client[index+1:])
# spl_client_array = np.array(spl_client_list)
# print(len(spl_client_array))
# spl_client_array[1020:]

##### Machine Learning

Generate Data

In [32]:
def generate_data(test,client_seed_data,pass_state_reqd,
                    match_digit_arr,match_digit_count_arr,
                    mismatch_digit_arr=None,mismatch_digit_indices=None,
                    feature_chain_length=20,
                    is_data_hash=False,
                    match_digit_indices=None,
                    match_count_exp=1,
                    test_limit = None,
                    last_index=None,result_df=None,
                    print_client_scan=False,
                    hash_list_nonce=None):
    i = len(client_seed_data)-1 if last_index is None else last_index - 1
    last_success_index =i
    client_size = len(client_seed_data)
    client_scan_fresh=True 
    

    if result_df is None:
        result_df = pd.DataFrame(columns=['match','seed','hash','nonce',
                                    'client_seed','client_index',
                                    'passed_state',
                                    'roll_actual','roll_hash_list','roll_hash_list_nonce',
                                         ])
    result_success=True
    dictionary_list=[]
    for server_count,(server_index,row) in enumerate(test.iterrows()):
        
        if (server_count % 5)==0:
            print(f'server_count:{server_count}')
        
        if (test_limit is not None) and (server_count >= test_limit):
            break
        
        #for large volume of data, it is recommended to set i to entire lengthof client seeds
        #for every server seed
#         i = len(client_seed_data)-1 if last_index is None else last_index - 1
        
        cur_seed = row['seed']
        cur_nonce = row['nonce']
        cur_hash= row['hash']
        
       
        if is_data_hash:
            cur_hash_list = list(test[server_count+1:server_count+1+feature_chain_length+1]['seed'])
        else:
            cur_hash_list = generate_hash_chain(cur_hash,feature_chain_length)
        
          
        fullclientscan=False
        match = False
        prev_match=""
        match_count=0
        
        client_start = i
        while match_count < match_count_exp and not fullclientscan:
            
            if print_client_scan  & (i % 1000==0) :
                print(f'current scan : {i}')
            
            client_seed = client_seed_data[i]
            client_index = i

            roll_seed_actual=calculate_roll(cur_seed,client_seed,cur_nonce)
            roll_hash=calculate_roll(cur_hash,client_seed,cur_nonce)
            
            match = predict_digit_pattern(cur_hash,roll_hash,cur_nonce,client_seed,
                        match_digit_arr,match_digit_indices,
                        mismatch_digit_arr=mismatch_digit_arr,
                        mismatch_digit_indices=mismatch_digit_indices)
        
            # match, rolls_list = predict_digit_output(cur_hash,cur_nonce,client_seed,
            #              match_digit_arr,match_digit_count_arr,
            #             cur_hash_list,match_digit_indices=match_digit_indices)
            i-=1
            
            if i==client_start and not match:
                print()
                print(i,client_start)
                print(f'No further match for seed:{cur_seed} nonce:{cur_nonce} match count: {match_count}')
                print()
                fullclientscan=True
            if (i<0):
                i = len(client_seed_data)-1
       
            if match:
                
#                 print(f'Matched for seed:{cur_seed} i:{i} match count: {match_count}')
                rolls_list  = compute_multirolls(cur_hash_list,client_seed,cur_nonce)
                if hash_list_nonce is None:
                    hash_list_nonce = generate_hash_chain(str(cur_nonce),feature_chain_length)

                rolls_list_nonce  = compute_multirolls_nonce(cur_hash,client_seed,hash_list_nonce)

                match_count += 1
                
                result_success = ((pass_state_reqd==ABOVE) & (roll_seed_actual>5250))  \
                                | ((pass_state_reqd==BELOW) & (roll_seed_actual<4750))
                if result_success:
                    last_success_index= i+1
#                 result = [result_success,cur_seed, cur_hash, cur_nonce,
#                                                client_seed,client_index,pass_state_reqd,
#                                                roll_seed_actual,rolls_list,rolls_list_nonce]
                

                dictionary_data = {'match': result_success,'seed':cur_seed, 
                                   'hash': cur_hash, 'nonce':cur_nonce,
                                    'client_seed':client_seed,'client_index':client_index,
                                      'passed_state':pass_state_reqd,
                                     'roll_actual':roll_seed_actual,
                                   'roll_hash_list':rolls_list,'roll_hash_list_nonce':rolls_list_nonce
                                  }
                dictionary_list.append(dictionary_data)
                
#                 result_df.loc[len(result_df.index)] = result
#                 print(f'server_count: {server_count} current result: {result_success, cur_nonce,pass_state_reqd,roll_seed_actual,roll_hash}')
    
    result_df = pd.DataFrame.from_dict(dictionary_list)

    return result_df


In [87]:
hashval = 'a08fa060e49d0755163928e2445a6d5eb715a9827239fff6aa75a07f7dec36cd'#'3e24f29a3ae70e54aedd86b0c68640050be7dace8ae679f1fb85d1325a56ddae'
seed = hashval #'c77b3e783f094e255b0693f89094bed95ad9738d86f44fa4c09c8b0d58e2c73e'
nonce = 7007 #2390

feature_chain_length=60
train_hash_size = 2158#121
train_client_size = 2000 
test_client_size = 10000

In [50]:
cur_hash_list = generate_hash_chain(hashval,train_hash_size)

train_data = pd.DataFrame(columns=['seed','hash','nonce'])
start=24
train_data['seed']=np.array(cur_hash_list[start:-1])
train_data['hash']=np.array(cur_hash_list)[start+1:]
train_data['nonce'] = nonce

print(len(train_data))
train_data.head()

2134


Unnamed: 0,seed,hash,nonce
0,3a2f8b8d70e8c21a2c7e77f36ac7211cec947e0df3ca69451d1ada23d092b271,d98d0a6ee9de1d1e39bc3f6c6e65d20f4cc269ddcb3e3702e4b1b1e30140a0be,7007
1,d98d0a6ee9de1d1e39bc3f6c6e65d20f4cc269ddcb3e3702e4b1b1e30140a0be,78392e401b0996ce99afaf8ca4dc08457a11af594810880cee87866d863f7400,7007
2,78392e401b0996ce99afaf8ca4dc08457a11af594810880cee87866d863f7400,27a8f3d3af9f07dae5691046573a7bd39748297676a32915cbf396d964a9472a,7007
3,27a8f3d3af9f07dae5691046573a7bd39748297676a32915cbf396d964a9472a,e86606a181f4989df87783084f7e9e7c51d8a21fd4a3abea3300b4f0c4d0065c,7007
4,e86606a181f4989df87783084f7e9e7c51d8a21fd4a3abea3300b4f0c4d0065c,317a0ca95eb93e9233d8e2933dcc7db7a05a5aaf97fe8928f0f34ec42dddba1e,7007


In [51]:
PATTERN_99=False
HIGH_ANALYSIS=False

cur_digit = 9
cur_zero_digit = 0

In [52]:
READ_FROM_FILE=False

In [53]:
%%time
# match_digit_arr = np.array(['999',])
# match_digit_count_arr=np.array([1])
# match_digit_indices = np.array(['012'])

if PATTERN_99:
    match_digit_arr=np.array([9,9])
    mismatch_digit_arr=np.array([9,9])
else:
    match_digit_arr=np.array([0,0])
    mismatch_digit_arr=np.array([0,0])
    
match_digit_indices = np.array([1,2])
match_digit_count_arr=np.array([1])
mismatch_digit_indices = np.array([0,3])

CPU times: user 19 µs, sys: 2 µs, total: 21 µs
Wall time: 24.6 µs


In [54]:
%%time

if not READ_FROM_FILE:

    cur_client_seed_array = client_seed_array_2[:1000001]

    results_df = None

    last_index =  len(cur_client_seed_array)-1 #1024
    start = 0

    hash_list_nonce = generate_hash_chain(str(nonce),feature_chain_length)

    results_df = generate_data(train_data,
                                cur_client_seed_array,
                                 ABOVE,
                        match_digit_arr,match_digit_count_arr,match_digit_indices=match_digit_indices,
                        mismatch_digit_arr=mismatch_digit_arr,mismatch_digit_indices=mismatch_digit_indices,
                        match_count_exp=train_client_size,
                        feature_chain_length=feature_chain_length,
                        is_data_hash = True,
                        test_limit=38,
                        last_index=last_index,result_df=results_df,
                        hash_list_nonce=hash_list_nonce)
    row = results_df.iloc[len(results_df)-1]
    last_index = row['client_index']

server_count:0
server_count:5
server_count:10
server_count:15
server_count:20
server_count:25
server_count:30
server_count:35
CPU times: user 1.82 s, sys: 3 ms, total: 1.82 s
Wall time: 1.82 s


In [None]:
# last_index
# results_df['client_index'].min()

In [None]:
# results_df.head()

In [None]:
# print(len(results_df))
# results_df['client_seed'].nunique()

In [56]:
def save_cleaned_results_df(results_df,is_test,
                            file_suffix="",cleaned_suffix = True):
    nonce_suffix = "_nonce"
    rolls_df = results_df['roll_hash_list'].apply(pd.Series)
    rolls_df.columns = [f'roll_{i}' for i in range(rolls_df.shape[1])]
    rolls_df_nonce = results_df['roll_hash_list_nonce'].apply(pd.Series)
    rolls_df_nonce.columns = [f'roll_{i}{nonce_suffix}' for i in range(rolls_df.shape[1])]
    results_df_cleaned = pd.concat([results_df,rolls_df,rolls_df_nonce],axis=1)
    results_df_cleaned=results_df_cleaned.drop(['roll_hash_list','roll_hash_list_nonce'],axis=1)
    if cleaned_suffix:
        cleaned_suffix='cleaned_'
    else:
        cleaned_suffix=''
    if is_test:
        filename = f'data/{cleaned_suffix}results_df_test_{nonce}_{file_pattern_str}_pattern{file_suffix}.csv'
    else:
        filename = f'data/{cleaned_suffix}results_df_{nonce}_Large_{file_pattern_str}_pattern{file_suffix}.csv'
    results_df_cleaned.to_csv(filename,index=False)
    return results_df_cleaned

In [57]:
%%time
if PATTERN_99:
    file_pattern_str = 'x99x'
else:
    file_pattern_str = 'x00x'
filename = f'data/results_df_{nonce}_Large_{file_pattern_str}_pattern.csv'
if READ_FROM_FILE:
    if OLD_VERSION_READ:
        results_df = pd.read_csv(filename, converters={'roll_hash_list': pd.eval,
                                                     'roll_hash_list_nonce': pd.eval})
    else:
        results_df = pd.read_csv(filename)

if not(READ_FROM_FILE) or OLD_VERSION_READ:
    results_df=save_cleaned_results_df(results_df,False,
                            file_suffix="",cleaned_suffix = False)
#     results_df.to_csv(filename,index=False)

CPU times: user 136 ms, sys: 12.4 ms, total: 149 ms
Wall time: 139 ms


In [58]:
results_df.head()

Unnamed: 0,match,seed,hash,nonce,client_seed,client_index,passed_state,roll_actual,roll_0,roll_1,roll_2,roll_3,roll_4,roll_5,roll_6,roll_7,roll_8,roll_9,roll_10,roll_11,roll_12,roll_13,roll_14,roll_15,roll_16,roll_17,roll_18,roll_19,roll_20,roll_21,roll_22,roll_23,roll_24,roll_25,roll_26,roll_27,roll_28,roll_29,roll_30,roll_31,roll_32,roll_33,roll_34,roll_35,roll_36,roll_37,roll_38,roll_39,roll_40,roll_41,roll_42,roll_43,roll_44,roll_45,roll_46,roll_47,roll_48,roll_49,roll_50,roll_51,roll_52,roll_53,roll_54,roll_55,roll_56,roll_57,roll_58,roll_59,roll_60,roll_0_nonce,roll_1_nonce,roll_2_nonce,roll_3_nonce,roll_4_nonce,roll_5_nonce,roll_6_nonce,roll_7_nonce,roll_8_nonce,roll_9_nonce,roll_10_nonce,roll_11_nonce,roll_12_nonce,roll_13_nonce,roll_14_nonce,roll_15_nonce,roll_16_nonce,roll_17_nonce,roll_18_nonce,roll_19_nonce,roll_20_nonce,roll_21_nonce,roll_22_nonce,roll_23_nonce,roll_24_nonce,roll_25_nonce,roll_26_nonce,roll_27_nonce,roll_28_nonce,roll_29_nonce,roll_30_nonce,roll_31_nonce,roll_32_nonce,roll_33_nonce,roll_34_nonce,roll_35_nonce,roll_36_nonce,roll_37_nonce,roll_38_nonce,roll_39_nonce,roll_40_nonce,roll_41_nonce,roll_42_nonce,roll_43_nonce,roll_44_nonce,roll_45_nonce,roll_46_nonce,roll_47_nonce,roll_48_nonce,roll_49_nonce,roll_50_nonce,roll_51_nonce,roll_52_nonce,roll_53_nonce,roll_54_nonce,roll_55_nonce,roll_56_nonce,roll_57_nonce,roll_58_nonce,roll_59_nonce,roll_60_nonce
0,False,3a2f8b8d70e8c21a2c7e77f36ac7211cec947e0df3ca69451d1ada23d092b271,d98d0a6ee9de1d1e39bc3f6c6e65d20f4cc269ddcb3e3702e4b1b1e30140a0be,7007,woxpwoxpwoxpwoxpwoxp69d3b01bbbd6361eb8baf80a8bbdf5f0c68edadfa8a18131b575012e5c6a1510,999982,Above,2272,5005,7124,9893,5681,5161,2518,979,3342,2578,9667,1275,3295,6452,6318,5607,6253,6706,7819,2998,7211,5845,7607,2407,1455,4995,3065,2192,3251,9647,7510,1009,8812,9277,8866,904,5611,2684,1055,9905,3875,9253,5881,4045,3520,8149,7486,6098,7092,8321,7958,565,1326,872,9180,516,3219,9773,7657,9996,1143,8360,5005,5121,5206,7751,7065,2827,8745,3999,4888,7905,701,826,3245,9194,9111,8047,3656,2918,6241,6212,6128,9555,2444,6596,1038,3531,4749,7980,4116,7425,9093,2482,4841,4410,1942,463,7933,2103,3531,7252,3151,1055,1492,2867,2122,2677,6268,7663,8770,5248,7368,5717,4353,7674,7134,7933,937,6355,1895,8641,563
1,False,3a2f8b8d70e8c21a2c7e77f36ac7211cec947e0df3ca69451d1ada23d092b271,d98d0a6ee9de1d1e39bc3f6c6e65d20f4cc269ddcb3e3702e4b1b1e30140a0be,7007,woxpwoxpwoxpwoxpwoxp202c0e4fbe87ff2f8c654b6bd55bd52beb3424d1811ef5b7e76bef454a8e1be1,999962,Above,4776,5005,9051,5582,8835,1371,7660,9389,3635,6310,7337,709,8231,1731,4332,8215,7724,2822,6720,8810,8204,6031,5440,3244,4965,1665,1425,5399,8972,1521,5545,6921,1642,2600,3240,4035,1862,7588,9296,9593,3741,2142,8334,4744,1304,2068,8562,1859,603,3757,9477,6951,2643,1194,6933,4387,3611,580,4728,707,3367,6286,5005,4182,4734,3728,3429,9868,3262,8322,4548,6662,9874,5451,3407,529,8262,7974,8269,3413,9745,8972,7474,2863,203,7115,2808,1347,2808,5669,2296,1868,4283,522,3367,8797,892,4549,4193,3912,9626,5650,5642,8738,6967,7196,2446,5146,7719,6286,5753,7937,4542,4607,2252,2820,5825,534,228,7641,3999,8832,1367
2,False,3a2f8b8d70e8c21a2c7e77f36ac7211cec947e0df3ca69451d1ada23d092b271,d98d0a6ee9de1d1e39bc3f6c6e65d20f4cc269ddcb3e3702e4b1b1e30140a0be,7007,woxpwoxpwoxpwoxpwoxp02357e72710abf0d3bc73468cc68e236d83585b36b3adca07347a5b57b54340f,999704,Above,998,6004,8749,3174,1708,5409,6620,9976,2935,5119,721,8276,7633,2648,1072,1154,1067,1182,5913,2685,2635,7095,7385,6939,7074,696,433,1508,9053,8060,2758,6881,4491,4260,4916,1016,8616,2526,1936,2022,8857,3100,7823,680,4212,7989,3477,6714,5690,7515,1215,1000,208,6931,8578,272,349,2048,9267,5741,9846,78,6004,6922,9156,6948,6351,8540,6908,8644,5234,9514,7627,2475,9772,9268,4057,7522,8339,8406,98,3294,990,6377,6294,4464,6094,4483,1851,7267,5798,2073,7201,1611,6802,2219,5244,918,7962,2595,9166,6304,260,651,260,4435,2263,5141,9777,7941,6264,5336,3874,4709,4424,4183,4953,4477,5193,8333,6110,2290,6611
3,False,3a2f8b8d70e8c21a2c7e77f36ac7211cec947e0df3ca69451d1ada23d092b271,d98d0a6ee9de1d1e39bc3f6c6e65d20f4cc269ddcb3e3702e4b1b1e30140a0be,7007,woxpwoxpwoxpwoxpwoxp0f5f99c257ae9505a2b9dc976623fffe6e9f0414a4a14b97a21222cec7a563eb,999637,Above,1967,8009,2006,4186,9824,4831,1209,4958,8699,9865,8538,9846,3220,8681,498,2318,8073,3105,829,4726,6678,3636,4882,2409,3901,7279,9238,5585,5610,7781,1097,7156,338,869,9875,8742,265,1498,1575,9038,7863,1537,3026,904,4134,2647,2971,4939,4348,3511,7576,2125,7145,4680,684,7307,5279,1767,4688,4876,9227,8697,8009,7234,7601,4558,118,4694,5708,7727,1600,9880,409,6247,5201,6973,1175,1412,9406,8850,7544,2901,5183,9206,5883,4686,6046,11,453,6642,5166,8165,8766,2859,2765,9184,4748,7909,3239,83,9534,1491,1069,4311,2419,8772,9013,430,4822,7920,9584,5741,7853,6409,1878,6417,4903,6880,812,7341,6346,7209,3649
4,True,3a2f8b8d70e8c21a2c7e77f36ac7211cec947e0df3ca69451d1ada23d092b271,d98d0a6ee9de1d1e39bc3f6c6e65d20f4cc269ddcb3e3702e4b1b1e30140a0be,7007,woxpwoxpwoxpwoxpwoxpe02014a70eca59940e99cc4bce6d97285b419cda77ca2253c32fa03f72397701,999581,Above,9624,2003,9632,8536,2287,7451,5413,237,7343,6053,4871,3865,1711,3873,2963,1101,7966,3601,9667,5697,894,9125,3063,2169,6330,6376,2157,8131,8403,8212,4285,404,815,3478,2195,5194,5856,8170,3232,6364,4830,1786,942,7882,8599,1895,1016,8378,3600,3995,208,3405,9966,2458,4382,5177,785,5304,7139,7846,7513,3680,2003,9912,2389,2319,7294,2515,3916,972,7694,7393,7336,4451,507,2436,9449,8167,9437,2182,1726,654,906,7393,5568,7005,2341,1982,2216,3209,5900,4452,5790,4412,8276,46,1033,156,2664,2461,2998,8222,5776,9340,8324,4421,3519,3271,5254,7268,9300,2822,5917,7169,1666,8229,6901,3312,6292,6308,7472,3898,3484


In [59]:
HIGH_TARGET= 9000 #8944
LOW_TARGET = 1000 #1056
ZERO_COL_CUTOFF = 4

In [60]:
def create_target_mask(data,bHighCheck=HIGH_ANALYSIS):
    if bHighCheck:
        mask = data['roll_actual'] > HIGH_TARGET
    else:
        mask = data['roll_actual'] < LOW_TARGET   
    return mask

Prediction

In [64]:
def gen_test_data(cur_nonce,cur_hash,cur_seed,file_suffix=""):
    test_data = pd.DataFrame(columns=['seed','hash','nonce'])
    test_data.loc[0,'seed']=cur_seed
    test_data.loc[0,'hash']=cur_hash
    test_data.loc[0,'nonce'] = cur_nonce

    print(len(test_data))
    print(test_data.head())
    
    if PATTERN_99:
        match_digit_arr=np.array([9,9])
        mismatch_digit_arr=np.array([9,9])
    else:
        match_digit_arr=np.array([0,0])
        mismatch_digit_arr=np.array([0,0])

    match_digit_indices = np.array([1,2])
    match_digit_count_arr=np.array([1])
    mismatch_digit_indices = np.array([0,3]) 
    
    if PATTERN_99:
        file_pattern_str = 'x99x'
    else:
        file_pattern_str = 'x00x'
        
    filename = f'data/results_df_test_{nonce}_{file_pattern_str}_pattern{file_suffix}.csv'
    
    if  READ_FROM_FILE_TEST:
        results_df_test = pd.read_csv(filename)
#         results_df_test = pd.read_csv(filename, converters={'roll_hash_list': pd.eval,
#                                                         'roll_hash_list_nonce': pd.eval})
    else:

        cur_client_seed_array = client_seed_array_2[:1000001]
        results_df_test = None

        last_index_test =  len(cur_client_seed_array)-1 #1024
        start = 0

        results_df_test = generate_data(test_data,
                                    cur_client_seed_array,
                                     ABOVE,
                            match_digit_arr,match_digit_count_arr,match_digit_indices=match_digit_indices,
                            mismatch_digit_arr=mismatch_digit_arr,mismatch_digit_indices=mismatch_digit_indices,
                            match_count_exp=test_client_size, #train_client_size,
                            feature_chain_length=feature_chain_length,
                            is_data_hash = False,
                            test_limit=1,
                            last_index=last_index_test,result_df=results_df_test,
                            print_client_scan=False)
        #Remove client seed duplicates from test data
        results_df_test= results_df_test.drop_duplicates(subset='client_seed')
        
    if not(READ_FROM_FILE_TEST) or OLD_VERSION_READ:
        results_df_test=save_cleaned_results_df(results_df_test,True,
                            file_suffix=file_suffix,cleaned_suffix= False)

#         results_df_test.to_csv(filename,index=False)
        
    print('Test Data Generation Completed')    
        
#         row = results_df_test.iloc[len(results_df_test)-1]
#         last_index_test = row['client_index']   

    return results_df_test
 

In [65]:
%%time
READ_FROM_FILE_TEST=False
# seed='dummy'
seed = generate_hash_chain(hashval,26)[25]
print('seed:',seed)
results_df_test=gen_test_data(nonce,hashval,seed,file_suffix="")
print()
print(results_df_test[['seed','hash','nonce']].head(1))

seed: d98d0a6ee9de1d1e39bc3f6c6e65d20f4cc269ddcb3e3702e4b1b1e30140a0be
1
                                                               seed  \
0  d98d0a6ee9de1d1e39bc3f6c6e65d20f4cc269ddcb3e3702e4b1b1e30140a0be   

                                                               hash nonce  
0  a08fa060e49d0755163928e2445a6d5eb715a9827239fff6aa75a07f7dec36cd  7007  
server_count:0
Test Data Generation Completed

                                                               seed  \
0  d98d0a6ee9de1d1e39bc3f6c6e65d20f4cc269ddcb3e3702e4b1b1e30140a0be   

                                                               hash  nonce  
0  a08fa060e49d0755163928e2445a6d5eb715a9827239fff6aa75a07f7dec36cd   7007  
CPU times: user 375 ms, sys: 0 ns, total: 375 ms
Wall time: 373 ms


In [66]:
%%time
READ_FROM_FILE_TEST=False
#Generate hash test data
print(f'{hashval=}')
hash_l2=generate_hash(hashval.encode())
print(f'{hash_l2=}')
results_df_test_hash=gen_test_data(nonce,hash_l2,hashval,file_suffix="_hash")
print()
print(results_df_test_hash.head(1))

hashval='a08fa060e49d0755163928e2445a6d5eb715a9827239fff6aa75a07f7dec36cd'
hash_l2='b465a3bc3240974c62115114d220a77452036cf55edb5bc079668fa89ff9d194'
1
                                                               seed  \
0  a08fa060e49d0755163928e2445a6d5eb715a9827239fff6aa75a07f7dec36cd   

                                                               hash nonce  
0  b465a3bc3240974c62115114d220a77452036cf55edb5bc079668fa89ff9d194  7007  
server_count:0
Test Data Generation Completed

   match                                                              seed  \
0   True  a08fa060e49d0755163928e2445a6d5eb715a9827239fff6aa75a07f7dec36cd   

                                                               hash  nonce  \
0  b465a3bc3240974c62115114d220a77452036cf55edb5bc079668fa89ff9d194   7007   

                                                                            client_seed  \
0  woxpwoxpwoxpwoxpwoxp520e38ca4d0cc8342c73ea7c5d4f25f3bc7e9db3ae087cb464b1cd9d049f6d2c   

   client

In [None]:
# %%time
# READ_FROM_FILE_TEST=False
# cur_seed = cur_hash_list[25]
# cur_hash = cur_hash_list[26]
# print(f'{cur_seed=}')
# print(f'{cur_hash=}')

# results_df_test_hash25=gen_test_data(nonce,cur_hash,cur_seed,file_suffix="_hash25")
# print()
# print(results_df_test_hash25.head(1))

In [None]:
# %%time
# READ_FROM_FILE_TEST=False
# cur_seed = cur_hash_list[24]
# cur_hash = cur_hash_list[25]
# print(f'{cur_seed=}')
# print(f'{cur_hash=}')

# results_df_test_hash24=gen_test_data(nonce,cur_hash,cur_seed,file_suffix="_hash24")
# print()
# print(results_df_test_hash24.head(1))


In [None]:
# %%time
# results_df_test=save_cleaned_results_df(results_df_test,True)
# # results_df_test_hash=save_cleaned_results_df(results_df_test_hash,True)
# results_df_test_hash_cleaned=save_cleaned_results_df(results_df_test_hash,True,'_hash')
# # results_df_test_cleaned=save_cleaned_results_df(results_df_test_hash25,True)
# # results_df_test_cleaned=save_cleaned_results_df(results_df_test_hash24,True)

##### Feature Generation

In [67]:
def generate_roll_features_k(rolls_df,roll_start,roll_end,suffix="",stat_suffix=""):
    first_k_roll_cols = [f'roll_{i}{suffix}' for i in range(roll_start,roll_end+1)]
    rolls_df[f'count_gt_9000_k{stat_suffix}'] = (rolls_df[first_k_roll_cols] >= 9000).sum(axis=1)
    rolls_df[f'count_lt_1000_k{stat_suffix}'] = (rolls_df[first_k_roll_cols] < 1000).sum(axis=1)
    
    return rolls_df
    
def generate_roll_features(rolls_df,
                           roll_start=1, roll_end=20,k=4,
                          suffix="",stat_suffix=""):
    cols_roll = [f'roll_{i}{suffix}' for i in range(roll_start,roll_end+1)]
#     print(cols_roll)
    
    if (roll_start==1) & (roll_end==20):
        stat_suffix = suffix
    else:
        stat_suffix = f'_{roll_start}_{roll_end}{suffix}'
    
    rolls_df[f'count_lt_1000{stat_suffix}'] = ( (rolls_df[cols_roll] / 1000).astype('int')==0).sum(axis=1)
    rolls_df[f'count_gt_9000{stat_suffix}'] = ( (rolls_df[cols_roll] / 1000).astype('int')==9).sum(axis=1)
    rolls_df[f'count_gt_8000{stat_suffix}'] = ( (rolls_df[cols_roll] / 1000).astype('int')==8).sum(axis=1)
    rolls_df[f'count_gt_7000{stat_suffix}'] = ( (rolls_df[cols_roll] / 1000).astype('int')==7).sum(axis=1)
    rolls_df[f'count_gt_6000{stat_suffix}'] = ( (rolls_df[cols_roll] / 1000).astype('int')==6).sum(axis=1)
    rolls_df[f'count_gt_5000{stat_suffix}'] = ( (rolls_df[cols_roll] / 1000).astype('int')==5).sum(axis=1)
    rolls_df[f'count_gt_4000{stat_suffix}'] = ( (rolls_df[cols_roll] / 1000).astype('int')==4).sum(axis=1)
    rolls_df[f'count_gt_3000{stat_suffix}'] = ( (rolls_df[cols_roll] / 1000).astype('int')==3).sum(axis=1)
    rolls_df[f'count_gt_2000{stat_suffix}'] = ( (rolls_df[cols_roll] / 1000).astype('int')==2).sum(axis=1)
    rolls_df[f'count_gt_1000{stat_suffix}'] = ( (rolls_df[cols_roll] / 1000).astype('int')==1).sum(axis=1)
    
    rolls_df[f'count_hi{stat_suffix}'] = (rolls_df[cols_roll] > 5250).sum(axis=1)
    rolls_df[f'count_lo{stat_suffix}'] = (rolls_df[cols_roll] < 4750).sum(axis=1)    
    rolls_df[f'count_lo_hi{stat_suffix}'] = rolls_df[f'count_hi{stat_suffix}'] \
                                            + rolls_df[f'count_lo{stat_suffix}']   


    rolls_df=generate_roll_features_k(rolls_df,roll_start,roll_start+k,
                                      suffix=suffix,stat_suffix=stat_suffix)
    
    roll_list_strs = np.full(len(rolls_df),"")
    for col in cols_roll:
        roll_list_strs = rolls_df[col].astype('str') + roll_list_strs
    
    rolls_df[f'total_dig_9{stat_suffix}']= roll_list_strs.str.count('9')
    rolls_df[f'total_dig_0{stat_suffix}']= roll_list_strs.str.count('0')
    
    rolls_df[f'roll_mean{stat_suffix}'] = (rolls_df[cols_roll]).mean(axis=1)
    rolls_df[f'roll_std{stat_suffix}'] = (rolls_df[cols_roll]).std(axis=1)
     
    print(f'Roll Features from {roll_start} to {roll_end} {suffix} completed' )
    
    return rolls_df

#Modification: 
#1. supported both raw roll_hash_list and already generated roll columns if present
#2. added client index
def generate_features_full(initial_df,istrain,feature_chain_length):
    
    nonce_suffix = '_nonce'
    if 'roll_1' in list(initial_df.columns):
        print('roll columns present')
        rolls_cols = [f'roll_{i}' for i in range(feature_chain_length+1)] 
        rolls_df = initial_df[rolls_cols]
        rolls_cols_nonce = [f'roll_{i}{nonce_suffix}' for i in range(feature_chain_length+1)] 
        rolls_df_nonce = initial_df[rolls_cols_nonce]
    else:
        print('roll columns not present')
        rolls_df = initial_df['roll_hash_list'].apply(pd.Series)
        rolls_df.columns = [f'roll_{i}' for i in range(rolls_df.shape[1])]    
        rolls_df_nonce = initial_df['roll_hash_list_nonce'].apply(pd.Series)
        rolls_df_nonce.columns = [f'roll_{i}{nonce_suffix}' for i in range(rolls_df_nonce.shape[1])] 
    
    #generate top 20 hash features
    rolls_df=generate_roll_features(rolls_df,roll_start=1,
                                    roll_end=20, k=4)
    rolls_df_nonce=generate_roll_features(rolls_df_nonce,
                                    suffix=nonce_suffix,roll_start=1,
                                    roll_end=20 , k=4)
    
    #generate 25's hash features
    roll_start =25
    roll_end = 50
    rolls_df=generate_roll_features(rolls_df,
                                    roll_start=roll_start,
                                    roll_end=roll_end, k=5)
    rolls_df_nonce=generate_roll_features(rolls_df_nonce,
                                    suffix=nonce_suffix ,
                                    roll_start=roll_start,
                                    roll_end=roll_end, k=5)  

    #generate 50's hash features
    roll_start =50
    roll_end = 60
    rolls_df=generate_roll_features(rolls_df,
                                    roll_start=roll_start,
                                    roll_end=roll_end, k=5)
    rolls_df_nonce=generate_roll_features(rolls_df_nonce,
                                    suffix=nonce_suffix ,roll_start=roll_start,
                                    roll_end=roll_end, k=5)      
    #generate for full hash features
    roll_start =1
    roll_end = feature_chain_length
    rolls_df=generate_roll_features(rolls_df,
                                    roll_start=roll_start,
                                    roll_end=roll_end, k=5)
    rolls_df_nonce=generate_roll_features(rolls_df_nonce,
                                    suffix=nonce_suffix ,roll_start=roll_start,
                                    roll_end=roll_end, k=5)        
    if istrain:
        features_df = pd.concat([initial_df[['roll_actual','seed',
                                             'hash','client_seed','client_index']],
                                 rolls_df,
                                 rolls_df_nonce],axis=1)
        mask = create_target_mask(features_df)
            
        features_df['target'] =0
        features_df.loc[mask,'target'] = 1
        features_df['roll_actual']=features_df['roll_actual'].astype('int')
        print(features_df['target'].value_counts())
    else:
        features_df = pd.concat([initial_df[['roll_actual','seed',
                                             'hash','client_seed','client_index']],
                                             rolls_df,rolls_df_nonce],axis=1)
    return features_df


In [68]:
%%time
# train = generate_features_full(results_df[train_client_size:],True,feature_chain_length)
train = generate_features_full(results_df,True,feature_chain_length)
print(len(train))
train.head()

roll columns present
Roll Features from 1 to 20  completed
Roll Features from 1 to 20 _nonce completed
Roll Features from 25 to 50  completed
Roll Features from 25 to 50 _nonce completed
Roll Features from 50 to 60  completed
Roll Features from 50 to 60 _nonce completed
Roll Features from 1 to 60  completed
Roll Features from 1 to 60 _nonce completed
0    482
1     50
Name: target, dtype: int64
532
CPU times: user 1.6 s, sys: 56.1 ms, total: 1.66 s
Wall time: 1.65 s


Unnamed: 0,roll_actual,seed,hash,client_seed,client_index,roll_0,roll_1,roll_2,roll_3,roll_4,roll_5,roll_6,roll_7,roll_8,roll_9,roll_10,roll_11,roll_12,roll_13,roll_14,roll_15,roll_16,roll_17,roll_18,roll_19,roll_20,roll_21,roll_22,roll_23,roll_24,roll_25,roll_26,roll_27,roll_28,roll_29,roll_30,roll_31,roll_32,roll_33,roll_34,roll_35,roll_36,roll_37,roll_38,roll_39,roll_40,roll_41,roll_42,roll_43,roll_44,roll_45,roll_46,roll_47,roll_48,roll_49,roll_50,roll_51,roll_52,roll_53,roll_54,roll_55,roll_56,roll_57,roll_58,roll_59,roll_60,count_lt_1000,count_gt_9000,count_gt_8000,count_gt_7000,count_gt_6000,count_gt_5000,count_gt_4000,count_gt_3000,count_gt_2000,count_gt_1000,count_hi,count_lo,count_lo_hi,count_gt_9000_k,count_lt_1000_k,total_dig_9,total_dig_0,roll_mean,roll_std,count_lt_1000_25_50,count_gt_9000_25_50,count_gt_8000_25_50,count_gt_7000_25_50,count_gt_6000_25_50,count_gt_5000_25_50,count_gt_4000_25_50,count_gt_3000_25_50,count_gt_2000_25_50,count_gt_1000_25_50,count_hi_25_50,count_lo_25_50,count_lo_hi_25_50,count_gt_9000_k_25_50,count_lt_1000_k_25_50,total_dig_9_25_50,total_dig_0_25_50,roll_mean_25_50,roll_std_25_50,count_lt_1000_50_60,count_gt_9000_50_60,count_gt_8000_50_60,count_gt_7000_50_60,count_gt_6000_50_60,count_gt_5000_50_60,count_gt_4000_50_60,count_gt_3000_50_60,count_gt_2000_50_60,count_gt_1000_50_60,count_hi_50_60,count_lo_50_60,count_lo_hi_50_60,count_gt_9000_k_50_60,count_lt_1000_k_50_60,total_dig_9_50_60,total_dig_0_50_60,roll_mean_50_60,roll_std_50_60,count_lt_1000_1_60,count_gt_9000_1_60,count_gt_8000_1_60,count_gt_7000_1_60,count_gt_6000_1_60,count_gt_5000_1_60,count_gt_4000_1_60,count_gt_3000_1_60,count_gt_2000_1_60,count_gt_1000_1_60,count_hi_1_60,count_lo_1_60,count_lo_hi_1_60,count_gt_9000_k_1_60,count_lt_1000_k_1_60,total_dig_9_1_60,total_dig_0_1_60,roll_mean_1_60,roll_std_1_60,roll_0_nonce,roll_1_nonce,roll_2_nonce,roll_3_nonce,roll_4_nonce,roll_5_nonce,roll_6_nonce,roll_7_nonce,roll_8_nonce,roll_9_nonce,roll_10_nonce,roll_11_nonce,roll_12_nonce,roll_13_nonce,roll_14_nonce,roll_15_nonce,roll_16_nonce,roll_17_nonce,roll_18_nonce,roll_19_nonce,roll_20_nonce,roll_21_nonce,roll_22_nonce,roll_23_nonce,roll_24_nonce,roll_25_nonce,roll_26_nonce,roll_27_nonce,roll_28_nonce,roll_29_nonce,roll_30_nonce,roll_31_nonce,roll_32_nonce,roll_33_nonce,roll_34_nonce,roll_35_nonce,roll_36_nonce,roll_37_nonce,roll_38_nonce,roll_39_nonce,roll_40_nonce,roll_41_nonce,roll_42_nonce,roll_43_nonce,roll_44_nonce,roll_45_nonce,roll_46_nonce,roll_47_nonce,roll_48_nonce,roll_49_nonce,roll_50_nonce,roll_51_nonce,roll_52_nonce,roll_53_nonce,roll_54_nonce,roll_55_nonce,roll_56_nonce,roll_57_nonce,roll_58_nonce,roll_59_nonce,roll_60_nonce,count_lt_1000_nonce,count_gt_9000_nonce,count_gt_8000_nonce,count_gt_7000_nonce,count_gt_6000_nonce,count_gt_5000_nonce,count_gt_4000_nonce,count_gt_3000_nonce,count_gt_2000_nonce,count_gt_1000_nonce,count_hi_nonce,count_lo_nonce,count_lo_hi_nonce,count_gt_9000_k_nonce,count_lt_1000_k_nonce,total_dig_9_nonce,total_dig_0_nonce,roll_mean_nonce,roll_std_nonce,count_lt_1000_25_50_nonce,count_gt_9000_25_50_nonce,count_gt_8000_25_50_nonce,count_gt_7000_25_50_nonce,count_gt_6000_25_50_nonce,count_gt_5000_25_50_nonce,count_gt_4000_25_50_nonce,count_gt_3000_25_50_nonce,count_gt_2000_25_50_nonce,count_gt_1000_25_50_nonce,count_hi_25_50_nonce,count_lo_25_50_nonce,count_lo_hi_25_50_nonce,count_gt_9000_k_25_50_nonce,count_lt_1000_k_25_50_nonce,total_dig_9_25_50_nonce,total_dig_0_25_50_nonce,roll_mean_25_50_nonce,roll_std_25_50_nonce,count_lt_1000_50_60_nonce,count_gt_9000_50_60_nonce,count_gt_8000_50_60_nonce,count_gt_7000_50_60_nonce,count_gt_6000_50_60_nonce,count_gt_5000_50_60_nonce,count_gt_4000_50_60_nonce,count_gt_3000_50_60_nonce,count_gt_2000_50_60_nonce,count_gt_1000_50_60_nonce,count_hi_50_60_nonce,count_lo_50_60_nonce,count_lo_hi_50_60_nonce,count_gt_9000_k_50_60_nonce,count_lt_1000_k_50_60_nonce,total_dig_9_50_60_nonce,total_dig_0_50_60_nonce,roll_mean_50_60_nonce,roll_std_50_60_nonce,count_lt_1000_1_60_nonce,count_gt_9000_1_60_nonce,count_gt_8000_1_60_nonce,count_gt_7000_1_60_nonce,count_gt_6000_1_60_nonce,count_gt_5000_1_60_nonce,count_gt_4000_1_60_nonce,count_gt_3000_1_60_nonce,count_gt_2000_1_60_nonce,count_gt_1000_1_60_nonce,count_hi_1_60_nonce,count_lo_1_60_nonce,count_lo_hi_1_60_nonce,count_gt_9000_k_1_60_nonce,count_lt_1000_k_1_60_nonce,total_dig_9_1_60_nonce,total_dig_0_1_60_nonce,roll_mean_1_60_nonce,roll_std_1_60_nonce,target
0,2272,3a2f8b8d70e8c21a2c7e77f36ac7211cec947e0df3ca69451d1ada23d092b271,d98d0a6ee9de1d1e39bc3f6c6e65d20f4cc269ddcb3e3702e4b1b1e30140a0be,woxpwoxpwoxpwoxpwoxp69d3b01bbbd6361eb8baf80a8bbdf5f0c68edadfa8a18131b575012e5c6a1510,999982,5005,7124,9893,5681,5161,2518,979,3342,2578,9667,1275,3295,6452,6318,5607,6253,6706,7819,2998,7211,5845,7607,2407,1455,4995,3065,2192,3251,9647,7510,1009,8812,9277,8866,904,5611,2684,1055,9905,3875,9253,5881,4045,3520,8149,7486,6098,7092,8321,7958,565,1326,872,9180,516,3219,9773,7657,9996,1143,8360,1,2,0,3,4,4,0,2,3,1,12,7,19,1,0,9,2,5336.1,2538.568057,2,4,4,4,1,2,1,4,2,2,15,11,26,1,0,12,11,5616.576923,3116.996775,3,3,1,1,0,0,0,1,0,2,5,6,11,1,3,6,2,4782.454545,4140.110611,5,9,5,9,5,6,2,7,6,6,33,25,58,1,1,29,17,5354.316667,3048.447254,5005,5121,5206,7751,7065,2827,8745,3999,4888,7905,701,826,3245,9194,9111,8047,3656,2918,6241,6212,6128,9555,2444,6596,1038,3531,4749,7980,4116,7425,9093,2482,4841,4410,1942,463,7933,2103,3531,7252,3151,1055,1492,2867,2122,2677,6268,7663,8770,5248,7368,5717,4353,7674,7134,7933,937,6355,1895,8641,563,2,2,2,3,3,2,1,3,2,0,10,7,17,0,0,8,5,5489.3,2600.423793,1,1,1,6,1,1,4,3,5,3,9,15,24,1,0,7,6,4635.846154,2607.990011,2,0,1,4,1,1,1,0,0,1,7,4,11,0,0,3,0,5324.545455,2943.078061,5,4,4,12,6,4,6,6,8,5,27,28,55,0,0,20,12,5019.216667,2713.526139,0
1,4776,3a2f8b8d70e8c21a2c7e77f36ac7211cec947e0df3ca69451d1ada23d092b271,d98d0a6ee9de1d1e39bc3f6c6e65d20f4cc269ddcb3e3702e4b1b1e30140a0be,woxpwoxpwoxpwoxpwoxp202c0e4fbe87ff2f8c654b6bd55bd52beb3424d1811ef5b7e76bef454a8e1be1,999962,5005,9051,5582,8835,1371,7660,9389,3635,6310,7337,709,8231,1731,4332,8215,7724,2822,6720,8810,8204,6031,5440,3244,4965,1665,1425,5399,8972,1521,5545,6921,1642,2600,3240,4035,1862,7588,9296,9593,3741,2142,8334,4744,1304,2068,8562,1859,603,3757,9477,6951,2643,1194,6933,4387,3611,580,4728,707,3367,6286,1,2,5,3,3,1,1,1,1,2,14,6,20,1,0,4,8,6134.95,2767.590297,1,3,3,1,2,2,2,3,3,6,11,15,26,0,0,11,7,4737.730769,3037.457062,2,0,0,0,3,0,2,2,1,1,3,8,11,0,0,3,2,3762.454545,2353.785817,4,5,8,4,7,4,6,7,5,10,28,31,59,2,0,18,18,4927.166667,2870.839747,5005,4182,4734,3728,3429,9868,3262,8322,4548,6662,9874,5451,3407,529,8262,7974,8269,3413,9745,8972,7474,2863,203,7115,2808,1347,2808,5669,2296,1868,4283,522,3367,8797,892,4549,4193,3912,9626,5650,5642,8738,6967,7196,2446,5146,7719,6286,5753,7937,4542,4607,2252,2820,5825,534,228,7641,3999,8832,1367,1,3,4,2,1,1,3,5,0,0,11,9,20,1,0,8,1,6105.25,2759.711615,2,1,2,3,2,5,4,2,3,2,12,13,25,0,0,12,2,4928.884615,2545.924827,2,0,1,1,0,1,2,1,2,1,3,8,11,0,1,3,2,3877.0,2793.801317,6,4,7,7,3,7,8,8,7,3,27,32,59,1,0,23,7,5022.5,2791.792253,0
2,998,3a2f8b8d70e8c21a2c7e77f36ac7211cec947e0df3ca69451d1ada23d092b271,d98d0a6ee9de1d1e39bc3f6c6e65d20f4cc269ddcb3e3702e4b1b1e30140a0be,woxpwoxpwoxpwoxpwoxp02357e72710abf0d3bc73468cc68e236d83585b36b3adca07347a5b57b54340f,999704,6004,8749,3174,1708,5409,6620,9976,2935,5119,721,8276,7633,2648,1072,1154,1067,1182,5913,2685,2635,7095,7385,6939,7074,696,433,1508,9053,8060,2758,6881,4491,4260,4916,1016,8616,2526,1936,2022,8857,3100,7823,680,4212,7989,3477,6714,5690,7515,1215,1000,208,6931,8578,272,349,2048,9267,5741,9846,78,1,1,2,2,1,3,0,1,4,5,8,11,19,0,0,8,6,4288.55,2972.524436,2,1,3,3,2,1,4,2,3,5,10,15,25,1,1,7,14,4490.307692,2911.007163,4,2,1,0,1,1,0,0,1,1,5,6,11,0,3,4,5,4028.909091,4053.235435,8,4,6,7,5,5,4,3,8,10,26,32,58,1,0,22,23,4465.516667,3109.315476,6004,6922,9156,6948,6351,8540,6908,8644,5234,9514,7627,2475,9772,9268,4057,7522,8339,8406,98,3294,990,6377,6294,4464,6094,4483,1851,7267,5798,2073,7201,1611,6802,2219,5244,918,7962,2595,9166,6304,260,651,260,4435,2263,5141,9777,7941,6264,5336,3874,4709,4424,4183,4953,4477,5193,8333,6110,2290,6611,2,4,4,2,4,1,1,1,1,0,14,5,19,1,0,12,5,6503.25,2888.790111,4,2,0,4,3,4,2,1,4,2,11,13,24,0,0,8,6,4526.769231,2845.894043,0,0,1,0,2,1,5,1,1,0,3,6,9,0,0,4,3,5014.272727,1579.334612,6,6,5,6,12,6,9,2,6,2,31,24,55,1,0,26,15,5371.216667,2707.543305,1
3,1967,3a2f8b8d70e8c21a2c7e77f36ac7211cec947e0df3ca69451d1ada23d092b271,d98d0a6ee9de1d1e39bc3f6c6e65d20f4cc269ddcb3e3702e4b1b1e30140a0be,woxpwoxpwoxpwoxpwoxp0f5f99c257ae9505a2b9dc976623fffe6e9f0414a4a14b97a21222cec7a563eb,999637,8009,2006,4186,9824,4831,1209,4958,8699,9865,8538,9846,3220,8681,498,2318,8073,3105,829,4726,6678,3636,4882,2409,3901,7279,9238,5585,5610,7781,1097,7156,338,869,9875,8742,265,1498,1575,9038,7863,1537,3026,904,4134,2647,2971,4939,4348,3511,7576,2125,7145,4680,684,7307,5279,1767,4688,4876,9227,8697,2,3,4,0,1,0,4,3,2,1,8,10,18,1,0,9,6,5286.3,3231.216051,4,3,1,4,0,2,3,2,3,4,10,15,25,1,0,10,5,4394.153846,3129.530708,1,1,1,2,0,1,3,0,1,1,5,5,10,0,1,3,2,5134.090909,2808.906529,7,7,6,7,1,3,11,6,6,6,24,31,55,1,0,25,15,4879.916667,3018.309816,8009,7234,7601,4558,118,4694,5708,7727,1600,9880,409,6247,5201,6973,1175,1412,9406,8850,7544,2901,5183,9206,5883,4686,6046,11,453,6642,5166,8165,8766,2859,2765,9184,4748,7909,3239,83,9534,1491,1069,4311,2419,8772,9013,430,4822,7920,9584,5741,7853,6409,1878,6417,4903,6880,812,7341,6346,7209,3649,2,2,1,4,2,3,2,0,1,3,10,8,18,0,1,6,10,5221.05,3055.884427,4,4,3,3,1,2,3,1,3,2,12,12,24,0,2,12,5,5113.423077,3346.517392,1,0,0,3,4,0,1,1,0,1,7,3,10,0,0,4,4,5427.0,2344.228572,7,7,4,9,8,6,7,2,4,6,31,24,55,0,1,23,21,5250.583333,2978.558344,0
4,9624,3a2f8b8d70e8c21a2c7e77f36ac7211cec947e0df3ca69451d1ada23d092b271,d98d0a6ee9de1d1e39bc3f6c6e65d20f4cc269ddcb3e3702e4b1b1e30140a0be,woxpwoxpwoxpwoxpwoxpe02014a70eca59940e99cc4bce6d97285b419cda77ca2253c32fa03f72397701,999581,2003,9632,8536,2287,7451,5413,237,7343,6053,4871,3865,1711,3873,2963,1101,7966,3601,9667,5697,894,9125,3063,2169,6330,6376,2157,8131,8403,8212,4285,404,815,3478,2195,5194,5856,8170,3232,6364,4830,1786,942,7882,8599,1895,1016,8378,3600,3995,208,3405,9966,2458,4382,5177,785,5304,7139,7846,7513,3680,2,3,1,3,1,2,1,3,2,2,10,9,19,1,0,7,3,5114.3,3044.930197,4,0,6,1,1,2,2,5,2,3,9,15,24,0,1,8,9,4362.769231,2883.404617,1,1,0,3,0,2,1,2,1,0,5,5,10,1,1,3,3,5241.363636,2683.852167,7,4,7,7,4,6,4,10,6,5,26,30,56,1,1,19,16,4798.433333,2846.193892,2003,9912,2389,2319,7294,2515,3916,972,7694,7393,7336,4451,507,2436,9449,8167,9437,2182,1726,654,906,7393,5568,7005,2341,1982,2216,3209,5900,4452,5790,4412,8276,46,1033,156,2664,2461,2998,8222,5776,9340,8324,4421,3519,3271,5254,7268,9300,2822,5917,7169,1666,8229,6901,3312,6292,6308,7472,3898,3484,4,3,1,4,0,0,1,1,5,1,8,12,20,1,0,13,2,4582.75,3346.727496,2,2,3,1,0,5,3,3,5,2,11,15,26,0,0,10,8,4578.038462,2709.7867,0,0,1,2,3,1,0,3,0,1,7,4,11,0,0,6,2,5513.454545,2089.358101,6,5,5,9,3,6,4,7,11,4,28,32,60,1,0,29,14,4795.366667,2806.88068,0


In [69]:
%%time
test = generate_features_full(results_df_test,False,feature_chain_length)
print(len(test))
test.head()

roll columns present
Roll Features from 1 to 20  completed
Roll Features from 1 to 20 _nonce completed
Roll Features from 25 to 50  completed
Roll Features from 25 to 50 _nonce completed
Roll Features from 50 to 60  completed
Roll Features from 50 to 60 _nonce completed
Roll Features from 1 to 60  completed
Roll Features from 1 to 60 _nonce completed
100
CPU times: user 1.48 s, sys: 27.9 ms, total: 1.51 s
Wall time: 1.5 s


Unnamed: 0,roll_actual,seed,hash,client_seed,client_index,roll_0,roll_1,roll_2,roll_3,roll_4,roll_5,roll_6,roll_7,roll_8,roll_9,roll_10,roll_11,roll_12,roll_13,roll_14,roll_15,roll_16,roll_17,roll_18,roll_19,roll_20,roll_21,roll_22,roll_23,roll_24,roll_25,roll_26,roll_27,roll_28,roll_29,roll_30,roll_31,roll_32,roll_33,roll_34,roll_35,roll_36,roll_37,roll_38,roll_39,roll_40,roll_41,roll_42,roll_43,roll_44,roll_45,roll_46,roll_47,roll_48,roll_49,roll_50,roll_51,roll_52,roll_53,roll_54,roll_55,roll_56,roll_57,roll_58,roll_59,roll_60,count_lt_1000,count_gt_9000,count_gt_8000,count_gt_7000,count_gt_6000,count_gt_5000,count_gt_4000,count_gt_3000,count_gt_2000,count_gt_1000,count_hi,count_lo,count_lo_hi,count_gt_9000_k,count_lt_1000_k,total_dig_9,total_dig_0,roll_mean,roll_std,count_lt_1000_25_50,count_gt_9000_25_50,count_gt_8000_25_50,count_gt_7000_25_50,count_gt_6000_25_50,count_gt_5000_25_50,count_gt_4000_25_50,count_gt_3000_25_50,count_gt_2000_25_50,count_gt_1000_25_50,count_hi_25_50,count_lo_25_50,count_lo_hi_25_50,count_gt_9000_k_25_50,count_lt_1000_k_25_50,total_dig_9_25_50,total_dig_0_25_50,roll_mean_25_50,roll_std_25_50,count_lt_1000_50_60,count_gt_9000_50_60,count_gt_8000_50_60,count_gt_7000_50_60,count_gt_6000_50_60,count_gt_5000_50_60,count_gt_4000_50_60,count_gt_3000_50_60,count_gt_2000_50_60,count_gt_1000_50_60,count_hi_50_60,count_lo_50_60,count_lo_hi_50_60,count_gt_9000_k_50_60,count_lt_1000_k_50_60,total_dig_9_50_60,total_dig_0_50_60,roll_mean_50_60,roll_std_50_60,count_lt_1000_1_60,count_gt_9000_1_60,count_gt_8000_1_60,count_gt_7000_1_60,count_gt_6000_1_60,count_gt_5000_1_60,count_gt_4000_1_60,count_gt_3000_1_60,count_gt_2000_1_60,count_gt_1000_1_60,count_hi_1_60,count_lo_1_60,count_lo_hi_1_60,count_gt_9000_k_1_60,count_lt_1000_k_1_60,total_dig_9_1_60,total_dig_0_1_60,roll_mean_1_60,roll_std_1_60,roll_0_nonce,roll_1_nonce,roll_2_nonce,roll_3_nonce,roll_4_nonce,roll_5_nonce,roll_6_nonce,roll_7_nonce,roll_8_nonce,roll_9_nonce,roll_10_nonce,roll_11_nonce,roll_12_nonce,roll_13_nonce,roll_14_nonce,roll_15_nonce,roll_16_nonce,roll_17_nonce,roll_18_nonce,roll_19_nonce,roll_20_nonce,roll_21_nonce,roll_22_nonce,roll_23_nonce,roll_24_nonce,roll_25_nonce,roll_26_nonce,roll_27_nonce,roll_28_nonce,roll_29_nonce,roll_30_nonce,roll_31_nonce,roll_32_nonce,roll_33_nonce,roll_34_nonce,roll_35_nonce,roll_36_nonce,roll_37_nonce,roll_38_nonce,roll_39_nonce,roll_40_nonce,roll_41_nonce,roll_42_nonce,roll_43_nonce,roll_44_nonce,roll_45_nonce,roll_46_nonce,roll_47_nonce,roll_48_nonce,roll_49_nonce,roll_50_nonce,roll_51_nonce,roll_52_nonce,roll_53_nonce,roll_54_nonce,roll_55_nonce,roll_56_nonce,roll_57_nonce,roll_58_nonce,roll_59_nonce,roll_60_nonce,count_lt_1000_nonce,count_gt_9000_nonce,count_gt_8000_nonce,count_gt_7000_nonce,count_gt_6000_nonce,count_gt_5000_nonce,count_gt_4000_nonce,count_gt_3000_nonce,count_gt_2000_nonce,count_gt_1000_nonce,count_hi_nonce,count_lo_nonce,count_lo_hi_nonce,count_gt_9000_k_nonce,count_lt_1000_k_nonce,total_dig_9_nonce,total_dig_0_nonce,roll_mean_nonce,roll_std_nonce,count_lt_1000_25_50_nonce,count_gt_9000_25_50_nonce,count_gt_8000_25_50_nonce,count_gt_7000_25_50_nonce,count_gt_6000_25_50_nonce,count_gt_5000_25_50_nonce,count_gt_4000_25_50_nonce,count_gt_3000_25_50_nonce,count_gt_2000_25_50_nonce,count_gt_1000_25_50_nonce,count_hi_25_50_nonce,count_lo_25_50_nonce,count_lo_hi_25_50_nonce,count_gt_9000_k_25_50_nonce,count_lt_1000_k_25_50_nonce,total_dig_9_25_50_nonce,total_dig_0_25_50_nonce,roll_mean_25_50_nonce,roll_std_25_50_nonce,count_lt_1000_50_60_nonce,count_gt_9000_50_60_nonce,count_gt_8000_50_60_nonce,count_gt_7000_50_60_nonce,count_gt_6000_50_60_nonce,count_gt_5000_50_60_nonce,count_gt_4000_50_60_nonce,count_gt_3000_50_60_nonce,count_gt_2000_50_60_nonce,count_gt_1000_50_60_nonce,count_hi_50_60_nonce,count_lo_50_60_nonce,count_lo_hi_50_60_nonce,count_gt_9000_k_50_60_nonce,count_lt_1000_k_50_60_nonce,total_dig_9_50_60_nonce,total_dig_0_50_60_nonce,roll_mean_50_60_nonce,roll_std_50_60_nonce,count_lt_1000_1_60_nonce,count_gt_9000_1_60_nonce,count_gt_8000_1_60_nonce,count_gt_7000_1_60_nonce,count_gt_6000_1_60_nonce,count_gt_5000_1_60_nonce,count_gt_4000_1_60_nonce,count_gt_3000_1_60_nonce,count_gt_2000_1_60_nonce,count_gt_1000_1_60_nonce,count_hi_1_60_nonce,count_lo_1_60_nonce,count_lo_hi_1_60_nonce,count_gt_9000_k_1_60_nonce,count_lt_1000_k_1_60_nonce,total_dig_9_1_60_nonce,total_dig_0_1_60_nonce,roll_mean_1_60_nonce,roll_std_1_60_nonce
0,5799,d98d0a6ee9de1d1e39bc3f6c6e65d20f4cc269ddcb3e3702e4b1b1e30140a0be,a08fa060e49d0755163928e2445a6d5eb715a9827239fff6aa75a07f7dec36cd,woxpwoxpwoxpwoxpwoxp2fed7fc81a575dd29d75c9588473d43f7af22fb5faab89700090f3fcec88d1a5,999950,6003,8781,6744,9778,102,5517,2088,7091,9592,3736,3557,7823,8705,2404,5736,9874,4617,3294,3393,1324,8913,1028,6743,3097,6912,5799,3262,5889,3928,1255,2608,3024,3682,9151,9785,2996,264,3658,1129,762,6683,6131,427,9328,1583,9459,839,4719,2161,1108,2096,1326,9464,8645,995,6022,9830,8458,2524,7103,7553,1,3,3,2,1,2,1,4,2,1,11,9,20,1,1,8,5,5653.45,3083.916093,4,4,0,0,2,2,1,5,4,4,8,18,26,0,0,15,4,3912.538462,3006.039617,1,2,2,2,1,0,0,0,2,1,7,4,11,1,1,5,4,5819.636364,3423.406761,6,9,5,4,6,4,2,10,7,7,28,32,60,1,1,29,14,4908.25,3155.932206,6003,7714,8979,9610,5580,2782,5349,8142,8561,2945,3006,5133,1449,939,5843,5268,9169,3338,6587,1278,7835,3337,1801,9140,2611,6946,6425,4513,6242,8156,8353,8815,767,5528,3689,1480,718,9853,8439,881,98,1207,6047,4112,5101,5375,1938,9204,4487,4830,9661,4130,4321,3943,2550,9664,397,5328,6943,9846,7726,1,2,3,2,1,5,0,2,2,2,12,7,19,1,0,10,4,5475.35,2814.56945,4,3,4,0,4,3,4,1,0,3,13,11,24,0,0,8,6,5110.192308,3069.484869,1,3,0,1,1,1,2,1,1,0,6,5,11,2,0,6,2,5864.454545,3155.640073,6,8,7,3,6,9,6,5,4,6,31,26,57,1,0,24,14,5235.15,2938.628441
1,6038,d98d0a6ee9de1d1e39bc3f6c6e65d20f4cc269ddcb3e3702e4b1b1e30140a0be,a08fa060e49d0755163928e2445a6d5eb715a9827239fff6aa75a07f7dec36cd,woxpwoxpwoxpwoxpwoxp6a6a475115f53e251c53ee08486296d3c663867a9bb5b5fd20e9baf4345d9e45,999908,3004,4605,7883,2627,7438,1848,1286,5080,2747,7669,2333,3329,9457,999,861,1799,7895,2303,6429,1494,446,8212,3511,4096,9063,6038,7743,2707,8231,9438,944,4194,4429,8310,5080,6720,7269,21,1100,8264,2237,7028,4979,3,8648,1391,5686,6526,3823,9915,2997,4918,167,6855,4816,335,9726,4141,9822,5248,2487,3,1,0,4,1,1,1,1,4,4,6,13,19,0,0,11,4,3926.4,2872.81951,3,2,4,3,3,2,3,1,3,2,13,11,24,1,1,12,9,5143.115385,3006.767358,2,2,0,0,1,1,3,0,2,0,3,5,8,0,2,5,0,4682.909091,3228.142328,8,6,5,7,5,4,8,3,8,6,24,30,54,0,0,28,15,4760.766667,3008.851898,3004,3978,395,4741,3204,3911,4220,4050,9588,2617,9208,3716,9472,7440,6741,4051,826,5908,6572,8484,164,8081,1842,6614,8302,7619,1035,2444,7805,433,1568,6532,2613,6828,8117,2908,1431,6759,6267,2505,463,5511,6903,3651,6379,3057,3538,5224,1299,2225,2523,6845,3066,1613,3113,1937,1373,8541,2448,5247,1960,3,3,1,1,2,1,4,4,1,0,8,12,20,0,1,7,8,4964.3,2905.765885,2,0,1,2,6,2,0,3,6,4,10,15,25,0,1,7,6,4062.961538,2510.888631,0,0,1,0,1,1,0,2,2,4,2,8,10,0,0,2,2,3515.090909,2344.121049,5,3,5,3,10,4,4,9,8,9,23,35,58,0,1,16,18,4431.75,2691.99027
2,3192,d98d0a6ee9de1d1e39bc3f6c6e65d20f4cc269ddcb3e3702e4b1b1e30140a0be,a08fa060e49d0755163928e2445a6d5eb715a9827239fff6aa75a07f7dec36cd,woxpwoxpwoxpwoxpwoxpebe08630a032db46a6b30243d6794366d79f6c08028b4060f239eef38c5f756c,999798,3001,9836,4705,4354,8011,4240,4735,362,4197,3836,811,7329,1332,1787,2218,709,23,6436,1272,6463,6258,3924,2465,8333,3315,3192,9044,4842,7851,4606,8729,1070,7085,8667,5427,1940,2254,1910,9763,9980,9089,9655,9587,4562,7443,7909,341,8616,649,7979,5335,2497,3693,2632,1737,8219,6223,3012,8859,6596,4562,4,1,1,1,3,0,5,1,1,3,6,14,20,1,0,4,4,3945.7,2830.661669,2,6,3,5,0,2,3,1,1,3,16,9,25,1,0,17,10,6058.653846,3205.051113,0,0,2,0,2,1,1,2,2,1,5,6,11,0,0,5,1,4851.363636,2398.704453,6,7,7,6,5,2,9,6,5,7,27,32,59,1,0,27,15,5041.766667,3023.959607,3001,8955,9910,2872,4767,2751,5076,3949,2697,6156,6633,5270,5228,8229,2772,364,2728,2114,8985,4793,1928,8425,9527,3893,6586,3716,2966,7816,19,7863,4764,9386,6698,6313,8357,474,938,8745,5941,8397,7226,2971,280,8007,223,1809,8504,2773,463,7511,3711,3346,8377,1612,9261,7578,4758,9299,7443,3917,8169,1,1,3,0,2,3,2,1,6,1,7,9,16,1,0,10,3,4808.85,2664.940866,6,1,5,4,2,1,1,2,3,1,13,12,25,0,1,9,5,4841.192308,3248.717667,0,2,2,2,0,0,1,3,0,1,6,4,10,1,0,6,0,6133.727273,2714.167758,7,5,11,6,5,4,4,6,9,3,29,25,54,1,0,27,8,5237.316667,2965.642918
3,6510,d98d0a6ee9de1d1e39bc3f6c6e65d20f4cc269ddcb3e3702e4b1b1e30140a0be,a08fa060e49d0755163928e2445a6d5eb715a9827239fff6aa75a07f7dec36cd,woxpwoxpwoxpwoxpwoxp270812fe08ed514d0f01ea5dc929e1534b852af245c6a3f17fd6a174ef9e0304,999779,7005,4205,7740,1075,2333,1704,4809,8451,7108,1024,4172,4507,5983,7803,4914,5494,534,2719,18,9255,8021,2128,3501,5194,6053,6510,8740,2689,9727,3025,2042,8687,3701,2094,7448,1814,8661,3824,7546,2072,1356,2777,8438,2171,6106,3633,4933,7401,1240,9253,2608,2119,4313,5745,1511,3693,1320,3520,7608,9541,4205,2,1,2,3,0,2,5,0,2,3,8,10,18,0,0,6,10,4593.45,2880.216448,0,2,4,3,2,0,1,4,7,3,11,14,25,1,0,5,11,4942.153846,2905.712163,0,1,0,1,0,1,2,2,2,2,3,8,11,0,0,3,5,4198.454545,2562.228185,2,4,6,7,3,4,8,7,11,8,23,33,56,0,0,15,27,4680.266667,2745.081234,7005,5657,3421,5463,3366,1154,9909,8486,4652,8274,6768,6511,5444,3624,9385,7051,9387,3829,9082,3111,8616,161,7236,4033,2893,7366,5759,5241,2171,7480,1143,6236,2436,1729,2448,7936,4249,180,2425,1059,4959,4656,1398,7309,1607,6810,8159,395,2248,3870,452,1367,9688,2075,8703,6356,5165,3065,8850,5036,4121,0,4,3,1,2,3,1,5,0,1,13,7,20,0,0,7,3,6159.5,2571.714327,3,0,1,4,2,2,3,1,5,5,8,16,24,0,0,11,7,3835.423077,2636.869267,1,1,2,0,1,2,1,1,1,1,4,5,9,1,1,1,5,4988.909091,3151.720814,4,5,6,6,5,7,6,7,7,7,26,30,56,1,0,20,16,4861.0,2831.182241
4,8530,d98d0a6ee9de1d1e39bc3f6c6e65d20f4cc269ddcb3e3702e4b1b1e30140a0be,a08fa060e49d0755163928e2445a6d5eb715a9827239fff6aa75a07f7dec36cd,woxpwoxpwoxpwoxpwoxp53067320e764840fdf1ee1f327f3e351cb22361253502a5dfe965d61a638a60b,999646,8003,3987,1717,2077,2467,2662,80,4041,4806,7485,1474,5502,7886,3244,8876,427,4189,2020,9236,4370,3643,1090,6435,6581,3696,8530,8408,1030,8380,2497,3884,5332,3361,829,690,3430,18,2797,6173,6605,3867,8728,6449,5550,1239,8852,3199,7320,1400,7256,1285,2526,3529,2279,1871,1265,4215,1282,9101,4863,9702,2,1,1,2,0,1,4,3,4,2,5,14,19,0,0,3,8,4009.45,2652.692366,3,0,5,2,3,2,0,5,2,4,12,14,26,0,0,8,12,4504.192308,2927.685134,0,2,0,0,0,0,2,1,2,4,2,8,10,0,0,4,2,3810.727273,3017.580259,5,3,6,4,5,3,6,10,8,10,21,37,58,0,1,17,24,4262.216667,2779.160043,8003,564,1670,3328,7359,9439,3227,1416,6559,2410,7966,2449,6876,9488,5312,9409,6577,401,4434,7105,2768,2867,1779,1784,3266,4931,794,6138,897,397,5111,8940,2755,8230,6292,4950,8918,268,96,1660,4111,6566,6072,9567,4146,9888,7593,9716,4349,1157,8428,948,9264,7961,7168,2074,9790,282,1948,376,2949,2,3,0,3,3,1,1,2,3,2,10,10,20,1,1,9,5,4937.85,3041.432597,5,3,4,1,4,1,5,0,1,2,12,11,23,0,3,14,5,5075.769231,3275.791914,3,2,1,2,0,0,0,0,2,1,5,6,11,1,1,8,2,4653.454545,3836.223126,10,8,4,6,7,2,6,3,7,7,26,31,57,1,1,32,12,4719.716667,3201.030317


In [71]:
%%time
test_hash = generate_features_full(results_df_test_hash,False,feature_chain_length)
print(len(test))
test_hash.head()

roll columns present
Roll Features from 1 to 20  completed
Roll Features from 1 to 20 _nonce completed
Roll Features from 25 to 50  completed
Roll Features from 25 to 50 _nonce completed
Roll Features from 50 to 60  completed
Roll Features from 50 to 60 _nonce completed
Roll Features from 1 to 60  completed
Roll Features from 1 to 60 _nonce completed
100
CPU times: user 1.49 s, sys: 15.9 ms, total: 1.51 s
Wall time: 1.5 s


Unnamed: 0,roll_actual,seed,hash,client_seed,client_index,roll_0,roll_1,roll_2,roll_3,roll_4,roll_5,roll_6,roll_7,roll_8,roll_9,roll_10,roll_11,roll_12,roll_13,roll_14,roll_15,roll_16,roll_17,roll_18,roll_19,roll_20,roll_21,roll_22,roll_23,roll_24,roll_25,roll_26,roll_27,roll_28,roll_29,roll_30,roll_31,roll_32,roll_33,roll_34,roll_35,roll_36,roll_37,roll_38,roll_39,roll_40,roll_41,roll_42,roll_43,roll_44,roll_45,roll_46,roll_47,roll_48,roll_49,roll_50,roll_51,roll_52,roll_53,roll_54,roll_55,roll_56,roll_57,roll_58,roll_59,roll_60,count_lt_1000,count_gt_9000,count_gt_8000,count_gt_7000,count_gt_6000,count_gt_5000,count_gt_4000,count_gt_3000,count_gt_2000,count_gt_1000,count_hi,count_lo,count_lo_hi,count_gt_9000_k,count_lt_1000_k,total_dig_9,total_dig_0,roll_mean,roll_std,count_lt_1000_25_50,count_gt_9000_25_50,count_gt_8000_25_50,count_gt_7000_25_50,count_gt_6000_25_50,count_gt_5000_25_50,count_gt_4000_25_50,count_gt_3000_25_50,count_gt_2000_25_50,count_gt_1000_25_50,count_hi_25_50,count_lo_25_50,count_lo_hi_25_50,count_gt_9000_k_25_50,count_lt_1000_k_25_50,total_dig_9_25_50,total_dig_0_25_50,roll_mean_25_50,roll_std_25_50,count_lt_1000_50_60,count_gt_9000_50_60,count_gt_8000_50_60,count_gt_7000_50_60,count_gt_6000_50_60,count_gt_5000_50_60,count_gt_4000_50_60,count_gt_3000_50_60,count_gt_2000_50_60,count_gt_1000_50_60,count_hi_50_60,count_lo_50_60,count_lo_hi_50_60,count_gt_9000_k_50_60,count_lt_1000_k_50_60,total_dig_9_50_60,total_dig_0_50_60,roll_mean_50_60,roll_std_50_60,count_lt_1000_1_60,count_gt_9000_1_60,count_gt_8000_1_60,count_gt_7000_1_60,count_gt_6000_1_60,count_gt_5000_1_60,count_gt_4000_1_60,count_gt_3000_1_60,count_gt_2000_1_60,count_gt_1000_1_60,count_hi_1_60,count_lo_1_60,count_lo_hi_1_60,count_gt_9000_k_1_60,count_lt_1000_k_1_60,total_dig_9_1_60,total_dig_0_1_60,roll_mean_1_60,roll_std_1_60,roll_0_nonce,roll_1_nonce,roll_2_nonce,roll_3_nonce,roll_4_nonce,roll_5_nonce,roll_6_nonce,roll_7_nonce,roll_8_nonce,roll_9_nonce,roll_10_nonce,roll_11_nonce,roll_12_nonce,roll_13_nonce,roll_14_nonce,roll_15_nonce,roll_16_nonce,roll_17_nonce,roll_18_nonce,roll_19_nonce,roll_20_nonce,roll_21_nonce,roll_22_nonce,roll_23_nonce,roll_24_nonce,roll_25_nonce,roll_26_nonce,roll_27_nonce,roll_28_nonce,roll_29_nonce,roll_30_nonce,roll_31_nonce,roll_32_nonce,roll_33_nonce,roll_34_nonce,roll_35_nonce,roll_36_nonce,roll_37_nonce,roll_38_nonce,roll_39_nonce,roll_40_nonce,roll_41_nonce,roll_42_nonce,roll_43_nonce,roll_44_nonce,roll_45_nonce,roll_46_nonce,roll_47_nonce,roll_48_nonce,roll_49_nonce,roll_50_nonce,roll_51_nonce,roll_52_nonce,roll_53_nonce,roll_54_nonce,roll_55_nonce,roll_56_nonce,roll_57_nonce,roll_58_nonce,roll_59_nonce,roll_60_nonce,count_lt_1000_nonce,count_gt_9000_nonce,count_gt_8000_nonce,count_gt_7000_nonce,count_gt_6000_nonce,count_gt_5000_nonce,count_gt_4000_nonce,count_gt_3000_nonce,count_gt_2000_nonce,count_gt_1000_nonce,count_hi_nonce,count_lo_nonce,count_lo_hi_nonce,count_gt_9000_k_nonce,count_lt_1000_k_nonce,total_dig_9_nonce,total_dig_0_nonce,roll_mean_nonce,roll_std_nonce,count_lt_1000_25_50_nonce,count_gt_9000_25_50_nonce,count_gt_8000_25_50_nonce,count_gt_7000_25_50_nonce,count_gt_6000_25_50_nonce,count_gt_5000_25_50_nonce,count_gt_4000_25_50_nonce,count_gt_3000_25_50_nonce,count_gt_2000_25_50_nonce,count_gt_1000_25_50_nonce,count_hi_25_50_nonce,count_lo_25_50_nonce,count_lo_hi_25_50_nonce,count_gt_9000_k_25_50_nonce,count_lt_1000_k_25_50_nonce,total_dig_9_25_50_nonce,total_dig_0_25_50_nonce,roll_mean_25_50_nonce,roll_std_25_50_nonce,count_lt_1000_50_60_nonce,count_gt_9000_50_60_nonce,count_gt_8000_50_60_nonce,count_gt_7000_50_60_nonce,count_gt_6000_50_60_nonce,count_gt_5000_50_60_nonce,count_gt_4000_50_60_nonce,count_gt_3000_50_60_nonce,count_gt_2000_50_60_nonce,count_gt_1000_50_60_nonce,count_hi_50_60_nonce,count_lo_50_60_nonce,count_lo_hi_50_60_nonce,count_gt_9000_k_50_60_nonce,count_lt_1000_k_50_60_nonce,total_dig_9_50_60_nonce,total_dig_0_50_60_nonce,roll_mean_50_60_nonce,roll_std_50_60_nonce,count_lt_1000_1_60_nonce,count_gt_9000_1_60_nonce,count_gt_8000_1_60_nonce,count_gt_7000_1_60_nonce,count_gt_6000_1_60_nonce,count_gt_5000_1_60_nonce,count_gt_4000_1_60_nonce,count_gt_3000_1_60_nonce,count_gt_2000_1_60_nonce,count_gt_1000_1_60_nonce,count_hi_1_60_nonce,count_lo_1_60_nonce,count_lo_hi_1_60_nonce,count_gt_9000_k_1_60_nonce,count_lt_1000_k_1_60_nonce,total_dig_9_1_60_nonce,total_dig_0_1_60_nonce,roll_mean_1_60_nonce,roll_std_1_60_nonce
0,8399,a08fa060e49d0755163928e2445a6d5eb715a9827239fff6aa75a07f7dec36cd,b465a3bc3240974c62115114d220a77452036cf55edb5bc079668fa89ff9d194,woxpwoxpwoxpwoxpwoxp520e38ca4d0cc8342c73ea7c5d4f25f3bc7e9db3ae087cb464b1cd9d049f6d2c,999913,4003,8144,759,9088,8556,6098,2088,9961,8385,7172,9481,4919,9043,1823,8453,9042,3239,5981,5709,5823,6625,2657,5159,8628,3000,3492,7051,3868,965,2047,1604,3693,1789,7690,8600,5789,121,2190,3203,4455,328,5685,3680,8046,3169,5901,2674,7017,7821,7322,7404,1954,3209,8012,1597,648,3214,8678,374,3108,6425,1,5,4,1,2,3,1,1,1,1,15,4,19,1,1,13,6,6519.45,2754.861721,3,0,2,6,0,3,1,6,3,2,11,15,26,0,1,9,13,4446.307692,2611.351708,2,0,2,1,1,0,0,3,0,2,4,7,11,0,1,3,4,4056.636364,3029.782674,6,5,9,7,3,7,2,11,5,5,30,28,58,1,1,26,25,5044.266667,2873.567746,4003,3244,9023,5031,6278,288,4401,2646,9786,7559,8458,8663,5724,1815,1939,4583,4722,8159,7693,4855,389,142,6420,8191,1059,760,6768,312,1766,2669,5327,2299,3998,9496,8209,5437,9581,1735,1417,4759,4194,6267,1255,1837,3492,298,9467,3831,5799,8225,3794,8498,4788,8392,6338,1449,7570,3481,6170,4832,687,2,2,3,2,1,2,4,1,1,2,9,9,18,1,1,8,3,5262.8,2913.280618,3,3,2,0,2,3,2,4,2,5,10,15,25,0,2,17,2,4345.846154,2939.248145,1,0,2,1,2,0,2,2,0,1,5,4,9,0,0,4,2,5090.818182,2616.515233,7,5,8,3,6,5,8,6,3,9,26,29,55,1,1,30,9,4771.083333,2925.71202
1,5980,a08fa060e49d0755163928e2445a6d5eb715a9827239fff6aa75a07f7dec36cd,b465a3bc3240974c62115114d220a77452036cf55edb5bc079668fa89ff9d194,woxpwoxpwoxpwoxpwoxp5a47389b1201dbec9589f3e13f3ca989a9ac2e4ed027a776078692d0237cb624,999784,7008,2637,5596,2235,554,2053,9775,2355,246,7484,8934,1207,2960,6892,4171,1392,7965,2157,871,7167,8598,4160,2731,1157,8981,9147,7250,2118,6868,4871,1333,382,2770,4673,3044,8201,6367,8229,507,4545,8891,2297,9976,8895,123,6130,6828,5258,346,8187,499,7007,1577,2544,3646,7210,9443,5051,3484,7824,639,3,1,2,3,1,1,1,0,6,2,8,12,20,0,1,8,3,4262.45,3183.817131,5,2,5,1,4,1,3,1,3,1,13,12,25,1,0,9,6,4912.884615,3245.99765,2,1,0,3,0,1,0,2,1,1,4,6,10,0,1,4,4,4447.636364,3072.972056,9,4,8,7,5,3,5,3,11,5,26,32,58,1,1,20,14,4640.633333,3117.304393,7008,4849,797,5307,52,8319,3680,4282,5861,9216,7375,3661,6153,8505,1456,2084,5644,7918,1253,3613,6902,2472,507,4759,1508,2209,4193,4277,9528,2914,6681,3778,6752,4140,3083,2059,2629,7390,9941,5917,2205,615,5853,7924,8788,4343,7734,1481,1178,5190,3910,4648,1260,8971,1975,6960,6233,7918,4141,8931,5221,2,1,2,2,2,3,2,3,1,2,10,9,19,0,2,6,5,4846.35,2754.940753,1,2,1,3,2,3,4,3,5,2,10,15,25,1,0,13,8,4796.615385,2657.601928,0,0,2,1,2,1,2,1,0,2,5,5,10,0,0,6,3,5469.818182,2605.899953,4,3,5,6,6,7,9,6,7,7,25,31,56,0,2,25,17,4785.716667,2691.937555
2,4884,a08fa060e49d0755163928e2445a6d5eb715a9827239fff6aa75a07f7dec36cd,b465a3bc3240974c62115114d220a77452036cf55edb5bc079668fa89ff9d194,woxpwoxpwoxpwoxpwoxp15c924137ee005812392ae2e65614764773bad42129fefd4413843b3361c2e3f,999671,4006,4799,523,5350,2893,8412,4530,5090,9915,8868,2244,9083,5500,1192,405,2808,2761,6653,3099,1839,9506,1460,787,9764,8107,6427,6653,7000,2219,931,9313,3353,1739,4698,6926,9125,6529,2737,4053,8872,5754,4218,6347,1087,67,8022,9158,9229,4728,2211,3827,6580,6634,6683,4625,6419,6523,5607,2955,5381,850,2,3,2,0,1,3,2,1,4,2,8,10,18,0,1,12,11,4773.5,3088.183702,2,4,2,1,5,1,4,2,3,2,13,13,26,1,1,11,6,5200.884615,2858.658963,1,0,0,0,5,2,1,1,1,0,7,4,11,0,0,2,3,5098.545455,1888.380278,6,8,5,1,11,6,7,3,8,5,30,28,58,0,1,26,22,5051.133333,2865.452214,4006,1450,4879,2277,5521,7205,3439,2510,9920,4549,4926,8120,239,1311,9326,6006,5033,1159,7663,4511,6071,343,7237,807,3813,9154,1650,8258,575,9481,8941,3195,2844,6226,1169,6256,6032,7643,9852,5027,4267,3004,8931,6171,2130,4950,8973,4696,1461,5936,5897,9505,8786,5182,3358,2872,3824,2291,5940,6659,4943,1,2,1,2,2,2,4,1,2,3,8,9,17,0,0,9,9,4805.75,2776.133507,1,3,4,1,4,3,3,2,2,3,14,10,24,2,1,12,7,5489.192308,2863.465928,0,1,1,0,1,3,1,2,2,0,5,4,9,1,0,6,2,5387.0,2310.939852,4,6,6,4,7,7,8,6,6,6,27,26,53,0,0,26,19,5073.233333,2786.169178
3,1672,a08fa060e49d0755163928e2445a6d5eb715a9827239fff6aa75a07f7dec36cd,b465a3bc3240974c62115114d220a77452036cf55edb5bc079668fa89ff9d194,woxpwoxpwoxpwoxpwoxp08521ce159252e5293936507f726de9da11a07485161755c879c52114becd34d,999528,4009,1195,2207,4592,4898,7054,9630,7265,3200,3557,9559,7398,1576,4,2938,5587,3084,53,8327,6695,5441,3124,1705,4199,8014,6935,3167,9814,1557,5185,2677,9628,2628,2312,4772,8709,992,1828,1793,721,9398,412,3737,1219,2613,8766,4068,6903,1945,2780,2752,5633,7764,2095,9107,1587,6905,8667,3088,3937,3194,2,2,1,3,1,2,2,3,2,2,9,10,19,0,0,9,6,4713.0,2964.811541,3,3,2,0,2,1,2,2,6,5,7,17,24,1,0,12,4,4127.346154,3036.654408,0,1,1,1,1,1,0,3,2,1,5,6,11,1,0,5,4,4975.363636,2741.633063,5,6,5,4,4,4,5,9,9,9,22,35,57,1,0,28,16,4509.833333,2907.605172,4009,6641,6295,3982,9692,1231,553,9995,3466,3279,2463,5721,6632,8324,5555,9207,2504,7443,8795,7101,9867,1317,7390,7099,3043,8841,164,1032,7291,9386,4527,1767,8118,3054,6886,4878,6695,2903,9287,9837,7050,5036,7314,1692,9491,733,2839,7946,1973,9669,6050,3663,5604,7341,3742,7825,5551,8,8011,2777,7855,1,4,2,2,3,2,0,3,2,1,13,7,20,1,0,11,3,5937.3,2968.911941,2,5,2,4,3,1,2,1,2,4,14,10,24,1,1,14,8,5556.115385,3171.267057,1,0,1,3,1,2,0,2,1,0,7,4,11,0,0,0,4,5311.545455,2541.913231,4,9,5,11,6,5,2,7,5,6,35,23,58,1,1,28,16,5573.85,2959.734286
4,6578,a08fa060e49d0755163928e2445a6d5eb715a9827239fff6aa75a07f7dec36cd,b465a3bc3240974c62115114d220a77452036cf55edb5bc079668fa89ff9d194,woxpwoxpwoxpwoxpwoxp64114df532f1cd746e76cba4f4b97a79bff505484f0b282d117ab722c7a52a5f,999319,9009,7912,7992,3687,1315,3815,6491,3016,3229,6833,6685,3090,8970,4317,5363,3067,3440,6126,71,6777,256,4756,8653,9463,6631,2517,7799,149,8149,7623,490,5685,7661,5289,5539,97,5504,562,5475,9710,7498,8215,7429,1148,1371,5495,3924,4532,6732,6222,7315,5143,3365,7292,7830,1904,784,3907,7641,5152,3922,2,0,1,2,5,1,1,7,0,1,9,11,20,0,0,7,6,4622.6,2556.611993,4,1,2,6,2,6,1,1,1,2,17,9,26,0,2,13,3,5081.923077,2903.618293,1,0,0,4,0,2,0,3,0,1,4,5,9,0,0,4,3,4932.272727,2407.886961,7,2,4,11,8,9,3,11,1,4,32,25,57,0,0,25,12,5017.083333,2689.031445,9009,2828,6671,1068,5730,9549,7047,2614,1647,3938,5857,3931,3980,3728,1608,6013,8001,290,585,2564,3258,4939,5684,4555,4936,6012,514,9156,5213,8935,2675,8882,7329,5599,9666,6390,4121,1092,1423,1744,758,5438,8358,7541,6572,5839,1024,7426,4762,9317,9676,995,117,2515,1660,9833,8039,3529,2671,1401,4158,2,1,1,1,2,2,0,5,3,3,7,13,20,1,0,6,9,4045.35,2564.347548,2,4,3,3,3,4,2,0,1,4,16,8,24,1,1,11,4,5594.692308,3069.237518,2,2,1,0,0,0,1,1,2,2,3,8,11,2,2,6,3,4054.0,3508.257345,6,6,5,4,5,7,6,6,6,9,26,30,56,1,0,25,16,4690.016667,2893.111459


#### Machine Learning

In [72]:
USE_STAT_FEATS = True

In [73]:

if USE_STAT_FEATS:
    exclude_stat_cols=[]
else:
#     exclude_stat_cols = ['roll_mean','roll_std','roll_mean_nonce','roll_std_nonce']
    exclude_stat_col_starts = ['roll_mean','roll_std','roll_mean_nonce','roll_std_nonce']
    exclude_stat_cols=[]
    for stat_col in exclude_stat_col_starts:
        exclude_stat_cols += [col for col in train.columns if col.startswith(stat_col)]
    print(exclude_stat_cols)
exclude_cols = []#[f'roll_{i}' for i in range(11,feature_chain_length)]
exclude_cols_nonce =[]# [f'roll_{i}_nonce' for i in range(11,feature_chain_length+1)]
exclude_count_cols = [f'count_gt_{i}000' for i in [1,2,3,4,6]]
exclude_count_cols_nonce = [f'count_gt_{i}000_nonce' for i in [1,2,3,4,6]]
othercols = ['roll_actual','client_seed','seed','hash','client_index'] + exclude_stat_cols

all_exclude_cols = othercols+exclude_cols+exclude_cols_nonce+\
                    exclude_count_cols+exclude_count_cols_nonce
                  
features = [col for col in train.columns if col not in all_exclude_cols]
print(len(features))
print(features)

265
['roll_0', 'roll_1', 'roll_2', 'roll_3', 'roll_4', 'roll_5', 'roll_6', 'roll_7', 'roll_8', 'roll_9', 'roll_10', 'roll_11', 'roll_12', 'roll_13', 'roll_14', 'roll_15', 'roll_16', 'roll_17', 'roll_18', 'roll_19', 'roll_20', 'roll_21', 'roll_22', 'roll_23', 'roll_24', 'roll_25', 'roll_26', 'roll_27', 'roll_28', 'roll_29', 'roll_30', 'roll_31', 'roll_32', 'roll_33', 'roll_34', 'roll_35', 'roll_36', 'roll_37', 'roll_38', 'roll_39', 'roll_40', 'roll_41', 'roll_42', 'roll_43', 'roll_44', 'roll_45', 'roll_46', 'roll_47', 'roll_48', 'roll_49', 'roll_50', 'roll_51', 'roll_52', 'roll_53', 'roll_54', 'roll_55', 'roll_56', 'roll_57', 'roll_58', 'roll_59', 'roll_60', 'count_lt_1000', 'count_gt_9000', 'count_gt_8000', 'count_gt_7000', 'count_gt_5000', 'count_hi', 'count_lo', 'count_lo_hi', 'count_gt_9000_k', 'count_lt_1000_k', 'total_dig_9', 'total_dig_0', 'roll_mean', 'roll_std', 'count_lt_1000_25_50', 'count_gt_9000_25_50', 'count_gt_8000_25_50', 'count_gt_7000_25_50', 'count_gt_6000_25_50', 'c

In [74]:
X = train[features].drop('target',axis=1)
y = train['target']

In [75]:
X.columns

Index(['roll_0', 'roll_1', 'roll_2', 'roll_3', 'roll_4', 'roll_5', 'roll_6',
       'roll_7', 'roll_8', 'roll_9',
       ...
       'count_gt_1000_1_60_nonce', 'count_hi_1_60_nonce',
       'count_lo_1_60_nonce', 'count_lo_hi_1_60_nonce',
       'count_gt_9000_k_1_60_nonce', 'count_lt_1000_k_1_60_nonce',
       'total_dig_9_1_60_nonce', 'total_dig_0_1_60_nonce',
       'roll_mean_1_60_nonce', 'roll_std_1_60_nonce'],
      dtype='object', length=264)

In [76]:
import xgboost as xgb
import matplotlib.pyplot as plt # for plotting graphs
import seaborn as sns # for plotting graphs
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import RandomOverSampler
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve, f1_score,precision_recall_curve

Train Validation Split

In [77]:
n_folds=10
tr_indices_folds = []
val_indices_folds=[]
tr_indices_seeds=[]
for fold in range(n_folds):
#     tr_indices = train[(fold+2) * train_client_size:].index
#     val_indices = train[(fold) * train_client_size:(fold+1) * train_client_size].index
    tr_offset = 1#25
    tr_length = 25#25
    tr_start = (fold+tr_offset) * train_client_size
    tr_end = (fold+tr_offset+tr_length) * train_client_size
    tr_indices = train[tr_start:tr_end].index #fold + 3
#     tr_indices = list(tr_indices) + list(train[70:90].index) #fold + 3
    
#     tr_indices_seed=[]
#     for i in range(1,tr_offset):
#         tr_start_seed = (fold+i) * train_client_size
#         tr_end_seed = (fold+i+1) * train_client_size
#         tr_indices_seed.append(train[tr_start_seed:tr_end_seed].index) #fold + 3 
        
    # tr_start_seed = (fold+1) * train_client_size
    # tr_end_seed = (fold+tr_offset) * train_client_size
    # tr_indices_seed = train[tr_start_seed:tr_end_seed].index #fold + 3
    
    tr_indices_seed = [train[0:(fold+1) * train_client_size].index]
#     val_indices = train[(fold+1) * train_client_size:(fold+2) * train_client_size].index
    val_indices = train[(fold) * train_client_size:(fold+1) * train_client_size].index
    tr_indices_seeds.append(tr_indices_seed)
    tr_indices_folds.append(tr_indices)
    val_indices_folds.append(val_indices)
    

In [78]:
#This function is useful to convert the predicted probabilities into labels 
#so that F1 score is optimized
#It first determines the probability threshold using precision recall curve at which F1 score is optimized and 
#then generate the optimized labels based on the determined probability threshold
def get_opt_cutoff_prec(labels,preds):
    precision, recall, thresholds  = precision_recall_curve(labels, preds)
    f1_score= 2*((precision*recall)/(precision+recall))
    
    f1_score = f1_score[:len(thresholds)]
    f1_score[np.isnan(f1_score)]=0
#     optimal_idx = np.nanargmax(precision[:len(precision)-1])
#     print(f'precision borders:{precision[0],precision[len(precision)-2]}')
#     print(f'recall borders:{recall[0],recall[len(recall)-2]}')
#     print(f'f1_score size:{len(f1_score)} recall size:{len(recall)}')
#     print(f'{precision=}')
#     print(f'{recall=}')
#     print(f'{f1_score=}')
#     print(f'{thresholds=}')
#     print(f'optimal_idx:{optimal_idx} precision size:{len(precision)} thresholds size:{len(thresholds)}')
    optimal_idx = np.nanargmax(f1_score)
    optimal_threshold = thresholds[optimal_idx]
#     return optimal_threshold, precision[optimal_idx]
    return optimal_threshold, f1_score[optimal_idx]

def convert_probtolabels(preds,cutoff=0.5):
    y_bin= preds.copy()
    y_bin[preds>cutoff] = 1
    y_bin[preds<=cutoff] = 0
    y_bin=y_bin.astype(int)

    return y_bin

In [79]:
# # roll_actual_df = pd.DataFrame(columns=['roll_actual'])
# # roll_actual_df['roll_actual']=train['roll_actual'].iloc[tr_index]
# # print(len(X_tr))
# # print(len(roll_actual_df))

# oversample = RandomOverSampler(sampling_strategy='minority',random_state=123)
# # fit and apply the transform
# roll_actual_df, y_temp = oversample.fit_resample(train[features+['roll_actual']].iloc[tr_index], 
#                                                  train['target'].iloc[tr_index])
# print(len(roll_actual_df))

# # tr_df = pd.concat([X_tr,roll_actual_df],axis=1)
# # print(len(tr_df))
# train_status(roll_actual_df,True)

In [80]:
%%time
READ_MODEL_FILE = False

if READ_MODEL_FILE:
    xgb_models = joblib.load(f'data/models/models_{nonce}_{file_pattern_str}_pattern.dump')
else:

    params = { 'n_estimators':100,
              'max_leaves':25,
                'subsample':0.8,
              'random_state':145,
              # 'scale_pos_weight': 5,
    #           'max_depth':6,
            'learning_rate':0.05,
             'colsample_bytree':0.6,#0.85,
             'lambda':0.05,
             'alpha':0.1}

    # xgb_model = xgb.XGBClassifier(**params)
    xgb_models = []
    scores  = []
    ratios =[]
    tr_cutoffs=[]
    val_cutoffs=[]
    tr_last_cutoffs=[]

    for fold,(tr_index, val_index, tr_index_seed) in enumerate(zip(tr_indices_folds,val_indices_folds,tr_indices_seeds)):
        X_tr,y_tr = X.iloc[tr_index],y.iloc[tr_index]
        X_val,y_val = X.iloc[val_index],y.iloc[val_index]

        oversample = RandomOverSampler(sampling_strategy='minority',random_state=123)
        # fit and apply the transform
        X_tr, y_tr = oversample.fit_resample(X_tr, y_tr)
        print(pd.Series(y_tr).value_counts())


        print(f'\n ******************* fold: {fold} ********')
        print(f'tr size: { len(X_tr)}  val size:  {len(X_val)}')
#         print(f'{tr_index_seed}')
#         print(f'{val_index}')
#         print(f'{tr_index}')

        xgb_model = xgb.XGBClassifier(**params)
        xgb_model.fit(X_tr,y_tr)
        xgb_models.append(xgb_model)

        mean_opt_cutoff_tr = 0
        for tr_index_seed_cur in tr_index_seed:
            X_tr_seed,y_tr_seed = X.iloc[tr_index_seed_cur],y.iloc[tr_index_seed_cur]

            tr_probs = xgb_model.predict_proba(X_tr_seed)[:,1]
            opt_cutoff_tr, f1score = get_opt_cutoff_prec(y_tr_seed,tr_probs)
            # print('tr cutoff:',opt_cutoff_tr)
            tr_labels = convert_probtolabels(tr_probs,cutoff=opt_cutoff_tr) 
            mask1 = (tr_labels==1)
            mask2 = (y_tr_seed==1)
            total = len(tr_labels[mask1])
            success = len(tr_labels[mask1 & mask2])
            ratio = success/total if total!=0 else 0
            # print(f'tr ratio: {ratio} total: {total} success: {success}')
            mean_opt_cutoff_tr += opt_cutoff_tr / len(tr_index_seed)

        print(f'last cutoff :{opt_cutoff_tr}')
        print(f'mean tr cutoff :{mean_opt_cutoff_tr}')
        tr_last_cutoffs.append(opt_cutoff_tr)
        tr_cutoffs.append(mean_opt_cutoff_tr)
        val_probs = xgb_model.predict_proba(X_val)[:,1]
        print(val_probs[0:5])
        labels = convert_probtolabels(val_probs)
        score_init = accuracy_score(y_val,labels)

        opt_cutoff, f1score = get_opt_cutoff_prec(y_val,val_probs)
        print('valid cutoff:',opt_cutoff)
        val_cutoffs.append(opt_cutoff)
        # print(pd.Series(val_probs).describe(percentiles=[0.6,0.7,0.75,0.8]))
        val_labels = convert_probtolabels(val_probs,cutoff=opt_cutoff_tr) 
        mask1 = (val_labels==1)
        mask2 = (y_val==1)
        total = len(val_labels[mask1])
        success = len(val_labels[mask1 & mask2])
        ratio = success/total if total!=0 else 0
        print(f'ratio: {ratio} total: {total} success: {success}')
        ratios.append(ratio)
        cur_f1 = f1_score(y_val, val_labels,average='macro')
        cur_acc = accuracy_score(y_val,val_labels)
        print(f'val accuracy score:{cur_acc} f1 score:{cur_f1:.4f} initial accuracy score:{score_init}')
        scores.append(cur_acc)

        val_labels = convert_probtolabels(val_probs,cutoff=opt_cutoff) 
        mask1 = (val_labels==1)
        mask2 = (y_val==1)
        total = len(val_labels[mask1])
        success = len(val_labels[mask1 & mask2])
        ratio = success/total if total!=0 else 0
        print(f'ratio @val_cutoff: {ratio} total: {total} success: {success}')

    #     train_status(X_tr,True)

    #     top_prob = np.sort(val_probs)[::-1][:1]
    #     top_label = y_val[val_probs==top_prob ]
    #     print(f'top_prob:{top_prob} top_label:{top_label}')

    print(f'mean score:{np.mean(scores)}')    
    print(f'mean ratio:{np.mean(ratios)}')   
    
    print(np.mean(np.array(val_cutoffs)))
    print(np.mean(np.array(tr_cutoffs)))
    print(np.mean(np.array(tr_last_cutoffs)))
    print(ratios)
    
    joblib.dump(xgb_models,f'data/models/models_{nonce}_{file_pattern_str}_pattern.dump')
    print('Models Save completed')   

0    317
1    317
Name: target, dtype: int64

 ******************* fold: 0 ********
tr size: 634  val size:  14
last cutoff :0.17379635572433472
mean tr cutoff :0.17379635572433472
[0.0480644  0.02016523 0.03113678 0.14613938 0.02476498]
valid cutoff: 0.17379636
ratio: 0 total: 0 success: 0
val accuracy score:0.8571428571428571 f1 score:0.4615 initial accuracy score:0.8571428571428571
ratio @val_cutoff: 0 total: 0 success: 0
0    318
1    318
Name: target, dtype: int64

 ******************* fold: 1 ********
tr size: 636  val size:  14
last cutoff :0.10100797563791275
mean tr cutoff :0.10100797563791275
[0.05962554 0.02988621 0.07025079 0.04900866 0.031049  ]
valid cutoff: 0.029886207
ratio: 0.0 total: 2 success: 0
val accuracy score:0.7857142857142857 f1 score:0.4400 initial accuracy score:0.9285714285714286
ratio @val_cutoff: 0.0 total: 11 success: 0
0    316
1    316
Name: target, dtype: int64

 ******************* fold: 2 ********
tr size: 632  val size:  14
last cutoff :0.135677605

In [81]:
def generate_probs(multi_models,model,data,features):
    if multi_models:
        probs= np.zeros(len(data))
        for i,model_ind in enumerate(model):
            cur_prob= model_ind.predict_proba(data[features])[:,1] 
            probs+= cur_prob / len(model)
    else: 
        probs= model.predict_proba(test_filt_df[features_test])[:,1]
    data['probs']=probs
    return data

##### Prediction of Test 

In [82]:
CONT_ANALYSIS=False
IS_PROD=False

In [83]:
if not IS_PROD:
    
    actual_seed="e658f73f9831527c16614b72cf87f0a9718961a6099024b6301b10b54dc166d0"
#     actual_seed = cur_hash_list[25]
    print(actual_seed)
    # Vectorize the function
    vectorized_calculate_roll = np.vectorize(calculate_roll)

    # Compute the roll values for the input arrays
    roll_array = vectorized_calculate_roll(actual_seed,
                                           results_df_test['client_seed'],
                                           nonce)

    results_df_test['roll_actual']=roll_array
#     test_manual = generate_test_features(results_df_test)
    test = generate_features_full(results_df_test,False,feature_chain_length)

e658f73f9831527c16614b72cf87f0a9718961a6099024b6301b10b54dc166d0
roll columns present
Roll Features from 1 to 20  completed
Roll Features from 1 to 20 _nonce completed
Roll Features from 25 to 50  completed
Roll Features from 25 to 50 _nonce completed
Roll Features from 50 to 60  completed
Roll Features from 50 to 60 _nonce completed
Roll Features from 1 to 60  completed
Roll Features from 1 to 60 _nonce completed


In [None]:
# l1=list(range(1,261))
# # l2=list(range(50,80))
# for hash_idx in [-1]+l1:
    
#     if hash_idx==-1:
#         actual_seed="d0e068a90b3c836bda9220c2d0135028f66e80d325a75935baee007c85b73005"
#         suffix = 'orig'
#     else:    
#         actual_seed = cur_hash_list[hash_idx]
#         suffix = hash_idx
#     print(actual_seed)
#     # Vectorize the function
#     vectorized_calculate_roll = np.vectorize(calculate_roll)

#     # Compute the roll values for the input arrays
#     test[f'roll_actual_{suffix}']=vectorized_calculate_roll(actual_seed,
#                                            test['client_seed'],
#                                            nonce)

In [None]:
# analysis_df=pd.read_csv(f'data/analysis_df_{nonce}.csv')

In [84]:
mean_imp = np.zeros(len(features)-1)
imp_df = pd.DataFrame()
imp_df['feature'] = [col for col in features if col!='target']
for i,model in enumerate(xgb_models):
    cur_imp = model.feature_importances_
    imp_df[f'importance_m{i}'] = cur_imp
#     print(cur_imp)
    mean_imp += cur_imp / len(xgb_models)

# print(mean_imp)
imp_df['importance'] = mean_imp
imp_df = imp_df.sort_values('importance',ascending=False).reset_index(drop=True)
imp_df

Unnamed: 0,feature,importance_m0,importance_m1,importance_m2,importance_m3,importance_m4,importance_m5,importance_m6,importance_m7,importance_m8,importance_m9,importance
0,roll_mean_50_60_nonce,0.009918,0.008259,0.016906,0.032622,0.008562,0.002447,0.004664,0.020548,0.005219,0.00797,0.011711
1,count_gt_1000_1_60,0.012775,0.006034,0.00463,0.008085,0.000227,0.005271,0.010281,0.017997,0.011868,0.02909,0.010626
2,count_gt_4000_1_60,0.007774,0.010531,0.007476,0.007542,0.007406,0.006712,0.012631,0.015376,0.01084,0.008383,0.009467
3,roll_52_nonce,0.014671,0.010858,0.013241,0.007096,0.006887,0.006974,0.005133,0.010572,0.006984,0.01221,0.009463
4,roll_mean_1_60_nonce,0.000914,0.012982,0.003752,0.008917,0.030407,0.001734,0.011398,0.012818,0.002953,0.001986,0.008786
5,count_gt_5000,0.005084,0.005234,0.00169,0.011881,0.007409,0.003002,0.00534,0.017676,0.013809,0.013464,0.008459
6,roll_44_nonce,0.005041,0.005582,0.002589,0.008157,0.00749,0.007727,0.022222,0.005473,0.001491,0.016838,0.008261
7,roll_6_nonce,0.005771,0.007678,0.005128,0.016131,0.018672,0.006258,0.008008,0.003629,0.004886,0.006296,0.008246
8,roll_19_nonce,0.00984,0.009895,0.010751,0.010212,0.015854,0.002267,0.003921,0.006985,0.004415,0.001477,0.007562
9,roll_50_nonce,0.009676,0.011389,0.003399,0.008269,0.00665,0.010749,0.012084,0.008944,0.002596,0.001518,0.007527


In [None]:
# %%time
# test_hash25 = generate_features_full(results_df_test_hash25,False,feature_chain_length)
# test_hash25.head()

In [None]:
# %%time
# test_hash24 = generate_features_full(results_df_test_hash24,False,feature_chain_length)
# test_hash24.head()

In [None]:
# hash_imp_name='roll_54'
# nonce_suffix = '_nonce'
# if hash_imp_name.find(nonce_suffix)!=-1:
#     is_nonce=True
#     hash_imp_name=hash_imp_name.replace(nonce_suffix,'')

# hash_no = int(hash_imp_name.replace('roll_',''))
# hash_no

In [None]:
# hash_imp_count=5
# roll_imp_feats = [col for col in imp_df['feature'].values if (col.startswith('roll_'))
#                                 and ('std' not in col ) and ('mean' not in col )][:hash_imp_count] 
# roll_imp_feats


In [86]:
%%time
READ_FROM_FILE_TEST=False

hash_list_nonce = generate_hash_chain(str(nonce),feature_chain_length)
hash_imp_count=3
test_hash_imp = np.zeros((hash_imp_count))
hash_imp_names = [col for col in imp_df['feature'].values if (col.startswith('roll_'))
                                and ('std' not in col ) and ('mean' not in col )][:hash_imp_count] 
nonce_suffix = '_nonce'
test_hash_imp = []
for i in range(hash_imp_count):

    hash_imp_name = hash_imp_names[i]
    
    print(f'********* Processing Test Hash {hash_imp_name} ******************')

    if hash_imp_name.find(nonce_suffix)!=-1:
        is_nonce=True
        hash_imp_name=hash_imp_name.replace(nonce_suffix,'')
        nonce_no = int(hash_imp_name.replace('roll_',''))
        cur_nonce = hash_list_nonce[nonce_no]
        hash_no = 0
        file_suffix =f"_nonce{nonce_no}"
        print(f'{cur_nonce=}')
    else:
        hash_no = int(hash_imp_name.replace('roll_',''))
        cur_nonce= nonce
        file_suffix =f"_hash{hash_no}"

    cur_seed = cur_hash_list[hash_no]
    cur_hash = cur_hash_list[hash_no+1]
    print(f'{file_suffix=}')
    print(f'{hash_no=}')
    print(f'{cur_seed=}')
    print(f'{cur_hash=}')

    results_df_test_hash_imp=gen_test_data(cur_nonce,cur_hash,cur_seed,
                                        file_suffix=file_suffix)
#     results_df_test_hash_imp=save_cleaned_results_df(results_df_test_hash_imp,True,
#                                                 file_suffix=file_suffix)
    print()
    print(results_df_test_hash.head(1))
    cur_test_hash_imp = generate_features_full(results_df_test_hash_imp,False,feature_chain_length)
    test_hash_imp.append(cur_test_hash_imp)

********* Processing Test Hash roll_52_nonce ******************
cur_nonce='0632862afa69a470522819d912de8142f8d235d7048dc16c75a16c36bb606321'
file_suffix='_nonce52'
hash_no=0
cur_seed='a08fa060e49d0755163928e2445a6d5eb715a9827239fff6aa75a07f7dec36cd'
cur_hash='b465a3bc3240974c62115114d220a77452036cf55edb5bc079668fa89ff9d194'
1
                                                               seed  \
0  a08fa060e49d0755163928e2445a6d5eb715a9827239fff6aa75a07f7dec36cd   

                                                               hash  \
0  b465a3bc3240974c62115114d220a77452036cf55edb5bc079668fa89ff9d194   

                                                              nonce  
0  0632862afa69a470522819d912de8142f8d235d7048dc16c75a16c36bb606321  
server_count:0
Test Data Generation Completed

   match                                                              seed  \
0   True  a08fa060e49d0755163928e2445a6d5eb715a9827239fff6aa75a07f7dec36cd   

                                          

In [None]:
test_hash1 = test_hash.copy()
# test_hash = test_hash24.copy()
# test_hash = test_hash1.copy()

In [None]:
test_filt=test.copy()
test_filt_hash=test_hash.copy()
features_test = [col for col in features if col not in ['target','probs']]

In [None]:
def gen_hash_cutoff_df(models,feature_test,test_hash,k_bin_size=600,quant=0.95):
    test_hash_probs = generate_probs(True,models,test_hash,features_test)
    cutoffs = []
    ratios =[]
    totals =[]
    print(test_hash_probs['probs'].describe())
    cutoffs = np.linspace(test_hash_probs['probs'].min(),
                          test_hash_probs['probs'].quantile(quant),k_bin_size)
    for i,cutoff in enumerate(cutoffs[:len(cutoffs)-1]):

        mask = (test_hash_probs['probs']>=cutoff) & (test_hash_probs['probs']<=cutoffs[i+1])
        mask2=create_target_mask(test_hash_probs)

        total = len(test_hash_probs[mask])
        if total==0:
            ratio=0
            success=0
        else:
            success = len(test_hash_probs[mask & mask2])
            ratio = success / total
        ratios.append(ratio)
        totals.append(total)
    #     print(ratio,success,total)

    print(len(ratios),len(cutoffs))
    df=pd.DataFrame()
    df['cutoff']=cutoffs[:len(cutoffs)-1]
    df['cutoff_2']=cutoffs[1:len(cutoffs)]
    df['ratio'] =ratios
    df['total'] =totals

    df = df.sort_values(['ratio','cutoff'],ascending=[False,False]).reset_index(drop=True)
#     print(df[:5].mean())
#     print(df[:10].mean())
    return df

In [None]:
# test_hash_probs = generate_probs(True,xgb_models,test_hash,features_test)
# cutoffs = []
# ratios =[]
# totals =[]
# print(test_hash_probs['probs'].describe())
# cutoffs = np.linspace(test_hash_probs['probs'].min(),
#                       test_hash_probs['probs'].quantile(0.95),600)
# for i,cutoff in enumerate(cutoffs[:len(cutoffs)-1]):
# #     cutoff=0.4
# #     diff = 0.01
# #     print(cutoff,cutoffs[i+1])
#     mask = (test_hash_probs['probs']>=cutoff) & (test_hash_probs['probs']<=cutoffs[i+1])
#     mask2=create_target_mask(test_hash_probs)
    
# #     mask2 = test_hash_probs['roll_actual']>=9000
#     total = len(test_hash_probs[mask])
#     if total==0:
#         ratio=0
#         success=0
#     else:
#         success = len(test_hash_probs[mask & mask2])
#         ratio = success / total
#     ratios.append(ratio)
#     totals.append(total)
# #     print(ratio,success,total)

# print(len(ratios),len(cutoffs))
# df=pd.DataFrame()
# df['cutoff']=cutoffs[:len(cutoffs)-1]
# df['cutoff_2']=cutoffs[1:len(cutoffs)]
# df['ratio'] =ratios
# df['total'] =totals

# df = df.sort_values('ratio',ascending=False).reset_index(drop=True)
# print(df[:5].mean())
# print(df[:10].mean())
# df.head(10)

In [None]:
# df

In [None]:
# df.describe()

In [None]:
# print(len(df[df['total']==1]))
# print(len(df[(df['total']==1) & (df['ratio']>0)]))
# df[df['total']==1]

In [None]:
# df[df['ratio']<0.12].index[0]

In [None]:
# test['roll_actual']=test[f'roll_actual_orig'].copy()

In [None]:
def gen_all_k(models,features_test,data,test_hash_cutoff_df,target_total=1,is_k_data_req=True):
    test_probs = generate_probs(True,models,data,features_test)
    top_k=[]
    all_k=[]
    test_filt_all=None
    for k in range(len(test_hash_cutoff_df)):
        row = test_hash_cutoff_df.iloc[k]
    #     print(row['ratio'],row['cutoff'],row['cutoff_2'])
        mask = (test_probs['probs']>=row['cutoff']) & (test_probs['probs']<=row['cutoff_2'])
        mask2 = create_target_mask(test_probs)    
        test_filt = test_probs[mask]
        

        total = len(test_filt)
        if total==0:
            success=0
            ratio=0
        else:
            success = len(test_probs[mask & mask2])
            ratio = success / total
        if total==target_total:
            all_k.append(k) 
            if is_k_data_req:
#                 cur_test = test_filt[['roll_actual','client_seed','probs']]
                cur_test = test_filt
                cur_test['k']=k
                if test_filt_all is None:
                    test_filt_all = cur_test
                else:
                    test_filt_all = pd.concat([test_filt_all,cur_test],axis=0)
        if ratio>=0.12:
            if total==target_total:
                top_k.append((k,ratio,total))
    #     print(ratio,success,total,row['cutoff'],k)
    print(f'{len(all_k)=}')
    print(all_k)
    print(f'{len(top_k)=}')
    print(top_k)
#     mask = (test_hash_probs['probs']>=row['cutoff']) & (test_hash_probs['probs']<=row['cutoff_2'])
#     test_filt_hash  = test_hash_probs[mask]   
    
    return all_k,top_k,test_filt_all

In [None]:
%%time

k_bin_size=600

print(f'***************** Bin {k_bin_size} ***************** ')
df= gen_hash_cutoff_df(xgb_models,features_test,test_hash,
                            k_bin_size=k_bin_size,quant=0.95)
print('Test Hash Cutoff Df Stats')
print('Total size:',len(df[df['total']==1]))
print('Success size:',len(df[(df['total']==1) & (df['ratio']>0)]))

all_k,top_k,test_all_k = gen_all_k(xgb_models,features_test,
                                test,df,target_total=1)

top_k_ele = [x[0] for x in top_k]
print(top_k_ele)
print('Test Size:',len(test_all_k))
test_all_k.head()

In [None]:
# test_sel=test_all_k[:5]
test_sel=test_all_k.sort_values('probs',ascending=False)[:5]
test_sel

In [None]:
top_k_ele = [x[0] for x in top_k]
print(top_k_ele)
matches = set(list(test_sel.k)).intersection(set(top_k_ele))
print(f'{len(matches)}')
print(matches)

Important Feats Top K Elements

In [None]:
%%time

k_bin_size=600

all_k_imp =[-1]*hash_imp_count
top_k_imp =[-1]*hash_imp_count
test_all_k_imp =[-1]*hash_imp_count
top_k_ele_imp =[-1]*hash_imp_count

df= gen_hash_cutoff_df(xgb_models,features_test,test_hash,
                            k_bin_size=k_bin_size,quant=0.95)
print('Test Hash Cutoff Df Stats')
print('Total size:',len(df[df['total']==1]))
print('Success size:',len(df[(df['total']==1) & (df['ratio']>0)]))

for hash_idx in range(hash_imp_count):

    print(f'***************** Test Hash {hash_imp_names[hash_idx]} ***************** ')

    all_k_imp[hash_idx],top_k_imp[hash_idx],test_all_k_imp[hash_idx] = gen_all_k(xgb_models,features_test,
                                                                        test_hash_imp[hash_idx],
                                                                        df,target_total=1)

    top_k_ele_imp[hash_idx] = [x[0] for x in top_k_imp[hash_idx]]


In [None]:
test_all_k_imp[0].head()

END

In [None]:
%%time
test_sel_all = None
# bin_list = list(range(6054,7800,100))
bin_list = list(range(600,1001,100))
# bin_list = list(range(1000,2001,100))
# bin_list = list(range(2100,3000,100))
# bin_list = list(range(1057,2001,100))
# bin_list = list(range(6000,6500,100))
total_matches=0
start =0
for i,k_bin_size in enumerate(bin_list):
    print(f'***************** Bin {k_bin_size} ***************** ')
    df_temp= gen_hash_cutoff_df(xgb_models,features_test,test_hash,
                                k_bin_size=k_bin_size,quant=0.95)
    end = int(len(test) * (i+1) / len(bin_list) )
    print(f'{end=}')
    all_k_temp,top_k_temp,test_sel_temp = gen_all_k(xgb_models,features_test,
                                                    test,
                                                    df_temp,
                                                    target_total=1)
    start = end
    
    cur_test_sel = test_sel_temp[:5]
    print(test_sel_temp[:10][['k','probs','roll_actual']])
    
#     cur_test_sel=test_sel_temp.sort_values('probs',ascending=False)[:5]
#     cur_test_sel=cur_test_sel[cur_test_sel['k'].isin(all_k_temp[:10])]
    
#     cur_test_sel=test_sel_temp[:100]
# #     print(cur_test_sel[['k','probs','roll_actual']])
#     cur_test_sel=cur_test_sel.sort_values('probs',ascending=False)[:10]
#     print(cur_test_sel[['k','probs','roll_actual']])

    top_k_ele = [x[0] for x in top_k_temp]
    print(top_k_ele)
    matches = set(list(cur_test_sel.k)).intersection(set(top_k_ele))
    print(f'{len(matches)}')
    print(matches)
    
    cur_test_sel['bin']=k_bin_size
    
    
    
    if test_sel_all is None:
        test_sel_all = cur_test_sel
        total_matches +=len(matches)
    else:
        mask = cur_test_sel.index.isin(test_sel_all.index)
        if len(cur_test_sel[mask])==0:
            total_matches +=len(matches)
            test_sel_all = pd.concat([test_sel_all,cur_test_sel],axis=0)
            print(f'total matches: {total_matches} size:{len(test_sel_all)}')
print('Test Size:',len(test_sel_all))
print(f'{total_matches=}')
# print(all_k_temp)
# print(top_k_temp)
test_sel_all.head()

In [None]:
temp = df[df.index.isin(all_k)].sort_values('cutoff',ascending=False)
temp_filt = temp[:10]
print(temp_filt.index)
top_k_ele = [x[0] for x in top_k]
print(top_k_ele)
set(list(temp_filt.index)).intersection(set(top_k_ele))

In [None]:
# test_probs = generate_probs(True,xgb_models,test,features_test)
# top_k=[]
# all_k=[]
# for k in range(len(df)):
#     row = df.iloc[k]
# #     print(row['ratio'],row['cutoff'],row['cutoff_2'])
#     mask = (test_probs['probs']>=row['cutoff']) & (test_probs['probs']<=row['cutoff_2'])
#     mask2 = create_target_mask(test_probs)    
#     test_filt = test_probs[mask]

#     total = len(test_filt)
#     if total==0:
#         success=0
#         ratio=0
#     else:
#         success = len(test_probs[mask & mask2])
#         ratio = success / total
#     if total==1:
#         all_k.append(k) 
#     if ratio>=0.12:
#         if total==1:
#             top_k.append((k,ratio,total))
# #     print(ratio,success,total,row['cutoff'],k)
# print(len(all_k))
# print(all_k)
# print(len(top_k))
# print(top_k)
# mask = (test_hash_probs['probs']>=row['cutoff']) & (test_hash_probs['probs']<=row['cutoff_2'])
# test_filt_hash  = test_hash_probs[mask]

In [None]:
# all_k_arr = np.array(all_k)
# diff = np.diff(all_k_arr)
# print(diff)
# ind = np.where(diff>=20)
# print(ind[0]+1)
# trans_k = np.sort(np.concatenate((all_k_arr[ind[0]],all_k_arr[ind[0]+1])))
# print(len(trans_k))
# trans_k

In [None]:
# def get_sim_cutoff_data(test,xgb_models,features_test,test_hash_cutoff_df,
#                        is_imp_feats,imp_feats_count,gen_probs=True):
#     if gen_probs:
#         test_probs = generate_probs(True,xgb_models,test,features_test)
#     else:
#         test_probs = test
#     print('Generate probs completed')
#     test_probs['roll_actual_init']=test_probs['roll_actual'].copy()

#     if is_imp_feats:
#         hash_groups =[ [col for col in imp_df['feature'].values if (col.startswith('roll_'))][:imp_feats_count] ]
# #         hash_groups = [['roll_mean_nonce']]
#     else:
#     #     hash_groups = [['roll_21_nonce', 'roll_37_nonce', 'roll_mean_nonce','roll_36_nonce','roll_48',
#     #                    'roll_53_nonce','roll_32_nonce','roll_44_nonce','roll_38_nonce','roll_17']]
#         hash_groups = [list(range(1,11)),list(range(25,36)),list(range(50,60)),list(range(100,111)) ]
# #         hash_groups = [list(range(25,36)) ]
#     hash_groups_flat = [item for sublist in hash_groups for item in sublist]
#     print(hash_groups)
#     print('list creation complete')
    

#     top_k=[]
#     sim_cutoff_df = pd.DataFrame(columns=['k','ratio','total','hash_no','hash_group','roll_mean'])
#     df_size = len(test_hash_cutoff_df)
#     for i,row in enumerate(test_hash_cutoff_df.itertuples()):
#         k=row.Index
# #         if (k%10)==0:
# #             print(f'loop for k = {k}')
# #         row = test_hash_cutoff_df.iloc[k]
#     #     print(row['ratio'],row['cutoff'],row['cutoff_2'])
#         mask = (test_probs['probs']>=row.cutoff) & (test_probs['probs']<=row.cutoff_2)
#         test_filt = test_probs[mask]
#         total = len(test_filt)
#         ratios=[]
        
#         # test_filt[hash_groups_flat] <
# #     if HIGH_ANALYSIS:
# #         success = (test_filt[hash_groups_flat] > HIGH_TARGET).sum(axis=1)
# #     else:
# #         success = (test_filt[hash_groups_flat] < LOW_TARGET).sum(axis=1)
    
# #     ratio_mean = success / len(hash_groups_flat)
# #     roll_mean_mean = test_filt[hash_groups_flat].mean(axis=1)

#         for idx in hash_groups_flat:
# #             print(f'k:{k} idx:{idx}')
#             if is_imp_feats:
#                 test_filt['roll_actual']=test_filt[idx].copy()    
#             else:
#                 test_filt['roll_actual']=test_filt[f'roll_actual_{idx}'].copy()
#             mask2 = create_target_mask(test_filt)
#             if total==0:
#                 success=0
#                 ratio=0
#             else:
#                 success = len(test_filt[mask2])
#                 ratio = success / total
# #             print(ratio,success,total,row['cutoff'],k)
#             roll_mean=test_filt['roll_actual'].mean()
#             sim_cutoff_df.loc[len(sim_cutoff_df.index)] = [k,ratio, total, idx, np.nan,roll_mean]

#     for group_no,group in enumerate(hash_groups):
#         start =group[0]
#         end = group[-1]
#         mask = (sim_cutoff_df['hash_no']>=start) & (sim_cutoff_df['hash_no']<=end) 
#         sim_cutoff_df.loc[mask,'hash_group']=group_no
        
#     test_probs['roll_actual']=test_probs['roll_actual_init'].copy()
    
#     return sim_cutoff_df,test_probs
        



In [None]:
# def func(data):
#     return data['x']**2


# temp_df = pd.DataFrame(columns=['x','y','z'])
# temp_df['x']=np.random.randint(5,12,size=10)
# temp_df['y']=np.random.randint(65,82,size=10)
# temp_df['z']=np.random.randint(1000,2000,size=10)
# res=temp_df.apply(func)
# res

In [None]:
def get_sim_summary(test_filt_all_k,
                       test_hash_cutoff_df,
                       is_imp_feats,imp_feats_count):

    if is_imp_feats:
        print('imp_feats_count:',imp_feats_count)
        hash_groups =[ [col for col in imp_df['feature'].values if (col.startswith('roll_'))][:imp_feats_count] ]
    else:
        hash_groups = [list(range(1,11)),list(range(25,36)),list(range(50,60)),list(range(100,111)) ]
    hash_groups_flat = [item for sublist in hash_groups for item in sublist]
    print(hash_groups)
    
    
    if HIGH_ANALYSIS:
        success = (test_filt_all_k[hash_groups_flat] > HIGH_TARGET).sum(axis=1)
    else:
        success = (test_filt_all_k[hash_groups_flat] < LOW_TARGET).sum(axis=1)
    
    ratio_mean = success / len(hash_groups_flat)
    roll_mean = test_filt_all_k[hash_groups_flat].mean(axis=1)
    
    sim_summary = pd.DataFrame()
    sim_summary['k']=test_filt_all_k['k']
    sim_summary['ratio_mean']=ratio_mean
    sim_summary['roll_mean']=roll_mean
    
    sim_summary=sim_summary.reset_index(drop=True)
    
    return sim_summary
    

In [None]:
# def get_hash_prob_top_data(test,xgb_models,features_test,
#                            test_hash_cutoff_df,is_imp_feats,imp_feats_count,
#                            test_filt_all_k,
#                            sim_cutoff_df = None,is_min_total=True,
#                            total_cri=12,top_n=10,
#                           ratio_mean_asc=False,roll_mean_asc=True):

# #     if sim_cutoff_df is None:
# #         sim_cutoff_df,test_probs= get_sim_cutoff_data(test,xgb_models,features_test,
# #                                                       test_hash_cutoff_df,
# #                                                      is_imp_feats,imp_feats_count)
# #     else:
# #         test_probs = generate_probs(True,xgb_models,test,features_test)
        
# #     print(len(sim_cutoff_df))
# # #     print(sim_cutoff_df.head())
# #     #filter only high total records
# #     if is_min_total:
# #         sim_cutoff_df_filt = sim_cutoff_df[sim_cutoff_df['total']>=total_cri]
# #     else:
# #         sim_cutoff_df_filt = sim_cutoff_df[sim_cutoff_df['total']<=total_cri]
# #         if total_cri!=1:
# #             sim_cutoff_df_filt = sim_cutoff_df_filt[sim_cutoff_df_filt['total']>1]
# #     print(len(sim_cutoff_df_filt))
# # #     print(sim_cutoff_df_filt.dtypes)
# #     sim_cutoff_df_filt['total']=  sim_cutoff_df_filt['total'].astype('float')
# #     sim_summary  = sim_cutoff_df_filt.groupby(['k']).agg( total =('total','mean'),
# #                                                         roll_mean=('roll_mean','mean'),
# #                                                          ratio_mean=('ratio', 'mean'), 
# #                                                           ratio_std=('ratio', 'std')).reset_index()
    
#     sim_summary= get_sim_summary(test_filt_all_k,test_hash_cutoff_df,is_imp_feats,imp_feats_count)
# #     print(sim_summary.head(20))
# #     print(sim_summary.sort_values(['ratio_mean','roll_mean'],ascending=False))
    
#     top_mean_data = sim_summary.sort_values(['ratio_mean','roll_mean'],ascending=[ratio_mean_asc,roll_mean_asc])[:top_n].reset_index(drop=True)
#     print('--- Top 5 data ---')
#     print(top_mean_data[:5])
# #     print('--- Top 6 to 10 data ---')
# #     print(top_mean_data[5:20])
# #     top_std_row = top_mean_data.sort_values('ratio_std',ascending=True).iloc[0]
# #     print('--- Selected K ---')
# #     print(top_std_row)
# #     k_sel = int(top_std_row['k'])

# #     k_sel = int(top_mean_data[0:1]['k'])
# #     row = test_hash_cutoff_df.iloc[k_sel]
# #     #     print(row['ratio'],row['cutoff'],row['cutoff_2'])
# #     mask = (test_probs['probs']>=row['cutoff']) & (test_probs['probs']<=row['cutoff_2'])
# #     test_sel = test_probs[mask]
# #     print('Test Size:',len(test_sel))
    
#     return top_mean_data


In [None]:
# k_sel,test_sel = get_test_sel_hash_prob(test,xgb_models,features_test,df,
#                           min_total=12, top_n=10)
# print(k_sel)

In [None]:
def create_cutoff_pos_df(test,models,features_test,test_hash_cutoff_df,
                         imp_feats_count_list,test_filt_all_k,
                         gen_probs=True,is_compute_matches=True,
                   test_ratio_limit=0.12,sim_cutoff_dfs=None,
                         low_high_record_condns=[(True,12,10),(False,10,10),(False,2,10)],
                        ratio_mean_asc=False,roll_mean_asc=True,
                        top_k_ele=None):

    cutoff_pos_df =pd.DataFrame(columns = ['imp_feats_count','condn_max','condn_val',
                                           'k_top_1','k_top_2','k_top_3','k_top_4','k_top_5',
                                          'mean_top_1', 'mean_top_2','mean_top_3','mean_top_4',
                                          'mean_top_5','match_pos_1',
                                          'match_total_top_5','match_total_top_10','match_ks_top_5'])

    
    max_good_k= test_hash_cutoff_df[test_hash_cutoff_df['ratio']<test_ratio_limit].index[0]-1
    print(f'{max_good_k=}')
    is_imp_feats = True 
    gen_sim_cutoff=False
    if sim_cutoff_dfs is None:
        gen_sim_cutoff=True
        sim_cutoff_dfs=[]
    elif gen_probs:
        test_probs = generate_probs(True,xgb_models,test,features_test) 
    else:
        test_probs = test
    top_mean_data_list=[]
    for i,imp_feats_count in enumerate(imp_feats_count_list):
        print()
        print(f'********************** Feature count {imp_feats_count} *****************')
#         if gen_sim_cutoff:
#             sim_cutoff_df,test_probs= get_sim_cutoff_data(test,models,features_test,test_hash_cutoff_df,
#                                                      is_imp_feats,imp_feats_count,gen_probs)
#             sim_cutoff_dfs.append(sim_cutoff_df)
#         else:
#             sim_cutoff_df=sim_cutoff_dfs[i]
        
        for condn in low_high_record_condns:
            print(f'******* Condition {condn} for feature count {imp_feats_count} *****************')
#             top_mean_data = get_hash_prob_top_data(test,models,features_test,
#                                        test_hash_cutoff_df,is_imp_feats,imp_feats_count,
#                                         test_filt_all_k,
#                                        sim_cutoff_df = None,
#                                         is_min_total=condn[0], total_cri=condn[1],
#                                         top_n=condn[2],
#                                         ratio_mean_asc=ratio_mean_asc,roll_mean_asc=roll_mean_asc)
            
            sim_summary= get_sim_summary(test_filt_all_k,test_hash_cutoff_df,
                                         is_imp_feats,imp_feats_count)
            top_mean_data = sim_summary.sort_values(['ratio_mean','roll_mean'],ascending=[ratio_mean_asc,roll_mean_asc])[:top_n].reset_index(drop=True)
            print('--- Top 5 data ---')
            print(top_mean_data[:5])

            top_mean_data_list.append(top_mean_data)
            
            if top_k_ele is not None:
                print(f'\n******success data *****')
                mask= top_mean_data['k'].isin(top_k_ele)
                print(top_mean_data[mask])
                print(f'\n******success positions *****')
                cur_positions = list(top_mean_data[mask].index+1)
                print(cur_positions)
            
#             print(top_mean_data)
            top_5_mean_data = top_mean_data[:5]
            k_tops=list(top_5_mean_data['k'].values)
            mean_tops=list(top_5_mean_data['ratio_mean'].values)
            
            match_pos_1=np.nan
            match_total_top_5=np.nan
            match_total_top_10=np.nan
            match_ks_top_5=np.nan
            
            if is_compute_matches:
                
                top_mean_data['rank']=top_mean_data['ratio_mean'].rank(method='average',
                                                                  ascending=False)
                top_5_mean_data = top_mean_data[:5]
               
                matches_top_5=top_5_mean_data[top_5_mean_data['k']<=max_good_k]
                matches_top_5=matches_top_5[matches_top_5['ratio_mean']>0]
                match_total_top_5=len(matches_top_5)

                matches_top=top_mean_data[top_mean_data['k']<=max_good_k]
                matches_top=matches_top[matches_top['ratio_mean']>0]
                match_total_top_10 = len(matches_top)

                if match_total_top_5==0:
                    match_ks_top_5=[]
                    if match_total_top_10==0:
                        match_pos_1=11
                    else:
                        match_pos_1=matches_top.iloc[0]['rank']
                else:
    #                 match_pos_1=matches_top_5.index[0]+1
                    match_pos_1=matches_top_5.iloc[0]['rank']
                    match_ks_top_5 = list(matches_top_5['k'].values)
 
            
            result = [imp_feats_count,condn[0],condn[1]]+k_tops+mean_tops \
                    +[match_pos_1,match_total_top_5,match_total_top_10,match_ks_top_5]
        
#             print(cutoff_pos_df.columns)
#             print(f'{len(cutoff_pos_df.columns),len(result),result}')
            next_idx = len(cutoff_pos_df.index)
            cutoff_pos_df.loc[next_idx] = result
        
            if is_compute_matches:
                match_cols = [col for col in cutoff_pos_df if col.startswith('match')]
                print(cutoff_pos_df.loc[next_idx][match_cols])
        
    return cutoff_pos_df,sim_cutoff_dfs,top_mean_data_list


In [None]:
import math

def get_test_sel_cutoff_pos(test_probs,models,features_test,test_hash_cutoff_df,
                            cutoff_pos_df_hash,
                            is_min_total=False,total_cri=2,top_n=10,
                           sim_cutoff_dfs=None,imp_feats_count_list=None):
    cutoff_pos_stat = cutoff_pos_df_hash.groupby(['imp_feats_count'])['match_pos_1',
                                               'match_total_top_5',
                                               'match_total_top_10'].mean().reset_index()
    print(' ........... Overall Positions of Features .........')
    print(cutoff_pos_stat)
    cutoff_pos_df_hash_filt=cutoff_pos_df_hash[cutoff_pos_df_hash['condn_max']==False]
    cutoff_pos_stat = cutoff_pos_df_hash_filt.groupby(['imp_feats_count'])['match_pos_1',
                                               'match_total_top_5',
                                               'match_total_top_10'].mean().reset_index()
    print(' ........... Low Positions of Features .........')
    print(cutoff_pos_stat)
    cutoff_pos_best= cutoff_pos_stat.sort_values(['match_pos_1','match_total_top_5'],
                                                ascending=[True,False])\
                                    .reset_index().iloc[0]
    print('*** Best cutoff position in hash **** ')
    print(cutoff_pos_best)
    predicted_raw_pos=cutoff_pos_best['match_pos_1']
    predicted_max_pos = math.ceil(predicted_raw_pos)
    
    if predicted_raw_pos==predicted_max_pos:
        predicted_max_pos+=1
    
    is_imp_feats=True
    imp_feats_count=cutoff_pos_best['imp_feats_count']
    is_min_total=False
    
    if sim_cutoff_dfs is None:
        sim_cutoff_df,test_probs= get_sim_cutoff_data(test_probs,models,features_test,test_hash_cutoff_df,
                                                 is_imp_feats,imp_feats_count,gen_probs=False)
    else:
        #to use existing buffer, read the matching sim cutoff df from the list of buffer
        idx = imp_feats_count_list.index(imp_feats_count)
        sim_cutoff_df=sim_cutoff_dfs[idx]

    top_mean_data = get_hash_prob_top_data(test_probs,models,features_test,
                           test_hash_cutoff_df,is_imp_feats,imp_feats_count,
                           sim_cutoff_df = sim_cutoff_df,
                            is_min_total=is_min_total, total_cri=total_cri,
                            top_n=top_n)
    
    top_mean_data['rank']=top_mean_data['ratio_mean'].rank(method='min',ascending=False)
    mask = (top_mean_data['rank']<=predicted_max_pos) & (top_mean_data['ratio_mean']>0)
    top_mean_sel = top_mean_data[mask]
    k_sel_list = list(top_mean_sel['k'].values)
    print(top_mean_data)
    print(top_mean_sel)
    print(f'{k_sel_list=}')
    

#     mask = cutoff_pos_df_actual['imp_feats_count']==cutoff_pos_best['cutoff_pos_best']
#     mask2 = cutoff_pos_df_actual['match_pos_1']<=predicted_max_pos
#     k_sel = cutoff_pos_df_actual.loc[mask & mask2, 'k' ]
    
    test_sel  = None
    for k_sel in k_sel_list:
        row = test_hash_cutoff_df.loc[k_sel]
    #     print(row)
        mask = (test_probs['probs']>=row['cutoff']) & (test_probs['probs']<=row['cutoff_2'])
        cur_test_sel = test_probs[mask]
        cur_test_sel['k']=k_sel
        if test_sel is None:
            test_sel = cur_test_sel
        else:
            test_sel = pd.concat([test_sel,cur_test_sel],axis=0)
        print('Test Size:',len(test_sel))

    return k_sel_list,test_sel,top_mean_sel



In [None]:
# k_sel_list=[218,198,201,190,184]
# print(df[df.index.isin(k_sel_list)])
# test_temp=None
# for k_sel in k_sel_list:
#     row = df.loc[k_sel]
# #     print(row)
#     mask = (test['probs']>=row['cutoff']) & (test['probs']<=row['cutoff_2'])
#     cur_test_sel = test[mask]
#     cur_test_sel['k']=k_sel
#     if test_temp is None:
#         test_temp = cur_test_sel
#     else:
#         test_temp = pd.concat([test_temp,cur_test_sel],axis=0)

# cols = [col for col in imp_df['feature'].values if (col.startswith('roll_'))][:20]        
# # test_temp
# test_temp['feats_mean']=test_temp[cols].mean(axis=1)
# test_temp['feats_std']=test_temp[cols].std(axis=1)
# test_temp['feats_count']=(test_temp[cols]<1000).sum(axis=1)
# test_temp[['k','roll_actual','feats_mean','feats_std','feats_count']+cols]

In [None]:
# test_orig= test.copy()
# test_hash_orig = test_hash.copy()

In [None]:
count_all_k = len(all_k)
if count_all_k<10:
    print(f'PROJECT WARNING: number of target records is {count_all_k} which is less than 10 ')
count_top_k = len(top_k)    
if count_top_k<2:
    print(f'PROJECT WARNING: number of probable success is {count_top_k} which is less than 2')  
hash_success = len(df[(df['total']==2) & (df['ratio']>0)])    
if hash_success<2:
    print(f'PROJECT WARNING: number of hash success is {hash_success} which is less than 2')

In [None]:
print(len(test))
mask = test.duplicated(subset='client_seed')
print(len(test[mask]))
print(test[mask]['client_seed'].nunique())

print(len(test_hash))
mask = test_hash.duplicated(subset='client_seed')
print(len(test_hash[mask]))
print(test_hash[mask]['client_seed'].nunique())

In [None]:
# test=test.drop_duplicates(subset='client_seed')
# test_hash=test_hash.drop_duplicates(subset='client_seed')

In [None]:
# %%time
# sim_cutoff_dfs_temp=None
# imp_feats_count_list = [2,5,10,15,20,25,30,40]
# cutoff_pos_df_temp,sim_cutoff_dfs_temp,test_probs=create_cutoff_pos_df(test,xgb_models,
#                     features_test,df[df.index.isin([2,22,70,100,178,200,486,
#                                                     488,525,604,672,681,747,
#                                                    812,1212,1392,1536])],
#                     imp_feats_count_list,is_compute_matches=False,
#                    test_ratio_limit=0.12,sim_cutoff_dfs=sim_cutoff_dfs_temp,
#                     low_high_record_condns=[(False,2,20)])
# cutoff_pos_df_temp

In [None]:
# %%time
# imp_feats_count_list = [30,40]
# sim_cutoff_dfs=None
# cutoff_pos_df_hash,sim_cutoff_dfs,test_probs_hash=create_cutoff_pos_df(test_hash,xgb_models,
#                     features_test,df[df['total']<=10],
#                     imp_feats_count_list,
#                    test_ratio_limit=0.12,sim_cutoff_dfs=sim_cutoff_dfs,
#                     low_high_record_condns=[(False,10,20),(False,2,20)])
# cutoff_pos_df_hash

In [None]:
# cutoff_pos_df_hash[cutoff_pos_df_hash['condn_val']==2]

In [None]:
# print(len(df))
# df_filt = df[(df['total']>=50)]
# print(len(df_filt))
# # df_filt[df_filt['ratio']==0.12]

In [None]:
# %%time
# imp_feats_count_list = [2,5,10,15,20,25,30,40]
# sim_cutoff_dfs=None
# min_normal_count=50
# max_low_count = 5
# cutoff_pos_df_hash,sim_cutoff_dfs,test_probs_hash=create_cutoff_pos_df(test_hash,xgb_models,
#                     features_test,df[(df['total']<=max_low_count) | (df['total']>=min_normal_count)],
#                     imp_feats_count_list,
#                    test_ratio_limit=0.12,sim_cutoff_dfs=sim_cutoff_dfs,
#                     low_high_record_condns=[(True,min_normal_count,10),
#                                             (False,max_low_count,20),(False,2,20)])
# cutoff_pos_df_hash

In [None]:
# cutoff_pos_df_hash[cutoff_pos_df_hash['condn_val']==2]

In [None]:
print(all_k)
# print(trans_k)

In [None]:
# %%time
# imp_feats_count_list = [2,5,10,15,20,25,30,35,40,45,50,55,60]
# sim_cutoff_dfs=None
# cutoff_pos_df_hash,sim_cutoff_dfs,test_probs_hash=create_cutoff_pos_df(test_hash,xgb_models,
#                     features_test,df[(df['total']==1)],
#                     imp_feats_count_list,
#                    test_ratio_limit=0.12,sim_cutoff_dfs=sim_cutoff_dfs,
#                     low_high_record_condns=[(False,1,20)])
# cutoff_pos_df_hash

In [None]:
# temp=df.loc[[243,181,220,226,245,10,4]]
# temp['diff']=(df.loc[214,'cutoff']-temp['cutoff']).abs()
# temp.sort_values('diff')

In [None]:
# # set(top_mean_data_actual[0]['k']).intersection(set(top_k_ele))

# for i in range(13):
#     cur_top_data = top_mean_actual[i]
#     print(f'******{imp_feats_count_list[i]=}*****')
#     print(cur_top_data[cur_top_data['k'].isin(top_k_ele)])
#     print(cur_top_data[cur_top_data['k'].isin(top_k_ele)].index+1)

Check top important hash test data for ascending or descending ratio mean performance

In [None]:
%%time
imp_feats_count_list = [2,5,10,15,20,25,30,35,40,45,50,55,60]
# imp_feats_count_list = [2,5]
top_n = 62
imp_feats_size = len(imp_feats_count_list)
first_match_pos_arr= np.zeros((hash_imp_count,2,imp_feats_size))
dictionary_list=[]
for hash_idx in range(hash_imp_count):
    top_n = len(all_k_imp[hash_idx])
    print(f'\ntop_n:{top_n}')
    for ratio_mean_order in [False,True]:
        print(f'********************** Test Hash {hash_imp_names[hash_idx]} ratio_mean order {ratio_mean_order}***********************')
        sim_cutoff_dfs_actual=None
        cutoff_pos_df_actual,sim_cutoff_dfs_actual,top_mean_actual=create_cutoff_pos_df(test_hash_imp[hash_idx],xgb_models,
                            features_test,df[df.index.isin(all_k_imp[hash_idx])],
                            imp_feats_count_list,test_all_k_imp[hash_idx],is_compute_matches=False,
                           test_ratio_limit=0.12,sim_cutoff_dfs=sim_cutoff_dfs_actual,
                            low_high_record_condns=[(False,1,top_n)],
                            ratio_mean_asc=ratio_mean_order,roll_mean_asc=not(HIGH_ANALYSIS))

        first_match_pos_list=[]
        match_pos_list=[]
        for feats_idx in range(imp_feats_size):
            cur_top_data = top_mean_actual[feats_idx]
            print(f'\n******{imp_feats_count_list[feats_idx]} feats success data *****')
            mask= cur_top_data['k'].isin(top_k_ele_imp[hash_idx])
            print(cur_top_data[mask])
            cur_positions = list(cur_top_data[mask].index+1)
            
            dictionary_data = {'imp_feats_count': imp_feats_count_list[feats_idx],
                               'hash_index':hash_idx, 
                               'hash_feat_name': hash_imp_names[hash_idx], 
                                'ratio_mean_asc':ratio_mean_order,
                                'first_match_pos':cur_positions[0],
                                'other_match_pos':cur_positions[1:]
                  }
            dictionary_list.append(dictionary_data)
            print(f'\n {imp_feats_count_list[feats_idx]} feats success positions')
            print(cur_positions)
        
        
match_pos_df = pd.DataFrame.from_dict(dictionary_list)
match_pos_df        

In [None]:
        
match_summary =match_pos_df.groupby(['hash_index','ratio_mean_asc'])\
                            .agg(first_match_pos_mean =('first_match_pos','mean'))
                                                                        
print(match_summary)
match_top_pos = match_summary['first_match_pos_mean'].groupby('hash_index', group_keys=False).nsmallest(1)
match_top_pos = match_top_pos.reset_index()
print(match_top_pos)
match_value_counts = match_top_pos['ratio_mean_asc'].value_counts()
top_ratio_mean_asc = match_value_counts.index[0]  
if len(match_value_counts)==2:
    ratio_mean_order_equal = match_top_pos['ratio_mean_asc'].value_counts().iloc[0]==match_top_pos['ratio_mean_asc'].value_counts().iloc[1]
    if ratio_mean_order_equal:
        print('PROJECT WARNING: Both Ratio mean order ascending and descending are equal')
top_ratio_mean_asc

In [None]:
len(match_value_counts)

In [None]:
# match_pos_df.groupby('hash_index')
mask = (match_pos_df['ratio_mean_asc']==top_ratio_mean_asc)
# mask = (match_pos_df['ratio_mean_asc']==False)
pos_top_df= match_pos_df[mask].sort_values(['hash_index','first_match_pos'],ascending=True)\
            .groupby(['hash_index'])\
            .head(5)
print(pos_top_df['imp_feats_count'].value_counts())
pos_top_df
# match_pos_df.sort_values(['first_match_pos'],ascending=True)\
#             .groupby(['hash_index','ratio_mean_asc'])\
#             .head(5)\
#             .sort_values(['hash_index','ratio_mean_asc'],ascending=True)
    

In [None]:
%%time
imp_feats_count_list = [2,5,10,15,20,25,30,35,40,45,50,55,60]
sim_cutoff_dfs_actual=None
top_n = len(all_k)
cutoff_pos_df_actual,sim_cutoff_dfs_actual,top_mean_actual=create_cutoff_pos_df(test,xgb_models,
                    features_test,df[df.index.isin(all_k)],
                    imp_feats_count_list,test_all_k,is_compute_matches=False,
                   test_ratio_limit=0.12,sim_cutoff_dfs=sim_cutoff_dfs_actual,
                    low_high_record_condns=[(False,1,top_n)],
                    ratio_mean_asc=top_ratio_mean_asc,roll_mean_asc=not(HIGH_ANALYSIS),
                    top_k_ele=top_k_ele)
cutoff_pos_df_actual

In [None]:
# %%time
# # imp_feats_count_list = [2,5,10,15,20,25,30,40]
# sim_cutoff_dfs_actual=None
# cutoff_pos_df_actual,sim_cutoff_dfs_actual,test_probs=create_cutoff_pos_df(test,xgb_models,
#                     features_test,df[df.index.isin(trans_k)],
#                     imp_feats_count_list,is_compute_matches=False,
#                    test_ratio_limit=0.12,sim_cutoff_dfs=sim_cutoff_dfs_actual,
#                     low_high_record_condns=[(False,2,20)])
# cutoff_pos_df_actual

In [None]:
# # k_sel,test_sel,cutoff_pos_best =get_test_sel_cutoff_pos(test_probs,cutoff_pos_df_hash,cutoff_pos_df_actual)
# # imp_feats_count_list = [2,5,10,15,20,25]

# k_sel,test_sel,top_mean_best =get_test_sel_cutoff_pos(test_probs,xgb_models,features_test,
#                                                       df,
#                             cutoff_pos_df_hash,
#                             is_min_total=False,total_cri=2,top_n=10,
#                             sim_cutoff_dfs=sim_cutoff_dfs_actual,
#                             imp_feats_count_list=imp_feats_count_list)
# print(f'{k_sel=}')
# print('cutoff best')
# print(top_mean_best)
# print(' ** selected test **')
# test_sel[['k','client_seed','probs']]

In [None]:
test_sel=None
#array([ 10, 536, 371, 333, 518])
k_list = top_mean_actual[8][:5]['k'].values
print(k_list)
for k_sel in k_list:
    row = df.iloc[k_sel]
#     print(row)
    mask = (test['probs']>=row['cutoff']) & (test['probs']<=row['cutoff_2'])
    cur_test_sel = test[mask]
    cur_test_sel['k']=k_sel
    if test_sel is None:
        test_sel = cur_test_sel
    else:
        test_sel = pd.concat([test_sel,cur_test_sel],axis=0)
    print('Test Size:',len(test_sel))
test_sel[['k','roll_actual','client_seed']]

In [None]:
imp_feats = [col for col in imp_df['feature'].values if (col.startswith('roll_'))][:10] 
test_sel[['k']+imp_feats]

In [None]:
sim_hash_group[sim_hash_group['k']==24]

In [None]:
sim_cutoff_df[sim_cutoff_df['k']==24]

In [None]:
def compute_total(data,cutoffs,cols):
    mask = None
    for cutoff,col in zip(cutoffs,cols):
        cur_mask =(data[col]>=cutoff) 
        if mask is None:
            mask = cur_mask
        else:
            mask = mask & cur_mask
    data_filt = data[mask]
    total = len(data_filt)
    return total,data_filt

In [None]:
cols=['roll_20_nonce','roll_14_nonce','roll_3_nonce','roll_16_nonce','roll_11_nonce']
cols=['roll_43_nonce','roll_56_nonce','roll_31_nonce','roll_20_nonce','roll_45_nonce']
cols=['roll_31_nonce','roll_52_nonce','roll_15_nonce','roll_45_nonce','roll_2_nonce']
cols=['roll_31_nonce','roll_52_nonce']
cols=['roll_52_nonce','roll_31_nonce','roll_24_nonce','roll_49_nonce','roll_41_nonce']
cols=['roll_mean_25_50_nonce','roll_52_nonce','roll_31_nonce']
cols=['roll_52_nonce','roll_31_nonce']
cols=['roll_46_nonce','roll_25_nonce']
cols=['roll_42_nonce','roll_37_nonce']

# cols=['roll_20_nonce','roll_15_nonce','roll_13_nonce','roll_12_nonce','roll_18_nonce']
# cols=['roll_20_nonce','roll_15_nonce','roll_13_nonce']
# cols=['roll_6_nonce','roll_11_nonce','roll_12_nonce','roll_10_nonce',]




In [None]:
def get_cutoff(data,data_hash,col,quantile):
    mask=create_target_mask(data_hash)
#     cutoff = data[col].mean()-data_hash[col].mean()+data_hash[mask][col].mean()
#     cutoff = data_hash[mask][col].quantile(0.5)
    cutoff = data[col].mean()-data_hash[col].mean()+data_hash[mask][col].quantile(quantile)
    return cutoff



best_ratio =0
best_success = 0
best_total = 0
best_cutoffs = []
best_quantile = 0
best_actual = test_filt
# for quantile in np.linspace(0.01,0.95,50):
for quantile in np.linspace(0.01,0.95,50):
    cutoffs=[]
    for col in cols:
        #use entire hash to determine cutoff
        cutoff = get_cutoff(test,test_hash,col,quantile)
        cutoffs.append(cutoff)
    
#     print(cutoffs)
    total,filt_inter = compute_total(test_filt_hash,cutoffs,cols)
    mask2=create_target_mask(filt_inter)
#     mask2 = (filt_inter['roll_actual']>=9000)
    success = len(filt_inter[mask2])
    
    total_actual,filt_inter_actual = compute_total(test_filt,cutoffs,cols)
    mask2=create_target_mask(filt_inter_actual)

#     mask2 = (filt_inter_actual['roll_actual']>=9000)
    
    if total_actual==0:
        success_actual=0
        ratio_actual=0
    else:
        success_actual = len(filt_inter_actual[mask2])
        ratio_actual = success_actual / total_actual
    print('\nquantile:',quantile)
    print('Actuals:',ratio_actual,success_actual,total_actual)
    
    if total==0:
        ratio=0
    else:
        ratio = success / total
    print('Hash:',ratio,success,total)
#     print(quantile,cutoffs)
    print(total,total_actual)
#     print(total,success)
    if (total==0) or (success==0) : #or (total_actual<1):
        break

#     print(ratio,success,total)
    if ratio >= best_ratio:
        best_ratio = ratio
        best_success = success
        best_total = total
        best_cutoffs = cutoffs
        best_quantile = quantile
        best_actual = filt_inter_actual
print(best_quantile)        
print(best_ratio,best_success,best_total)        
print(best_cutoffs)
# print(test_filt[mask]['roll_actual'].describe())

In [None]:
mask2=create_target_mask(best_actual)
success = len(best_actual[mask2])
total = len(best_actual)
ratio = success / total
print(ratio,success,total)

In [None]:
test_sel = best_actual.copy()

In [None]:


def get_actual_based_data(data,data_hash,data_filt,cols,total_cutoff=3):

    best_filt_inter = data
    best_total= 0
    best_quantile = 0
    for quantile in np.linspace(0.01,0.95,50):
        cutoffs=[]
        for col in cols:
            cutoff = get_cutoff(data,data_hash,col,quantile)
        #     cutoff = get_cutoff(test,test_hash,col)
            cutoffs.append(cutoff)
#         print(quantile,cutoffs)

        total,filt_inter=compute_total(data_filt,cutoffs,cols)
        if total<total_cutoff:
            break
        best_filt_inter = filt_inter
        best_total = total
        best_quantile = quantile
        
    return best_filt_inter,best_total,best_quantile

In [None]:
for cutoff in range(0,11):

    best_filt_inter,best_total,best_quantile=get_actual_based_data(test,test_hash,test_filt,cols,
                                                                        total_cutoff=cutoff)
#     print('quantile:',best_quantile)
    mask2=create_target_mask(best_filt_inter)
    # mask2 = (best_filt_inter['roll_actual']>=9000)
    success = len(best_filt_inter[mask2])
    if best_total==0:
        ratio=0
        success=0
    else:
        success = len(best_filt_inter[mask2])
        ratio = success / best_total
    print(cutoff, ratio,success,best_total)
    # print(test_filt[mask]['roll_actual'].describe())

In [None]:
test_sel =best_filt_inter.copy()

In [None]:
best_filt_inter,best_total,best_quantile=get_actual_based_data(test_filt,test_filt_hash,test_filt,cols,
                                                              total_cutoff=4)
print('quantile:',best_quantile)
mask2=create_target_mask(best_filt_inter)
# mask2 = (best_filt_inter['roll_actual']>=9000)
success = len(best_filt_inter[mask2])
if best_total==0:
    ratio=0
    success=0
else:
    success = len(best_filt_inter[mask2])
    ratio = success / best_total
print(ratio,success,best_total)
# print(test_filt[mask]['roll_actual'].describe())

In [None]:
# mask_actual = (test_probs['probs']>=row['cutoff']) & (test_probs['probs']<=row['cutoff_2'])
# idx= test_probs[mask_actual].index
# mask_hash = test_hash_probs.index.isin(idx)
# test_filt_hash_pos = test_hash_probs.loc[mask_hash]

mask_hash = (test_hash_probs['probs']>=row['cutoff']) & (test_hash_probs['probs']<=row['cutoff_2'])
idx= test_hash_probs[mask_hash].index
mask_actual = test_probs.index.isin(idx)
test_filt_pos = test_probs.loc[mask_actual]

best_filt_inter,best_total,best_quantile=get_actual_based_data(test,test_hash,
                                                               test_filt_pos,cols,
                                                              total_cutoff=37)
print('quantile:',best_quantile)
mask2=create_target_mask(best_filt_inter)
# mask2 = (best_filt_inter['roll_actual']>=9000)
if best_total==0:
    ratio=0
    success=0
else:
    success = len(best_filt_inter[mask2])
    ratio = success / best_total
print(ratio,success,best_total)
# print(test_filt[mask]['roll_actual'].describe())

In [None]:
best_filt_inter

In [None]:
test_sel=best_filt_inter.copy()

In [None]:
# analysis_df_copy  = analysis_df.copy()

In [None]:
def report_exceptional_teratio(analysis_df,digitlist,
                     ratio_cutoff,count_cutoff_min,count_cutoff_max,
                     pattern,bHighCheck,
                    zerocolcutoffs,is_ratio_cri_lessthan=False,
                               is_aftval_opp=False,is_aftval_sum=False):
    mask1 = analysis_df['digit'].isin(digitlist)
    if is_ratio_cri_lessthan:
        mask2= (analysis_df['tr_ratio']<ratio_cutoff) 
        if is_aftval_opp:
            mask2= mask2 & ((analysis_df['aft_all']>=ratio_cutoff) | (analysis_df['aft_1']>=ratio_cutoff) | (analysis_df['aft_2']>=ratio_cutoff)) 
    else:
        mask2=  (analysis_df['tr_ratio']>=ratio_cutoff) 
        if is_aftval_opp:
            mask2= mask2 & ((analysis_df['aft_all']<ratio_cutoff) | (analysis_df['aft_1']<ratio_cutoff) | (analysis_df['aft_2']<ratio_cutoff)) \
    
        
    mask= mask1 & mask2 \
             & ((analysis_df['aft_all']!=0) & (analysis_df['aft_1']!=0) & (analysis_df['aft_2']!=0)) \
                & (analysis_df['te_total']>=count_cutoff_min) \
                & (analysis_df['te_total']<=count_cutoff_max) \
                & (analysis_df['pattern_99']==pattern) \
                & (analysis_df['target_high']==bHighCheck) \
                & (analysis_df['zero_col_cutoff'].isin(zerocolcutoffs))
        
    if is_aftval_sum:
        mask= mask & ((analysis_df['aft_all']+analysis_df['aft_1']+analysis_df['aft_2'])>=36) \

    filtered = analysis_df[mask]
    
    records  = len(filtered)
    cols = ['tr_ratio','tr_total','te_total']
    
    return filtered

##### Production Code

In [None]:
# def get_random_client_seed(test):
#     size = len(test)
#     test = test.reset_index(drop=True).sample(n=size).reset_index(drop=True)
#     rand_pos = random.randint(0,size-1)
#     print(f'rand_pos:{rand_pos}')
#     return test.iloc[rand_pos]['client_seed'],test

def get_random_client_seed(test):
    size = len(test)
    final_test= test.sample(n=size)
    rand_pos=random.randint(0,size-1)
    sample=final_test.reset_index(drop=True).loc[rand_pos]
#     rand_pos = sample.index[0]
    print(f'rand_pos:{rand_pos}')
    return sample['client_seed'],final_test

In [None]:
client_seed,final_test = get_random_client_seed(test_sel)
print('Client Seed Selected')
print(client_seed)
# final_test.to_csv(f'data/analysis/final_test_{nonce}.csv')
# final_test.head()

In [None]:
final_test

In [None]:
final_test.reset_index()[final_test.reset_index()['index']==6826]

After Actual SEED is known

In [None]:
actual_seed="e658f73f9831527c16614b72cf87f0a9718961a6099024b6301b10b54dc166d0"
# actual_seed=cur_hash_list[4]
# print(f'{actual_seed=}')

In [None]:
# Vectorize the function
vectorized_calculate_roll = np.vectorize(calculate_roll)

# Compute the roll values for the input arrays
roll_array = vectorized_calculate_roll(actual_seed,
                                       final_test['client_seed'],
                                       nonce)
final_test['roll_actual_dummy']=final_test['roll_actual'].copy()
final_test['roll_actual']=roll_array
# final_test.to_csv(f'data/analysis/final_test_{nonce}.csv')

In [None]:
final_test

In [None]:
final_test.to_csv(f'data/final_test_{nonce}_{file_pattern_str}_pattern.csv')

In [None]:
LOW_TARGET

In [None]:
mask = create_target_mask(final_test)
success_pos = final_test[mask].index

if HIGH_ANALYSIS:
    success_pos = final_test[final_test['roll_actual']>=HIGH_TARGET].index
else:
    success_pos = final_test[final_test['roll_actual']<LOW_TARGET].index
print(len(success_pos))
print(f'success_pos:{success_pos}')

# target = set([17, 19, 34, 35, 44, 47])
# matched  = set(list(success_pos)).intersection(target)
# print(matched)

# print(len(matched)/len(success_pos))

In [None]:
test_sel.head()

In [None]:
final_test

In [None]:
# Vectorize the function
vectorized_calculate_roll = np.vectorize(calculate_roll)

# Compute the roll values for the input arrays
roll_array = vectorized_calculate_roll(actual_seed,
                                       results_df_test['client_seed'],
                                       nonce)

results_df_test['roll_actual']=roll_array
test_manual = generate_test_features(results_df_test)
test = generate_features_full(results_df_test,False,feature_chain_length)


analysis_df,test = gen_analysis(nonce,train_manual,test_manual,test,xgb_models,tr_last_cutoffs,val_cutoffs,
                multi_models=True,file_suffix="")
analysis_df.head()

#### END

Finalize Training

In [None]:
tr_index

In [None]:
train.index

In [None]:
# tr_index = train[30 * train_client_size:(30 + 5) * train_client_size].index
tr_index = train[train_client_size:].index
train_ignored = train[0:30*train_client_size]
# tr_index = list(tr_index) + list(train_ignored[train_ignored['target']==1].index)
print(len(tr_index))
xgb_model_final = xgb.XGBClassifier(**params)

X_tr,y_tr = X.iloc[tr_index],y.iloc[tr_index]
oversample = RandomOverSampler(sampling_strategy='minority',random_state=123)
# fit and apply the transform
X_tr, y_tr = oversample.fit_resample(X_tr, y_tr)
print(pd.Series(y_tr).value_counts())
xgb_model_final.fit(X_tr,y_tr)

In [None]:
len(X_tr)

In [None]:
test_probs = xgb_model_final.predict_proba(test)[:,1]

test_labels = convert_probtolabels(test_probs)
subm=pd.DataFrame()
subm['client_seed']= results_df_test['client_seed']
subm['preds'] = test_labels
subm['prob'] = test_probs
score = accuracy_score(y_test,test_labels)
print(f'test score:{score}')

In [None]:
mask = (subm['prob']<0.52) & (subm['preds']==1)
print(len(subm[mask]))
score = accuracy_score(y_test[mask],test_labels[mask])
print(f'test score:{score}')