In [126]:
import hashlib
import hmac
import csv
import numpy as np
import pandas as pd
import joblib
from joblib import Parallel, delayed
from multiprocessing import  Pool
import multiprocessing
import itertools
import random

In [127]:
import math

In [128]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import average_precision_score, precision_recall_curve
from scipy import stats
from collections import Counter

In [129]:
from operator import itemgetter
import logging, sys

In [130]:
import warnings
warnings.filterwarnings("ignore")

In [131]:
_DEBUG = False
KAGGLE = False

In [132]:
# logging.basicConfig( #stream=sys.stdout, 
#                     level=logging.DEBUG)

In [133]:
# logging.debug('A debug message!')

In [134]:
pd.options.display.max_rows=2000
pd.options.display.max_columns=500
pd.set_option('display.max_colwidth', None)

In [135]:
import time
from multiprocessing import Value
from ctypes import py_object

In [136]:
def init_pool_processes(shared_value):
    global p
    p = shared_value
def parallelize_dataframe(df, func, data_to_share=None,other_args=None, n_cores=4):
    start = time.time()
    df_split = np.array_split(df, n_cores)
    
    if data_to_share is not None:
        p = Value(py_object)
        p.value = data_to_share        
        pool = Pool(processes=n_cores,
                    initializer=init_pool_processes, initargs=(p,))
    else:
        pool = Pool(n_cores)
        
    arg_list=[]
    arg_list.append(df_split)
            
    if other_args is not None:
        for arg in other_args:
            arg_list.append(itertools.repeat(arg))
    
    parallel_args = zip(*arg_list)
    end = time.time()
    print(f'Parallel preparation time: {end-start}')
    df = pd.concat(pool.starmap(func, parallel_args))
    pool.close()
    pool.join()
    return df

In [137]:
def debugprint(msg):
    if _DEBUG:
        print(msg)

In [138]:
def get_bin(bins,val):
    matching_bin=-1
    for i in range(len(bins)-1):
        if bins[i] <= val < bins[i+1]:
            matching_bin= bins[i]
    if matching_bin==-1:
        matching_bin=bins[-1]
    return matching_bin

In [139]:
def calculate_roll(server_seed,client_seed,nonce):
    # Compute the HMAC-SHA512 hash of string1 using string2 as the key
    
    string1 = f"{nonce}:{server_seed}:{nonce}"
    hmac_key =   f"{nonce}:{client_seed}:{nonce}".encode()
    hmac_hash = hmac.new(key=hmac_key,
                         msg=string1.encode(),
                         digestmod=hashlib.sha512).hexdigest()

    # Convert the first 8 characters of the HMAC-SHA512 hash to an integer
    string3 = hmac_hash[:8]
    number = int(string3, 16)

    # Compute the roll value
    roll = round(number / 429496.7295)
    
    return roll

In [140]:
def compute_roll_hash_arrays(server_seed_array,client_seed_array,nonce_array):
    # Vectorize the function
    vectorized_calculate_roll = np.vectorize(calculate_roll)

    # Compute the roll values for the input arrays
    roll_array = vectorized_calculate_roll(server_seed_array,
                                           client_seed_array,
                                           nonce_array)
    
    # Compute roll for hash using the hash which is the next element in array
    # Vectorize the function
    vectorized_calculate_roll_hash = np.vectorize(calculate_roll)

    # Compute the roll values for the input arrays
    roll_array_hash = vectorized_calculate_roll_hash(server_seed_array[1:],
                                           client_seed_array[:chain_length-1],
                                           nonce_array[:chain_length-1])
    return roll_array,roll_array_hash

    

In [141]:
# trans_36 = bytes((x ^ 0x36) for x in range(256))
# trans_36

In [142]:
chain_length = 5110101 #1000001

filename = "sha256_hashchain_b9556671f785fe935bee087665b4047e421ea4491a5e2021a8152cab0b74c953.npy"

if KAGGLE:
    filepath_client ="/kaggle/input/hash-file-generation-client-seed/"
else:
    filepath_client="data/"
    
# filename_client=f'{filepath_client}sha256_hashchain_client_5M.npy'
filename_client=f'{filepath_client}sha256_hashchain_client1.npy'


# Define the number of CPU cores to use
num_cores = 8

In [143]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [144]:
# server_seed_array=np.load(filename)
# print(len(server_seed_array))
# server_seed_array[0:10]

In [145]:
client_seed_array=np.load(filename_client,
                         allow_pickle=True,fix_imports=True,encoding='latin1')
client_seed_array= client_seed_array[:chain_length]
print(len(client_seed_array))
client_seed_array[0:10]

1010102


array(['3455dda4b3aecaa36d4687277766a079feebbb4ab01dc038bc8fb8a36ddad6aa',
       '57eea50e4484ba102d776506baf4b1dc0dcb3fc74fe2ab56b10b0f427cb6409e',
       '3e49b34f2d6c7116d92ac8818c94f87d5616d7f2aea5024c2f38413605dc321c',
       'a0731d74c9501894d16ba144bc10dd49e8439c7f88304bfdbe4a755870c46da3',
       '665122519e5835c8e7ebfa0ab5b1949d75ac2c9e3dc1a6965a4b6e26bfc4ed76',
       '87fa02b24749184f0ce9183f21715ce556db72ecd1ad4f32b02227b40f3b24ab',
       'edee47aed4d9b598fa647145a17e44f9d155dc53ad231c913ff5c93fc04e56ee',
       'baf9d66be4136fe8acd30c042704b1798380fe0f3ee45d5800ae01db88e03ed3',
       '8eac9ebdcbfcf1720f6b0317fa66569c3129d278c0e2605069e7c5a0a2179c8d',
       'aeab33d31e0fb10105ce7ad75e3a40736db70a824c3ed6489957d6736243184b'],
      dtype='<U64')

In [146]:
# str_cat ='pppppp'
# # str_cat ='ppppppppppppppp'
# print(len(str_cat))
# client_seed_array_1=(str_cat + pd.Series(client_seed_array)).values
# client_seed_array_1[:2]

In [147]:
# str_cat ='ppppppppppppppp'
str_cat ='woxpwoxpwoxpwoxpwoxp'
print(len(str_cat))
client_seed_array_2=(str_cat + pd.Series(client_seed_array)).values
client_seed_array_2[:2]

20


array(['woxpwoxpwoxpwoxpwoxp3455dda4b3aecaa36d4687277766a079feebbb4ab01dc038bc8fb8a36ddad6aa',
       'woxpwoxpwoxpwoxpwoxp57eea50e4484ba102d776506baf4b1dc0dcb3fc74fe2ab56b10b0f427cb6409e'],
      dtype=object)

In [148]:
np.random.seed(5000)
low = 1500
# nonce_array = np.random.randint(low, high=low+chain_length, 
#                                 size=chain_length)

nonce_array = np.arange(low,low+chain_length,1)
np.random.shuffle(nonce_array)
print(len(nonce_array))
print(nonce_array[:10])
pd.Series(nonce_array).nunique()

5110101
[ 638661 1297273 1057802 1410000 2886373 4139846 2635511 4657935 2336816
  640280]


5110101

In [149]:
# roll_array,roll_array_hash=compute_roll_hash_arrays(server_seed_array,
#                                        client_seed_array,
#                                        nonce_array)
# print(len(roll_array[0:10]))
# print(roll_array[0:100])
# print(len(roll_array_hash[0:10]))
# print(roll_array_hash[0:100])

In [150]:
# roll_array_1,roll_array_hash_1=compute_roll_hash_arrays(server_seed_array,
#                                        client_seed_array_1,
#                                        nonce_array)
# print(len(roll_array_1[0:10]))
# print(roll_array_1[0:100])
# print(len(roll_array_hash_1[0:10]))
# print(roll_array_hash_1[0:100])

In [151]:
# roll_array_2,roll_array_hash_2=compute_roll_hash_arrays(server_seed_array,
#                                        client_seed_array_2,
#                                        nonce_array)
# print(len(roll_array_2[0:10]))
# print(roll_array_2[0:100])
# print(len(roll_array_hash_2[0:10]))
# print(roll_array_hash_2[0:100])

In [152]:
# # Vectorize the function
# vectorized_calculate_roll = np.vectorize(calculate_roll)

# # Compute the roll values for the input arrays
# roll_array = vectorized_calculate_roll(server_seed_array,
#                                        client_seed_array,
#                                        nonce_array)

# # Print the output
# print(len(roll_array[0:10]))
# print(roll_array[0:100])

In [153]:
# # Vectorize the function
# vectorized_calculate_roll_hash = np.vectorize(calculate_roll)

# # Compute the roll values for the input arrays
# roll_array_hash = vectorized_calculate_roll_hash(server_seed_array[1:],
#                                        client_seed_array[:chain_length-1],
#                                        nonce_array[:chain_length-1])

# # Print the output
# print(len(roll_array_hash[0:10]))
# print(roll_array_hash[0:100])

In [154]:
ABOVE="Above"
BELOW="Below"
INTER ="Intermediate"

In [155]:
# Define a function to generate a single hash in the hash chain
def generate_hash(message):
    # Hash the message using SHA-256
    hash_obj = hashlib.sha256(message)

    # Get the hexadecimal representation of the hash
    hash_hex = hash_obj.hexdigest()

    # Return the hash
    return hash_hex

In [156]:
def generate_hash_chain(seed,chain_length):
    hash_list=[seed]
    message = seed.encode()
    for i in range(chain_length):
        hash_hex= generate_hash(message)
        # Write the hash to the CSV file
        hash_list.append(hash_hex)
        # Update the message with the current hash
        message = hash_hex.encode()
    return hash_list

def compute_multirolls(hash_list,client_seed,nonce):
    rolls=[]
    for cur_hash in hash_list:
        roll = calculate_roll(cur_hash,client_seed,nonce)
        rolls.append(roll)
    return rolls
def compute_multirolls_nonce(server_hash,client_seed,hash_list_nonce):
    rolls=[]
    for cur_hash_nonce in hash_list_nonce:
        roll = calculate_roll(server_hash,client_seed,cur_hash_nonce)
        rolls.append(roll)
    return rolls

In [157]:
def predict_digit_pattern(server_hash,roll_hash,nonce,client_seed,
                        match_digit_arr,match_digit_indices,
                        mismatch_digit_arr=None,mismatch_digit_indices=None):

#     rolls_list  = compute_multirolls(hash_list_server,client_seed,nonce)
    match = True
    
#     roll_first = rolls_list[0]
    factors = np.array([10000,1000,100,10])
    factors_filt = factors[match_digit_indices]
    for i,val in enumerate(factors_filt):
        match = match & (int((roll_hash % val) / (val / 10))==match_digit_arr[i])
        if not match: 
            break

    if mismatch_digit_arr is not None:
        factors_filt = factors[mismatch_digit_indices]
        for i,val in enumerate(factors_filt):
            match = match & (int((roll_hash % val) / (val / 10))!=mismatch_digit_arr[i])
            if not match: 
                break            

    return match
 

def predict_digit_output(server_hash,nonce,client_seed,
                         match_digit_arr,match_digit_count_arr,
                        hash_list_server,match_digit_indices=None):

    rolls_list  = compute_multirolls(hash_list_server,client_seed,nonce)
    match = True
    
    #eliminate zero from all roll_hash
#     for roll in rolls_list:
#         str_roll = f'{roll:04d}'
#         cur_count = str_roll.count('0')
#         if cur_count!=0:
#             match=False
            
    for roll,match_digit,match_count,match_indices \
            in zip(rolls_list,match_digit_arr,match_digit_count_arr,match_digit_indices):
        str_roll = f'{roll:04d}'
        
        if match_digit_indices is None:
            str_roll_indexed = str_roll
        else:
            str_roll_indexed = ''
            for index in match_indices:
                str_roll_indexed = str_roll_indexed + str_roll[int(index)]
                
        cur_count = str_roll_indexed.count(match_digit)
        if cur_count!=match_count:
            match=False
            
    return match, rolls_list  

In [158]:
# test_file_random ='/kaggle/input/hash-generate-random-seeds/test_data_random.csv'
# test_seeds_2=pd.read_csv(test_file_random)
# print(test_seeds_2.shape)
# test_seeds_2.head()

In [159]:
# #create special client seed pattern array

# def get_hexval(val):
#     hexdata='abcdef'
#     if val >=10:
#         return hexdata[val-10]
#     else:
#         return str(val)

# str_spl_client = 'woxpwoxpwoxpwoxpwoxp066103c1b2a6ebe01cf30afd49a6b931278793fc457dee84510f03e11779d5be'
# spl_client_list=[]
# for index in range(20,20+64):
#     for val in range(16):
#         spl_client_list.append(str_spl_client[:index]+get_hexval(val)+str_spl_client[index+1:])
# spl_client_array = np.array(spl_client_list)
# print(len(spl_client_array))
# spl_client_array[1020:]

##### Machine Learning

Generate Data

In [160]:
def generate_data(test,client_seed_data,pass_state_reqd,
                    match_digit_arr,match_digit_count_arr,
                    mismatch_digit_arr=None,mismatch_digit_indices=None,
                    feature_chain_length=20,
                    is_data_hash=False,
                    match_digit_indices=None,
                    match_count_exp=1,
                    test_limit = None,
                    last_index=None,result_df=None,
                    print_client_scan=False,
                    hash_list_nonce=None):
    i = len(client_seed_data)-1 if last_index is None else last_index - 1
    last_success_index =i
    client_size = len(client_seed_data)
    client_scan_fresh=True 
    

    if result_df is None:
        result_df = pd.DataFrame(columns=['match','seed','hash','nonce',
                                    'client_seed','client_index',
                                    'passed_state',
                                    'roll_actual','roll_hash_list','roll_hash_list_nonce',
                                         ])
    result_success=True
    dictionary_list=[]
    for server_count,(server_index,row) in enumerate(test.iterrows()):
        
        if (server_count % 5)==0:
            print(f'server_count:{server_count}')
        
        if (test_limit is not None) and (server_count >= test_limit):
            break
        
        #for large volume of data, it is recommended to set i to entire lengthof client seeds
        #for every server seed
#         i = len(client_seed_data)-1 if last_index is None else last_index - 1
        
        cur_seed = row['seed']
        cur_nonce = row['nonce']
        cur_hash= row['hash']
        
       
        if is_data_hash:
            cur_hash_list = list(test[server_count+1:server_count+1+feature_chain_length+1]['seed'])
        else:
            cur_hash_list = generate_hash_chain(cur_hash,feature_chain_length)
        
          
        fullclientscan=False
        match = False
        prev_match=""
        match_count=0
        
        client_start = i
        while match_count < match_count_exp and not fullclientscan:
            
            if print_client_scan  & (i % 1000==0) :
                print(f'current scan : {i}')
            
            client_seed = client_seed_data[i]
            client_index = i

            roll_seed_actual=calculate_roll(cur_seed,client_seed,cur_nonce)
            roll_hash=calculate_roll(cur_hash,client_seed,cur_nonce)
            
            match = predict_digit_pattern(cur_hash,roll_hash,cur_nonce,client_seed,
                        match_digit_arr,match_digit_indices,
                        mismatch_digit_arr=mismatch_digit_arr,
                        mismatch_digit_indices=mismatch_digit_indices)
        
            # match, rolls_list = predict_digit_output(cur_hash,cur_nonce,client_seed,
            #              match_digit_arr,match_digit_count_arr,
            #             cur_hash_list,match_digit_indices=match_digit_indices)
            i-=1
            
            if i==client_start and not match:
                print()
                print(i,client_start)
                print(f'No further match for seed:{cur_seed} nonce:{cur_nonce} match count: {match_count}')
                print()
                fullclientscan=True
            if (i<0):
                i = len(client_seed_data)-1
       
            if match:
                
#                 print(f'Matched for seed:{cur_seed} i:{i} match count: {match_count}')
                rolls_list  = compute_multirolls(cur_hash_list,client_seed,cur_nonce)
                if hash_list_nonce is None:
                    hash_list_nonce = generate_hash_chain(str(cur_nonce),feature_chain_length)

                rolls_list_nonce  = compute_multirolls_nonce(cur_hash,client_seed,hash_list_nonce)

                match_count += 1
                
                result_success = ((pass_state_reqd==ABOVE) & (roll_seed_actual>5250))  \
                                | ((pass_state_reqd==BELOW) & (roll_seed_actual<4750))
                if result_success:
                    last_success_index= i+1
#                 result = [result_success,cur_seed, cur_hash, cur_nonce,
#                                                client_seed,client_index,pass_state_reqd,
#                                                roll_seed_actual,rolls_list,rolls_list_nonce]
                

                dictionary_data = {'match': result_success,'seed':cur_seed, 
                                   'hash': cur_hash, 'nonce':cur_nonce,
                                    'client_seed':client_seed,'client_index':client_index,
                                      'passed_state':pass_state_reqd,
                                     'roll_actual':roll_seed_actual,
                                   'roll_hash_list':rolls_list,'roll_hash_list_nonce':rolls_list_nonce
                                  }
                dictionary_list.append(dictionary_data)
                
#                 result_df.loc[len(result_df.index)] = result
#                 print(f'server_count: {server_count} current result: {result_success, cur_nonce,pass_state_reqd,roll_seed_actual,roll_hash}')
    
    result_df = pd.DataFrame.from_dict(dictionary_list)

    return result_df


In [161]:
hashval = 'fef10d00fcd97a523a39169b7c5f6fadaf15f063acdbc31b4ae0ff7126a973d6'#'3e24f29a3ae70e54aedd86b0c68640050be7dace8ae679f1fb85d1325a56ddae'
actual_seed = 'f9e1702bb3393f834aea8907e7e346bf9641e4c39131de555a19d56cf1b66989'
seed = hashval #'c77b3e783f094e255b0693f89094bed95ad9738d86f44fa4c09c8b0d58e2c73e'
nonce = 2722 #2390

feature_chain_length=60
train_hash_size = 2158#121
train_client_size = 2000 
test_client_size = 10000

In [162]:
cur_hash_list = generate_hash_chain(hashval,train_hash_size)

train_data = pd.DataFrame(columns=['seed','hash','nonce'])
start=24
train_data['seed']=np.array(cur_hash_list[start:-1])
train_data['hash']=np.array(cur_hash_list)[start+1:]
train_data['nonce'] = nonce

print(len(train_data))
train_data.head()

2134


Unnamed: 0,seed,hash,nonce
0,e264cb9168ffd0be654594e8fc545d71c3aa1ba0e6f7e543b000f0c997cbaa3f,1c5ea7091200ef097e44fa32587a8d6119374dce2cf9763ff9ed568d7ac7c956,2722
1,1c5ea7091200ef097e44fa32587a8d6119374dce2cf9763ff9ed568d7ac7c956,bcd5106e72ba4ce2d781913f6850b3432bb02538fdd503f5db1df4501f2a5046,2722
2,bcd5106e72ba4ce2d781913f6850b3432bb02538fdd503f5db1df4501f2a5046,174a5d9f4c99ec933ac8dd77ce521bf4b115839011b722ef4adda8b5d3cf3c23,2722
3,174a5d9f4c99ec933ac8dd77ce521bf4b115839011b722ef4adda8b5d3cf3c23,a7bf8e0f5dfa7bf69c0c76053f4199c7a6470f1a7693f749751015f18dd8f628,2722
4,a7bf8e0f5dfa7bf69c0c76053f4199c7a6470f1a7693f749751015f18dd8f628,3ee95f14c132d4f3461ebc7c03559334fd5ec4d4214dfb1d1f75af0ad359811d,2722


In [163]:
PATTERN_99=False
HIGH_ANALYSIS=False

cur_digit = 9
cur_zero_digit = 0

In [164]:
OLD_VERSION_READ=False
READ_FROM_FILE=False

In [165]:
%%time
# match_digit_arr = np.array(['999',])
# match_digit_count_arr=np.array([1])
# match_digit_indices = np.array(['012'])

if PATTERN_99:
    match_digit_arr=np.array([9,9])
    mismatch_digit_arr=np.array([9,9])
else:
    match_digit_arr=np.array([0,0])
    mismatch_digit_arr=np.array([0,0])
    
match_digit_indices = np.array([1,2])
match_digit_count_arr=np.array([1])
mismatch_digit_indices = np.array([0,3])

CPU times: user 16 µs, sys: 0 ns, total: 16 µs
Wall time: 20.3 µs


In [166]:
%%time

if not READ_FROM_FILE:

    cur_client_seed_array = client_seed_array_2[:1000001]

    results_df = None

    last_index =  len(cur_client_seed_array)-1 #1024
    start = 0

    hash_list_nonce = generate_hash_chain(str(nonce),feature_chain_length)

    results_df = generate_data(train_data,
                                cur_client_seed_array,
                                 ABOVE,
                        match_digit_arr,match_digit_count_arr,match_digit_indices=match_digit_indices,
                        mismatch_digit_arr=mismatch_digit_arr,mismatch_digit_indices=mismatch_digit_indices,
                        match_count_exp=train_client_size,
                        feature_chain_length=feature_chain_length,
                        is_data_hash = True,
                        test_limit=38,
                        last_index=last_index,result_df=results_df,
                        hash_list_nonce=hash_list_nonce)
    row = results_df.iloc[len(results_df)-1]
    last_index = row['client_index']

server_count:0
server_count:5
server_count:10
server_count:15
server_count:20
server_count:25
server_count:30
server_count:35
CPU times: user 4min 21s, sys: 253 ms, total: 4min 22s
Wall time: 4min 22s


In [167]:
# last_index
# results_df['client_index'].min()

In [168]:
# results_df.head()

In [169]:
# print(len(results_df))
# results_df['client_seed'].nunique()

In [170]:
def save_cleaned_results_df(results_df,is_test,
                            file_suffix="",cleaned_suffix = True):
    nonce_suffix = "_nonce"
    rolls_df = results_df['roll_hash_list'].apply(pd.Series)
    rolls_df.columns = [f'roll_{i}' for i in range(rolls_df.shape[1])]
    rolls_df_nonce = results_df['roll_hash_list_nonce'].apply(pd.Series)
    rolls_df_nonce.columns = [f'roll_{i}{nonce_suffix}' for i in range(rolls_df.shape[1])]
    results_df_cleaned = pd.concat([results_df,rolls_df,rolls_df_nonce],axis=1)
    results_df_cleaned=results_df_cleaned.drop(['roll_hash_list','roll_hash_list_nonce'],axis=1)
    if cleaned_suffix:
        cleaned_suffix='cleaned_'
    else:
        cleaned_suffix=''
    if is_test:
        filename = f'data/{cleaned_suffix}results_df_test_{nonce}_{file_pattern_str}_pattern{file_suffix}.csv'
    else:
        filename = f'data/{cleaned_suffix}results_df_{nonce}_Large_{file_pattern_str}_pattern{file_suffix}.csv'
    results_df_cleaned.to_csv(filename,index=False)
    return results_df_cleaned

In [171]:
%%time
if PATTERN_99:
    file_pattern_str = 'x99x'
else:
    file_pattern_str = 'x00x'
filename = f'data/results_df_{nonce}_Large_{file_pattern_str}_pattern.csv'
if READ_FROM_FILE:
    if OLD_VERSION_READ:
        results_df = pd.read_csv(filename, converters={'roll_hash_list': pd.eval,
                                                     'roll_hash_list_nonce': pd.eval})
    else:
        results_df = pd.read_csv(filename)

if not(READ_FROM_FILE) or OLD_VERSION_READ:
    results_df=save_cleaned_results_df(results_df,False,
                            file_suffix="",cleaned_suffix = False)
#     results_df.to_csv(filename,index=False)

CPU times: user 23.4 s, sys: 1.21 s, total: 24.6 s
Wall time: 23.6 s


In [172]:
results_df.head()

Unnamed: 0,match,seed,hash,nonce,client_seed,client_index,passed_state,roll_actual,roll_0,roll_1,roll_2,roll_3,roll_4,roll_5,roll_6,roll_7,roll_8,roll_9,roll_10,roll_11,roll_12,roll_13,roll_14,roll_15,roll_16,roll_17,roll_18,roll_19,roll_20,roll_21,roll_22,roll_23,roll_24,roll_25,roll_26,roll_27,roll_28,roll_29,roll_30,roll_31,roll_32,roll_33,roll_34,roll_35,roll_36,roll_37,roll_38,roll_39,roll_40,roll_41,roll_42,roll_43,roll_44,roll_45,roll_46,roll_47,roll_48,roll_49,roll_50,roll_51,roll_52,roll_53,roll_54,roll_55,roll_56,roll_57,roll_58,roll_59,roll_60,roll_0_nonce,roll_1_nonce,roll_2_nonce,roll_3_nonce,roll_4_nonce,roll_5_nonce,roll_6_nonce,roll_7_nonce,roll_8_nonce,roll_9_nonce,roll_10_nonce,roll_11_nonce,roll_12_nonce,roll_13_nonce,roll_14_nonce,roll_15_nonce,roll_16_nonce,roll_17_nonce,roll_18_nonce,roll_19_nonce,roll_20_nonce,roll_21_nonce,roll_22_nonce,roll_23_nonce,roll_24_nonce,roll_25_nonce,roll_26_nonce,roll_27_nonce,roll_28_nonce,roll_29_nonce,roll_30_nonce,roll_31_nonce,roll_32_nonce,roll_33_nonce,roll_34_nonce,roll_35_nonce,roll_36_nonce,roll_37_nonce,roll_38_nonce,roll_39_nonce,roll_40_nonce,roll_41_nonce,roll_42_nonce,roll_43_nonce,roll_44_nonce,roll_45_nonce,roll_46_nonce,roll_47_nonce,roll_48_nonce,roll_49_nonce,roll_50_nonce,roll_51_nonce,roll_52_nonce,roll_53_nonce,roll_54_nonce,roll_55_nonce,roll_56_nonce,roll_57_nonce,roll_58_nonce,roll_59_nonce,roll_60_nonce
0,False,e264cb9168ffd0be654594e8fc545d71c3aa1ba0e6f7e543b000f0c997cbaa3f,1c5ea7091200ef097e44fa32587a8d6119374dce2cf9763ff9ed568d7ac7c956,2722,woxpwoxpwoxpwoxpwoxp3a964c1e4af510dff04f6bc911e553b4dd7150a51a1f5ee6e833ceeeaca7633d,999986,Above,4868,2003,6381,2581,9297,971,2849,6068,2304,1888,8738,4477,1958,2854,5012,7686,7507,217,1991,5259,4979,3586,5869,2919,5861,6888,8476,8207,2737,8688,4573,2347,6867,3725,5883,3873,991,2222,7632,9157,1076,9662,805,6553,7776,1208,5737,3135,4446,7610,2091,4173,4277,1212,5633,7419,6693,8907,4951,6226,4614,4459,2003,4415,6175,6542,5853,6051,8137,5054,331,6629,5911,3093,3695,1019,8077,4646,7916,3682,3384,2534,5518,1504,4230,2321,2373,4679,9710,4160,1987,6854,9510,920,5950,3807,9289,2383,9629,3476,181,5271,1376,2822,8760,5934,6050,3430,4458,4734,3002,3297,5962,8946,1581,9499,2170,7560,4854,6608,3089,6573,9249
1,False,e264cb9168ffd0be654594e8fc545d71c3aa1ba0e6f7e543b000f0c997cbaa3f,1c5ea7091200ef097e44fa32587a8d6119374dce2cf9763ff9ed568d7ac7c956,2722,woxpwoxpwoxpwoxpwoxp37effccb1c2c5aa1165724b7d96e5545a8d0359d22c34fa331decb1aad77ae7f,999902,Above,676,8006,936,4206,9371,6470,8355,4447,2148,3355,4461,5187,7028,9027,291,1353,6626,5271,6866,973,2360,4258,1252,4774,8396,5399,7474,6183,7826,4591,6294,9525,1591,8970,6943,7032,6136,8353,5343,2236,3687,7199,1451,1907,4188,6240,3725,8326,2644,4350,3267,5933,8404,7458,1118,3364,8249,1193,4287,8837,6390,5843,8006,3780,2442,4483,9487,5921,151,8744,788,1043,8078,2389,467,4303,7328,4038,8628,9101,8635,9730,4971,7949,6122,3998,3402,6707,6105,5697,7880,3178,142,8988,4720,2604,6281,6370,6867,1522,1316,1373,7371,4900,8276,3500,2691,3191,1448,1375,8028,2622,8223,9382,7305,9178,3571,3117,7646,7509,7102,2951,8390
2,True,e264cb9168ffd0be654594e8fc545d71c3aa1ba0e6f7e543b000f0c997cbaa3f,1c5ea7091200ef097e44fa32587a8d6119374dce2cf9763ff9ed568d7ac7c956,2722,woxpwoxpwoxpwoxpwoxp31c534399d36e2942a285e07710cfa3facf15b94edc81af38826f9d6aef1679f,999860,Above,9335,8004,1023,4578,7091,7460,8266,1128,4415,7159,7520,2391,3461,7004,3763,3100,7618,6627,2252,5495,1607,1017,5308,4919,8508,3276,8834,1119,4273,2782,2971,7334,4442,5691,6967,1941,9878,3745,2207,6698,2736,3182,6555,6476,7996,7575,1232,3346,8703,4751,9827,9330,8518,7831,8404,1309,3285,1236,4517,3270,7156,3350,8004,7875,1467,3127,928,1487,6,2048,113,143,6683,9947,4183,9870,7083,8229,1168,3935,6136,9573,3502,659,9038,2843,1117,5757,8804,1441,293,1643,4234,9389,5933,1664,3255,7791,3212,7556,1218,4169,5862,4921,4726,2306,3256,4373,4868,8106,9875,9598,7697,8400,8178,6161,4982,3738,966,6310,4619,8026,1455
3,True,e264cb9168ffd0be654594e8fc545d71c3aa1ba0e6f7e543b000f0c997cbaa3f,1c5ea7091200ef097e44fa32587a8d6119374dce2cf9763ff9ed568d7ac7c956,2722,woxpwoxpwoxpwoxpwoxpd509e0f9b503df969fb4948484ba53d6247889e44b3002e81a5151fdac6d4ee2,999842,Above,6972,6008,2455,8014,3355,6519,5705,4307,2693,1198,9149,427,134,7554,9686,1803,1855,8177,8281,4823,7949,7327,8273,7662,2485,9301,5733,2516,4489,99,5748,1933,9246,1294,913,113,4328,2917,5906,3308,5681,1063,1181,1902,5010,3107,5885,2483,9100,2476,3306,1509,6768,5771,2191,4357,802,461,4068,3677,7299,6939,6008,7507,6533,7029,1704,9179,1726,9228,2288,8330,7588,5330,5584,7370,8169,5358,8660,9888,8465,6128,1969,7965,2786,4442,5953,9163,3976,7641,6137,7065,5915,707,9538,6367,2888,5419,1255,8208,5732,8763,2945,5614,8562,1198,4469,5721,3371,8437,6291,9740,541,3597,5215,4911,2918,757,3941,2660,7259,2678,6785
4,False,e264cb9168ffd0be654594e8fc545d71c3aa1ba0e6f7e543b000f0c997cbaa3f,1c5ea7091200ef097e44fa32587a8d6119374dce2cf9763ff9ed568d7ac7c956,2722,woxpwoxpwoxpwoxpwoxpcfb3e36aee8d1e161bdb2d655e1e10e5ded26579ff0685f136252fc2aea7a953,999830,Above,2877,9008,1681,4312,8509,4140,65,4407,4523,2107,3134,9115,5502,4565,7069,1103,8261,891,9257,7918,702,4398,5079,9245,8582,9700,1540,262,6994,4674,1244,8394,8746,9617,7537,9338,1887,6977,3874,6576,2539,4440,5089,798,4639,2985,5357,9548,1185,1314,8857,854,5904,4452,2253,4030,7235,6875,1947,720,1757,3445,9008,908,134,2736,6017,2066,9983,3285,3999,135,5545,4263,9602,5398,1203,2984,7119,2965,5544,7135,8078,4901,762,840,5530,2119,6359,6175,2726,8230,3915,8114,9004,5964,3805,5130,2245,6073,4976,8934,8465,7148,9229,4223,6685,3370,4941,8290,223,8877,8810,4632,7444,2131,4388,4214,3227,229,1644,5124,1617


In [173]:
HIGH_TARGET= 9000 #8944
LOW_TARGET = 1000 #1056
ZERO_COL_CUTOFF = 4

In [174]:
def create_target_mask(data,bHighCheck=HIGH_ANALYSIS):
    if bHighCheck:
        mask = data['roll_actual'] > HIGH_TARGET
    else:
        mask = data['roll_actual'] < LOW_TARGET   
    return mask

Prediction

In [175]:
def gen_test_data(cur_nonce,cur_hash,cur_seed,file_suffix=""):
    test_data = pd.DataFrame(columns=['seed','hash','nonce'])
    test_data.loc[0,'seed']=cur_seed
    test_data.loc[0,'hash']=cur_hash
    test_data.loc[0,'nonce'] = cur_nonce

    print(len(test_data))
    print(test_data.head())
    
    if PATTERN_99:
        match_digit_arr=np.array([9,9])
        mismatch_digit_arr=np.array([9,9])
    else:
        match_digit_arr=np.array([0,0])
        mismatch_digit_arr=np.array([0,0])

    match_digit_indices = np.array([1,2])
    match_digit_count_arr=np.array([1])
    mismatch_digit_indices = np.array([0,3]) 
    
    if PATTERN_99:
        file_pattern_str = 'x99x'
    else:
        file_pattern_str = 'x00x'
        
    filename = f'data/results_df_test_{nonce}_{file_pattern_str}_pattern{file_suffix}.csv'
    
    if  READ_FROM_FILE_TEST:
        results_df_test = pd.read_csv(filename)
#         results_df_test = pd.read_csv(filename, converters={'roll_hash_list': pd.eval,
#                                                         'roll_hash_list_nonce': pd.eval})
    else:

        cur_client_seed_array = client_seed_array_2[:1000001]
        results_df_test = None

        last_index_test =  len(cur_client_seed_array)-1 #1024
        start = 0

        results_df_test = generate_data(test_data,
                                    cur_client_seed_array,
                                     ABOVE,
                            match_digit_arr,match_digit_count_arr,match_digit_indices=match_digit_indices,
                            mismatch_digit_arr=mismatch_digit_arr,mismatch_digit_indices=mismatch_digit_indices,
                            match_count_exp=test_client_size, #train_client_size,
                            feature_chain_length=feature_chain_length,
                            is_data_hash = False,
                            test_limit=1,
                            last_index=last_index_test,result_df=results_df_test,
                            print_client_scan=False)
        #Remove client seed duplicates from test data
        results_df_test= results_df_test.drop_duplicates(subset='client_seed')
        
    if not(READ_FROM_FILE_TEST) or OLD_VERSION_READ:
        results_df_test=save_cleaned_results_df(results_df_test,True,
                            file_suffix=file_suffix,cleaned_suffix= False)

#         results_df_test.to_csv(filename,index=False)
        
    print('Test Data Generation Completed')    
        
#         row = results_df_test.iloc[len(results_df_test)-1]
#         last_index_test = row['client_index']   

    return results_df_test
 

In [176]:
%%time
READ_FROM_FILE_TEST=False
# seed='dummy'
seed = generate_hash_chain(hashval,26)[25]
print('seed:',seed)
results_df_test=gen_test_data(nonce,hashval,seed,file_suffix="")
print()
print(results_df_test[['seed','hash','nonce']].head(1))

seed: 1c5ea7091200ef097e44fa32587a8d6119374dce2cf9763ff9ed568d7ac7c956
1
                                                               seed  \
0  1c5ea7091200ef097e44fa32587a8d6119374dce2cf9763ff9ed568d7ac7c956   

                                                               hash nonce  
0  fef10d00fcd97a523a39169b7c5f6fadaf15f063acdbc31b4ae0ff7126a973d6  2722  
server_count:0

999999 999999
No further match for seed:1c5ea7091200ef097e44fa32587a8d6119374dce2cf9763ff9ed568d7ac7c956 nonce:2722 match count: 8163

Test Data Generation Completed

                                                               seed  \
0  1c5ea7091200ef097e44fa32587a8d6119374dce2cf9763ff9ed568d7ac7c956   

                                                               hash  nonce  
0  fef10d00fcd97a523a39169b7c5f6fadaf15f063acdbc31b4ae0ff7126a973d6   2722  
CPU times: user 30 s, sys: 23.5 ms, total: 30 s
Wall time: 30 s


In [177]:
%%time
READ_FROM_FILE_TEST=False
#Generate hash test data
print(f'{hashval=}')
hash_l2=generate_hash(hashval.encode())
print(f'{hash_l2=}')
results_df_test_hash=gen_test_data(nonce,hash_l2,hashval,file_suffix="_hash")
print()
print(results_df_test_hash.head(1))

hashval='fef10d00fcd97a523a39169b7c5f6fadaf15f063acdbc31b4ae0ff7126a973d6'
hash_l2='ce112aa083d921c2b2045e24952723a1c8789e6a4596f439c594900a5bddab5f'
1
                                                               seed  \
0  fef10d00fcd97a523a39169b7c5f6fadaf15f063acdbc31b4ae0ff7126a973d6   

                                                               hash nonce  
0  ce112aa083d921c2b2045e24952723a1c8789e6a4596f439c594900a5bddab5f  2722  
server_count:0

999999 999999
No further match for seed:fef10d00fcd97a523a39169b7c5f6fadaf15f063acdbc31b4ae0ff7126a973d6 nonce:2722 match count: 8098

Test Data Generation Completed

   match                                                              seed  \
0  False  fef10d00fcd97a523a39169b7c5f6fadaf15f063acdbc31b4ae0ff7126a973d6   

                                                               hash  nonce  \
0  ce112aa083d921c2b2045e24952723a1c8789e6a4596f439c594900a5bddab5f   2722   

                                                        

In [178]:
# %%time
# READ_FROM_FILE_TEST=False
# cur_seed = cur_hash_list[25]
# cur_hash = cur_hash_list[26]
# print(f'{cur_seed=}')
# print(f'{cur_hash=}')

# results_df_test_hash25=gen_test_data(nonce,cur_hash,cur_seed,file_suffix="_hash25")
# print()
# print(results_df_test_hash25.head(1))

In [179]:
# %%time
# READ_FROM_FILE_TEST=False
# cur_seed = cur_hash_list[24]
# cur_hash = cur_hash_list[25]
# print(f'{cur_seed=}')
# print(f'{cur_hash=}')

# results_df_test_hash24=gen_test_data(nonce,cur_hash,cur_seed,file_suffix="_hash24")
# print()
# print(results_df_test_hash24.head(1))


In [180]:
# %%time
# results_df_test=save_cleaned_results_df(results_df_test,True)
# # results_df_test_hash=save_cleaned_results_df(results_df_test_hash,True)
# results_df_test_hash_cleaned=save_cleaned_results_df(results_df_test_hash,True,'_hash')
# # results_df_test_cleaned=save_cleaned_results_df(results_df_test_hash25,True)
# # results_df_test_cleaned=save_cleaned_results_df(results_df_test_hash24,True)

##### Feature Generation

In [181]:
def generate_roll_features_k(rolls_df,roll_start,roll_end,suffix="",stat_suffix=""):
    first_k_roll_cols = [f'roll_{i}{suffix}' for i in range(roll_start,roll_end+1)]
    rolls_df[f'count_gt_9000_k{stat_suffix}'] = (rolls_df[first_k_roll_cols] >= 9000).sum(axis=1)
    rolls_df[f'count_lt_1000_k{stat_suffix}'] = (rolls_df[first_k_roll_cols] < 1000).sum(axis=1)
    
    return rolls_df
    
def generate_roll_features(rolls_df,
                           roll_start=1, roll_end=20,k=4,
                          suffix="",stat_suffix=""):
    cols_roll = [f'roll_{i}{suffix}' for i in range(roll_start,roll_end+1)]
#     print(cols_roll)
    
    if (roll_start==1) & (roll_end==20):
        stat_suffix = suffix
    else:
        stat_suffix = f'_{roll_start}_{roll_end}{suffix}'
    
    rolls_df[f'count_lt_1000{stat_suffix}'] = ( (rolls_df[cols_roll] / 1000).astype('int')==0).sum(axis=1)
    rolls_df[f'count_gt_9000{stat_suffix}'] = ( (rolls_df[cols_roll] / 1000).astype('int')==9).sum(axis=1)
    rolls_df[f'count_gt_8000{stat_suffix}'] = ( (rolls_df[cols_roll] / 1000).astype('int')==8).sum(axis=1)
    rolls_df[f'count_gt_7000{stat_suffix}'] = ( (rolls_df[cols_roll] / 1000).astype('int')==7).sum(axis=1)
    rolls_df[f'count_gt_6000{stat_suffix}'] = ( (rolls_df[cols_roll] / 1000).astype('int')==6).sum(axis=1)
    rolls_df[f'count_gt_5000{stat_suffix}'] = ( (rolls_df[cols_roll] / 1000).astype('int')==5).sum(axis=1)
    rolls_df[f'count_gt_4000{stat_suffix}'] = ( (rolls_df[cols_roll] / 1000).astype('int')==4).sum(axis=1)
    rolls_df[f'count_gt_3000{stat_suffix}'] = ( (rolls_df[cols_roll] / 1000).astype('int')==3).sum(axis=1)
    rolls_df[f'count_gt_2000{stat_suffix}'] = ( (rolls_df[cols_roll] / 1000).astype('int')==2).sum(axis=1)
    rolls_df[f'count_gt_1000{stat_suffix}'] = ( (rolls_df[cols_roll] / 1000).astype('int')==1).sum(axis=1)
    
    rolls_df[f'count_hi{stat_suffix}'] = (rolls_df[cols_roll] > 5250).sum(axis=1)
    rolls_df[f'count_lo{stat_suffix}'] = (rolls_df[cols_roll] < 4750).sum(axis=1)    
    rolls_df[f'count_lo_hi{stat_suffix}'] = rolls_df[f'count_hi{stat_suffix}'] \
                                            + rolls_df[f'count_lo{stat_suffix}']   


    rolls_df=generate_roll_features_k(rolls_df,roll_start,roll_start+k,
                                      suffix=suffix,stat_suffix=stat_suffix)
    
    roll_list_strs = np.full(len(rolls_df),"")
    for col in cols_roll:
        roll_list_strs = rolls_df[col].astype('str') + roll_list_strs
    
    rolls_df[f'total_dig_9{stat_suffix}']= roll_list_strs.str.count('9')
    rolls_df[f'total_dig_0{stat_suffix}']= roll_list_strs.str.count('0')
    
    rolls_df[f'roll_mean{stat_suffix}'] = (rolls_df[cols_roll]).mean(axis=1)
    rolls_df[f'roll_std{stat_suffix}'] = (rolls_df[cols_roll]).std(axis=1)
     
    print(f'Roll Features from {roll_start} to {roll_end} {suffix} completed' )
    
    return rolls_df

#Modification: 
#1. supported both raw roll_hash_list and already generated roll columns if present
#2. added client index
def generate_features_full(initial_df,istrain,feature_chain_length):
    
    nonce_suffix = '_nonce'
    if 'roll_1' in list(initial_df.columns):
        print('roll columns present')
        rolls_cols = [f'roll_{i}' for i in range(feature_chain_length+1)] 
        rolls_df = initial_df[rolls_cols]
        rolls_cols_nonce = [f'roll_{i}{nonce_suffix}' for i in range(feature_chain_length+1)] 
        rolls_df_nonce = initial_df[rolls_cols_nonce]
    else:
        print('roll columns not present')
        rolls_df = initial_df['roll_hash_list'].apply(pd.Series)
        rolls_df.columns = [f'roll_{i}' for i in range(rolls_df.shape[1])]    
        rolls_df_nonce = initial_df['roll_hash_list_nonce'].apply(pd.Series)
        rolls_df_nonce.columns = [f'roll_{i}{nonce_suffix}' for i in range(rolls_df_nonce.shape[1])] 
    
    #generate top 20 hash features
    rolls_df=generate_roll_features(rolls_df,roll_start=1,
                                    roll_end=20, k=4)
    rolls_df_nonce=generate_roll_features(rolls_df_nonce,
                                    suffix=nonce_suffix,roll_start=1,
                                    roll_end=20 , k=4)
    
    #generate 25's hash features
    roll_start =25
    roll_end = 50
    rolls_df=generate_roll_features(rolls_df,
                                    roll_start=roll_start,
                                    roll_end=roll_end, k=5)
    rolls_df_nonce=generate_roll_features(rolls_df_nonce,
                                    suffix=nonce_suffix ,
                                    roll_start=roll_start,
                                    roll_end=roll_end, k=5)  

    #generate 50's hash features
    roll_start =50
    roll_end = 60
    rolls_df=generate_roll_features(rolls_df,
                                    roll_start=roll_start,
                                    roll_end=roll_end, k=5)
    rolls_df_nonce=generate_roll_features(rolls_df_nonce,
                                    suffix=nonce_suffix ,roll_start=roll_start,
                                    roll_end=roll_end, k=5)      
    #generate for full hash features
    roll_start =1
    roll_end = feature_chain_length
    rolls_df=generate_roll_features(rolls_df,
                                    roll_start=roll_start,
                                    roll_end=roll_end, k=5)
    rolls_df_nonce=generate_roll_features(rolls_df_nonce,
                                    suffix=nonce_suffix ,roll_start=roll_start,
                                    roll_end=roll_end, k=5)        
    if istrain:
        features_df = pd.concat([initial_df[['roll_actual','seed',
                                             'hash','client_seed','client_index']],
                                 rolls_df,
                                 rolls_df_nonce],axis=1)
        mask = create_target_mask(features_df)
            
        features_df['target'] =0
        features_df.loc[mask,'target'] = 1
        features_df['roll_actual']=features_df['roll_actual'].astype('int')
        print(features_df['target'].value_counts())
    else:
        features_df = pd.concat([initial_df[['roll_actual','seed',
                                             'hash','client_seed','client_index']],
                                             rolls_df,rolls_df_nonce],axis=1)
    return features_df


In [182]:
%%time
# train = generate_features_full(results_df[train_client_size:],True,feature_chain_length)
train = generate_features_full(results_df,True,feature_chain_length)
print(len(train))
train.head()

roll columns present
Roll Features from 1 to 20  completed
Roll Features from 1 to 20 _nonce completed
Roll Features from 25 to 50  completed
Roll Features from 25 to 50 _nonce completed
Roll Features from 50 to 60  completed
Roll Features from 50 to 60 _nonce completed
Roll Features from 1 to 60  completed
Roll Features from 1 to 60 _nonce completed
0    68311
1     7689
Name: target, dtype: int64
76000
CPU times: user 14.3 s, sys: 3.26 s, total: 17.5 s
Wall time: 17.5 s


Unnamed: 0,roll_actual,seed,hash,client_seed,client_index,roll_0,roll_1,roll_2,roll_3,roll_4,roll_5,roll_6,roll_7,roll_8,roll_9,roll_10,roll_11,roll_12,roll_13,roll_14,roll_15,roll_16,roll_17,roll_18,roll_19,roll_20,roll_21,roll_22,roll_23,roll_24,roll_25,roll_26,roll_27,roll_28,roll_29,roll_30,roll_31,roll_32,roll_33,roll_34,roll_35,roll_36,roll_37,roll_38,roll_39,roll_40,roll_41,roll_42,roll_43,roll_44,roll_45,roll_46,roll_47,roll_48,roll_49,roll_50,roll_51,roll_52,roll_53,roll_54,roll_55,roll_56,roll_57,roll_58,roll_59,roll_60,count_lt_1000,count_gt_9000,count_gt_8000,count_gt_7000,count_gt_6000,count_gt_5000,count_gt_4000,count_gt_3000,count_gt_2000,count_gt_1000,count_hi,count_lo,count_lo_hi,count_gt_9000_k,count_lt_1000_k,total_dig_9,total_dig_0,roll_mean,roll_std,count_lt_1000_25_50,count_gt_9000_25_50,count_gt_8000_25_50,count_gt_7000_25_50,count_gt_6000_25_50,count_gt_5000_25_50,count_gt_4000_25_50,count_gt_3000_25_50,count_gt_2000_25_50,count_gt_1000_25_50,count_hi_25_50,count_lo_25_50,count_lo_hi_25_50,count_gt_9000_k_25_50,count_lt_1000_k_25_50,total_dig_9_25_50,total_dig_0_25_50,roll_mean_25_50,roll_std_25_50,count_lt_1000_50_60,count_gt_9000_50_60,count_gt_8000_50_60,count_gt_7000_50_60,count_gt_6000_50_60,count_gt_5000_50_60,count_gt_4000_50_60,count_gt_3000_50_60,count_gt_2000_50_60,count_gt_1000_50_60,count_hi_50_60,count_lo_50_60,count_lo_hi_50_60,count_gt_9000_k_50_60,count_lt_1000_k_50_60,total_dig_9_50_60,total_dig_0_50_60,roll_mean_50_60,roll_std_50_60,count_lt_1000_1_60,count_gt_9000_1_60,count_gt_8000_1_60,count_gt_7000_1_60,count_gt_6000_1_60,count_gt_5000_1_60,count_gt_4000_1_60,count_gt_3000_1_60,count_gt_2000_1_60,count_gt_1000_1_60,count_hi_1_60,count_lo_1_60,count_lo_hi_1_60,count_gt_9000_k_1_60,count_lt_1000_k_1_60,total_dig_9_1_60,total_dig_0_1_60,roll_mean_1_60,roll_std_1_60,roll_0_nonce,roll_1_nonce,roll_2_nonce,roll_3_nonce,roll_4_nonce,roll_5_nonce,roll_6_nonce,roll_7_nonce,roll_8_nonce,roll_9_nonce,roll_10_nonce,roll_11_nonce,roll_12_nonce,roll_13_nonce,roll_14_nonce,roll_15_nonce,roll_16_nonce,roll_17_nonce,roll_18_nonce,roll_19_nonce,roll_20_nonce,roll_21_nonce,roll_22_nonce,roll_23_nonce,roll_24_nonce,roll_25_nonce,roll_26_nonce,roll_27_nonce,roll_28_nonce,roll_29_nonce,roll_30_nonce,roll_31_nonce,roll_32_nonce,roll_33_nonce,roll_34_nonce,roll_35_nonce,roll_36_nonce,roll_37_nonce,roll_38_nonce,roll_39_nonce,roll_40_nonce,roll_41_nonce,roll_42_nonce,roll_43_nonce,roll_44_nonce,roll_45_nonce,roll_46_nonce,roll_47_nonce,roll_48_nonce,roll_49_nonce,roll_50_nonce,roll_51_nonce,roll_52_nonce,roll_53_nonce,roll_54_nonce,roll_55_nonce,roll_56_nonce,roll_57_nonce,roll_58_nonce,roll_59_nonce,roll_60_nonce,count_lt_1000_nonce,count_gt_9000_nonce,count_gt_8000_nonce,count_gt_7000_nonce,count_gt_6000_nonce,count_gt_5000_nonce,count_gt_4000_nonce,count_gt_3000_nonce,count_gt_2000_nonce,count_gt_1000_nonce,count_hi_nonce,count_lo_nonce,count_lo_hi_nonce,count_gt_9000_k_nonce,count_lt_1000_k_nonce,total_dig_9_nonce,total_dig_0_nonce,roll_mean_nonce,roll_std_nonce,count_lt_1000_25_50_nonce,count_gt_9000_25_50_nonce,count_gt_8000_25_50_nonce,count_gt_7000_25_50_nonce,count_gt_6000_25_50_nonce,count_gt_5000_25_50_nonce,count_gt_4000_25_50_nonce,count_gt_3000_25_50_nonce,count_gt_2000_25_50_nonce,count_gt_1000_25_50_nonce,count_hi_25_50_nonce,count_lo_25_50_nonce,count_lo_hi_25_50_nonce,count_gt_9000_k_25_50_nonce,count_lt_1000_k_25_50_nonce,total_dig_9_25_50_nonce,total_dig_0_25_50_nonce,roll_mean_25_50_nonce,roll_std_25_50_nonce,count_lt_1000_50_60_nonce,count_gt_9000_50_60_nonce,count_gt_8000_50_60_nonce,count_gt_7000_50_60_nonce,count_gt_6000_50_60_nonce,count_gt_5000_50_60_nonce,count_gt_4000_50_60_nonce,count_gt_3000_50_60_nonce,count_gt_2000_50_60_nonce,count_gt_1000_50_60_nonce,count_hi_50_60_nonce,count_lo_50_60_nonce,count_lo_hi_50_60_nonce,count_gt_9000_k_50_60_nonce,count_lt_1000_k_50_60_nonce,total_dig_9_50_60_nonce,total_dig_0_50_60_nonce,roll_mean_50_60_nonce,roll_std_50_60_nonce,count_lt_1000_1_60_nonce,count_gt_9000_1_60_nonce,count_gt_8000_1_60_nonce,count_gt_7000_1_60_nonce,count_gt_6000_1_60_nonce,count_gt_5000_1_60_nonce,count_gt_4000_1_60_nonce,count_gt_3000_1_60_nonce,count_gt_2000_1_60_nonce,count_gt_1000_1_60_nonce,count_hi_1_60_nonce,count_lo_1_60_nonce,count_lo_hi_1_60_nonce,count_gt_9000_k_1_60_nonce,count_lt_1000_k_1_60_nonce,total_dig_9_1_60_nonce,total_dig_0_1_60_nonce,roll_mean_1_60_nonce,roll_std_1_60_nonce,target
0,4868,e264cb9168ffd0be654594e8fc545d71c3aa1ba0e6f7e543b000f0c997cbaa3f,1c5ea7091200ef097e44fa32587a8d6119374dce2cf9763ff9ed568d7ac7c956,woxpwoxpwoxpwoxpwoxp3a964c1e4af510dff04f6bc911e553b4dd7150a51a1f5ee6e833ceeeaca7633d,999986,2003,6381,2581,9297,971,2849,6068,2304,1888,8738,4477,1958,2854,5012,7686,7507,217,1991,5259,4979,3586,5869,2919,5861,6888,8476,8207,2737,8688,4573,2347,6867,3725,5883,3873,991,2222,7632,9157,1076,9662,805,6553,7776,1208,5737,3135,4446,7610,2091,4173,4277,1212,5633,7419,6693,8907,4951,6226,4614,4459,2,1,1,2,2,2,2,1,4,3,7,11,18,1,1,10,4,4330.15,2633.841566,2,2,3,3,2,2,3,3,4,2,12,14,26,0,0,5,6,4986.538462,2849.410104,0,0,1,1,2,1,5,0,0,1,5,5,10,0,0,5,1,5324.0,2018.196423,4,3,5,6,7,7,9,4,9,6,27,30,57,1,1,23,11,4869.683333,2582.001139,2003,4415,6175,6542,5853,6051,8137,5054,331,6629,5911,3093,3695,1019,8077,4646,7916,3682,3384,2534,5518,1504,4230,2321,2373,4679,9710,4160,1987,6854,9510,920,5950,3807,9289,2383,9629,3476,181,5271,1376,2822,8760,5934,6050,3430,4458,4734,3002,3297,5962,8946,1581,9499,2170,7560,4854,6608,3089,6573,9249,1,0,2,1,4,4,2,4,1,1,10,9,19,0,0,6,5,4933.1,2187.915564,2,4,1,0,2,4,4,5,2,2,11,15,26,2,0,13,12,4908.884615,2767.90061,0,2,1,1,2,1,1,1,1,1,7,3,10,1,0,8,4,6008.272727,2803.460365,3,6,4,2,8,8,8,10,6,5,27,31,58,0,0,26,23,4947.5,2598.463187,0
1,676,e264cb9168ffd0be654594e8fc545d71c3aa1ba0e6f7e543b000f0c997cbaa3f,1c5ea7091200ef097e44fa32587a8d6119374dce2cf9763ff9ed568d7ac7c956,woxpwoxpwoxpwoxpwoxp37effccb1c2c5aa1165724b7d96e5545a8d0359d22c34fa331decb1aad77ae7f,999902,8006,936,4206,9371,6470,8355,4447,2148,3355,4461,5187,7028,9027,291,1353,6626,5271,6866,973,2360,4258,1252,4774,8396,5399,7474,6183,7826,4591,6294,9525,1591,8970,6943,7032,6136,8353,5343,2236,3687,7199,1451,1907,4188,6240,3725,8326,2644,4350,3267,5933,8404,7458,1118,3364,8249,1193,4287,8837,6390,5843,3,2,1,1,3,2,4,1,2,1,8,11,19,1,1,5,5,4649.45,2757.034915,0,1,3,4,5,2,3,3,2,3,15,11,26,1,0,10,5,5439.0,2381.960705,0,0,3,1,1,2,1,1,0,2,7,4,11,0,0,4,2,5552.363636,2753.409968,3,3,8,6,9,6,9,5,4,7,31,27,58,1,1,21,12,5156.116667,2591.910656,8006,3780,2442,4483,9487,5921,151,8744,788,1043,8078,2389,467,4303,7328,4038,8628,9101,8635,9730,4971,7949,6122,3998,3402,6707,6105,5697,7880,3178,142,8988,4720,2604,6281,6370,6867,1522,1316,1373,7371,4900,8276,3500,2691,3191,1448,1375,8028,2622,8223,9382,7305,9178,3571,3117,7646,7509,7102,2951,8390,3,3,4,1,0,1,4,1,2,1,9,10,19,1,0,6,7,5225.35,3312.113513,1,0,4,2,5,1,2,3,3,5,12,13,25,0,1,5,11,4668.269231,2703.409973,0,2,2,4,0,0,0,2,1,0,8,3,11,2,0,5,4,6761.272727,2392.247817,4,5,9,8,6,2,6,8,6,6,30,28,58,1,1,20,23,5225.066667,2871.415961,1
2,9335,e264cb9168ffd0be654594e8fc545d71c3aa1ba0e6f7e543b000f0c997cbaa3f,1c5ea7091200ef097e44fa32587a8d6119374dce2cf9763ff9ed568d7ac7c956,woxpwoxpwoxpwoxpwoxp31c534399d36e2942a285e07710cfa3facf15b94edc81af38826f9d6aef1679f,999860,8004,1023,4578,7091,7460,8266,1128,4415,7159,7520,2391,3461,7004,3763,3100,7618,6627,2252,5495,1607,1017,5308,4919,8508,3276,8834,1119,4273,2782,2971,7334,4442,5691,6967,1941,9878,3745,2207,6698,2736,3182,6555,6476,7996,7575,1232,3346,8703,4751,9827,9330,8518,7831,8404,1309,3285,1236,4517,3270,7156,3350,0,0,1,6,1,1,2,3,2,4,9,11,20,0,0,4,10,4648.75,2554.355576,0,3,2,3,4,1,3,3,4,3,13,12,25,0,0,11,3,5407.346154,2770.703751,0,1,2,2,0,0,1,3,0,2,5,6,11,1,0,2,5,5291.454545,3018.642488,0,3,6,11,5,3,7,10,6,9,28,30,58,0,0,18,19,5074.216667,2639.428348,8004,7875,1467,3127,928,1487,6,2048,113,143,6683,9947,4183,9870,7083,8229,1168,3935,6136,9573,3502,659,9038,2843,1117,5757,8804,1441,293,1643,4234,9389,5933,1664,3255,7791,3212,7556,1218,4169,5862,4921,4726,2306,3256,4373,4868,8106,9875,9598,7697,8400,8178,6161,4982,3738,966,6310,4619,8026,1455,4,3,1,2,2,0,1,3,1,3,8,12,20,0,1,7,4,4375.15,3503.511761,1,3,2,3,0,3,6,3,1,4,11,13,24,0,1,11,3,5074.884615,2840.845907,1,0,3,1,2,0,2,1,0,1,6,4,10,0,0,4,4,5502.909091,2625.840949,7,7,6,5,4,3,9,7,3,9,25,32,57,0,2,23,12,4765.7,3078.016505,0
3,6972,e264cb9168ffd0be654594e8fc545d71c3aa1ba0e6f7e543b000f0c997cbaa3f,1c5ea7091200ef097e44fa32587a8d6119374dce2cf9763ff9ed568d7ac7c956,woxpwoxpwoxpwoxpwoxpd509e0f9b503df969fb4948484ba53d6247889e44b3002e81a5151fdac6d4ee2,999842,6008,2455,8014,3355,6519,5705,4307,2693,1198,9149,427,134,7554,9686,1803,1855,8177,8281,4823,7949,7327,8273,7662,2485,9301,5733,2516,4489,99,5748,1933,9246,1294,913,113,4328,2917,5906,3308,5681,1063,1181,1902,5010,3107,5885,2483,9100,2476,3306,1509,6768,5771,2191,4357,802,461,4068,3677,7299,6939,2,2,3,3,1,1,2,1,2,3,10,9,19,0,0,8,4,5070.55,3134.097444,3,2,0,0,0,6,2,3,4,6,7,18,25,0,1,12,11,3509.461538,2474.663989,2,0,0,1,2,1,2,1,1,1,4,7,11,0,1,6,3,3985.636364,2505.649747,7,5,4,5,3,8,6,5,8,9,24,34,58,0,0,26,18,4378.516667,2849.186,6008,7507,6533,7029,1704,9179,1726,9228,2288,8330,7588,5330,5584,7370,8169,5358,8660,9888,8465,6128,1969,7965,2786,4442,5953,9163,3976,7641,6137,7065,5915,707,9538,6367,2888,5419,1255,8208,5732,8763,2945,5614,8562,1198,4469,5721,3371,8437,6291,9740,541,3597,5215,4911,2918,757,3941,2660,7259,2678,6785,0,3,4,4,2,3,0,0,1,3,16,4,20,1,0,8,7,6401.65,2630.940642,2,3,4,2,3,5,1,2,2,2,17,9,26,1,0,10,4,5602.423077,2813.405796,2,0,0,1,1,1,1,2,3,0,2,7,9,0,2,5,1,3751.090909,2176.317231,3,6,8,8,6,10,3,4,7,5,37,21,58,1,0,25,12,5592.716667,2657.247353,0
4,2877,e264cb9168ffd0be654594e8fc545d71c3aa1ba0e6f7e543b000f0c997cbaa3f,1c5ea7091200ef097e44fa32587a8d6119374dce2cf9763ff9ed568d7ac7c956,woxpwoxpwoxpwoxpwoxpcfb3e36aee8d1e161bdb2d655e1e10e5ded26579ff0685f136252fc2aea7a953,999830,9008,1681,4312,8509,4140,65,4407,4523,2107,3134,9115,5502,4565,7069,1103,8261,891,9257,7918,702,4398,5079,9245,8582,9700,1540,262,6994,4674,1244,8394,8746,9617,7537,9338,1887,6977,3874,6576,2539,4440,5089,798,4639,2985,5357,9548,1185,1314,8857,854,5904,4452,2253,4030,7235,6875,1947,720,1757,3445,3,2,2,2,0,1,6,1,1,2,7,13,20,0,1,7,8,4582.95,2970.73256,3,3,3,1,3,2,3,1,2,5,11,14,25,0,1,12,3,4817.884615,3165.220439,2,0,0,1,1,1,2,1,1,2,3,8,11,0,1,2,4,3588.363636,2322.907414,7,7,6,4,4,5,11,3,4,9,24,34,58,0,1,24,18,4802.466667,3002.20714,9008,908,134,2736,6017,2066,9983,3285,3999,135,5545,4263,9602,5398,1203,2984,7119,2965,5544,7135,8078,4901,762,840,5530,2119,6359,6175,2726,8230,3915,8114,9004,5964,3805,5130,2245,6073,4976,8934,8465,7148,9229,4223,6685,3370,4941,8290,223,8877,8810,4632,7444,2131,4388,4214,3227,229,1644,5124,1617,3,2,1,2,1,3,1,2,4,1,9,11,20,0,2,11,6,4454.95,2949.656617,1,2,7,1,4,2,3,3,3,0,15,8,23,0,0,11,9,5924.230769,2547.312973,1,0,1,1,0,1,3,1,1,2,2,8,10,0,0,1,1,3950.909091,2576.245775,7,4,8,4,5,7,8,6,8,3,26,29,55,1,2,24,18,4830.2,2772.344671,0


In [183]:
%%time
test = generate_features_full(results_df_test,False,feature_chain_length)
print(len(test))
test.head()

roll columns present
Roll Features from 1 to 20  completed
Roll Features from 1 to 20 _nonce completed
Roll Features from 25 to 50  completed
Roll Features from 25 to 50 _nonce completed
Roll Features from 50 to 60  completed
Roll Features from 50 to 60 _nonce completed
Roll Features from 1 to 60  completed
Roll Features from 1 to 60 _nonce completed
8163
CPU times: user 3.77 s, sys: 23.9 ms, total: 3.79 s
Wall time: 3.79 s


Unnamed: 0,roll_actual,seed,hash,client_seed,client_index,roll_0,roll_1,roll_2,roll_3,roll_4,roll_5,roll_6,roll_7,roll_8,roll_9,roll_10,roll_11,roll_12,roll_13,roll_14,roll_15,roll_16,roll_17,roll_18,roll_19,roll_20,roll_21,roll_22,roll_23,roll_24,roll_25,roll_26,roll_27,roll_28,roll_29,roll_30,roll_31,roll_32,roll_33,roll_34,roll_35,roll_36,roll_37,roll_38,roll_39,roll_40,roll_41,roll_42,roll_43,roll_44,roll_45,roll_46,roll_47,roll_48,roll_49,roll_50,roll_51,roll_52,roll_53,roll_54,roll_55,roll_56,roll_57,roll_58,roll_59,roll_60,count_lt_1000,count_gt_9000,count_gt_8000,count_gt_7000,count_gt_6000,count_gt_5000,count_gt_4000,count_gt_3000,count_gt_2000,count_gt_1000,count_hi,count_lo,count_lo_hi,count_gt_9000_k,count_lt_1000_k,total_dig_9,total_dig_0,roll_mean,roll_std,count_lt_1000_25_50,count_gt_9000_25_50,count_gt_8000_25_50,count_gt_7000_25_50,count_gt_6000_25_50,count_gt_5000_25_50,count_gt_4000_25_50,count_gt_3000_25_50,count_gt_2000_25_50,count_gt_1000_25_50,count_hi_25_50,count_lo_25_50,count_lo_hi_25_50,count_gt_9000_k_25_50,count_lt_1000_k_25_50,total_dig_9_25_50,total_dig_0_25_50,roll_mean_25_50,roll_std_25_50,count_lt_1000_50_60,count_gt_9000_50_60,count_gt_8000_50_60,count_gt_7000_50_60,count_gt_6000_50_60,count_gt_5000_50_60,count_gt_4000_50_60,count_gt_3000_50_60,count_gt_2000_50_60,count_gt_1000_50_60,count_hi_50_60,count_lo_50_60,count_lo_hi_50_60,count_gt_9000_k_50_60,count_lt_1000_k_50_60,total_dig_9_50_60,total_dig_0_50_60,roll_mean_50_60,roll_std_50_60,count_lt_1000_1_60,count_gt_9000_1_60,count_gt_8000_1_60,count_gt_7000_1_60,count_gt_6000_1_60,count_gt_5000_1_60,count_gt_4000_1_60,count_gt_3000_1_60,count_gt_2000_1_60,count_gt_1000_1_60,count_hi_1_60,count_lo_1_60,count_lo_hi_1_60,count_gt_9000_k_1_60,count_lt_1000_k_1_60,total_dig_9_1_60,total_dig_0_1_60,roll_mean_1_60,roll_std_1_60,roll_0_nonce,roll_1_nonce,roll_2_nonce,roll_3_nonce,roll_4_nonce,roll_5_nonce,roll_6_nonce,roll_7_nonce,roll_8_nonce,roll_9_nonce,roll_10_nonce,roll_11_nonce,roll_12_nonce,roll_13_nonce,roll_14_nonce,roll_15_nonce,roll_16_nonce,roll_17_nonce,roll_18_nonce,roll_19_nonce,roll_20_nonce,roll_21_nonce,roll_22_nonce,roll_23_nonce,roll_24_nonce,roll_25_nonce,roll_26_nonce,roll_27_nonce,roll_28_nonce,roll_29_nonce,roll_30_nonce,roll_31_nonce,roll_32_nonce,roll_33_nonce,roll_34_nonce,roll_35_nonce,roll_36_nonce,roll_37_nonce,roll_38_nonce,roll_39_nonce,roll_40_nonce,roll_41_nonce,roll_42_nonce,roll_43_nonce,roll_44_nonce,roll_45_nonce,roll_46_nonce,roll_47_nonce,roll_48_nonce,roll_49_nonce,roll_50_nonce,roll_51_nonce,roll_52_nonce,roll_53_nonce,roll_54_nonce,roll_55_nonce,roll_56_nonce,roll_57_nonce,roll_58_nonce,roll_59_nonce,roll_60_nonce,count_lt_1000_nonce,count_gt_9000_nonce,count_gt_8000_nonce,count_gt_7000_nonce,count_gt_6000_nonce,count_gt_5000_nonce,count_gt_4000_nonce,count_gt_3000_nonce,count_gt_2000_nonce,count_gt_1000_nonce,count_hi_nonce,count_lo_nonce,count_lo_hi_nonce,count_gt_9000_k_nonce,count_lt_1000_k_nonce,total_dig_9_nonce,total_dig_0_nonce,roll_mean_nonce,roll_std_nonce,count_lt_1000_25_50_nonce,count_gt_9000_25_50_nonce,count_gt_8000_25_50_nonce,count_gt_7000_25_50_nonce,count_gt_6000_25_50_nonce,count_gt_5000_25_50_nonce,count_gt_4000_25_50_nonce,count_gt_3000_25_50_nonce,count_gt_2000_25_50_nonce,count_gt_1000_25_50_nonce,count_hi_25_50_nonce,count_lo_25_50_nonce,count_lo_hi_25_50_nonce,count_gt_9000_k_25_50_nonce,count_lt_1000_k_25_50_nonce,total_dig_9_25_50_nonce,total_dig_0_25_50_nonce,roll_mean_25_50_nonce,roll_std_25_50_nonce,count_lt_1000_50_60_nonce,count_gt_9000_50_60_nonce,count_gt_8000_50_60_nonce,count_gt_7000_50_60_nonce,count_gt_6000_50_60_nonce,count_gt_5000_50_60_nonce,count_gt_4000_50_60_nonce,count_gt_3000_50_60_nonce,count_gt_2000_50_60_nonce,count_gt_1000_50_60_nonce,count_hi_50_60_nonce,count_lo_50_60_nonce,count_lo_hi_50_60_nonce,count_gt_9000_k_50_60_nonce,count_lt_1000_k_50_60_nonce,total_dig_9_50_60_nonce,total_dig_0_50_60_nonce,roll_mean_50_60_nonce,roll_std_50_60_nonce,count_lt_1000_1_60_nonce,count_gt_9000_1_60_nonce,count_gt_8000_1_60_nonce,count_gt_7000_1_60_nonce,count_gt_6000_1_60_nonce,count_gt_5000_1_60_nonce,count_gt_4000_1_60_nonce,count_gt_3000_1_60_nonce,count_gt_2000_1_60_nonce,count_gt_1000_1_60_nonce,count_hi_1_60_nonce,count_lo_1_60_nonce,count_lo_hi_1_60_nonce,count_gt_9000_k_1_60_nonce,count_lt_1000_k_1_60_nonce,total_dig_9_1_60_nonce,total_dig_0_1_60_nonce,roll_mean_1_60_nonce,roll_std_1_60_nonce
0,6102,1c5ea7091200ef097e44fa32587a8d6119374dce2cf9763ff9ed568d7ac7c956,fef10d00fcd97a523a39169b7c5f6fadaf15f063acdbc31b4ae0ff7126a973d6,woxpwoxpwoxpwoxpwoxpc01a7c318c22b443a0d82efb69ac2305887691b1e625d0ed3e64da925ab3029e,999816,4007,6747,8644,4018,9390,4924,9396,3012,8147,8160,8153,4788,5043,8984,706,6018,5040,9894,2765,4905,5349,1689,7923,6160,8618,6102,8750,9079,3771,4771,2206,7486,6790,7383,4936,4523,7828,8095,7437,9692,8681,2494,9221,6843,8888,9899,5886,8669,5230,4946,4936,5249,119,2932,9518,4082,2932,8193,4649,8823,9208,1,3,5,0,2,3,4,1,1,0,11,4,15,1,0,10,10,6204.15,2566.522064,0,4,5,4,3,2,5,1,2,0,17,4,21,1,0,15,7,6713.153846,2193.341901,1,2,2,0,0,1,3,0,2,0,4,5,9,1,1,9,2,5512.818182,3056.906764,2,9,13,5,6,6,11,2,5,1,35,14,49,2,0,35,20,6312.0,2530.245081,4007,7842,2306,5052,589,4976,4393,711,6812,5517,9366,9242,6528,7217,683,6251,2386,4329,5622,2064,8236,8332,4981,8726,7761,4089,7808,7919,1275,4773,1787,9557,7761,3633,3686,7578,6502,1799,4225,5925,8197,6990,2869,5873,2807,3769,8395,2541,6666,2713,1725,8941,2883,8757,9050,3517,4553,5935,7769,3459,2463,3,2,1,2,3,3,3,0,3,0,10,8,18,0,1,6,3,5006.1,2794.529014,0,1,2,4,3,2,3,3,4,4,12,13,25,0,0,13,5,5033.153846,2499.277891,0,1,2,1,0,1,1,2,2,1,5,6,11,1,0,5,2,5368.363636,2819.848729,3,4,7,8,6,6,8,5,9,4,30,26,56,0,1,25,10,5301.85,2610.253438
1,761,1c5ea7091200ef097e44fa32587a8d6119374dce2cf9763ff9ed568d7ac7c956,fef10d00fcd97a523a39169b7c5f6fadaf15f063acdbc31b4ae0ff7126a973d6,woxpwoxpwoxpwoxpwoxpc643fb8d9cf759ad44c04b2ac05d7dc4e03515c93a79e18e729543b64fb9f395,999793,4009,4736,8641,4343,5765,1315,1537,7954,7322,1230,5375,2935,1408,8081,6053,5526,2438,1419,4133,2001,7864,8123,9808,6555,3473,761,9244,6172,6327,6707,2699,6795,10,8526,7663,6309,5022,4403,7050,1018,5895,9063,4504,7427,6259,4730,4379,8639,2019,6497,4631,8192,6203,4819,8664,7980,1437,204,1902,7286,8040,0,0,2,3,1,3,3,0,3,5,9,11,20,0,0,3,6,4503.8,2604.069762,2,2,2,3,7,2,5,0,2,1,15,10,25,1,1,12,12,5490.346154,2536.474545,1,0,3,2,1,0,2,0,0,2,6,4,10,0,0,4,6,5396.181818,3035.54469,3,3,8,8,10,5,9,1,5,8,33,25,58,0,0,20,25,5258.516667,2694.018192,4009,5024,7949,2456,189,3046,7861,1010,5463,7777,2539,6933,5074,1978,9351,1637,4809,5478,8015,1258,9890,9098,3653,7849,4026,4872,4572,521,6893,6477,2035,4022,851,4104,4200,7567,7210,2033,4315,2846,3214,9677,1206,8582,3312,402,9389,8449,2130,7134,3979,610,7839,2639,2258,9355,1239,3442,9361,4798,9480,1,2,1,3,1,4,1,1,2,4,9,8,17,0,1,10,8,4886.85,2994.294615,3,2,2,3,2,0,6,3,4,1,9,16,25,0,1,7,10,4615.076923,2808.818071,1,3,0,1,0,0,1,2,2,1,4,6,10,1,1,9,2,5000.0,3407.86417,5,8,3,8,3,4,9,6,8,6,24,31,55,0,1,27,22,4889.6,2950.811978
2,1128,1c5ea7091200ef097e44fa32587a8d6119374dce2cf9763ff9ed568d7ac7c956,fef10d00fcd97a523a39169b7c5f6fadaf15f063acdbc31b4ae0ff7126a973d6,woxpwoxpwoxpwoxpwoxp3fa2aa9a9eed038cb3bf831783565fa78305ee823d5d4cff963f2759db6657e5,999750,7007,6368,9428,4940,1549,2210,6604,8790,8638,1634,3050,6680,8820,7322,8355,7164,7792,1765,1354,1743,8461,4388,7626,6421,334,1128,3662,7923,4917,7727,5379,2928,6351,4242,5331,961,2242,1559,2865,384,4600,4306,7892,7170,4174,8765,9066,9425,5144,9933,7870,3934,565,5979,7444,1738,3196,5026,9511,239,3033,0,1,5,3,3,0,1,1,1,5,12,7,19,1,0,5,8,5633.35,3003.009548,2,3,1,5,1,3,5,1,3,2,12,12,24,0,0,11,6,5228.615385,2808.987648,2,1,0,2,0,2,0,3,0,1,4,6,10,0,1,6,3,4412.272727,3048.859921,5,5,6,10,5,5,7,5,4,8,29,27,56,1,0,22,16,5134.083333,2900.864356,7007,1497,3358,6567,6782,1586,1314,8213,3851,7592,8673,7795,5400,3701,9613,3447,1589,6014,5490,3464,285,6410,3591,6930,6110,7848,5137,9792,7167,1746,1991,4775,622,9833,9647,2943,6211,3401,2024,7172,7720,2110,3251,4177,7718,815,6472,3457,9715,5497,4195,2440,7384,7201,5489,8908,6207,2118,3256,5793,1641,1,1,2,2,3,2,0,5,0,4,10,10,20,0,0,6,5,4811.55,2789.33698,2,4,0,5,2,2,3,3,3,2,12,12,24,1,0,10,4,5209.076923,2914.489937,0,0,1,2,1,2,1,1,2,1,6,5,11,0,0,4,4,4966.545455,2407.172506,3,5,3,9,9,6,3,10,5,7,31,27,58,0,0,21,16,5085.75,2692.831014
3,8408,1c5ea7091200ef097e44fa32587a8d6119374dce2cf9763ff9ed568d7ac7c956,fef10d00fcd97a523a39169b7c5f6fadaf15f063acdbc31b4ae0ff7126a973d6,woxpwoxpwoxpwoxpwoxpa703b62c3bd792205a0accce50473561da5ddbfb91d7d666dc7ecf0783fef1ed,999748,6008,848,7499,5035,776,63,3032,5905,1864,3329,9125,8087,1502,7030,4892,3664,7222,1163,6502,510,4512,7432,7144,165,4236,8408,5894,6020,5689,1159,3523,3016,3999,4455,1927,2660,8169,3909,6701,5019,6632,9064,534,6180,820,1500,2112,8073,8932,375,3301,5122,6132,6764,5642,4278,5133,1954,7682,9836,2219,4,1,1,3,1,2,2,3,0,3,7,11,18,0,3,6,9,4128.0,2854.593159,3,1,4,0,4,3,1,5,2,3,11,14,25,0,0,13,15,4541.192308,2738.119998,0,1,0,1,2,3,1,1,1,1,5,4,9,0,0,3,1,5278.454545,2344.886623,8,3,5,6,7,8,5,8,3,7,25,30,55,0,3,22,24,4572.833333,2729.046218,6008,796,6110,4952,3479,9714,5838,5162,2444,8460,5415,3668,1230,6138,9427,2585,1774,6626,3592,7789,9277,71,955,1528,3232,1099,2880,5559,8167,789,303,3295,3366,5746,5454,1444,7877,8190,7004,8744,7697,7890,9728,8152,9439,3228,2522,356,6068,968,4675,6175,488,9049,740,3917,6328,2114,4985,2719,3540,1,3,1,1,3,3,1,3,2,2,10,8,18,1,1,8,3,5223.8,2768.852747,4,2,4,4,1,3,1,3,2,2,14,12,26,0,2,12,8,5024.615385,3106.605042,2,1,0,0,2,0,2,2,2,0,3,7,10,1,2,5,3,4066.363636,2557.307266,9,6,5,5,6,6,3,9,6,5,27,30,57,1,1,26,14,4682.616667,2947.65488
4,1387,1c5ea7091200ef097e44fa32587a8d6119374dce2cf9763ff9ed568d7ac7c956,fef10d00fcd97a523a39169b7c5f6fadaf15f063acdbc31b4ae0ff7126a973d6,woxpwoxpwoxpwoxpwoxp4bedab77996765f6a92c17da1290edf98dd9d327debbc98d300a187e7d366784,999515,7004,3124,2690,7805,9539,9956,2141,2972,1003,8971,9253,452,393,5883,2987,844,3825,6886,8136,6185,91,3863,3630,6500,8787,1387,5758,8540,7921,133,1496,3606,9715,2603,7106,420,9923,5832,4874,1140,3779,5981,6417,9734,4996,8061,8130,8630,9654,3732,5067,4183,3590,868,4782,5253,737,7477,4839,6416,2506,4,3,2,1,2,1,0,2,4,1,9,11,20,2,0,11,4,4656.8,3448.411741,2,4,4,2,1,4,2,3,1,3,14,9,23,0,1,11,10,5562.884615,3098.921171,2,0,0,1,1,2,3,1,1,0,3,5,8,0,1,2,3,4156.181818,2111.218265,8,7,7,4,5,6,5,8,6,4,28,27,55,2,0,24,19,5020.033333,3049.737121,7004,9230,1370,2126,7016,7107,9579,389,3360,6347,1303,4845,8093,955,4068,6761,813,2815,3539,2659,2487,5412,1557,9561,1155,5106,1284,8501,177,6044,6166,6779,8697,1252,9518,1271,8834,4384,4139,7093,2983,7281,9580,1845,4921,6303,1632,7632,7917,9391,136,6749,8960,4629,3360,9922,7993,2007,1600,5092,4232,3,2,1,2,2,0,2,2,4,2,7,12,19,1,0,8,8,4243.1,2928.38574,2,3,3,4,4,1,3,0,1,5,14,10,24,0,1,11,6,5341.0,3135.526418,1,1,1,1,1,1,2,1,1,1,4,6,10,1,1,8,7,4970.909091,3154.639423,5,7,5,7,7,3,7,3,6,10,27,29,56,2,0,28,21,4932.616667,3043.218201


In [184]:
%%time
test_hash = generate_features_full(results_df_test_hash,False,feature_chain_length)
print(len(test))
test_hash.head()

roll columns present
Roll Features from 1 to 20  completed
Roll Features from 1 to 20 _nonce completed
Roll Features from 25 to 50  completed
Roll Features from 25 to 50 _nonce completed
Roll Features from 50 to 60  completed
Roll Features from 50 to 60 _nonce completed
Roll Features from 1 to 60  completed
Roll Features from 1 to 60 _nonce completed
8163
CPU times: user 3.71 s, sys: 27.7 ms, total: 3.74 s
Wall time: 3.73 s


Unnamed: 0,roll_actual,seed,hash,client_seed,client_index,roll_0,roll_1,roll_2,roll_3,roll_4,roll_5,roll_6,roll_7,roll_8,roll_9,roll_10,roll_11,roll_12,roll_13,roll_14,roll_15,roll_16,roll_17,roll_18,roll_19,roll_20,roll_21,roll_22,roll_23,roll_24,roll_25,roll_26,roll_27,roll_28,roll_29,roll_30,roll_31,roll_32,roll_33,roll_34,roll_35,roll_36,roll_37,roll_38,roll_39,roll_40,roll_41,roll_42,roll_43,roll_44,roll_45,roll_46,roll_47,roll_48,roll_49,roll_50,roll_51,roll_52,roll_53,roll_54,roll_55,roll_56,roll_57,roll_58,roll_59,roll_60,count_lt_1000,count_gt_9000,count_gt_8000,count_gt_7000,count_gt_6000,count_gt_5000,count_gt_4000,count_gt_3000,count_gt_2000,count_gt_1000,count_hi,count_lo,count_lo_hi,count_gt_9000_k,count_lt_1000_k,total_dig_9,total_dig_0,roll_mean,roll_std,count_lt_1000_25_50,count_gt_9000_25_50,count_gt_8000_25_50,count_gt_7000_25_50,count_gt_6000_25_50,count_gt_5000_25_50,count_gt_4000_25_50,count_gt_3000_25_50,count_gt_2000_25_50,count_gt_1000_25_50,count_hi_25_50,count_lo_25_50,count_lo_hi_25_50,count_gt_9000_k_25_50,count_lt_1000_k_25_50,total_dig_9_25_50,total_dig_0_25_50,roll_mean_25_50,roll_std_25_50,count_lt_1000_50_60,count_gt_9000_50_60,count_gt_8000_50_60,count_gt_7000_50_60,count_gt_6000_50_60,count_gt_5000_50_60,count_gt_4000_50_60,count_gt_3000_50_60,count_gt_2000_50_60,count_gt_1000_50_60,count_hi_50_60,count_lo_50_60,count_lo_hi_50_60,count_gt_9000_k_50_60,count_lt_1000_k_50_60,total_dig_9_50_60,total_dig_0_50_60,roll_mean_50_60,roll_std_50_60,count_lt_1000_1_60,count_gt_9000_1_60,count_gt_8000_1_60,count_gt_7000_1_60,count_gt_6000_1_60,count_gt_5000_1_60,count_gt_4000_1_60,count_gt_3000_1_60,count_gt_2000_1_60,count_gt_1000_1_60,count_hi_1_60,count_lo_1_60,count_lo_hi_1_60,count_gt_9000_k_1_60,count_lt_1000_k_1_60,total_dig_9_1_60,total_dig_0_1_60,roll_mean_1_60,roll_std_1_60,roll_0_nonce,roll_1_nonce,roll_2_nonce,roll_3_nonce,roll_4_nonce,roll_5_nonce,roll_6_nonce,roll_7_nonce,roll_8_nonce,roll_9_nonce,roll_10_nonce,roll_11_nonce,roll_12_nonce,roll_13_nonce,roll_14_nonce,roll_15_nonce,roll_16_nonce,roll_17_nonce,roll_18_nonce,roll_19_nonce,roll_20_nonce,roll_21_nonce,roll_22_nonce,roll_23_nonce,roll_24_nonce,roll_25_nonce,roll_26_nonce,roll_27_nonce,roll_28_nonce,roll_29_nonce,roll_30_nonce,roll_31_nonce,roll_32_nonce,roll_33_nonce,roll_34_nonce,roll_35_nonce,roll_36_nonce,roll_37_nonce,roll_38_nonce,roll_39_nonce,roll_40_nonce,roll_41_nonce,roll_42_nonce,roll_43_nonce,roll_44_nonce,roll_45_nonce,roll_46_nonce,roll_47_nonce,roll_48_nonce,roll_49_nonce,roll_50_nonce,roll_51_nonce,roll_52_nonce,roll_53_nonce,roll_54_nonce,roll_55_nonce,roll_56_nonce,roll_57_nonce,roll_58_nonce,roll_59_nonce,roll_60_nonce,count_lt_1000_nonce,count_gt_9000_nonce,count_gt_8000_nonce,count_gt_7000_nonce,count_gt_6000_nonce,count_gt_5000_nonce,count_gt_4000_nonce,count_gt_3000_nonce,count_gt_2000_nonce,count_gt_1000_nonce,count_hi_nonce,count_lo_nonce,count_lo_hi_nonce,count_gt_9000_k_nonce,count_lt_1000_k_nonce,total_dig_9_nonce,total_dig_0_nonce,roll_mean_nonce,roll_std_nonce,count_lt_1000_25_50_nonce,count_gt_9000_25_50_nonce,count_gt_8000_25_50_nonce,count_gt_7000_25_50_nonce,count_gt_6000_25_50_nonce,count_gt_5000_25_50_nonce,count_gt_4000_25_50_nonce,count_gt_3000_25_50_nonce,count_gt_2000_25_50_nonce,count_gt_1000_25_50_nonce,count_hi_25_50_nonce,count_lo_25_50_nonce,count_lo_hi_25_50_nonce,count_gt_9000_k_25_50_nonce,count_lt_1000_k_25_50_nonce,total_dig_9_25_50_nonce,total_dig_0_25_50_nonce,roll_mean_25_50_nonce,roll_std_25_50_nonce,count_lt_1000_50_60_nonce,count_gt_9000_50_60_nonce,count_gt_8000_50_60_nonce,count_gt_7000_50_60_nonce,count_gt_6000_50_60_nonce,count_gt_5000_50_60_nonce,count_gt_4000_50_60_nonce,count_gt_3000_50_60_nonce,count_gt_2000_50_60_nonce,count_gt_1000_50_60_nonce,count_hi_50_60_nonce,count_lo_50_60_nonce,count_lo_hi_50_60_nonce,count_gt_9000_k_50_60_nonce,count_lt_1000_k_50_60_nonce,total_dig_9_50_60_nonce,total_dig_0_50_60_nonce,roll_mean_50_60_nonce,roll_std_50_60_nonce,count_lt_1000_1_60_nonce,count_gt_9000_1_60_nonce,count_gt_8000_1_60_nonce,count_gt_7000_1_60_nonce,count_gt_6000_1_60_nonce,count_gt_5000_1_60_nonce,count_gt_4000_1_60_nonce,count_gt_3000_1_60_nonce,count_gt_2000_1_60_nonce,count_gt_1000_1_60_nonce,count_hi_1_60_nonce,count_lo_1_60_nonce,count_lo_hi_1_60_nonce,count_gt_9000_k_1_60_nonce,count_lt_1000_k_1_60_nonce,total_dig_9_1_60_nonce,total_dig_0_1_60_nonce,roll_mean_1_60_nonce,roll_std_1_60_nonce
0,4817,fef10d00fcd97a523a39169b7c5f6fadaf15f063acdbc31b4ae0ff7126a973d6,ce112aa083d921c2b2045e24952723a1c8789e6a4596f439c594900a5bddab5f,woxpwoxpwoxpwoxpwoxp8553058847fd4d959ceb3ba4e564a3db27e6ebd5e01f2289aaffb27c74dc42d7,999778,6009,2569,4121,5411,1848,7208,1549,1141,887,8901,8006,2448,7333,8761,6351,4439,7684,7899,5281,3312,6349,6586,514,1130,5041,3312,3803,736,2336,2182,2966,8349,243,8778,777,5441,560,9692,7667,7519,8087,5146,492,5914,8919,107,814,9073,7687,353,8114,4404,4050,9852,851,1266,5778,719,3510,5188,8897,1,0,3,4,2,2,2,1,2,3,11,9,20,0,0,7,4,5074.9,2696.178366,8,2,5,3,0,3,0,2,3,0,12,13,25,0,1,10,5,4579.5,3469.908417,2,1,2,0,0,2,2,1,0,1,4,6,10,1,1,3,4,4784.454545,3186.170314,12,3,9,7,3,8,4,4,5,5,27,30,57,0,0,20,15,4639.183333,3097.215926,6009,7463,335,3071,3190,4228,9505,3031,7807,8206,9236,1843,8546,9783,6829,8535,7586,3104,626,8820,1762,6289,9024,7527,2939,7231,8160,5283,328,5397,831,7954,916,847,1220,2178,7166,4122,433,5604,9306,9919,7166,9782,2739,925,4907,6984,2646,8705,9547,4636,1123,6563,9106,2837,8204,4865,2748,2745,4575,2,3,4,3,1,0,1,4,0,2,11,9,20,0,1,5,8,5675.3,3263.231658,6,4,2,4,1,3,2,0,3,1,14,11,25,0,2,13,6,5011.384615,3357.965587,0,2,1,0,1,0,3,0,3,1,4,6,10,2,0,2,2,5177.181818,2830.110875,8,9,7,8,4,3,6,4,7,4,31,27,58,1,1,23,17,5283.05,3126.684313
1,259,fef10d00fcd97a523a39169b7c5f6fadaf15f063acdbc31b4ae0ff7126a973d6,ce112aa083d921c2b2045e24952723a1c8789e6a4596f439c594900a5bddab5f,woxpwoxpwoxpwoxpwoxp1fc4134d1ca63faf7e16313ed06c36ee782c7f4d16b3dfabd965069c453c3b52,999735,8004,3027,6965,5860,7857,3080,622,9668,3065,5231,1769,8063,6512,8612,4833,7454,9925,6913,1640,1408,2137,6047,2139,4311,869,4271,3824,3918,8017,1113,4017,6436,5685,9448,3178,5295,5944,9076,8333,5327,4236,1868,6766,2634,5477,6163,3105,9213,4628,3635,1114,9201,3151,3742,962,9904,4114,3413,9717,6286,6847,1,2,2,2,3,2,1,3,1,3,10,8,18,0,0,6,8,5232.05,2951.61154,0,3,2,0,3,5,4,5,1,3,13,13,26,0,0,6,4,5104.653846,2379.530356,1,3,0,0,2,0,1,3,0,1,5,6,11,2,1,5,2,5313.727273,3280.878056,3,8,4,2,9,7,7,11,3,6,29,29,58,0,1,19,15,5134.416667,2692.754417,8004,5246,1381,9884,1782,8587,74,2106,9077,9528,3723,9872,9214,9244,4994,9118,7336,5829,1279,5452,43,1634,3940,9775,2635,7881,5919,6925,7514,1441,2447,8043,8262,721,6517,8736,8090,805,9368,1845,2482,807,6425,8536,3329,4927,1389,2236,1673,1660,7813,9107,7905,3247,5097,8030,2089,5378,2189,2149,2046,2,7,1,1,0,3,1,1,1,3,11,7,18,1,0,11,2,5688.45,3579.031911,3,1,5,3,3,1,1,1,3,5,13,12,25,0,0,8,6,4838.115385,3100.915559,0,1,1,2,0,2,0,1,4,0,5,5,10,1,0,6,7,5004.545455,2808.119953,5,10,7,5,3,6,2,4,9,9,29,27,56,1,1,27,16,5079.683333,3203.127621
2,2515,fef10d00fcd97a523a39169b7c5f6fadaf15f063acdbc31b4ae0ff7126a973d6,ce112aa083d921c2b2045e24952723a1c8789e6a4596f439c594900a5bddab5f,woxpwoxpwoxpwoxpwoxp49c858164782bf5378b4d899606c33c675f660f57d6a1d2c3694b3c61061363b,999504,4006,3091,7229,1414,3055,9185,7310,2654,6310,5481,3519,5074,2816,7173,7532,6590,823,1812,821,4809,491,5855,3672,6466,6928,1790,8713,5075,2866,1005,5417,4902,3587,4513,2291,5797,3788,2909,7900,4964,6210,2788,1283,7694,7587,9923,3855,2074,8748,2648,580,8054,7717,4180,5910,4248,2004,7986,9818,2818,4276,3,1,0,4,2,2,1,3,2,2,8,10,18,1,0,7,7,4359.45,2649.951151,1,1,2,3,1,3,3,3,6,3,9,14,23,0,0,11,11,4573.346154,2621.971944,1,1,1,2,0,1,3,0,2,0,5,6,11,0,1,3,6,5235.545455,2899.857699,4,3,3,9,5,7,7,7,10,5,25,30,55,1,0,22,23,4767.133333,2566.085659,4006,7362,7299,8990,9283,9751,6419,1524,2456,2689,8509,9113,6069,9716,5648,8131,8681,8235,6061,4164,2961,4256,5874,818,1929,8450,8119,5627,1129,8780,931,48,7786,2032,4110,3060,6712,5593,194,6259,6605,3251,979,4743,7368,5646,9556,430,5836,8702,4813,7796,7340,9778,8905,3050,35,9860,9475,589,445,0,4,5,2,3,1,1,0,3,1,15,5,20,2,0,13,4,6653.05,2633.702606,5,1,4,2,3,4,3,2,1,1,14,11,25,0,1,9,9,4875.346154,3000.257881,3,3,1,2,0,0,1,1,0,0,6,4,10,1,0,6,5,5644.181818,3989.307178,9,8,10,6,6,6,5,3,4,3,36,23,59,2,0,30,18,5499.5,3159.136749
3,9976,fef10d00fcd97a523a39169b7c5f6fadaf15f063acdbc31b4ae0ff7126a973d6,ce112aa083d921c2b2045e24952723a1c8789e6a4596f439c594900a5bddab5f,woxpwoxpwoxpwoxpwoxpf2050f784f61293221873db1bcae54f64ca8e685a5d365cfef9e425d700205bc,999502,5006,9276,4684,9021,6151,5733,8842,621,9974,197,6633,8940,8514,3470,5210,9716,5054,6854,5755,3313,1898,4702,5283,1058,2747,4097,4168,5140,5796,5370,2277,3260,2113,3950,1006,5474,405,3290,4515,1031,9360,841,7961,4601,5720,392,8747,6033,8117,2466,3112,933,3925,951,4458,812,1928,3185,8859,5438,3434,2,4,3,0,3,4,1,2,0,1,12,6,18,2,0,8,5,5992.8,2992.010301,3,1,2,1,1,5,4,4,3,2,9,16,25,0,0,7,14,4201.615385,2558.137261,3,0,1,0,0,1,1,4,0,1,2,9,11,0,3,5,0,3366.818182,2375.19569,8,5,6,1,4,11,7,9,4,5,24,33,57,2,0,20,21,4613.516667,2804.328268,5006,2599,3076,6642,5141,3179,296,3231,4573,3408,3355,9645,6412,2345,1838,3873,5210,395,4907,616,7645,9998,1970,3097,3993,2209,5963,5319,6911,1014,4999,7797,3275,7764,2317,6876,7686,3335,1713,9392,2906,7770,2041,8751,6266,7703,6755,8023,4383,619,2075,4732,9048,7864,5528,8166,1697,606,3779,2246,863,3,1,0,1,2,2,2,6,2,1,4,13,17,0,0,7,4,3919.3,2429.875784,1,1,2,5,4,2,2,2,5,2,14,11,25,0,0,12,8,5148.538462,2691.935619,2,1,1,1,0,1,1,1,2,1,4,7,11,1,0,3,3,4236.727273,3058.060205,6,4,3,7,6,5,5,11,8,5,23,33,56,0,1,29,16,4597.25,2723.751505
4,4950,fef10d00fcd97a523a39169b7c5f6fadaf15f063acdbc31b4ae0ff7126a973d6,ce112aa083d921c2b2045e24952723a1c8789e6a4596f439c594900a5bddab5f,woxpwoxpwoxpwoxpwoxpcb375f433859bdd7c0d89c262aefbf109a938679ba90b9026f849e57c867b435,999201,6005,4706,4676,1845,6817,975,3409,6623,4928,9205,8046,7011,8295,9998,921,8117,7719,1037,6367,9806,6441,4339,8193,9943,4468,5676,8592,2062,6821,8789,5581,4889,2736,3697,1100,2812,7292,8916,8759,8726,730,3520,8251,4945,3517,6557,8278,3889,6772,9999,9476,510,1309,9203,7098,4111,6005,7541,4881,7893,3743,2,3,3,2,4,0,3,1,0,2,12,7,19,0,1,11,7,5847.1,2936.189204,1,2,7,1,3,2,2,4,3,1,15,9,24,0,0,14,5,5860.846154,2770.787797,1,2,0,3,1,0,2,1,0,1,6,4,10,2,1,5,6,5615.454545,2998.98364,4,7,11,6,8,2,9,6,3,4,34,22,56,0,1,33,18,5809.35,2790.632829,6005,9467,3156,5130,1397,7756,8035,3981,719,2224,6892,8380,5099,2013,931,1850,858,5140,2385,6463,4231,9270,4701,973,5819,4234,6101,5071,2633,8330,6061,6034,6422,1969,4479,1568,1446,4644,1466,9945,5138,994,1770,63,7786,8198,1511,4728,6295,49,1319,3257,8944,5873,8301,1432,2327,3652,9015,7594,8460,3,1,2,1,2,3,1,2,3,2,6,11,17,1,0,8,7,4305.35,2788.523799,3,1,2,1,5,2,4,0,1,7,9,15,24,0,0,11,6,4163.615385,2802.217894,0,1,3,1,0,1,0,2,1,2,6,5,11,0,0,4,3,5470.363636,3128.599248,7,4,7,3,7,7,6,4,5,10,23,32,55,1,0,25,18,4566.316667,2859.954382


#### Machine Learning

In [185]:
USE_STAT_FEATS = True

In [186]:

if USE_STAT_FEATS:
    exclude_stat_cols=[]
else:
#     exclude_stat_cols = ['roll_mean','roll_std','roll_mean_nonce','roll_std_nonce']
    exclude_stat_col_starts = ['roll_mean','roll_std','roll_mean_nonce','roll_std_nonce']
    exclude_stat_cols=[]
    for stat_col in exclude_stat_col_starts:
        exclude_stat_cols += [col for col in train.columns if col.startswith(stat_col)]
    print(exclude_stat_cols)
exclude_cols = []#[f'roll_{i}' for i in range(11,feature_chain_length)]
exclude_cols_nonce =[]# [f'roll_{i}_nonce' for i in range(11,feature_chain_length+1)]
exclude_count_cols = [f'count_gt_{i}000' for i in [1,2,3,4,6]]
exclude_count_cols_nonce = [f'count_gt_{i}000_nonce' for i in [1,2,3,4,6]]
othercols = ['roll_actual','client_seed','seed','hash','client_index'] + exclude_stat_cols

all_exclude_cols = othercols+exclude_cols+exclude_cols_nonce+\
                    exclude_count_cols+exclude_count_cols_nonce
                  
features = [col for col in train.columns if col not in all_exclude_cols]
print(len(features))
print(features)

265
['roll_0', 'roll_1', 'roll_2', 'roll_3', 'roll_4', 'roll_5', 'roll_6', 'roll_7', 'roll_8', 'roll_9', 'roll_10', 'roll_11', 'roll_12', 'roll_13', 'roll_14', 'roll_15', 'roll_16', 'roll_17', 'roll_18', 'roll_19', 'roll_20', 'roll_21', 'roll_22', 'roll_23', 'roll_24', 'roll_25', 'roll_26', 'roll_27', 'roll_28', 'roll_29', 'roll_30', 'roll_31', 'roll_32', 'roll_33', 'roll_34', 'roll_35', 'roll_36', 'roll_37', 'roll_38', 'roll_39', 'roll_40', 'roll_41', 'roll_42', 'roll_43', 'roll_44', 'roll_45', 'roll_46', 'roll_47', 'roll_48', 'roll_49', 'roll_50', 'roll_51', 'roll_52', 'roll_53', 'roll_54', 'roll_55', 'roll_56', 'roll_57', 'roll_58', 'roll_59', 'roll_60', 'count_lt_1000', 'count_gt_9000', 'count_gt_8000', 'count_gt_7000', 'count_gt_5000', 'count_hi', 'count_lo', 'count_lo_hi', 'count_gt_9000_k', 'count_lt_1000_k', 'total_dig_9', 'total_dig_0', 'roll_mean', 'roll_std', 'count_lt_1000_25_50', 'count_gt_9000_25_50', 'count_gt_8000_25_50', 'count_gt_7000_25_50', 'count_gt_6000_25_50', 'c

In [187]:
X = train[features].drop('target',axis=1)
y = train['target']

In [188]:
X.columns

Index(['roll_0', 'roll_1', 'roll_2', 'roll_3', 'roll_4', 'roll_5', 'roll_6',
       'roll_7', 'roll_8', 'roll_9',
       ...
       'count_gt_1000_1_60_nonce', 'count_hi_1_60_nonce',
       'count_lo_1_60_nonce', 'count_lo_hi_1_60_nonce',
       'count_gt_9000_k_1_60_nonce', 'count_lt_1000_k_1_60_nonce',
       'total_dig_9_1_60_nonce', 'total_dig_0_1_60_nonce',
       'roll_mean_1_60_nonce', 'roll_std_1_60_nonce'],
      dtype='object', length=264)

In [189]:
import xgboost as xgb
import matplotlib.pyplot as plt # for plotting graphs
import seaborn as sns # for plotting graphs
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import RandomOverSampler
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve, f1_score,precision_recall_curve

Train Validation Split

In [190]:
n_folds=10
tr_indices_folds = []
val_indices_folds=[]
tr_indices_seeds=[]
for fold in range(n_folds):
#     tr_indices = train[(fold+2) * train_client_size:].index
#     val_indices = train[(fold) * train_client_size:(fold+1) * train_client_size].index
    tr_offset = 1#25
    tr_length = 25#25
    tr_start = (fold+tr_offset) * train_client_size
    tr_end = (fold+tr_offset+tr_length) * train_client_size
    tr_indices = train[tr_start:tr_end].index #fold + 3
#     tr_indices = list(tr_indices) + list(train[70:90].index) #fold + 3
    
#     tr_indices_seed=[]
#     for i in range(1,tr_offset):
#         tr_start_seed = (fold+i) * train_client_size
#         tr_end_seed = (fold+i+1) * train_client_size
#         tr_indices_seed.append(train[tr_start_seed:tr_end_seed].index) #fold + 3 
        
    # tr_start_seed = (fold+1) * train_client_size
    # tr_end_seed = (fold+tr_offset) * train_client_size
    # tr_indices_seed = train[tr_start_seed:tr_end_seed].index #fold + 3
    
    tr_indices_seed = [train[0:(fold+1) * train_client_size].index]
#     val_indices = train[(fold+1) * train_client_size:(fold+2) * train_client_size].index
    val_indices = train[(fold) * train_client_size:(fold+1) * train_client_size].index
    tr_indices_seeds.append(tr_indices_seed)
    tr_indices_folds.append(tr_indices)
    val_indices_folds.append(val_indices)
    

In [191]:
#This function is useful to convert the predicted probabilities into labels 
#so that F1 score is optimized
#It first determines the probability threshold using precision recall curve at which F1 score is optimized and 
#then generate the optimized labels based on the determined probability threshold
def get_opt_cutoff_prec(labels,preds):
    precision, recall, thresholds  = precision_recall_curve(labels, preds)
    f1_score= 2*((precision*recall)/(precision+recall))
    
    f1_score = f1_score[:len(thresholds)]
    f1_score[np.isnan(f1_score)]=0
#     optimal_idx = np.nanargmax(precision[:len(precision)-1])
#     print(f'precision borders:{precision[0],precision[len(precision)-2]}')
#     print(f'recall borders:{recall[0],recall[len(recall)-2]}')
#     print(f'f1_score size:{len(f1_score)} recall size:{len(recall)}')
#     print(f'{precision=}')
#     print(f'{recall=}')
#     print(f'{f1_score=}')
#     print(f'{thresholds=}')
#     print(f'optimal_idx:{optimal_idx} precision size:{len(precision)} thresholds size:{len(thresholds)}')
    optimal_idx = np.nanargmax(f1_score)
    optimal_threshold = thresholds[optimal_idx]
#     return optimal_threshold, precision[optimal_idx]
    return optimal_threshold, f1_score[optimal_idx]

def convert_probtolabels(preds,cutoff=0.5):
    y_bin= preds.copy()
    y_bin[preds>cutoff] = 1
    y_bin[preds<=cutoff] = 0
    y_bin=y_bin.astype(int)

    return y_bin

In [192]:
# # roll_actual_df = pd.DataFrame(columns=['roll_actual'])
# # roll_actual_df['roll_actual']=train['roll_actual'].iloc[tr_index]
# # print(len(X_tr))
# # print(len(roll_actual_df))

# oversample = RandomOverSampler(sampling_strategy='minority',random_state=123)
# # fit and apply the transform
# roll_actual_df, y_temp = oversample.fit_resample(train[features+['roll_actual']].iloc[tr_index], 
#                                                  train['target'].iloc[tr_index])
# print(len(roll_actual_df))

# # tr_df = pd.concat([X_tr,roll_actual_df],axis=1)
# # print(len(tr_df))
# train_status(roll_actual_df,True)

In [193]:
%%time
READ_MODEL_FILE = False

if READ_MODEL_FILE:
    xgb_models = joblib.load(f'data/models/models_{nonce}_{file_pattern_str}_pattern.dump')
else:

    params = { 'n_estimators':100,
              'max_leaves':25,
                'subsample':0.8,
              'random_state':145,
              # 'scale_pos_weight': 5,
    #           'max_depth':6,
            'learning_rate':0.05,
             'colsample_bytree':0.6,#0.85,
             'lambda':0.05,
             'alpha':0.1}

    # xgb_model = xgb.XGBClassifier(**params)
    xgb_models = []
    scores  = []
    ratios =[]
    tr_cutoffs=[]
    val_cutoffs=[]
    tr_last_cutoffs=[]

    for fold,(tr_index, val_index, tr_index_seed) in enumerate(zip(tr_indices_folds,val_indices_folds,tr_indices_seeds)):
        X_tr,y_tr = X.iloc[tr_index],y.iloc[tr_index]
        X_val,y_val = X.iloc[val_index],y.iloc[val_index]

        oversample = RandomOverSampler(sampling_strategy='minority',random_state=123)
        # fit and apply the transform
        X_tr, y_tr = oversample.fit_resample(X_tr, y_tr)
        print(pd.Series(y_tr).value_counts())


        print(f'\n ******************* fold: {fold} ********')
        print(f'tr size: { len(X_tr)}  val size:  {len(X_val)}')
#         print(f'{tr_index_seed}')
#         print(f'{val_index}')
#         print(f'{tr_index}')

        xgb_model = xgb.XGBClassifier(**params)
        xgb_model.fit(X_tr,y_tr)
        xgb_models.append(xgb_model)

        mean_opt_cutoff_tr = 0
        for tr_index_seed_cur in tr_index_seed:
            X_tr_seed,y_tr_seed = X.iloc[tr_index_seed_cur],y.iloc[tr_index_seed_cur]

            tr_probs = xgb_model.predict_proba(X_tr_seed)[:,1]
            opt_cutoff_tr, f1score = get_opt_cutoff_prec(y_tr_seed,tr_probs)
            # print('tr cutoff:',opt_cutoff_tr)
            tr_labels = convert_probtolabels(tr_probs,cutoff=opt_cutoff_tr) 
            mask1 = (tr_labels==1)
            mask2 = (y_tr_seed==1)
            total = len(tr_labels[mask1])
            success = len(tr_labels[mask1 & mask2])
            ratio = success/total if total!=0 else 0
            # print(f'tr ratio: {ratio} total: {total} success: {success}')
            mean_opt_cutoff_tr += opt_cutoff_tr / len(tr_index_seed)

        print(f'last cutoff :{opt_cutoff_tr}')
        print(f'mean tr cutoff :{mean_opt_cutoff_tr}')
        tr_last_cutoffs.append(opt_cutoff_tr)
        tr_cutoffs.append(mean_opt_cutoff_tr)
        val_probs = xgb_model.predict_proba(X_val)[:,1]
        print(val_probs[0:5])
        labels = convert_probtolabels(val_probs)
        score_init = accuracy_score(y_val,labels)

        opt_cutoff, f1score = get_opt_cutoff_prec(y_val,val_probs)
        print('valid cutoff:',opt_cutoff)
        val_cutoffs.append(opt_cutoff)
        # print(pd.Series(val_probs).describe(percentiles=[0.6,0.7,0.75,0.8]))
        val_labels = convert_probtolabels(val_probs,cutoff=opt_cutoff_tr) 
        mask1 = (val_labels==1)
        mask2 = (y_val==1)
        total = len(val_labels[mask1])
        success = len(val_labels[mask1 & mask2])
        ratio = success/total if total!=0 else 0
        print(f'ratio: {ratio} total: {total} success: {success}')
        ratios.append(ratio)
        cur_f1 = f1_score(y_val, val_labels,average='macro')
        cur_acc = accuracy_score(y_val,val_labels)
        print(f'val accuracy score:{cur_acc} f1 score:{cur_f1:.4f} initial accuracy score:{score_init}')
        scores.append(cur_acc)

        val_labels = convert_probtolabels(val_probs,cutoff=opt_cutoff) 
        mask1 = (val_labels==1)
        mask2 = (y_val==1)
        total = len(val_labels[mask1])
        success = len(val_labels[mask1 & mask2])
        ratio = success/total if total!=0 else 0
        print(f'ratio @val_cutoff: {ratio} total: {total} success: {success}')

    #     train_status(X_tr,True)

    #     top_prob = np.sort(val_probs)[::-1][:1]
    #     top_label = y_val[val_probs==top_prob ]
    #     print(f'top_prob:{top_prob} top_label:{top_label}')

    print(f'mean score:{np.mean(scores)}')    
    print(f'mean ratio:{np.mean(ratios)}')   
    
    print(np.mean(np.array(val_cutoffs)))
    print(np.mean(np.array(tr_cutoffs)))
    print(np.mean(np.array(tr_last_cutoffs)))
    print(ratios)
    
    joblib.dump(xgb_models,f'data/models/models_{nonce}_{file_pattern_str}_pattern.dump')
    print('Models Save completed')   

0    44955
1    44955
Name: target, dtype: int64

 ******************* fold: 0 ********
tr size: 89910  val size:  2000
last cutoff :0.41150394082069397
mean tr cutoff :0.41150394082069397
[0.46479392 0.43766192 0.45017812 0.47067347 0.46175423]
valid cutoff: 0.41150394
ratio: 0.10292326431181487 total: 1642 success: 169
val accuracy score:0.247 f1 score:0.2424 initial accuracy score:0.7135
ratio @val_cutoff: 0.10292326431181487 total: 1642 success: 169
1    44971
0    44971
Name: target, dtype: int64

 ******************* fold: 1 ********
tr size: 89942  val size:  2000
last cutoff :0.37956753373146057
mean tr cutoff :0.37956753373146057
[0.45780504 0.502741   0.49036217 0.4828706  0.43922085]
valid cutoff: 0.39390275
ratio: 0.09962100703844072 total: 1847 success: 184
val accuracy score:0.162 f1 score:0.1616 initial accuracy score:0.7185
ratio @val_cutoff: 0.10050533408197641 total: 1781 success: 179
0    44964
1    44964
Name: target, dtype: int64

 ******************* fold: 2 *****

In [194]:
def generate_probs(multi_models,model,data,features):
    if multi_models:
        probs= np.zeros(len(data))
        for i,model_ind in enumerate(model):
            cur_prob= model_ind.predict_proba(data[features])[:,1] 
            probs+= cur_prob / len(model)
    else: 
        probs= model.predict_proba(test_filt_df[features_test])[:,1]
    data['probs']=probs
    return data

##### Prediction of Test 

In [195]:
CONT_ANALYSIS=False
IS_PROD=False

In [196]:

#     actual_seed = cur_hash_list[25]
print(actual_seed)
# Vectorize the function
vectorized_calculate_roll = np.vectorize(calculate_roll)

# Compute the roll values for the input arrays
roll_array = vectorized_calculate_roll(actual_seed,
                                       results_df_test['client_seed'],
                                       nonce)

results_df_test['roll_actual']=roll_array
#     test_manual = generate_test_features(results_df_test)
test = generate_features_full(results_df_test,False,feature_chain_length)

f9e1702bb3393f834aea8907e7e346bf9641e4c39131de555a19d56cf1b66989
roll columns present
Roll Features from 1 to 20  completed
Roll Features from 1 to 20 _nonce completed
Roll Features from 25 to 50  completed
Roll Features from 25 to 50 _nonce completed
Roll Features from 50 to 60  completed
Roll Features from 50 to 60 _nonce completed
Roll Features from 1 to 60  completed
Roll Features from 1 to 60 _nonce completed


In [197]:
# l1=list(range(1,261))
# # l2=list(range(50,80))
# for hash_idx in [-1]+l1:
    
#     if hash_idx==-1:
#         actual_seed="d0e068a90b3c836bda9220c2d0135028f66e80d325a75935baee007c85b73005"
#         suffix = 'orig'
#     else:    
#         actual_seed = cur_hash_list[hash_idx]
#         suffix = hash_idx
#     print(actual_seed)
#     # Vectorize the function
#     vectorized_calculate_roll = np.vectorize(calculate_roll)

#     # Compute the roll values for the input arrays
#     test[f'roll_actual_{suffix}']=vectorized_calculate_roll(actual_seed,
#                                            test['client_seed'],
#                                            nonce)

In [198]:
# analysis_df=pd.read_csv(f'data/analysis_df_{nonce}.csv')

In [199]:
mean_imp = np.zeros(len(features)-1)
imp_df = pd.DataFrame()
imp_df['feature'] = [col for col in features if col!='target']
for i,model in enumerate(xgb_models):
    cur_imp = model.feature_importances_
    imp_df[f'importance_m{i}'] = cur_imp
#     print(cur_imp)
    mean_imp += cur_imp / len(xgb_models)

# print(mean_imp)
imp_df['importance'] = mean_imp
imp_df = imp_df.sort_values('importance',ascending=False).reset_index(drop=True)
imp_df

Unnamed: 0,feature,importance_m0,importance_m1,importance_m2,importance_m3,importance_m4,importance_m5,importance_m6,importance_m7,importance_m8,importance_m9,importance
0,roll_mean_25_50_nonce,0.004377,0.004669,0.0043,0.004313,0.003791,0.004848,0.004437,0.004821,0.004219,0.00535,0.004513
1,roll_50_nonce,0.004755,0.004796,0.004755,0.004206,0.004563,0.004663,0.004154,0.003766,0.004786,0.004349,0.004479
2,roll_14_nonce,0.004601,0.004295,0.004816,0.004243,0.004699,0.004235,0.004897,0.004385,0.004235,0.004232,0.004464
3,count_lt_1000_25_50_nonce,0.004604,0.003527,0.005055,0.004061,0.005149,0.004883,0.004109,0.004418,0.004911,0.003797,0.004451
4,roll_6_nonce,0.004744,0.004653,0.004105,0.004371,0.003938,0.004961,0.004193,0.00446,0.004331,0.004758,0.004451
5,roll_32_nonce,0.004658,0.004436,0.003897,0.004083,0.004924,0.004482,0.004974,0.004872,0.004319,0.003627,0.004427
6,count_gt_7000_1_60_nonce,0.004804,0.00413,0.004497,0.004233,0.004512,0.004636,0.004168,0.004044,0.004112,0.005038,0.004417
7,roll_mean,0.003905,0.003809,0.003829,0.004843,0.004269,0.00464,0.004758,0.004673,0.004779,0.004607,0.004411
8,roll_40,0.005001,0.004652,0.004873,0.005051,0.004169,0.003793,0.003956,0.004312,0.003775,0.004525,0.004411
9,roll_mean_1_60,0.00389,0.005215,0.004895,0.003962,0.004753,0.004878,0.004041,0.004311,0.003542,0.00458,0.004407


In [200]:
# %%time
# test_hash25 = generate_features_full(results_df_test_hash25,False,feature_chain_length)
# test_hash25.head()

In [201]:
# %%time
# test_hash24 = generate_features_full(results_df_test_hash24,False,feature_chain_length)
# test_hash24.head()

In [202]:
# hash_imp_name='roll_54'
# nonce_suffix = '_nonce'
# if hash_imp_name.find(nonce_suffix)!=-1:
#     is_nonce=True
#     hash_imp_name=hash_imp_name.replace(nonce_suffix,'')

# hash_no = int(hash_imp_name.replace('roll_',''))
# hash_no

In [203]:
# hash_imp_count=5
# roll_imp_feats = [col for col in imp_df['feature'].values if (col.startswith('roll_'))
#                                 and ('std' not in col ) and ('mean' not in col )][:hash_imp_count] 
# roll_imp_feats


In [204]:
%%time
READ_FROM_FILE_TEST=False

hash_list_nonce = generate_hash_chain(str(nonce),feature_chain_length)
hash_imp_count=5
test_hash_imp = np.zeros((hash_imp_count))
hash_imp_names = [col for col in imp_df['feature'].values if (col.startswith('roll_'))
                                and ('std' not in col ) and ('mean' not in col )][:hash_imp_count] 
print(f'Top Hash Important Features: {hash_imp_names}')
nonce_suffix = '_nonce'
test_hash_imp = []
for i in range(hash_imp_count):

    hash_imp_name = hash_imp_names[i]
    
    print(f'\n********* Processing Test Hash {hash_imp_name} ******************')

    if hash_imp_name.find(nonce_suffix)!=-1:
        is_nonce=True
        hash_imp_name=hash_imp_name.replace(nonce_suffix,'')
        nonce_no = int(hash_imp_name.replace('roll_',''))
        cur_nonce = hash_list_nonce[nonce_no]
        hash_no = 0
        file_suffix =f"_nonce{nonce_no}"
        print(f'{cur_nonce=}')
    else:
        hash_no = int(hash_imp_name.replace('roll_',''))
        cur_nonce= nonce
        file_suffix =f"_hash{hash_no}"

    cur_seed = cur_hash_list[hash_no]
    cur_hash = cur_hash_list[hash_no+1]
    print(f'{file_suffix=}')
    print(f'{hash_no=}')
    print(f'{cur_seed=}')
    print(f'{cur_hash=}')

    results_df_test_hash_imp=gen_test_data(cur_nonce,cur_hash,cur_seed,
                                        file_suffix=file_suffix)
#     results_df_test_hash_imp=save_cleaned_results_df(results_df_test_hash_imp,True,
#                                                 file_suffix=file_suffix)
    print()
    print(results_df_test_hash.head(1))
    cur_test_hash_imp = generate_features_full(results_df_test_hash_imp,False,feature_chain_length)
    test_hash_imp.append(cur_test_hash_imp)

Top Hash Important Features: ['roll_50_nonce', 'roll_14_nonce', 'roll_6_nonce', 'roll_32_nonce', 'roll_40']

********* Processing Test Hash roll_50_nonce ******************
cur_nonce='ff37b005a1f0e2f42b3207e52d77d2646463eb136e1b702307e04bab261eafe8'
file_suffix='_nonce50'
hash_no=0
cur_seed='fef10d00fcd97a523a39169b7c5f6fadaf15f063acdbc31b4ae0ff7126a973d6'
cur_hash='ce112aa083d921c2b2045e24952723a1c8789e6a4596f439c594900a5bddab5f'
1
                                                               seed  \
0  fef10d00fcd97a523a39169b7c5f6fadaf15f063acdbc31b4ae0ff7126a973d6   

                                                               hash  \
0  ce112aa083d921c2b2045e24952723a1c8789e6a4596f439c594900a5bddab5f   

                                                              nonce  
0  ff37b005a1f0e2f42b3207e52d77d2646463eb136e1b702307e04bab261eafe8  
server_count:0

999999 999999
No further match for seed:fef10d00fcd97a523a39169b7c5f6fadaf15f063acdbc31b4ae0ff7126a973d6 nonce:ff37b005a1

In [205]:
test_hash1 = test_hash.copy()
# test_hash = test_hash24.copy()
# test_hash = test_hash1.copy()

In [206]:
test_filt=test.copy()
test_filt_hash=test_hash.copy()
features_test = [col for col in features if col not in ['target','probs']]

In [207]:
def gen_hash_cutoff_df(models,feature_test,test_hash,k_bin_size=600,quant=0.95):
    test_hash_probs = generate_probs(True,models,test_hash,features_test)
    cutoffs = []
    ratios =[]
    totals =[]
    print(test_hash_probs['probs'].describe())
    cutoffs = np.linspace(test_hash_probs['probs'].min(),
                          test_hash_probs['probs'].quantile(quant),k_bin_size)
    for i,cutoff in enumerate(cutoffs[:len(cutoffs)-1]):

        mask = (test_hash_probs['probs']>=cutoff) & (test_hash_probs['probs']<=cutoffs[i+1])
        mask2=create_target_mask(test_hash_probs)

        total = len(test_hash_probs[mask])
        if total==0:
            ratio=0
            success=0
        else:
            success = len(test_hash_probs[mask & mask2])
            ratio = success / total
        ratios.append(ratio)
        totals.append(total)
    #     print(ratio,success,total)

    print(len(ratios),len(cutoffs))
    df=pd.DataFrame()
    df['cutoff']=cutoffs[:len(cutoffs)-1]
    df['cutoff_2']=cutoffs[1:len(cutoffs)]
    df['ratio'] =ratios
    df['total'] =totals

    df = df.sort_values(['ratio','cutoff'],ascending=[False,False]).reset_index(drop=True)
#     print(df[:5].mean())
#     print(df[:10].mean())
    return df

In [208]:
# test_hash_probs = generate_probs(True,xgb_models,test_hash,features_test)
# cutoffs = []
# ratios =[]
# totals =[]
# print(test_hash_probs['probs'].describe())
# cutoffs = np.linspace(test_hash_probs['probs'].min(),
#                       test_hash_probs['probs'].quantile(0.95),600)
# for i,cutoff in enumerate(cutoffs[:len(cutoffs)-1]):
# #     cutoff=0.4
# #     diff = 0.01
# #     print(cutoff,cutoffs[i+1])
#     mask = (test_hash_probs['probs']>=cutoff) & (test_hash_probs['probs']<=cutoffs[i+1])
#     mask2=create_target_mask(test_hash_probs)
    
# #     mask2 = test_hash_probs['roll_actual']>=9000
#     total = len(test_hash_probs[mask])
#     if total==0:
#         ratio=0
#         success=0
#     else:
#         success = len(test_hash_probs[mask & mask2])
#         ratio = success / total
#     ratios.append(ratio)
#     totals.append(total)
# #     print(ratio,success,total)

# print(len(ratios),len(cutoffs))
# df=pd.DataFrame()
# df['cutoff']=cutoffs[:len(cutoffs)-1]
# df['cutoff_2']=cutoffs[1:len(cutoffs)]
# df['ratio'] =ratios
# df['total'] =totals

# df = df.sort_values('ratio',ascending=False).reset_index(drop=True)
# print(df[:5].mean())
# print(df[:10].mean())
# df.head(10)

In [209]:
# df

In [210]:
# df.describe()

In [211]:
# print(len(df[df['total']==1]))
# print(len(df[(df['total']==1) & (df['ratio']>0)]))
# df[df['total']==1]

In [212]:
# df[df['ratio']<0.12].index[0]

In [213]:
# test['roll_actual']=test[f'roll_actual_orig'].copy()

In [214]:
def gen_all_k(models,features_test,data,test_hash_cutoff_df,target_total=1,is_k_data_req=True):
    test_probs = generate_probs(True,models,data,features_test)
    top_k=[]
    all_k=[]
    test_filt_all=None
    for k in range(len(test_hash_cutoff_df)):
        row = test_hash_cutoff_df.iloc[k]
    #     print(row['ratio'],row['cutoff'],row['cutoff_2'])
        mask = (test_probs['probs']>=row['cutoff']) & (test_probs['probs']<=row['cutoff_2'])
        mask2 = create_target_mask(test_probs)    
        test_filt = test_probs[mask]
        

        total = len(test_filt)
        if total==0:
            success=0
            ratio=0
        else:
            success = len(test_probs[mask & mask2])
            ratio = success / total
        if total==target_total:
            all_k.append(k) 
            if is_k_data_req:
#                 cur_test = test_filt[['roll_actual','client_seed','probs']]
                cur_test = test_filt
                cur_test['k']=k
                if test_filt_all is None:
                    test_filt_all = cur_test
                else:
                    test_filt_all = pd.concat([test_filt_all,cur_test],axis=0)
        if ratio>=0.12:
            if total==target_total:
                top_k.append((k,ratio,total))
    #     print(ratio,success,total,row['cutoff'],k)
    print(f'{len(all_k)=}')
#     print(all_k)
    print(f'{len(top_k)=}')
    print(top_k)
#     mask = (test_hash_probs['probs']>=row['cutoff']) & (test_hash_probs['probs']<=row['cutoff_2'])
#     test_filt_hash  = test_hash_probs[mask]   
    
    return all_k,top_k,test_filt_all

###### Normal Test Hash based Match Pos df generation

In [215]:
test_hash = test_hash1.copy()

In [216]:
%%time

k_bin_size=10000

print(f'***************** Bin {k_bin_size} ***************** ')
df= gen_hash_cutoff_df(xgb_models,features_test,test_hash,
                            k_bin_size=k_bin_size,quant=0.95)
print('Test Hash Cutoff Df Stats')
print('Total size:',len(df[df['total']==1]))
print('Success size:',len(df[(df['total']==1) & (df['ratio']>0)]))

all_k,top_k,test_all_k = gen_all_k(xgb_models,features_test,
                                test,df,target_total=1)

top_k_ele = [x[0] for x in top_k]
print(top_k_ele)
print('Test Size:',len(test_all_k))
test_all_k.head()

***************** Bin 10000 ***************** 
count    8098.000000
mean        0.461265
std         0.036103
min         0.256901
25%         0.441942
50%         0.467256
75%         0.486260
max         0.571553
Name: probs, dtype: float64
9999 10000
Test Hash Cutoff Df Stats
Total size: 1645
Success size: 166
len(all_k)=1643
len(top_k)=167
[(4, 1.0, 1), (9, 1.0, 1), (65, 1.0, 1), (105, 1.0, 1), (115, 1.0, 1), (248, 1.0, 1), (320, 1.0, 1), (412, 1.0, 1), (431, 1.0, 1), (500, 1.0, 1), (509, 1.0, 1), (533, 1.0, 1), (554, 1.0, 1), (560, 1.0, 1), (567, 1.0, 1), (570, 1.0, 1), (585, 1.0, 1), (648, 1.0, 1), (685, 1.0, 1), (749, 1.0, 1), (759, 1.0, 1), (760, 1.0, 1), (803, 1.0, 1), (811, 1.0, 1), (836, 1.0, 1), (925, 1.0, 1), (933, 1.0, 1), (960, 1.0, 1), (961, 1.0, 1), (1019, 1.0, 1), (1025, 1.0, 1), (1052, 1.0, 1), (1198, 1.0, 1), (1283, 1.0, 1), (1318, 1.0, 1), (1331, 1.0, 1), (1368, 1.0, 1), (1385, 1.0, 1), (1457, 1.0, 1), (1522, 1.0, 1), (1559, 1.0, 1), (1568, 1.0, 1), (1734, 1.0, 1),

Unnamed: 0,roll_actual,seed,hash,client_seed,client_index,roll_0,roll_1,roll_2,roll_3,roll_4,roll_5,roll_6,roll_7,roll_8,roll_9,roll_10,roll_11,roll_12,roll_13,roll_14,roll_15,roll_16,roll_17,roll_18,roll_19,roll_20,roll_21,roll_22,roll_23,roll_24,roll_25,roll_26,roll_27,roll_28,roll_29,roll_30,roll_31,roll_32,roll_33,roll_34,roll_35,roll_36,roll_37,roll_38,roll_39,roll_40,roll_41,roll_42,roll_43,roll_44,roll_45,roll_46,roll_47,roll_48,roll_49,roll_50,roll_51,roll_52,roll_53,roll_54,roll_55,roll_56,roll_57,roll_58,roll_59,roll_60,count_lt_1000,count_gt_9000,count_gt_8000,count_gt_7000,count_gt_6000,count_gt_5000,count_gt_4000,count_gt_3000,count_gt_2000,count_gt_1000,count_hi,count_lo,count_lo_hi,count_gt_9000_k,count_lt_1000_k,total_dig_9,total_dig_0,roll_mean,roll_std,count_lt_1000_25_50,count_gt_9000_25_50,count_gt_8000_25_50,count_gt_7000_25_50,count_gt_6000_25_50,count_gt_5000_25_50,count_gt_4000_25_50,count_gt_3000_25_50,count_gt_2000_25_50,count_gt_1000_25_50,count_hi_25_50,count_lo_25_50,count_lo_hi_25_50,count_gt_9000_k_25_50,count_lt_1000_k_25_50,total_dig_9_25_50,total_dig_0_25_50,roll_mean_25_50,roll_std_25_50,count_lt_1000_50_60,count_gt_9000_50_60,count_gt_8000_50_60,count_gt_7000_50_60,count_gt_6000_50_60,count_gt_5000_50_60,count_gt_4000_50_60,count_gt_3000_50_60,count_gt_2000_50_60,count_gt_1000_50_60,count_hi_50_60,count_lo_50_60,count_lo_hi_50_60,count_gt_9000_k_50_60,count_lt_1000_k_50_60,total_dig_9_50_60,total_dig_0_50_60,roll_mean_50_60,roll_std_50_60,count_lt_1000_1_60,count_gt_9000_1_60,count_gt_8000_1_60,count_gt_7000_1_60,count_gt_6000_1_60,count_gt_5000_1_60,count_gt_4000_1_60,count_gt_3000_1_60,count_gt_2000_1_60,count_gt_1000_1_60,count_hi_1_60,count_lo_1_60,count_lo_hi_1_60,count_gt_9000_k_1_60,count_lt_1000_k_1_60,total_dig_9_1_60,total_dig_0_1_60,roll_mean_1_60,roll_std_1_60,roll_0_nonce,roll_1_nonce,roll_2_nonce,roll_3_nonce,roll_4_nonce,roll_5_nonce,roll_6_nonce,roll_7_nonce,roll_8_nonce,roll_9_nonce,roll_10_nonce,roll_11_nonce,roll_12_nonce,roll_13_nonce,roll_14_nonce,roll_15_nonce,roll_16_nonce,roll_17_nonce,roll_18_nonce,roll_19_nonce,roll_20_nonce,roll_21_nonce,roll_22_nonce,roll_23_nonce,roll_24_nonce,roll_25_nonce,roll_26_nonce,roll_27_nonce,roll_28_nonce,roll_29_nonce,roll_30_nonce,roll_31_nonce,roll_32_nonce,roll_33_nonce,roll_34_nonce,roll_35_nonce,roll_36_nonce,roll_37_nonce,roll_38_nonce,roll_39_nonce,roll_40_nonce,roll_41_nonce,roll_42_nonce,roll_43_nonce,roll_44_nonce,roll_45_nonce,roll_46_nonce,roll_47_nonce,roll_48_nonce,roll_49_nonce,roll_50_nonce,roll_51_nonce,roll_52_nonce,roll_53_nonce,roll_54_nonce,roll_55_nonce,roll_56_nonce,roll_57_nonce,roll_58_nonce,roll_59_nonce,roll_60_nonce,count_lt_1000_nonce,count_gt_9000_nonce,count_gt_8000_nonce,count_gt_7000_nonce,count_gt_6000_nonce,count_gt_5000_nonce,count_gt_4000_nonce,count_gt_3000_nonce,count_gt_2000_nonce,count_gt_1000_nonce,count_hi_nonce,count_lo_nonce,count_lo_hi_nonce,count_gt_9000_k_nonce,count_lt_1000_k_nonce,total_dig_9_nonce,total_dig_0_nonce,roll_mean_nonce,roll_std_nonce,count_lt_1000_25_50_nonce,count_gt_9000_25_50_nonce,count_gt_8000_25_50_nonce,count_gt_7000_25_50_nonce,count_gt_6000_25_50_nonce,count_gt_5000_25_50_nonce,count_gt_4000_25_50_nonce,count_gt_3000_25_50_nonce,count_gt_2000_25_50_nonce,count_gt_1000_25_50_nonce,count_hi_25_50_nonce,count_lo_25_50_nonce,count_lo_hi_25_50_nonce,count_gt_9000_k_25_50_nonce,count_lt_1000_k_25_50_nonce,total_dig_9_25_50_nonce,total_dig_0_25_50_nonce,roll_mean_25_50_nonce,roll_std_25_50_nonce,count_lt_1000_50_60_nonce,count_gt_9000_50_60_nonce,count_gt_8000_50_60_nonce,count_gt_7000_50_60_nonce,count_gt_6000_50_60_nonce,count_gt_5000_50_60_nonce,count_gt_4000_50_60_nonce,count_gt_3000_50_60_nonce,count_gt_2000_50_60_nonce,count_gt_1000_50_60_nonce,count_hi_50_60_nonce,count_lo_50_60_nonce,count_lo_hi_50_60_nonce,count_gt_9000_k_50_60_nonce,count_lt_1000_k_50_60_nonce,total_dig_9_50_60_nonce,total_dig_0_50_60_nonce,roll_mean_50_60_nonce,roll_std_50_60_nonce,count_lt_1000_1_60_nonce,count_gt_9000_1_60_nonce,count_gt_8000_1_60_nonce,count_gt_7000_1_60_nonce,count_gt_6000_1_60_nonce,count_gt_5000_1_60_nonce,count_gt_4000_1_60_nonce,count_gt_3000_1_60_nonce,count_gt_2000_1_60_nonce,count_gt_1000_1_60_nonce,count_hi_1_60_nonce,count_lo_1_60_nonce,count_lo_hi_1_60_nonce,count_gt_9000_k_1_60_nonce,count_lt_1000_k_1_60_nonce,total_dig_9_1_60_nonce,total_dig_0_1_60_nonce,roll_mean_1_60_nonce,roll_std_1_60_nonce,probs,k
1341,3263,1c5ea7091200ef097e44fa32587a8d6119374dce2cf9763ff9ed568d7ac7c956,fef10d00fcd97a523a39169b7c5f6fadaf15f063acdbc31b4ae0ff7126a973d6,woxpwoxpwoxpwoxpwoxp167b7d22b1037628c8c0437041e6c41cb56f095855c1dd6fb5317bcca138569e,835250,1008,2279,6100,396,4406,9080,7015,3885,1205,8859,6179,9265,908,9174,7317,1357,5127,6530,2756,3722,750,7617,8348,1110,1106,4150,8942,3669,9203,3226,758,1113,7178,4024,1121,2027,2456,6212,8935,4035,2449,2294,6523,3595,135,2006,4233,9059,7275,8826,3705,1335,2397,9030,1065,9385,5349,2028,5532,2025,4680,3,3,1,2,3,1,1,2,2,2,9,10,19,1,1,8,10,4815.5,3079.659388,2,2,3,2,2,0,4,4,5,2,9,17,26,1,1,9,9,4505.730769,2890.298927,0,2,0,0,0,2,1,1,3,2,4,7,11,2,0,4,7,4230.090909,2907.034278,5,7,5,5,5,3,6,6,10,8,24,35,59,1,1,21,27,4574.433333,2977.083719,1008,1218,8716,9043,4337,3092,6314,8578,2874,9743,5099,5350,7514,249,2557,1834,1718,2739,9234,4385,4193,3390,7935,582,5932,8252,2520,2906,1495,7378,3414,292,884,8567,8569,8154,9444,5788,6517,9491,3273,6138,3270,6986,8986,9629,7879,7621,7839,1063,9581,9770,9239,6028,676,2964,7636,4890,3475,8263,7886,1,3,2,1,1,2,3,1,3,3,8,11,19,1,0,9,4,4939.35,2990.029718,2,4,5,4,3,1,0,3,2,2,17,9,26,0,0,14,4,5997.538462,3106.665495,1,3,1,2,1,0,1,1,1,0,7,3,10,3,1,6,3,6400.727273,3037.871264,5,9,8,8,5,4,4,6,6,5,33,25,58,1,0,31,12,5556.483333,3033.686517,0.506594,2
3613,704,1c5ea7091200ef097e44fa32587a8d6119374dce2cf9763ff9ed568d7ac7c956,fef10d00fcd97a523a39169b7c5f6fadaf15f063acdbc31b4ae0ff7126a973d6,woxpwoxpwoxpwoxpwoxp161b133b20e4961ae2991915c13702a41267ecbbd93e9713fdd926f51ed06f33,553545,5009,7311,6708,2270,9230,781,7515,2201,2294,5860,9311,9722,611,9807,3251,390,4839,7379,7551,5136,3824,4293,5264,3411,1159,775,1066,1529,9964,8399,8298,8260,1128,7491,1334,9111,2264,5865,5684,5363,5534,4091,6283,801,7275,5974,2159,8742,5744,504,277,7215,6014,3628,783,6586,5233,295,1468,7716,3323,3,4,0,4,1,2,1,2,3,0,10,8,18,1,1,8,7,5299.55,3187.055356,4,2,4,2,1,6,1,0,2,4,15,11,26,1,1,11,5,4765.961538,3185.479813,3,0,0,2,2,1,0,2,0,1,4,6,10,0,2,1,1,3867.090909,2848.842377,9,6,4,8,4,10,3,5,5,6,30,27,57,1,1,22,13,4771.566667,3018.522865,5009,7636,7081,5224,187,2459,9385,9763,7178,4057,1621,8127,9136,8320,1819,5854,3219,4526,9653,3239,2889,7702,1037,8900,4934,7215,3520,9305,3931,8611,895,5171,8173,1219,8092,4383,2026,4409,1319,1722,1247,5690,434,1468,7918,17,6976,2444,9193,4644,5106,3852,3870,5215,1245,8135,4193,694,5973,6875,2120,1,4,2,3,0,2,2,2,2,2,10,9,19,0,1,9,3,5568.65,3033.221186,3,2,3,2,1,3,3,2,2,5,9,15,24,1,1,12,7,4428.0,3002.43697,1,0,1,0,1,3,1,2,1,1,3,6,9,0,0,3,3,4298.0,2304.545725,5,6,7,6,2,7,7,6,5,9,24,31,55,1,1,26,16,4854.1,2941.837767,0.503676,4
992,9857,1c5ea7091200ef097e44fa32587a8d6119374dce2cf9763ff9ed568d7ac7c956,fef10d00fcd97a523a39169b7c5f6fadaf15f063acdbc31b4ae0ff7126a973d6,woxpwoxpwoxpwoxpwoxpaeba405a165a828235be95581ed2b9d55ff9fb1aa2d4a559e0863a03a94f3107,879684,5002,6537,6361,653,1033,2573,9506,1309,7804,1161,1863,2799,7433,8749,7878,3508,5202,7703,7288,6541,8357,7136,2691,1311,6140,4046,5299,3902,1041,2416,6436,1339,5703,3701,4653,5552,1079,3396,9555,958,7446,9388,757,7508,586,2015,2975,2198,7888,5891,7061,512,4030,912,6658,2183,7871,3859,5508,8552,2898,1,1,2,5,3,1,0,1,2,4,11,8,19,0,1,5,7,5212.9,3006.611467,3,2,0,4,1,4,2,3,4,3,11,15,26,0,0,12,9,4338.038462,2753.365889,2,0,1,2,1,1,1,1,2,0,5,6,11,0,2,3,4,4549.454545,2780.858729,6,3,3,11,6,6,3,5,9,8,28,31,59,1,1,21,20,4621.8,2807.727827,5002,6077,5215,9964,1408,6227,4522,7151,1941,4615,31,9446,8017,7986,2881,6298,4423,5561,9391,1846,8340,9605,1046,7862,3290,9843,5055,9058,1534,9681,545,7168,661,8870,7676,7100,7743,2763,9390,9764,9586,5778,2591,2425,2937,7215,3125,2071,6794,4353,2611,6997,2471,8146,9195,3540,1019,8470,4539,211,3405,1,3,2,2,3,2,3,0,1,3,11,8,19,1,0,8,4,5567.0,2882.522908,2,6,1,5,1,2,1,1,6,1,14,11,25,3,1,10,7,5628.346154,3179.14791,1,1,2,0,1,0,1,2,2,1,4,7,11,1,0,6,4,4600.363636,3123.165678,4,11,5,8,5,4,5,4,8,6,31,27,58,1,0,26,18,5457.883333,3070.379014,0.502321,5
868,1374,1c5ea7091200ef097e44fa32587a8d6119374dce2cf9763ff9ed568d7ac7c956,fef10d00fcd97a523a39169b7c5f6fadaf15f063acdbc31b4ae0ff7126a973d6,woxpwoxpwoxpwoxpwoxp7da94031ea707506b281700fae24ca0db5ff08bd50cada3625482d88d88de24a,893092,8007,4524,3032,5236,3939,8972,8554,8418,4196,2198,9627,9050,5258,8272,3142,5476,5454,1568,3515,5268,6621,703,7781,5038,6058,9324,9487,3675,3277,2351,9777,1257,3300,7312,9859,311,7139,9879,7991,1819,9044,76,3073,705,6426,5448,4859,117,5625,7618,1210,2833,2691,5277,1218,7615,352,2308,3257,4460,2532,0,2,4,0,1,5,2,4,1,1,11,8,19,0,0,7,3,5616.0,2464.82932,4,6,0,4,1,2,1,4,1,3,13,12,25,3,0,13,6,5036.884615,3458.955453,1,0,0,1,0,1,1,1,4,2,2,9,11,0,0,1,3,3068.454545,2069.827112,6,8,4,6,3,9,4,9,6,5,28,29,57,0,0,21,14,4923.366667,2964.907773,8007,4019,3388,481,888,452,5248,3930,3000,1293,6176,7156,3383,4690,4295,8274,9995,3895,8726,5596,6862,2398,6641,297,5651,6608,1808,7025,1693,2310,449,4592,3186,4874,5645,4915,3783,7655,9431,1434,3496,8037,5435,331,149,915,9522,19,6206,4751,6568,3619,6124,2839,842,7077,2623,4023,7550,6422,3620,3,1,2,1,2,2,3,5,0,1,7,12,19,0,3,10,6,4587.35,2723.700603,5,2,1,2,3,2,4,3,1,3,10,13,23,0,1,10,6,4262.961538,2893.711492,1,0,0,2,3,0,1,2,2,0,5,6,11,0,1,2,4,4664.272727,2183.936267,10,3,3,5,8,5,8,10,4,4,23,33,56,0,3,24,16,4371.833333,2676.411917,0.502166,7
2016,3816,1c5ea7091200ef097e44fa32587a8d6119374dce2cf9763ff9ed568d7ac7c956,fef10d00fcd97a523a39169b7c5f6fadaf15f063acdbc31b4ae0ff7126a973d6,woxpwoxpwoxpwoxpwoxp1166120f0f0fd6bc7b801646d123b7344b209d52f8adc4097a98248a8cb99ffa,749020,9009,5276,8928,4492,468,5219,3128,2096,2653,4955,5689,7918,7899,3324,928,9913,7261,9181,9105,1907,5505,1748,9705,4302,1456,9557,3059,6396,2034,8712,3286,5245,8076,4344,1241,5629,7402,2701,738,6992,9960,6796,846,3352,1680,9187,2883,229,5470,7873,6350,2202,7258,2971,5423,487,4273,3104,7789,3479,3369,2,3,1,3,0,4,2,2,2,1,10,8,18,0,1,15,4,5292.25,2922.482702,3,3,2,2,4,3,1,3,3,2,13,12,25,1,0,11,9,5001.461538,2990.356651,1,0,0,2,1,1,1,3,2,0,4,7,11,0,1,4,3,4245.909091,2236.80779,6,7,3,7,4,8,5,8,7,5,27,30,57,0,1,31,17,4890.816667,2873.170637,9009,8431,6063,6081,4475,2451,4688,3176,8614,8329,5504,3936,1879,1556,8226,2216,6779,7980,9129,2420,8628,7900,4423,9290,6540,766,3250,8528,7752,2501,7393,2687,8894,8261,3912,186,1056,2406,8127,4214,7934,7941,7681,8932,7959,217,4159,8561,5291,5413,8696,7068,3476,8479,8581,5521,5881,195,7927,8345,5092,0,1,5,1,3,1,2,2,3,2,11,9,20,0,0,7,5,5528.05,2644.681107,3,0,7,6,0,2,2,2,3,1,15,11,26,0,1,11,4,5489.115385,3058.032941,1,0,4,2,0,3,0,1,0,0,8,2,10,0,0,5,2,6296.454545,2657.034526,4,2,15,10,4,6,5,5,6,3,36,23,59,0,0,25,15,5699.933333,2774.060396,0.502084,8


In [217]:
df[df['total']==1]

Unnamed: 0,cutoff,cutoff_2,ratio,total
0,0.508289,0.508314,1.0,1
1,0.507535,0.50756,1.0,1
2,0.506579,0.506604,1.0,1
3,0.504593,0.504618,1.0,1
5,0.502304,0.502329,1.0,1
6,0.502279,0.502304,1.0,1
7,0.502153,0.502179,1.0,1
8,0.502078,0.502103,1.0,1
9,0.501902,0.501927,1.0,1
10,0.501022,0.501047,1.0,1


In [240]:
# key_cols = ['k','roll_actual','probs','roll_0','roll_1','client_seed']
# other_cols = [col for col in test_all_k if col not in key_cols]
# test_all_k[key_cols+other_cols]

In [None]:
# key_cols = ['k','roll_actual','client_seed','probs','roll_0','roll_1']
other_cols = [col for col in test_all_k if col not in key_cols]
test_all_k[test_all_k['k'].isin(top_k_ele)][key_cols+other_cols]

In [None]:
# # test_sel=test_all_k[:5]
# test_sel=test_all_k.sort_values('probs',ascending=False)[:5]
# test_sel

In [None]:
# top_k_ele = [x[0] for x in top_k]
# print(top_k_ele)
# matches = set(list(test_sel.k)).intersection(set(top_k_ele))
# print(f'{len(matches)}')
# print(matches)

Important Feats Top K Elements

In [222]:
%%time

# k_bin_size=600

all_k_imp =[-1]*hash_imp_count
top_k_imp =[-1]*hash_imp_count
test_all_k_imp =[-1]*hash_imp_count
top_k_ele_imp =[-1]*hash_imp_count

df= gen_hash_cutoff_df(xgb_models,features_test,test_hash,
                            k_bin_size=k_bin_size,quant=0.95)
print('Test Hash Cutoff Df Stats')
print('Total size:',len(df[df['total']==1]))
print('Success size:',len(df[(df['total']==1) & (df['ratio']>0)]))

for hash_idx in range(hash_imp_count):

    print(f'***************** Test Hash {hash_imp_names[hash_idx]} ***************** ')

    all_k_imp[hash_idx],top_k_imp[hash_idx],test_all_k_imp[hash_idx] = gen_all_k(xgb_models,features_test,
                                                                        test_hash_imp[hash_idx],
                                                                        df,target_total=1)

    top_k_ele_imp[hash_idx] = [x[0] for x in top_k_imp[hash_idx]]


len(all_k)=1753
len(top_k)=117
[(15, 1.0, 1), (24, 1.0, 1), (200, 1.0, 1), (209, 1.0, 1), (293, 1.0, 1), (449, 1.0, 1), (504, 1.0, 1), (508, 1.0, 1), (533, 1.0, 1), (568, 1.0, 1), (645, 1.0, 1), (744, 1.0, 1), (746, 1.0, 1), (801, 1.0, 1), (844, 1.0, 1), (857, 1.0, 1), (883, 1.0, 1), (885, 1.0, 1), (901, 1.0, 1), (907, 1.0, 1), (935, 1.0, 1), (942, 1.0, 1), (964, 1.0, 1), (971, 1.0, 1), (997, 1.0, 1), (1001, 1.0, 1), (1072, 1.0, 1), (1088, 1.0, 1), (1125, 1.0, 1), (1128, 1.0, 1), (1132, 1.0, 1), (1178, 1.0, 1), (1234, 1.0, 1), (1262, 1.0, 1), (1309, 1.0, 1), (1402, 1.0, 1), (1407, 1.0, 1), (1422, 1.0, 1), (1636, 1.0, 1), (1640, 1.0, 1), (1746, 1.0, 1), (1876, 1.0, 1), (1899, 1.0, 1), (1939, 1.0, 1), (2004, 1.0, 1), (2060, 1.0, 1), (2083, 1.0, 1), (2097, 1.0, 1), (2223, 1.0, 1), (2251, 1.0, 1), (2434, 1.0, 1), (2450, 1.0, 1), (2483, 1.0, 1), (2509, 1.0, 1), (2631, 1.0, 1), (2647, 1.0, 1), (2706, 1.0, 1), (2708, 1.0, 1), (2804, 1.0, 1), (2815, 1.0, 1), (2838, 1.0, 1), (2904, 1.0, 1), (30

In [223]:
# top_k_ele_imp

In [224]:
# test_all_k_imp[0].head()

END

In [225]:
def get_sim_summary(test_filt_all_k,
                       test_hash_cutoff_df,
                       is_imp_feats,imp_feats_count,
                       is_rank_pct=False):

    exclude_cols=['roll_mean_rank','roll_mean_summary_init','roll_mean_rank_init',
                'roll_mean_summary','ratio_group_roll_rank','ratio_group_roll_rank_desc']
    if is_imp_feats:
#         print('imp_feats_count:',imp_feats_count)
        hash_groups =[ [col for col in imp_df['feature'].values if (col.startswith('roll_')) \
                       & (col not in exclude_cols)][:imp_feats_count] ]
    else:
        hash_groups = [list(range(1,11)),list(range(25,36)),list(range(50,60)),list(range(100,111)) ]
    hash_groups_flat = [item for sublist in hash_groups for item in sublist]
#     print(hash_groups)
    
    
    if HIGH_ANALYSIS:
        success = (test_filt_all_k[hash_groups_flat] > HIGH_TARGET).sum(axis=1)
    else:
        success = (test_filt_all_k[hash_groups_flat] < LOW_TARGET).sum(axis=1)
    
    ratio_mean_init = success / len(hash_groups_flat)
    
    roll_mean_init = test_filt_all_k[hash_groups_flat].mean(axis=1)
    test_filt_all_k['ratio_mean_summary_init']=ratio_mean_init
    test_filt_all_k['roll_mean_summary_init']=roll_mean_init
    test_filt_all_k['ratio_mean_rank_init']= test_filt_all_k['ratio_mean_summary_init'].rank(method='min',
                                                                                           pct=is_rank_pct,
                                                                                           ascending=True)
    test_filt_all_k['roll_mean_rank_init']= test_filt_all_k['roll_mean_summary_init'].rank(method='min',
                                                                                           pct=is_rank_pct,
                                                                                           ascending=True)
    
    rev_cols = [col for col in hash_groups_flat if col not in ['roll_mean']] + \
                ['roll_mean_summary_init']
#     print('rev cols:',rev_cols)
    roll_mean = test_filt_all_k[rev_cols].mean(axis=1)
    test_filt_all_k['roll_mean']=roll_mean
    
    if HIGH_ANALYSIS:
        success = (test_filt_all_k[rev_cols] > HIGH_TARGET).sum(axis=1)
    else:
        success = (test_filt_all_k[rev_cols] < LOW_TARGET).sum(axis=1)
    
    ratio_mean = success / len(rev_cols)
    test_filt_all_k['ratio_mean_summary']=ratio_mean

    
    test_filt_all_k['k']=test_filt_all_k['k']
    
    test_filt_all_k['ratio_group_roll_rank_init']= test_filt_all_k.groupby('ratio_mean_summary_init') \
                                                            ['roll_mean_summary_init'] \
                                                            .rank(method='min', \
                                                             pct=is_rank_pct)   
    test_filt_all_k['ratio_group_roll_rank_desc_init']= test_filt_all_k.groupby('ratio_mean_summary_init') \
                                                                        ['roll_mean_summary_init'] \
                                                                        .rank(method='min', \
                                                                              ascending=False, \
                                                                              pct=is_rank_pct)   
    
    test_filt_all_k['ratio_mean_rank']= test_filt_all_k['ratio_mean_summary'].rank(method='min',
                                                                           ascending=True,
                                                                            pct=is_rank_pct)
    test_filt_all_k['roll_mean_rank']= test_filt_all_k['roll_mean'].rank(method='min',
                                                                         pct=is_rank_pct,
                                                                           ascending=True)
    test_filt_all_k['ratio_group_roll_rank']= test_filt_all_k.groupby('ratio_mean_summary') \
                                                                    ['roll_mean'] \
                                                                    .rank(method='min', \
                                                                    pct=is_rank_pct)   
    test_filt_all_k['ratio_group_roll_rank_desc']= test_filt_all_k.groupby('ratio_mean_summary') \
                                                                        ['roll_mean'] \
                                                                        .rank(method='min', \
                                                                        ascending=False, \
                                                                        pct=is_rank_pct)   
    test_filt_all_k['probs_rank']= test_filt_all_k['probs'].rank(method='min',pct=is_rank_pct,
                                                                ascending=False)
    test_filt_all_k['ratio_group_probs_rank']= test_filt_all_k.groupby('ratio_mean_summary')['probs'] \
                                                                            .rank(method='min', \
                                                                                  pct=is_rank_pct, \
                                                                                  ascending=False)    
#     for col in hash_groups_flat:
#         test_filt_all_k[f'{col}_rank']=test_filt_all_k[col].rank(method='min',
#                                                                 ascending=True,
#                                                                 pct=is_rank_pct)
    #     test_filt_all_k=sim_summary.reset_index(drop=True)
    
    return test_filt_all_k
    

In [226]:
# test_orig= test.copy()
# test_hash_orig = test_hash.copy()

In [227]:
count_all_k = len(all_k)
if count_all_k<10:
    print(f'PROJECT WARNING: number of target records is {count_all_k} which is less than 10 ')
count_top_k = len(top_k)    
if count_top_k<2:
    print(f'PROJECT WARNING: number of probable success is {count_top_k} which is less than 2')  
hash_success = len(df[(df['total']==2) & (df['ratio']>0)])    
if hash_success<2:
    print(f'PROJECT WARNING: number of hash success is {hash_success} which is less than 2')

In [228]:
print(len(test))
mask = test.duplicated(subset='client_seed')
print(len(test[mask]))
print(test[mask]['client_seed'].nunique())

print(len(test_hash))
mask = test_hash.duplicated(subset='client_seed')
print(len(test_hash[mask]))
print(test_hash[mask]['client_seed'].nunique())

8163
0
0
8098
0
0


In [241]:
# print(all_k)
print(top_k_ele)

[56, 91, 119, 122, 125, 202, 229, 273, 277, 289, 311, 320, 325, 363, 404, 567, 622, 648, 753, 836, 851, 885, 895, 922, 932, 948, 960, 1162, 1229, 1259, 1328, 1354, 1438, 1457, 1502, 1536, 1578, 1587, 1611, 1621, 1648, 1654, 1694, 1721, 1739, 1751, 1838, 1910, 2002, 2065, 2102, 2216, 2236, 2246, 2250, 2280, 2282, 2325, 2344, 2472, 2476, 2496, 2497, 2580, 2587, 2597, 2617, 2678, 2694, 2785, 2786, 2836, 2974, 3004, 3007, 3068, 3072, 3076, 3086, 3088, 3097, 3131, 3165, 3236, 3250, 3275, 3311, 3324, 3325, 3369, 3384, 3390, 3392, 3404, 3446, 3450, 3506, 3541, 3544, 3558, 3565, 3599, 3601, 3608, 3645, 3647, 3699, 3738, 3769, 3787, 3801, 3809, 3835, 3917, 4017, 4043, 4059, 4094, 4108, 4151, 4177, 4185, 4204, 4216, 4244, 4294, 4313, 4380, 4450, 4554, 4555, 4611, 4640, 4643, 4702, 4716, 4761, 4772, 4773, 4787, 4809, 4838, 4914, 4948, 5086, 5181, 5207, 5209, 5270, 5443, 5566, 5601, 5618, 5847, 5907, 5928, 6021, 6040, 6210, 6251, 6360, 6401, 6542, 6605, 6672, 6787, 6864, 6897, 7487, 7852, 8448]


In [230]:
def get_nlargest_ind(arr,n=10,asc=False):
    if not asc:
        ind = np.argpartition(arr, -n)[-n:]
    else:
        ind = np.argpartition(arr, -n)[:n]
    
    #     print(ind)

    topn= arr[ind]
#     print(topn)

    sorted_val=  np.argsort(arr[ind])
    if not asc:
        sorted_val=sorted_val[::-1]
#     print(sorted_val)
    sorted_ind  = ind[sorted_val]
    
    return sorted_ind



def get_top_success_count(probs,y,top_n=10,asc=False):
    
    top_prob_ind = get_nlargest_ind(probs,n=top_n,asc=asc)
    top_y = y[top_prob_ind]
    
#     print(f'{top_prob_ind=}')
#     print(f'{top_y=}')
#     print(f'{probs[top_prob_ind]}')
    
    total = top_n
    match_pos_arr = np.nonzero(top_y==1)[0]+1
    success = len(top_y[top_y==1])
    ratio = success / total
    
#     print(f'total:{total} success:{success} ratio:{ratio}')
    
    return success, ratio,top_prob_ind,match_pos_arr

def get_top_auc_score(probs,y,top_n=10,asc=False):
    
    top_prob_ind = get_nlargest_ind(probs,n=top_n,asc=asc)
    top_y = y[top_prob_ind]
    top_probs = probs[top_prob_ind]
    
    score = roc_auc_score(top_y, top_probs)
    
    return score

def cust_top_n_matches(y_pred, dtrain):
    y_true = dtrain.get_label()
    top_preds = 100
    success, ratio,top_prob_ind,match_pos_arr=get_top_success_count(y_pred,
                                                                    y_true,
                                                                  top_n=top_preds,asc=False)
    match_count = len(match_pos_arr)
    error = top_preds - match_count

    return 'top_n_matches', error

def cust_top_n_auc(y_pred, dtrain):
    y_true = dtrain.get_label()
    top_preds = 100
    auc_score=get_top_auc_score(y_pred,y_true,top_n=top_preds,asc=False)
    error = 1 - auc_score
    return 'top_n_matches', error

def gen_top_mean_data_ml(test_combined_imp,features_test,
                        n_estimators =100,asc=False,top_n=10,
                        random_over_sampler=False,use_best_iteration=False,
                        n_splits=5,is_boruta_sel=False,max_iter_boruta=100):
    params = { 'n_estimators':n_estimators,
              'max_leaves':25,
                'subsample':0.8,
              'random_state':145,
              # 'scale_pos_weight': 5,
    #           'max_depth':6,
            'learning_rate':0.05,
             'colsample_bytree':0.6,#0.85,
             'lambda':0.05,
             'alpha':0.1}

    # xgb_model = xgb.XGBClassifier(**params)
    xgb_models = []
    scores  = []
    ratios =[]
    tr_cutoffs=[]
    val_cutoffs=[]
    tr_last_cutoffs=[]
    val_cutoff_percs=[]
    
    X = test_combined_imp[features_test].reset_index(drop=True)
    y = test_combined_imp['target'].reset_index(drop=True)
    
    top_ratios = []
    iterations = []
    first_match_positions= []
    best_iterations =[]
    sel_features_folds = []
    
    
    kfold = StratifiedKFold(n_splits=n_splits, random_state=756, shuffle=True)
    oof_probs = np.zeros(len(test_combined_imp))
    oversample = RandomOverSampler(sampling_strategy='minority',random_state=756)

    for fold,(tr_index, val_index) in enumerate(kfold.split(X, y)):
        X_tr,y_tr = X.iloc[tr_index],y.iloc[tr_index]
        X_val,y_val = X.iloc[val_index],y.iloc[val_index]
        
#         print(f'{val_index=}')

        # fit and apply the transform
        if random_over_sampler:
            X_tr, y_tr = oversample.fit_resample(X_tr, y_tr)
#         print(pd.Series(y_tr).value_counts())


        print(f'\n ******************* fold: {fold} ********')
#         print(f'tr size: { len(X_tr)}  val size:  {len(X_val)}')
        xgb_model = xgb.XGBClassifier(**params)
    
        
#         xgb_model.fit(X_tr,y_tr,
#                      verbose=0,
#                      eval_set = [(X_val,y_val)],
#                      eval_metric='aucpr',
# #                     eval_metric = cust_top_n_auc,
#                       early_stopping_rounds=50)
        
        sel_features = features_test.copy()
        if is_boruta_sel:
            params['n_estimators']=100
            xgb_model = xgb.XGBClassifier(**params)
            
            sel_features=get_boruta_selected_data(X_tr,y_tr,xgb_model,max_iter=max_iter_boruta)
            X_tr = X_tr[sel_features]
            X_val = X_val[sel_features]
            
            params['n_estimators']=n_estimators
        
        #fit model again after feature selection
        xgb_model.fit(X_tr,y_tr,
                     verbose=0,
                     eval_set = [(X_val,y_val)],
                     eval_metric='aucpr',
#                     eval_metric = cust_top_n_auc,
                      early_stopping_rounds=50)        

        
        
        sel_features_folds.append(sel_features)
        xgb_models.append(xgb_model)
        
        best_iterations.append(xgb_model.get_booster().best_iteration)
        best_n_limit = xgb_model.get_booster().best_ntree_limit
        
        if use_best_iteration:
            val_probs = xgb_model.predict_proba(X_val,
                                               iteration_range=[0,best_n_limit]
#                                                iteration_range=[best_n_limit-1,best_n_limit]
                                               )[:,1]
        else:
            val_probs = xgb_model.predict_proba(X_val)[:,1]
            
        oof_probs[val_index] = val_probs
        
        val_success, val_ratio,val_top_prob_ind,match_pos_arr=get_top_success_count(val_probs,y_val.to_numpy(),
                                                                      top_n=top_n,asc=asc)
        if len(match_pos_arr)==0:
            first_match_pos=11
        else:
            first_match_pos = match_pos_arr[0]
        first_match_positions.append(first_match_pos)
#         print('first_match_pos:',first_match_pos)
        
#         print(f'total:{top_n} success:{val_success} top ratio:{val_ratio}')
        top_ratios.append(val_ratio)
        
        opt_cutoff, f1score = get_opt_cutoff_prec(y_val,val_probs)
#         print('valid cutoff:',opt_cutoff)
        val_cutoffs.append(opt_cutoff)
        opt_cutoff_perc = stats.percentileofscore(val_probs, opt_cutoff, 'weak') / 100
        val_cutoff_percs.append(opt_cutoff_perc)
        
    
        val_labels = convert_probtolabels(val_probs,cutoff=opt_cutoff) 
        mask1 = (val_labels==1)
        mask2 = (y_val==1)
        total = len(val_labels[mask1])
        success = len(val_labels[mask1 & mask2])
        ratio = success/total if total!=0 else 0
#         print(f'ratio: {ratio} total: {total} success: {success}')
        ratios.append(ratio)
        cur_f1 = f1_score(y_val, val_labels,average='macro')
        cur_acc = accuracy_score(y_val,val_labels)
#         print(f'val accuracy score:{cur_acc} f1 score:{cur_f1:.4f}')
        scores.append(cur_acc)
        
        dump_list = xgb_model.get_booster().get_dump()
        cur_iterations = len(dump_list)
        iterations.append(cur_iterations)
#         print('Iteration params:',cur_iterations,xgb_model.get_booster().best_iteration,
#               xgb_model.get_booster().best_ntree_limit)
        
#     print('\ntop ratios:',top_ratios)
    print()
#     print('ratios:',ratios)
#     print('scores:',scores)
#     print('val_cutoffs:',val_cutoffs)
    print('top ratio mean:',np.mean(top_ratios))
    print('first match pos mean:',np.mean(first_match_positions))
    print('ratio mean:',np.mean(ratios))
#     print('score mean:',np.mean(scores))
    mean_cutoff = np.mean(val_cutoffs)
#     print('cutoff mean:',mean_cutoff)
    mean_cutoff_perc= np.mean(val_cutoff_percs)
    
    print('iterations:',iterations)
    mean_iterations = np.mean(iterations)
#     print('iterations mean:',mean_iterations)
    mean_best_iterations = np.mean(best_iterations)
#     print('best iterations mean:',mean_best_iterations)
#         print(f'total:{top_n} success:{val_success} top ratio:{val_ratio}')

    #oof performance
    oof_success, oof_ratio,oof_top_prob_ind,oof_match_pos_arr=get_top_success_count(oof_probs,y.to_numpy(),
                                                                  top_n=top_n,asc=asc)

    print('oof match positions:',oof_match_pos_arr)
    print('oof top ratio:',oof_ratio)
    
    oof_average_precision = average_precision_score(y.to_numpy(),oof_probs)
    print('oof AUC PR Score:',oof_average_precision)
   
    return xgb_models,top_ratios,mean_iterations,mean_cutoff,mean_cutoff_perc,sel_features_folds,oof_match_pos_arr

def gen_top_mean_data_ml_final(test_combined_imp,features_test,
                        n_estimators =100,random_over_sampler=False,
                        is_boruta_sel=False,max_iter_boruta=100):
    params = { 'n_estimators':n_estimators,
              'max_leaves':25,
                'subsample':0.8,
              'random_state':145,
              # 'scale_pos_weight': 5,
    #           'max_depth':6,
            'learning_rate':0.05,
             'colsample_bytree':0.6,#0.85,
             'lambda':0.05,
             'alpha':0.1}

    
    X = test_combined_imp[features_test].reset_index(drop=True)
    y = test_combined_imp['target'].reset_index(drop=True)
    oversample = RandomOverSampler(sampling_strategy='minority',random_state=756)    
    if random_over_sampler:
        X , y = oversample.fit_resample(X , y)

    print(f'\n ******************* Finalized Model ********')
    xgb_model = xgb.XGBClassifier(**params)
    xgb_model.fit(X,y)
    
    sel_features = features_test.copy()
    if is_boruta_sel:
        sel_features=get_boruta_selected_data(X,y,xgb_model,max_iter=max_iter_boruta)
        X = X[sel_features]
        xgb_model.fit(X,y)
    
    
    return xgb_model, sel_features

In [231]:
def filter_pos(oof_first_pos):
    default_valid = [1,2,3,4,5]
    if oof_first_pos > 5:
        return default_valid
    
    if oof_first_pos==1:
        return [1,2,3]
    elif oof_first_pos==2:
        return [1,2,3,4]
    elif oof_first_pos==3:
        return default_valid
    elif oof_first_pos==4:
        return [2,3,4,5]
    elif oof_first_pos==5:
        return [3,4,5]
    else:
        return default_valid
            
def filt_good_rolls(top_test,good_rolls):
    mask_good =None
    for col in good_rolls:
        if mask_good is None:
            mask_good = (top_test[col]==top_test[col].max())
        else:
            mask_good = mask_good |  (top_test[col]==top_test[col].max())
            
    cur_filt_ks= top_test[~mask_good]['k'].values
#     print(f'good roll_ks:{cur_filt_ks}')
    return cur_filt_ks
def filter_k(test_all_k,test_probs,all_k_imp,top_k_ele_imp,oof_first_pos,
             imp_df,imp_feats_count=1,top_n=5):
    test_all_k['probs_new']=test_probs
    top_test = test_all_k.sort_values('probs_new',ascending=False)[:5].reset_index(drop=True)
    
    roll_imp_feats = [col for col in imp_df['feature'].values if (col.startswith('roll_'))][:imp_feats_count]
    #select only good rolls that is not having highest roll values in top 5
    good_rolls =   [
                    'roll_25',
                  'roll_25_nonce',
                 ] + roll_imp_feats
#     initial_rolls = [good_rolls[:3]]


    initial_rolls = good_rolls[3:4]
    
    
    
    cur_filt_ks= filt_good_rolls(top_test,initial_rolls)
    print(f'k after good rolls count {len(cur_filt_ks)} values: {cur_filt_ks} feats: {initial_rolls}')
    
    #remove top k belonging to hash important data
    if len(cur_filt_ks)>1:
        top_k_ele_flat = [x for sublist in top_k_ele_imp for x in sublist]
        cur_filt_ks = set(cur_filt_ks).difference(set(top_k_ele_flat))
        print(f'k after top k- count {len(cur_filt_ks)} values: {cur_filt_ks}')
        
    #select only good pos according to oof pos
    if len(cur_filt_ks)>1:
        good_pos_list= filter_pos(oof_first_pos)
        if len(good_pos_list)!=5:
            good_pos_ks= top_test[top_test.index.isin(good_pos_list)]['k']
            cur_filt_ks = set(cur_filt_ks).intersection(set(good_pos_ks))
            print(f'k after oof pos- count {len(cur_filt_ks)} values: {cur_filt_ks}')
        
    #remove highest k 
    if len(cur_filt_ks)>1:
        cur_filt_ks = set(cur_filt_ks).difference(set({top_test['k'].max()}))
        print(f'k after high k- count {len(cur_filt_ks)} values: {cur_filt_ks}')
        
        
    #select only all k belonging to hash important data
    if len(cur_filt_ks)>1:
        all_k_ele_flat = [x for sublist in all_k_imp for x in sublist]
        cur_filt_ks = set(cur_filt_ks).intersection(set(all_k_ele_flat))
        print(f'k after all k- count {len(cur_filt_ks)} values: {cur_filt_ks}')
        if len(cur_filt_ks)==0:
            print('WARNING: ALL K Filter removed all elements')
        
    for good_roll in [ col for col in good_rolls if col not in initial_rolls]:
        if len(cur_filt_ks)<=1:
            break
        good_filt_ks= filt_good_rolls(top_test,[good_roll])
        cur_filt_ks = set(cur_filt_ks).intersection(set(good_filt_ks))
        print(f'k after good rolls count {len(cur_filt_ks)} values: {cur_filt_ks} feats: {good_roll}')
        
    mask = create_target_mask(top_test)
    success_ks = top_test[mask]['k'].values
    matching_ks = set(cur_filt_ks).intersection(set(success_ks))
    print(f'\nManual Matched Ks: {matching_ks} Actual Success Ks: {success_ks} Filtered List: {cur_filt_ks}')
    match_success = len(matching_ks)!=0
    
    print(f'Success client seeds: {top_test[mask]["client_seed"].values}')
    
    print(f'Selected Client Seed(s):{top_test[top_test["k"].isin(cur_filt_ks)]["client_seed"].values}')
    print(f'Selected Roll Actual:{top_test[top_test["k"].isin(cur_filt_ks)]["roll_actual"].values}')
    
    return match_success,matching_ks,top_test

def filter_k_from_top(test_all_k,test_probs,all_k_imp,top_k_ele_imp,top_n=5):
    test_all_k['probs_new']=test_probs
    top_test = test_all_k.sort_values('probs_new',ascending=False)[:5]
    k_top_pos = top_test["k"].values
    top_test = top_test.sort_values('k',ascending=True).reset_index(drop=True)
    mask = create_target_mask(top_test)
    k_indices = top_test[mask]['k'].index
    k_matches = top_test[mask]["k"].values
    k_top = top_test["k"].values
    print(f'top k and its positions:{k_top_pos} ')
    print(f'k ranks: {k_indices} matching k: {k_matches}  top k:{k_top} ')
    all_k_ele_flat = [x for sublist in all_k_imp for x in sublist]
    top_k_ele_flat = [x for sublist in top_k_ele_imp for x in sublist]
    
    k_filt_hash = set(k_top).intersection(set(all_k_ele_flat))
    print(f'k existence in all hash imp- count {len(k_filt_hash)} values: {k_filt_hash}')
    k_filt_hash_top = set(k_top).intersection(set(top_k_ele_flat))
    print(f'k existence in top hash imp- count {len(k_filt_hash_top)} values: {k_filt_hash_top}')
    
def check_roll_exist_top(test_all_k,test_probs,imp_df,imp_feats_count=1,top_n=5):  
    test_all_k['probs_new']=test_probs
    top_test = test_all_k.sort_values('probs_new',ascending=False)[:5]
    roll_feats = ['roll_0','roll_1','roll_25','roll_50',
                  'roll_0_nonce','roll_1_nonce','roll_25_nonce','roll_50_nonce',
                  'roll_mean_25_50','roll_mean_25_50_nonce',
                  'roll_mean_25_50_nonce','roll_mean_nonce','roll_mean',
                 'count_hi', 'count_hi_1_60','count_hi_nonce', 'count_hi_1_60_nonce']
    
    roll_imp_feats = [col for col in imp_df['feature'].values if (col.startswith('roll_'))][:imp_feats_count]
#     roll_feats +=list(set(roll_imp_feats).difference(set(roll_feats)))
    
    mask = create_target_mask(top_test)
    top_match = top_test[mask]
    high_collision=[]
    high_collision_imp=[]
    
    if len(top_match)!=0:
        for col in roll_feats:
            top_match_filt = top_match[top_match[col]==top_test[col].max()]
            if len(top_match_filt)!=0:
                high_collision.append(col)
        for i,col in enumerate(roll_imp_feats):
            top_match_filt = top_match[top_match[col]==top_test[col].max()]
            if len(top_match_filt)!=0:
                high_collision_imp.append(f'imp_feat_{i+1}')
                
#     good_rolls = ['roll_25','roll_25_nonce',roll_imp_feats[0],roll_imp_feats[1]]
    good_rolls = ['roll_25','roll_25_nonce',roll_imp_feats[0]]
    mask_good =None
    for col in good_rolls:
        if mask_good is None:
            mask_good = (top_test[col]==top_test[col].max())
        else:
            mask_good = mask_good |  (top_test[col]==top_test[col].max())
            
    high_roll_ks= top_test[~mask_good]['k'].values
    print(f'high_roll_ks:{high_roll_ks}')
            
    print(f'High Collisions Count: {len(high_collision)} Values :{high_collision}')
    print(f'High Collisions Imp Count: {len(high_collision_imp)} Values :{high_collision_imp} ')
    print(f'All Imp Feats: {roll_imp_feats} ')
            
    return high_collision,high_collision_imp

In [242]:
%%time

use_best_iteration=False
is_rank=True
is_rank_pct=False
is_individual_feats = True

save_models=True
load_models=False
        
cur_feats_count = 2
top_order_asc = False
top_n=100
random_over_sampler=False
n_splits=5

is_boruta_sel=False
max_iter_boruta=25

rand_count_fetch=10
rand_iterations=1000

if is_individual_feats:
    option = 'indl'
elif is_rank:
    option = 'rank'
else:
    option = 'wo_rank'
    
if is_rank_pct:
    option += '_pct'

CPU times: user 5 µs, sys: 0 ns, total: 5 µs
Wall time: 8.34 µs


In [243]:
# nonce_models=[[]]*len(k_obj_all_nonces)

# k_obj_nonces_cur = k_obj_all_nonces.copy()
# nonce_list_cur   = nonce_list.copy()

#     test_all_k_imp,test_all_k,df,features_test,all_k,top_k_ele,all_k_imp,top_k_ele_imp,imp_df = k_obj
#     cur_nonce = nonce_list_cur[i]
print(f'\n****************** NONCE {nonce} initial success {len(top_k_ele)} out of {len(all_k)} ****************** ')

cur_feats_count=140

if is_individual_feats:
    for test_hash_imp in test_all_k_imp:
        test_hash_imp= get_sim_summary(test_hash_imp,df,
                                       True,cur_feats_count,is_rank_pct=is_rank_pct)

mask = create_target_mask(test_all_k)        

test_all_k['target']=0
test_all_k.loc[mask,'target']=1

y_test = test_all_k['target']


test_combined_imp=pd.concat(test_all_k_imp,axis=0)
mask = create_target_mask(test_combined_imp)  
test_combined_imp['target']=0
test_combined_imp.loc[mask,'target']=1

print(len(test_all_k))
print(len(test_combined_imp))

cur_feats_count=140

if not is_individual_feats:
    test_combined_imp= get_sim_summary(test_combined_imp,df,
                                         True,cur_feats_count,is_rank_pct=is_rank_pct)

cur_features = features_test.copy()
if is_rank:
    cur_features += ['ratio_mean_rank','roll_mean_rank','ratio_group_roll_rank']            


if load_models:
    model_set = joblib.load(f'data/models/k_models_{cur_nonce}_{option}_{file_pattern_str}_pattern.dump')
    cur_models = model_set[1]
    oof_match_pos_arr,mean_iterations=model_set[2]
    print(f'OOF Match Positions:{oof_match_pos_arr}')
else:
    cur_models,top_ratios,mean_iterations,mean_cutoff,mean_cutoff_perc,sel_features_folds,oof_match_pos_arr = \
                                                gen_top_mean_data_ml(test_combined_imp,
                                                                       cur_features,
                                                                        n_estimators=1000,
                                                                         asc=top_order_asc,top_n=top_n,
                                                                        random_over_sampler=random_over_sampler,
                                                                    use_best_iteration=use_best_iteration,
                                                                    n_splits=n_splits,
                                                                    is_boruta_sel=is_boruta_sel,
                                                                     max_iter_boruta=max_iter_boruta)



test_all_k= get_sim_summary(test_all_k,df,
                            True,cur_feats_count,is_rank_pct=is_rank_pct)

test_probs= np.zeros(len(test_all_k))


best_iterations=[]
for model in cur_models:
    test_probs += (model.predict_proba(test_all_k[cur_features])[:,1]) 
test_probs = test_probs / len(cur_models)

average_precision = average_precision_score(test_all_k['target'], test_probs)
print(f'AUC PR Score:{average_precision}')

#         random_experiment(test_all_k,test_probs,count_fetch=rand_count_fetch, iterations=rand_iterations)

test_success, test_ratio,test_top_prob_ind,match_pos_arr=get_top_success_count(test_probs,y_test.to_numpy(),
                                                                  top_n=top_n,asc=top_order_asc)
print(f'total:{top_n} success:{test_success} top ratio:{test_ratio} match positions: {match_pos_arr}')
oof_first_pos =   oof_match_pos_arr[0]  

filter_success,filter_ks,top_test = filter_k(test_all_k,test_probs,all_k_imp,top_k_ele_imp,oof_first_pos,
         imp_df,imp_feats_count=10,top_n=5)

filter_k_from_top(test_all_k,test_probs,all_k_imp,top_k_ele_imp,top_n=5)
high_collision_result = check_roll_exist_top(test_all_k,test_probs,imp_df,imp_feats_count=5,top_n=5)

#Final Model

print(f'mean iterations:{mean_iterations}')
if use_best_iteration:
    n_estimators = 100 #int(np.mean(best_iterations))
else:
    n_estimators = int(mean_iterations)

print('\n------------------ Finalized Model ---------------------')
if load_models:
    model_final = model_set[0]
else:
    print(f'*************** First Run of final model with kfold mean iterations {mean_iterations} ***************')
    model_final,sel_features_final = gen_top_mean_data_ml_final(test_combined_imp,cur_features,
                            n_estimators = n_estimators,random_over_sampler=random_over_sampler,
                            is_boruta_sel=is_boruta_sel,
                             max_iter_boruta=max_iter_boruta)
if save_models:
    joblib.dump([model_final,cur_models,[oof_match_pos_arr,mean_iterations]],f'data/models/k_models_{cur_nonce}_{option}_{file_pattern_str}_pattern.dump')
test_probs = (model_final.predict_proba(test_all_k[cur_features])[:,1]) 

average_precision = average_precision_score(test_all_k['target'], test_probs)
print(f'AUC PR Score:{average_precision}')
#         random_experiment(test_all_k,test_probs,count_fetch=rand_count_fetch, iterations=rand_iterations)

test_success, test_ratio,test_top_prob_ind,match_pos_arr=get_top_success_count(test_probs,y_test.to_numpy(),
                                                                  top_n=top_n,asc=top_order_asc)
print(f'First Run Final total:{top_n} success:{test_success} top ratio:{test_ratio} match positions: {match_pos_arr}')

filter_success,filter_ks,top_test_final = filter_k(test_all_k,test_probs,all_k_imp,top_k_ele_imp,oof_first_pos,
         imp_df,imp_feats_count=10,top_n=5)

filter_k_from_top(test_all_k,test_probs,all_k_imp,top_k_ele_imp,top_n=5)
high_collision_result = check_roll_exist_top(test_all_k,test_probs,imp_df,imp_feats_count=5,top_n=5)



****************** NONCE 2722 initial success 171 out of 1643 ****************** 
1643
8341

 ******************* fold: 0 ********

 ******************* fold: 1 ********

 ******************* fold: 2 ********

 ******************* fold: 3 ********

 ******************* fold: 4 ********

top ratio mean: 0.14400000000000002
first match pos mean: 5.2
ratio mean: 0.10038558965467774
iterations: [87, 56, 99, 52, 83]
oof match positions: [14 25 27 28 33 50 55 60 69 87 88 92 97]
oof top ratio: 0.13
oof AUC PR Score: 0.09953118611988376
AUC PR Score:0.10092911487577305
total:100 success:11 top ratio:0.11 match positions: [15 18 34 39 43 45 57 66 69 84 91]
k after good rolls count 4 values: [5017 2435 4686 3949] feats: ['roll_50_nonce']
k after top k- count 3 values: {5017, 2435, 3949}
k after high k- count 2 values: {2435, 3949}
k after all k- count 2 values: {2435, 3949}
k after good rolls count 2 values: {2435, 3949} feats: roll_25
k after good rolls count 2 values: {2435, 3949} feats: roll

In [244]:
key_cols = ['roll_actual','k','roll_0','roll_1','probs_new','probs','client_seed']
other_cols = [col for col in top_test.columns if col not in key_cols]
top_test_final[key_cols+other_cols]

Unnamed: 0,roll_actual,k,roll_0,roll_1,probs_new,probs,client_seed,seed,hash,client_index,roll_2,roll_3,roll_4,roll_5,roll_6,roll_7,roll_8,roll_9,roll_10,roll_11,roll_12,roll_13,roll_14,roll_15,roll_16,roll_17,roll_18,roll_19,roll_20,roll_21,roll_22,roll_23,roll_24,roll_25,roll_26,roll_27,roll_28,roll_29,roll_30,roll_31,roll_32,roll_33,roll_34,roll_35,roll_36,roll_37,roll_38,roll_39,roll_40,roll_41,roll_42,roll_43,roll_44,roll_45,roll_46,roll_47,roll_48,roll_49,roll_50,roll_51,roll_52,roll_53,roll_54,roll_55,roll_56,roll_57,roll_58,roll_59,roll_60,count_lt_1000,count_gt_9000,count_gt_8000,count_gt_7000,count_gt_6000,count_gt_5000,count_gt_4000,count_gt_3000,count_gt_2000,count_gt_1000,count_hi,count_lo,count_lo_hi,count_gt_9000_k,count_lt_1000_k,total_dig_9,total_dig_0,roll_mean,roll_std,count_lt_1000_25_50,count_gt_9000_25_50,count_gt_8000_25_50,count_gt_7000_25_50,count_gt_6000_25_50,count_gt_5000_25_50,count_gt_4000_25_50,count_gt_3000_25_50,count_gt_2000_25_50,count_gt_1000_25_50,count_hi_25_50,count_lo_25_50,count_lo_hi_25_50,count_gt_9000_k_25_50,count_lt_1000_k_25_50,total_dig_9_25_50,total_dig_0_25_50,roll_mean_25_50,roll_std_25_50,count_lt_1000_50_60,count_gt_9000_50_60,count_gt_8000_50_60,count_gt_7000_50_60,count_gt_6000_50_60,count_gt_5000_50_60,count_gt_4000_50_60,count_gt_3000_50_60,count_gt_2000_50_60,count_gt_1000_50_60,count_hi_50_60,count_lo_50_60,count_lo_hi_50_60,count_gt_9000_k_50_60,count_lt_1000_k_50_60,total_dig_9_50_60,total_dig_0_50_60,roll_mean_50_60,roll_std_50_60,count_lt_1000_1_60,count_gt_9000_1_60,count_gt_8000_1_60,count_gt_7000_1_60,count_gt_6000_1_60,count_gt_5000_1_60,count_gt_4000_1_60,count_gt_3000_1_60,count_gt_2000_1_60,count_gt_1000_1_60,count_hi_1_60,count_lo_1_60,count_lo_hi_1_60,count_gt_9000_k_1_60,count_lt_1000_k_1_60,total_dig_9_1_60,total_dig_0_1_60,roll_mean_1_60,roll_std_1_60,roll_0_nonce,roll_1_nonce,roll_2_nonce,roll_3_nonce,roll_4_nonce,roll_5_nonce,roll_6_nonce,roll_7_nonce,roll_8_nonce,roll_9_nonce,roll_10_nonce,roll_11_nonce,roll_12_nonce,roll_13_nonce,roll_14_nonce,roll_15_nonce,roll_16_nonce,roll_17_nonce,roll_18_nonce,roll_19_nonce,roll_20_nonce,roll_21_nonce,roll_22_nonce,roll_23_nonce,roll_24_nonce,roll_25_nonce,roll_26_nonce,roll_27_nonce,roll_28_nonce,roll_29_nonce,roll_30_nonce,roll_31_nonce,roll_32_nonce,roll_33_nonce,roll_34_nonce,roll_35_nonce,roll_36_nonce,roll_37_nonce,roll_38_nonce,roll_39_nonce,roll_40_nonce,roll_41_nonce,roll_42_nonce,roll_43_nonce,roll_44_nonce,roll_45_nonce,roll_46_nonce,roll_47_nonce,roll_48_nonce,roll_49_nonce,roll_50_nonce,roll_51_nonce,roll_52_nonce,roll_53_nonce,roll_54_nonce,roll_55_nonce,roll_56_nonce,roll_57_nonce,roll_58_nonce,roll_59_nonce,roll_60_nonce,count_lt_1000_nonce,count_gt_9000_nonce,count_gt_8000_nonce,count_gt_7000_nonce,count_gt_6000_nonce,count_gt_5000_nonce,count_gt_4000_nonce,count_gt_3000_nonce,count_gt_2000_nonce,count_gt_1000_nonce,count_hi_nonce,count_lo_nonce,count_lo_hi_nonce,count_gt_9000_k_nonce,count_lt_1000_k_nonce,total_dig_9_nonce,total_dig_0_nonce,roll_mean_nonce,roll_std_nonce,count_lt_1000_25_50_nonce,count_gt_9000_25_50_nonce,count_gt_8000_25_50_nonce,count_gt_7000_25_50_nonce,count_gt_6000_25_50_nonce,count_gt_5000_25_50_nonce,count_gt_4000_25_50_nonce,count_gt_3000_25_50_nonce,count_gt_2000_25_50_nonce,count_gt_1000_25_50_nonce,count_hi_25_50_nonce,count_lo_25_50_nonce,count_lo_hi_25_50_nonce,count_gt_9000_k_25_50_nonce,count_lt_1000_k_25_50_nonce,total_dig_9_25_50_nonce,total_dig_0_25_50_nonce,roll_mean_25_50_nonce,roll_std_25_50_nonce,count_lt_1000_50_60_nonce,count_gt_9000_50_60_nonce,count_gt_8000_50_60_nonce,count_gt_7000_50_60_nonce,count_gt_6000_50_60_nonce,count_gt_5000_50_60_nonce,count_gt_4000_50_60_nonce,count_gt_3000_50_60_nonce,count_gt_2000_50_60_nonce,count_gt_1000_50_60_nonce,count_hi_50_60_nonce,count_lo_50_60_nonce,count_lo_hi_50_60_nonce,count_gt_9000_k_50_60_nonce,count_lt_1000_k_50_60_nonce,total_dig_9_50_60_nonce,total_dig_0_50_60_nonce,roll_mean_50_60_nonce,roll_std_50_60_nonce,count_lt_1000_1_60_nonce,count_gt_9000_1_60_nonce,count_gt_8000_1_60_nonce,count_gt_7000_1_60_nonce,count_gt_6000_1_60_nonce,count_gt_5000_1_60_nonce,count_gt_4000_1_60_nonce,count_gt_3000_1_60_nonce,count_gt_2000_1_60_nonce,count_gt_1000_1_60_nonce,count_hi_1_60_nonce,count_lo_1_60_nonce,count_lo_hi_1_60_nonce,count_gt_9000_k_1_60_nonce,count_lt_1000_k_1_60_nonce,total_dig_9_1_60_nonce,total_dig_0_1_60_nonce,roll_mean_1_60_nonce,roll_std_1_60_nonce,target,ratio_mean_summary_init,roll_mean_summary_init,ratio_mean_rank_init,roll_mean_rank_init,ratio_mean_summary,ratio_group_roll_rank_init,ratio_group_roll_rank_desc_init,ratio_mean_rank,roll_mean_rank,ratio_group_roll_rank,ratio_group_roll_rank_desc,probs_rank,ratio_group_probs_rank
0,1679,5633,6002,5247,0.339596,0.367264,woxpwoxpwoxpwoxpwoxpc7c274ba940b8668ce25adfbf658f9ebb8b889248136f9b9151eeb627f41e987,1c5ea7091200ef097e44fa32587a8d6119374dce2cf9763ff9ed568d7ac7c956,fef10d00fcd97a523a39169b7c5f6fadaf15f063acdbc31b4ae0ff7126a973d6,601332,2999,6649,1853,3328,3734,9583,5849,2710,9512,2604,9608,4377,415,7476,1976,219,4049,3406,1941,6149,9635,3700,6242,8342,3623,2221,1743,9915,4712,57,3368,2645,8,9340,8621,9651,2805,1105,1417,2203,197,9548,8478,4514,5880,7715,2977,7945,1648,4530,6001,194,8582,1584,8172,482,6190,4140,8412,2,3,0,1,1,2,2,3,3,3,6,13,19,0,0,12,5,4710.002485,2901.587113,3,4,3,2,0,1,2,2,5,4,10,16,26,1,0,8,5,4641.461538,3419.465949,2,0,3,0,2,0,2,0,0,2,5,6,11,0,1,2,5,4539.545455,3193.67141,7,8,6,3,5,3,6,6,8,8,24,35,59,0,0,24,17,4703.766667,3127.941725,6002,2390,1297,2054,5235,2166,1722,255,4034,4755,1424,209,5586,6713,3151,6821,1987,3687,1633,6559,4653,5746,3469,4507,8334,2447,5229,9483,7649,47,1354,8746,1592,3925,5846,8304,9389,9778,8209,3135,7930,9321,8523,6645,9307,27,339,6273,8452,2017,5527,9800,9277,946,3744,2476,9661,3742,4437,931,9977,2,0,0,0,3,2,3,2,3,5,4,14,18,0,0,5,4,3316.55,2118.927223,3,5,5,2,2,3,0,2,2,2,16,9,25,1,1,13,5,5749.769231,3354.339021,2,4,0,0,0,1,1,2,1,0,5,6,11,2,1,7,2,5501.636364,3580.18436,7,9,6,2,5,6,5,7,6,7,26,31,57,0,0,26,12,4881.2,3140.691955,0,0.101449,4707.604986,1142.0,376.0,0.101449,44.0,100.0,1142.0,387.0,45.0,99.0,1490.0,125.0
1,7149,1191,1003,3363,0.244787,0.495171,woxpwoxpwoxpwoxpwoxpe0dd60b9e2b776ea7b97ffaa081b64231cd7bad9a601c0a22badb843477486b4,1c5ea7091200ef097e44fa32587a8d6119374dce2cf9763ff9ed568d7ac7c956,fef10d00fcd97a523a39169b7c5f6fadaf15f063acdbc31b4ae0ff7126a973d6,1595,1694,9997,7638,6963,158,937,4465,2493,7828,6602,4744,1803,8735,6653,1416,5966,9542,3681,5730,2513,945,8550,981,46,4170,6907,6599,2601,693,7039,9207,4011,7271,571,6094,3674,3079,8680,7237,399,3960,3782,6866,3737,3892,6885,7233,6475,8218,1667,7656,1126,246,8742,5016,2879,6646,9304,5550,2,2,1,2,3,2,2,2,1,3,10,10,20,1,0,9,3,4797.654817,2992.282308,4,1,2,4,6,0,2,6,1,0,13,13,26,0,2,12,10,4974.076923,2680.463511,1,1,2,1,1,2,0,0,1,2,6,4,10,0,1,2,3,5186.363636,3252.42661,9,4,5,7,10,4,4,8,4,5,29,30,59,1,1,25,17,4859.25,2898.781033,1003,2072,9757,2998,8276,7451,6634,1343,13,8347,8528,6730,9724,5276,9050,2762,6865,9683,1170,6293,9259,5072,4295,9080,6899,4119,3462,5534,4503,3590,4361,2560,1815,1549,3228,513,9893,9856,241,7354,913,772,961,2466,6804,9552,9249,1293,9854,3611,1471,6953,4561,7015,6856,5810,1684,3326,1228,3872,6630,1,5,3,1,4,1,0,0,3,2,14,6,20,1,0,9,5,6111.55,3232.131877,5,5,0,1,1,1,3,4,2,4,8,18,26,0,0,13,4,4212.461538,3273.93459,0,0,0,1,3,1,1,2,0,3,5,6,11,0,0,1,3,4491.454545,2318.025641,6,11,3,3,9,4,5,6,5,8,29,30,59,1,0,27,15,5083.933333,3112.97194,0,0.108696,4799.257301,1285.0,608.0,0.108696,75.0,39.0,1285.0,604.0,76.0,38.0,157.0,20.0
2,1,6401,9006,4184,0.241391,0.347595,woxpwoxpwoxpwoxpwoxp0bc2d4eef1060b61bfac4d8bc41aa407f998089abe5ac8c626934499b3020eef,1c5ea7091200ef097e44fa32587a8d6119374dce2cf9763ff9ed568d7ac7c956,fef10d00fcd97a523a39169b7c5f6fadaf15f063acdbc31b4ae0ff7126a973d6,112272,8287,7555,6596,5490,9574,2137,1475,786,3264,1177,7338,3202,6991,9530,9552,3703,9018,4933,8087,1938,6272,7346,9949,1044,2852,7780,9262,5876,324,675,3195,5049,2750,2261,955,5291,9841,6823,3129,762,12,9645,6145,203,9256,4397,5008,2806,5591,5323,8894,5119,7855,3967,9007,9732,6924,4044,9214,1,4,2,2,2,1,2,3,1,2,11,8,19,0,0,9,6,5277.128204,2997.263216,6,4,0,1,2,5,1,2,4,1,10,14,24,1,1,11,8,4266.615385,3152.274481,0,3,1,1,1,3,1,1,0,0,8,2,10,0,0,8,3,6879.090909,2164.016934,7,12,3,5,6,8,4,6,5,4,31,25,56,1,0,31,17,5323.25,3084.759011,9006,5192,8250,3366,2412,9998,9866,5183,6448,8270,9255,9139,3851,5528,2839,5154,7225,7682,703,6006,6454,6262,6379,7463,6228,5507,9549,7916,4095,5971,6955,9083,1584,99,7706,5894,4368,8903,789,5454,8946,8597,4628,7787,3311,9726,5851,5387,8874,554,1086,2453,526,4824,441,2034,1181,279,4376,8562,4111,1,4,2,2,3,4,0,2,2,0,12,5,17,1,0,9,5,6141.05,2622.486696,3,3,4,3,1,6,3,1,0,2,17,9,26,1,0,15,6,5716.153846,3022.095521,3,0,1,0,0,0,3,0,2,2,1,9,10,0,2,1,2,2715.727273,2545.174182,7,7,7,6,7,10,6,3,4,3,34,22,56,2,0,26,12,5442.666667,2924.300772,1,0.101449,5279.767209,1142.0,1531.0,0.101449,140.0,4.0,1142.0,1528.0,140.0,4.0,1566.0,136.0
3,9012,3262,6001,1301,0.236895,0.429942,woxpwoxpwoxpwoxpwoxpf9b6f9e688ad802b4ffa6f7e7b14b9add59eebdcb6517194366d1cd9e6c1b35e,1c5ea7091200ef097e44fa32587a8d6119374dce2cf9763ff9ed568d7ac7c956,fef10d00fcd97a523a39169b7c5f6fadaf15f063acdbc31b4ae0ff7126a973d6,830798,1261,1221,8966,3796,6435,2803,5464,5661,9897,7528,5990,3182,1474,7705,8986,536,7249,1325,1400,4101,4921,7883,6888,8861,8079,1273,621,900,5665,2891,1764,9416,5803,6343,9493,4816,3043,8370,9255,2235,5222,1293,4523,8838,9395,9047,9650,3303,8413,9966,2948,1000,7547,3929,7856,9713,3259,9311,9900,1,1,2,3,1,3,0,2,1,6,10,10,20,0,0,8,6,4900.054843,3123.328201,2,6,5,0,1,3,2,2,2,3,14,10,24,0,2,12,9,5712.0,3225.811067,0,4,1,2,0,0,0,2,1,1,7,4,11,1,0,10,5,6712.909091,3284.432263,3,11,7,6,3,6,4,6,4,10,32,25,57,0,0,31,21,5498.566667,3156.941667,6001,9631,3229,6502,525,3623,8046,7375,3997,5237,7243,8705,3822,4776,4409,2397,4193,2567,330,5455,5653,53,531,9106,3096,472,2115,928,4941,9486,1996,2679,6840,3896,3918,2602,4194,5255,2148,3857,5206,3124,767,259,7074,2458,8978,453,4392,3252,1247,8108,3956,6670,2618,6302,8490,6844,9073,8203,6004,2,1,2,2,1,3,3,4,2,0,8,10,18,1,1,7,5,4885.75,2521.633908,5,1,1,1,1,2,3,5,5,2,5,19,24,1,2,12,4,3559.115385,2501.015127,0,1,3,0,4,0,0,1,1,1,8,3,11,0,0,3,8,6137.727273,2533.593696,9,4,6,3,6,5,6,11,8,2,22,34,56,1,1,24,19,4488.433333,2746.349703,0,0.086957,4897.960923,760.0,847.0,0.086957,103.0,97.0,760.0,857.0,104.0,96.0,811.0,103.0
4,4113,2373,2003,2742,0.228301,0.455841,woxpwoxpwoxpwoxpwoxpde121cc6063a327f018318acf2381b6c7600e345105993d050070e64a3100723,1c5ea7091200ef097e44fa32587a8d6119374dce2cf9763ff9ed568d7ac7c956,fef10d00fcd97a523a39169b7c5f6fadaf15f063acdbc31b4ae0ff7126a973d6,574519,12,7472,7312,8663,2309,2384,6371,846,5954,6236,4055,8007,5999,7994,9768,2281,1515,7054,2012,9151,7536,3898,6246,6781,8501,3353,3256,4029,8463,8449,3908,5837,8587,7834,774,864,3490,7613,11,5836,6898,8106,7921,1813,2101,7049,136,3491,6777,3517,9059,9738,4297,315,3245,8497,2682,90,5352,2,1,2,4,2,2,1,0,5,1,11,9,20,0,1,8,6,4758.791329,2953.813612,4,0,5,4,3,2,1,5,1,1,14,12,26,0,0,8,7,5072.230769,2911.814847,2,2,1,0,1,1,1,2,1,0,5,6,11,2,1,6,2,4869.909091,3338.451421,8,4,8,9,6,5,3,8,7,2,32,28,60,0,1,24,15,5074.783333,2947.286768,2003,6529,3108,8624,200,619,9894,9034,8838,9835,3943,3310,7594,3232,680,9479,6652,5985,3984,9604,5561,5311,4435,2791,9716,3305,595,1302,1813,8867,3836,2789,9797,2293,3910,8460,9631,2525,9193,421,1683,4901,7595,2120,1058,9373,8706,4288,2279,3385,4859,265,4592,932,4706,182,5390,2545,4872,1658,3349,3,5,2,1,2,2,0,5,0,0,12,8,20,0,2,13,7,5835.25,3280.388486,2,4,3,1,0,0,3,4,5,4,8,16,24,0,1,14,8,4576.307692,3207.706368,3,0,0,0,0,1,4,1,1,1,1,8,9,0,3,5,2,3031.818182,1998.346607,8,10,5,2,2,4,7,10,7,5,23,34,57,1,2,33,17,4773.883333,3155.051044,0,0.115942,4760.161895,1398.0,487.0,0.115942,49.0,45.0,1398.0,492.0,49.0,45.0,469.0,27.0


END

In [107]:
def create_cutoff_pos_df(test,models,features_test,test_hash_cutoff_df,
                         imp_feats_count_list,test_filt_all_k,
                         gen_probs=True,is_compute_matches=True,
                   test_ratio_limit=0.12,sim_cutoff_dfs=None,
                         low_high_record_condns=[(True,12,10),(False,10,10),(False,2,10)],
                        ratio_mean_asc=False,roll_mean_asc=True,
                        top_k_ele=None,is_print=False):

    cutoff_pos_df =pd.DataFrame(columns = ['imp_feats_count','condn_max','condn_val',
                                           'k_top_1','k_top_2','k_top_3','k_top_4','k_top_5',
                                          'mean_top_1', 'mean_top_2','mean_top_3','mean_top_4',
                                          'mean_top_5','match_pos_1',
                                          'match_total_top_5','match_total_top_10','match_ks_top_5'])

    
    max_good_k= test_hash_cutoff_df[test_hash_cutoff_df['ratio']<test_ratio_limit].index[0]-1
    print(f'{max_good_k=}')
    is_imp_feats = True 
    gen_sim_cutoff=False
    if sim_cutoff_dfs is None:
        gen_sim_cutoff=True
        sim_cutoff_dfs=[]
    elif gen_probs:
        test_probs = generate_probs(True,xgb_models,test,features_test) 
    else:
        test_probs = test
    top_mean_data_list=[]
    for i,imp_feats_count in enumerate(imp_feats_count_list):
        if is_print:
            print()
            print(f'********************** Feature count {imp_feats_count} *****************')
#         if gen_sim_cutoff:
#             sim_cutoff_df,test_probs= get_sim_cutoff_data(test,models,features_test,test_hash_cutoff_df,
#                                                      is_imp_feats,imp_feats_count,gen_probs)
#             sim_cutoff_dfs.append(sim_cutoff_df)
#         else:
#             sim_cutoff_df=sim_cutoff_dfs[i]
        
        for condn in low_high_record_condns:
#             print(f'******* Condition {condn} for feature count {imp_feats_count} *****************')
#             top_mean_data = get_hash_prob_top_data(test,models,features_test,
#                                        test_hash_cutoff_df,is_imp_feats,imp_feats_count,
#                                         test_filt_all_k,
#                                        sim_cutoff_df = None,
#                                         is_min_total=condn[0], total_cri=condn[1],
#                                         top_n=condn[2],
#                                         ratio_mean_asc=ratio_mean_asc,roll_mean_asc=roll_mean_asc)
            
            sim_summary= get_sim_summary(test_filt_all_k,test_hash_cutoff_df,
                                         is_imp_feats,imp_feats_count,is_print=is_print)
            top_mean_data = sim_summary.sort_values(['ratio_mean','roll_mean'],ascending=[ratio_mean_asc,roll_mean_asc])[:top_n].reset_index(drop=True)
            if is_print:
                print('--- Top 5 data ---')
                print(top_mean_data[:5])
            top_mean_data_list.append(top_mean_data)
            
            if top_k_ele is not None:
                print(f'\n******success data *****')
                mask= top_mean_data['k'].isin(top_k_ele)
                print(top_mean_data[mask])
                print(f'\n******success positions *****')
                cur_positions = list(top_mean_data[mask].index+1)
                print(cur_positions)
            
#             print(top_mean_data)
            top_5_mean_data = top_mean_data[:5]
            k_tops=list(top_5_mean_data['k'].values)
            mean_tops=list(top_5_mean_data['ratio_mean'].values)
            
            match_pos_1=np.nan
            match_total_top_5=np.nan
            match_total_top_10=np.nan
            match_ks_top_5=np.nan
            
            if is_compute_matches:
                
                top_mean_data['rank']=top_mean_data['ratio_mean'].rank(method='average',
                                                                  ascending=False)
                top_5_mean_data = top_mean_data[:5]
               
                matches_top_5=top_5_mean_data[top_5_mean_data['k']<=max_good_k]
                matches_top_5=matches_top_5[matches_top_5['ratio_mean']>0]
                match_total_top_5=len(matches_top_5)

                matches_top=top_mean_data[top_mean_data['k']<=max_good_k]
                matches_top=matches_top[matches_top['ratio_mean']>0]
                match_total_top_10 = len(matches_top)

                if match_total_top_5==0:
                    match_ks_top_5=[]
                    if match_total_top_10==0:
                        match_pos_1=11
                    else:
                        match_pos_1=matches_top.iloc[0]['rank']
                else:
    #                 match_pos_1=matches_top_5.index[0]+1
                    match_pos_1=matches_top_5.iloc[0]['rank']
                    match_ks_top_5 = list(matches_top_5['k'].values)
 
            
            result = [imp_feats_count,condn[0],condn[1]]+k_tops+mean_tops \
                    +[match_pos_1,match_total_top_5,match_total_top_10,match_ks_top_5]
        
#             print(cutoff_pos_df.columns)
#             print(f'{len(cutoff_pos_df.columns),len(result),result}')
            next_idx = len(cutoff_pos_df.index)
            cutoff_pos_df.loc[next_idx] = result
        
            if is_compute_matches:
                match_cols = [col for col in cutoff_pos_df if col.startswith('match')]
                print(cutoff_pos_df.loc[next_idx][match_cols])
        
    return cutoff_pos_df,sim_cutoff_dfs,top_mean_data_list


In [108]:

def get_test_sel_cutoff_pos(test_probs,models,features_test,test_hash_cutoff_df,
                            cutoff_pos_df_hash,
                            is_min_total=False,total_cri=2,top_n=10,
                           sim_cutoff_dfs=None,imp_feats_count_list=None):
    cutoff_pos_stat = cutoff_pos_df_hash.groupby(['imp_feats_count'])['match_pos_1',
                                               'match_total_top_5',
                                               'match_total_top_10'].mean().reset_index()
    print(' ........... Overall Positions of Features .........')
    print(cutoff_pos_stat)
    cutoff_pos_df_hash_filt=cutoff_pos_df_hash[cutoff_pos_df_hash['condn_max']==False]
    cutoff_pos_stat = cutoff_pos_df_hash_filt.groupby(['imp_feats_count'])['match_pos_1',
                                               'match_total_top_5',
                                               'match_total_top_10'].mean().reset_index()
    print(' ........... Low Positions of Features .........')
    print(cutoff_pos_stat)
    cutoff_pos_best= cutoff_pos_stat.sort_values(['match_pos_1','match_total_top_5'],
                                                ascending=[True,False])\
                                    .reset_index().iloc[0]
    print('*** Best cutoff position in hash **** ')
    print(cutoff_pos_best)
    predicted_raw_pos=cutoff_pos_best['match_pos_1']
    predicted_max_pos = math.ceil(predicted_raw_pos)
    
    if predicted_raw_pos==predicted_max_pos:
        predicted_max_pos+=1
    
    is_imp_feats=True
    imp_feats_count=cutoff_pos_best['imp_feats_count']
    is_min_total=False
    
    if sim_cutoff_dfs is None:
        sim_cutoff_df,test_probs= get_sim_cutoff_data(test_probs,models,features_test,test_hash_cutoff_df,
                                                 is_imp_feats,imp_feats_count,gen_probs=False)
    else:
        #to use existing buffer, read the matching sim cutoff df from the list of buffer
        idx = imp_feats_count_list.index(imp_feats_count)
        sim_cutoff_df=sim_cutoff_dfs[idx]

    top_mean_data = get_hash_prob_top_data(test_probs,models,features_test,
                           test_hash_cutoff_df,is_imp_feats,imp_feats_count,
                           sim_cutoff_df = sim_cutoff_df,
                            is_min_total=is_min_total, total_cri=total_cri,
                            top_n=top_n)
    
    top_mean_data['rank']=top_mean_data['ratio_mean'].rank(method='min',ascending=False)
    mask = (top_mean_data['rank']<=predicted_max_pos) & (top_mean_data['ratio_mean']>0)
    top_mean_sel = top_mean_data[mask]
    k_sel_list = list(top_mean_sel['k'].values)
    print(top_mean_data)
    print(top_mean_sel)
    print(f'{k_sel_list=}')
    

#     mask = cutoff_pos_df_actual['imp_feats_count']==cutoff_pos_best['cutoff_pos_best']
#     mask2 = cutoff_pos_df_actual['match_pos_1']<=predicted_max_pos
#     k_sel = cutoff_pos_df_actual.loc[mask & mask2, 'k' ]
    
    test_sel  = None
    for k_sel in k_sel_list:
        row = test_hash_cutoff_df.loc[k_sel]
    #     print(row)
        mask = (test_probs['probs']>=row['cutoff']) & (test_probs['probs']<=row['cutoff_2'])
        cur_test_sel = test_probs[mask]
        cur_test_sel['k']=k_sel
        if test_sel is None:
            test_sel = cur_test_sel
        else:
            test_sel = pd.concat([test_sel,cur_test_sel],axis=0)
        print('Test Size:',len(test_sel))

    return k_sel_list,test_sel,top_mean_sel



In [124]:
opt_count = 4

HASH_IDX = 0
RATIO_MEAN_ASC_IDX = 1
ROLL_MEAN_ASC_IDX = 2
K_IDX = 3
IMP_FEATS_IDX = 4

In [180]:
def create_k_order(end,top_n,hash_idx,ratio_mean_order,roll_mean_order,
                  all_ks,imp_feats_count_list_norm_idx,imp_feats_count_list_norm_arr,
                   top_mean_actual,k_order_arr):
    start=end
    end = start + top_n
#     print(f'start {start} to {end}')

    k_order_arr[HASH_IDX,start:end] = hash_idx
    k_order_arr[RATIO_MEAN_ASC_IDX,start:end] = ratio_mean_order
    k_order_arr[ROLL_MEAN_ASC_IDX,start:end] = roll_mean_order
    k_order_arr[K_IDX,start:end] = np.sort(all_ks)

    #for each imp feats count, add only one set of k values and their corresponding positions
    # in that imp feats count
    for i,sel_count_idx in enumerate(imp_feats_count_list_norm_idx):
#         print('cur feat:',imp_feats_count_list_norm_arr[i])
        top_mean_filt = top_mean_actual[sel_count_idx].sort_values('k')
        k_order_arr[IMP_FEATS_IDX+i,start:end] = list(top_mean_filt.index+1)
        
    return k_order_arr,end

In [181]:
# imp_feats_count_list = [2,5,10,15,20,25,30,35,40,45,50,55,60]
# imp_feats_count_list= [2]+list(range(5,201,5))
# imp_feats_count_list= list(range(2,151))
imp_feats_count_list= list(range(2,151))
# imp_feats_count_list_norm_arr = np.array([2]+list(range(5,61,5)))
imp_feats_count_list_norm_arr = np.array(imp_feats_count_list)

# imp_feats_count_list= list(range(2,11))
# imp_feats_count_list_norm_arr = np.array([2]+list(range(5,11,5)))

imp_feats_count_list_norm_idx = np.where(np.isin(imp_feats_count_list,
                                                 imp_feats_count_list_norm_arr))[0]
# imp_feats_count_list_norm_idx = imp_feats_count_list_norm_arr-2



imp_feats_size_norm = len(imp_feats_count_list_norm_idx)
# imp_feats_count_list = [2,5]
imp_feats_size = len(imp_feats_count_list)

###### Normal Test Hash Data Based Match Pos df generation

Check top important hash test data for ascending or descending ratio mean performance

In [127]:
%%time
first_match_pos_arr= np.zeros((hash_imp_count,2,imp_feats_size))
dictionary_list=[]

total_k_all_hash=0
for hash_idx in range(hash_imp_count):
    total_k_all_hash += len(all_k_imp[hash_idx])
    
k_order_arr = np.zeros((4+imp_feats_size_norm,
                        opt_count*total_k_all_hash))


end =0

for hash_idx in range(hash_imp_count):
    k_count = len(all_k_imp[hash_idx])
    top_n = k_count
    
    print(f'\ntop_n:{top_n}')
    for ratio_mean_order in [False,True]:
        for roll_mean_order in [False,True]:
            print(f'********************** Test Hash idx:{hash_idx} {hash_imp_names[hash_idx]} (ratio_mean order) {ratio_mean_order} (roll_mean order) {roll_mean_order} ***********************')
            sim_cutoff_dfs_actual=None
            cutoff_pos_df_actual,sim_cutoff_dfs_actual,top_mean_actual=create_cutoff_pos_df(test_hash_imp[hash_idx],xgb_models,
                                features_test,df[df.index.isin(all_k_imp[hash_idx])],
                                imp_feats_count_list,test_all_k_imp[hash_idx],
                                is_compute_matches=False,
                               test_ratio_limit=0.12,sim_cutoff_dfs=sim_cutoff_dfs_actual,
                                low_high_record_condns=[(False,1,top_n)],
                                ratio_mean_asc=ratio_mean_order,
                                roll_mean_asc=roll_mean_order,
                                is_print=False,     
                                )
            
            k_order_arr,end = create_k_order(end,top_n,hash_idx,ratio_mean_order,roll_mean_order,
                  all_k_imp[hash_idx],imp_feats_count_list_norm_idx,imp_feats_count_list_norm_arr,
                   top_mean_actual,k_order_arr)
            
            first_match_pos_list=[]
            match_pos_list=[]
            for feats_idx in range(imp_feats_size):
                cur_top_data = top_mean_actual[feats_idx]
#                 print(f'\n******{imp_feats_count_list[feats_idx]} feats success data *****')
                mask= cur_top_data['k'].isin(top_k_ele_imp[hash_idx])
#                 print(cur_top_data[mask]['k'].to_numpy())
                cur_positions = list(cur_top_data[mask].index+1)

                dictionary_data = {'imp_feats_count': imp_feats_count_list[feats_idx],
                                   'hash_index':hash_idx, 
                                   'hash_feat_name': hash_imp_names[hash_idx], 
                                    'ratio_mean_asc':ratio_mean_order,
                                    'roll_mean_asc':roll_mean_order,
                                    'first_match_pos':cur_positions[0],
                                    'other_match_pos':cur_positions[1:]
                      }
                dictionary_list.append(dictionary_data)
#                 print(f'\n {imp_feats_count_list[feats_idx]} feats success positions')
#                 print(cur_positions)
        
        
match_pos_df = pd.DataFrame.from_dict(dictionary_list)

imp_feats_count_cols  = [f'imp_feats_{val}' for val in imp_feats_count_list_norm_arr]

k_order_df = pd.DataFrame(k_order_arr.T,columns=['hash_idx','ratio_mean_asc','roll_mean_asc',
                                              'k']+imp_feats_count_cols)
for col in k_order_df.columns:
    k_order_df[col]= k_order_df[col].astype('int')
k_order_df.head()
# match_pos_df        


top_n:80
********************** Test Hash idx:0 roll_0_nonce (ratio_mean order) False (roll_mean order) False ***********************
max_good_k=394
********************** Test Hash idx:0 roll_0_nonce (ratio_mean order) False (roll_mean order) True ***********************
max_good_k=394
********************** Test Hash idx:0 roll_0_nonce (ratio_mean order) True (roll_mean order) False ***********************
max_good_k=394
********************** Test Hash idx:0 roll_0_nonce (ratio_mean order) True (roll_mean order) True ***********************
max_good_k=394

top_n:81
********************** Test Hash idx:1 roll_56_nonce (ratio_mean order) False (roll_mean order) False ***********************
max_good_k=380
********************** Test Hash idx:1 roll_56_nonce (ratio_mean order) False (roll_mean order) True ***********************
max_good_k=380
********************** Test Hash idx:1 roll_56_nonce (ratio_mean order) True (roll_mean order) False ***********************
max_good_k=380
***

Unnamed: 0,hash_idx,ratio_mean_asc,roll_mean_asc,k,imp_feats_2,imp_feats_3,imp_feats_4,imp_feats_5,imp_feats_6,imp_feats_7,imp_feats_8,imp_feats_9,imp_feats_10,imp_feats_11,imp_feats_12,imp_feats_13,imp_feats_14,imp_feats_15,imp_feats_16,imp_feats_17,imp_feats_18,imp_feats_19,imp_feats_20,imp_feats_21,imp_feats_22,imp_feats_23,imp_feats_24,imp_feats_25,imp_feats_26,imp_feats_27,imp_feats_28,imp_feats_29,imp_feats_30,imp_feats_31,imp_feats_32,imp_feats_33,imp_feats_34,imp_feats_35,imp_feats_36,imp_feats_37,imp_feats_38,imp_feats_39,imp_feats_40,imp_feats_41,imp_feats_42,imp_feats_43,imp_feats_44,imp_feats_45,imp_feats_46,imp_feats_47,imp_feats_48,imp_feats_49,imp_feats_50,imp_feats_51,imp_feats_52,imp_feats_53,imp_feats_54,imp_feats_55,imp_feats_56,imp_feats_57,imp_feats_58,imp_feats_59,imp_feats_60,imp_feats_61,imp_feats_62,imp_feats_63,imp_feats_64,imp_feats_65,imp_feats_66,imp_feats_67,imp_feats_68,imp_feats_69,imp_feats_70,imp_feats_71,imp_feats_72,imp_feats_73,imp_feats_74,imp_feats_75,imp_feats_76,imp_feats_77,imp_feats_78,imp_feats_79,imp_feats_80,imp_feats_81,imp_feats_82,imp_feats_83,imp_feats_84,imp_feats_85,imp_feats_86,imp_feats_87,imp_feats_88,imp_feats_89,imp_feats_90,imp_feats_91,imp_feats_92,imp_feats_93,imp_feats_94,imp_feats_95,imp_feats_96,imp_feats_97,imp_feats_98,imp_feats_99,imp_feats_100,imp_feats_101,imp_feats_102,imp_feats_103,imp_feats_104,imp_feats_105,imp_feats_106,imp_feats_107,imp_feats_108,imp_feats_109,imp_feats_110,imp_feats_111,imp_feats_112,imp_feats_113,imp_feats_114,imp_feats_115,imp_feats_116,imp_feats_117,imp_feats_118,imp_feats_119,imp_feats_120,imp_feats_121,imp_feats_122,imp_feats_123,imp_feats_124,imp_feats_125,imp_feats_126,imp_feats_127,imp_feats_128,imp_feats_129,imp_feats_130,imp_feats_131,imp_feats_132,imp_feats_133,imp_feats_134,imp_feats_135,imp_feats_136,imp_feats_137,imp_feats_138,imp_feats_139,imp_feats_140,imp_feats_141,imp_feats_142,imp_feats_143,imp_feats_144,imp_feats_145,imp_feats_146,imp_feats_147,imp_feats_148,imp_feats_149,imp_feats_150
0,0,0,0,0,73,66,68,75,70,75,78,79,79,79,79,50,52,54,59,63,63,65,66,65,70,70,53,54,54,58,58,63,62,63,64,64,64,67,68,68,69,70,71,72,72,73,73,69,70,70,56,56,56,57,59,62,53,53,55,54,54,55,58,58,58,59,60,62,62,62,62,64,64,66,67,59,59,59,54,56,56,58,58,47,50,50,50,50,51,50,51,51,51,51,52,53,42,44,44,45,45,39,30,30,31,32,32,34,35,37,36,30,30,29,21,22,24,15,15,16,18,20,22,24,25,25,26,28,30,30,30,31,31,32,26,26,27,27,27,27,27,27,27,27,27,27,27,27,27,27,27,27,27
1,0,0,0,1,22,18,17,3,5,1,5,5,6,6,6,9,10,12,13,8,9,10,12,13,12,13,18,20,20,21,23,24,27,28,30,33,33,35,39,38,39,41,42,43,43,44,45,33,34,39,40,40,43,45,46,35,35,35,39,27,27,28,30,30,33,20,21,23,24,24,26,30,32,33,34,34,34,35,37,37,29,30,30,30,23,17,18,18,19,17,17,17,17,17,18,19,18,19,19,19,19,19,20,21,23,23,26,26,26,26,30,19,20,21,22,23,14,14,14,15,17,18,19,21,22,23,24,27,29,29,29,30,23,23,24,25,26,26,26,26,26,26,26,26,26,26,26,26,26,26,26,26,26
2,0,0,0,2,63,44,39,36,36,41,45,51,48,48,58,58,62,65,68,69,69,72,57,60,62,64,66,66,67,70,57,59,64,48,49,50,50,52,53,53,54,58,58,55,54,56,55,57,57,47,47,47,50,52,52,57,58,58,59,59,59,60,61,61,62,53,55,57,57,57,57,59,59,60,61,60,60,60,58,59,59,47,47,51,38,38,39,40,42,42,42,42,42,42,43,43,45,47,47,48,48,50,50,52,53,53,54,54,55,56,56,57,54,55,56,56,48,50,50,43,33,32,35,28,28,29,31,33,35,35,36,35,37,38,38,38,40,40,40,40,40,40,40,40,40,40,40,40,40,40,40,40,40
3,0,0,0,3,44,62,60,51,44,57,20,23,16,16,22,22,25,30,33,49,55,58,59,59,57,60,62,63,63,68,71,71,72,73,72,74,74,66,52,55,57,60,61,59,59,60,59,62,63,65,66,66,68,72,71,71,70,70,70,69,70,68,69,69,71,66,64,65,65,65,65,68,68,62,64,65,65,65,66,66,66,66,66,68,68,68,68,70,70,72,73,73,73,73,73,74,74,74,74,74,75,76,75,76,77,75,71,73,72,65,66,67,67,67,67,67,68,70,70,70,71,71,71,71,71,71,72,73,73,74,75,75,76,76,76,76,77,77,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78
4,0,0,0,4,42,36,33,47,42,40,49,53,53,53,56,67,67,67,39,25,28,30,34,35,39,42,44,44,45,45,50,52,54,38,41,43,41,26,28,31,32,32,33,33,33,34,37,41,43,46,46,46,46,36,37,40,40,40,44,44,44,46,32,32,19,21,23,24,16,16,18,20,21,22,25,25,25,26,28,28,32,33,33,33,33,33,34,35,38,38,39,39,39,39,41,41,43,45,45,46,46,48,48,50,51,51,52,53,53,46,49,50,51,52,54,55,56,57,58,58,58,58,58,51,52,53,55,55,55,56,57,59,59,60,60,60,61,53,53,46,46,46,46,46,46,46,46,46,46,46,46,46,46


In [128]:
match_pos_df.to_csv(f'data/analysis/match_pos_df_hash_{nonce}_{k_bin_size}.csv',index=False)
k_order_df.to_csv(f'data/analysis/k_order_df_hash_{nonce}_{k_bin_size}.csv',index=False)

In [129]:
# imp_feats_count_list= [2]+list(range(2,151))

In [182]:
%%time
# imp_feats_count_list = [2,5,10,15,20,25,30,35,40,45,50,55,60]
# imp_feats_count_list= [2]+list(range(5,201,5))
# imp_feats_count_list= list(range(2,151))
# imp_feats_count_list= list(range(150,250,10))

# imp_feats_count_list = [2,5]
imp_feats_size = len(imp_feats_count_list)
dictionary_list=[]

top_n = len(all_k)
print(f'\ntop_n:{top_n}')
k_order_arr_actual = np.zeros((4+imp_feats_size_norm,
                        opt_count*top_n))
end = 0
hash_idx =-1
for ratio_mean_order in [False,True]:
    for roll_mean_order in [False,True]:

        print(f'****************** ACTUAL ratio_mean asc {ratio_mean_order} roll_mean asc {roll_mean_order} *******************')
        sim_cutoff_dfs_actual=None
        cutoff_pos_df_actual,sim_cutoff_dfs_actual,top_mean_actual=create_cutoff_pos_df(test,xgb_models,
                            features_test,df[df.index.isin(all_k)],
                            imp_feats_count_list,test_all_k,is_compute_matches=False,
                           test_ratio_limit=0.12,sim_cutoff_dfs=sim_cutoff_dfs_actual,
                            low_high_record_condns=[(False,1,top_n)],
                            ratio_mean_asc=ratio_mean_order,
                            roll_mean_asc=roll_mean_order,#not(HIGH_ANALYSIS)
                            )
        k_order_arr_actual,end = create_k_order(end,top_n,hash_idx,ratio_mean_order,roll_mean_order,
                          all_k,imp_feats_count_list_norm_idx,imp_feats_count_list_norm_arr,
                           top_mean_actual,k_order_arr_actual)

        for feats_idx in range(imp_feats_size):
            cur_top_data = top_mean_actual[feats_idx]
#             print(f'\n******{imp_feats_count_list[feats_idx]} feats success data *****')
            mask= cur_top_data['k'].isin(top_k_ele)
#             print(cur_top_data[mask])
            cur_positions = list(cur_top_data[mask].index+1)

            dictionary_data = {'imp_feats_count': imp_feats_count_list[feats_idx],
                               'hash_index':hash_idx, 
                               'hash_feat_name': 'ACTUAL', 
                                'ratio_mean_asc':ratio_mean_order,
                                'roll_mean_asc':roll_mean_order,
                                'first_match_pos':cur_positions[0],
                                'other_match_pos':cur_positions[1:]
                  }
            dictionary_list.append(dictionary_data)
#             print(f'\n {imp_feats_count_list[feats_idx]} feats success positions')
#             print(cur_positions)


match_pos_df_actual = pd.DataFrame.from_dict(dictionary_list)

print()
print('First Match Pos Mean:',match_pos_df_actual.groupby(['ratio_mean_asc','roll_mean_asc'])['first_match_pos'].mean().reset_index())
print()
# match_pos_df_actual.head()  

imp_feats_count_cols  = [f'imp_feats_{val}' for val in imp_feats_count_list_norm_arr]

k_order_df_actual = pd.DataFrame(k_order_arr_actual.T,
                                 columns=['hash_idx','ratio_mean_asc','roll_mean_asc',
                                          'k']+imp_feats_count_cols)
for col in k_order_df.columns:
    k_order_df_actual[col]= k_order_df_actual[col].astype('int')
k_order_df_actual.head()



top_n:62
****************** ACTUAL ratio_mean asc False roll_mean asc False *******************
max_good_k=194
****************** ACTUAL ratio_mean asc False roll_mean asc True *******************
max_good_k=194
****************** ACTUAL ratio_mean asc True roll_mean asc False *******************
max_good_k=194
****************** ACTUAL ratio_mean asc True roll_mean asc True *******************
max_good_k=194

First Match Pos Mean:    ratio_mean_asc  roll_mean_asc  first_match_pos
0           False          False         3.879195
1           False           True         2.805369
2            True          False         8.496644
3            True           True         4.610738

CPU times: user 4.04 s, sys: 118 µs, total: 4.04 s
Wall time: 4.04 s


Unnamed: 0,hash_idx,ratio_mean_asc,roll_mean_asc,k,imp_feats_2,imp_feats_3,imp_feats_4,imp_feats_5,imp_feats_6,imp_feats_7,imp_feats_8,imp_feats_9,imp_feats_10,imp_feats_11,imp_feats_12,imp_feats_13,imp_feats_14,imp_feats_15,imp_feats_16,imp_feats_17,imp_feats_18,imp_feats_19,imp_feats_20,imp_feats_21,imp_feats_22,imp_feats_23,imp_feats_24,imp_feats_25,imp_feats_26,imp_feats_27,imp_feats_28,imp_feats_29,imp_feats_30,imp_feats_31,imp_feats_32,imp_feats_33,imp_feats_34,imp_feats_35,imp_feats_36,imp_feats_37,imp_feats_38,imp_feats_39,imp_feats_40,imp_feats_41,imp_feats_42,imp_feats_43,imp_feats_44,imp_feats_45,imp_feats_46,imp_feats_47,imp_feats_48,imp_feats_49,imp_feats_50,imp_feats_51,imp_feats_52,imp_feats_53,imp_feats_54,imp_feats_55,imp_feats_56,imp_feats_57,imp_feats_58,imp_feats_59,imp_feats_60,imp_feats_61,imp_feats_62,imp_feats_63,imp_feats_64,imp_feats_65,imp_feats_66,imp_feats_67,imp_feats_68,imp_feats_69,imp_feats_70,imp_feats_71,imp_feats_72,imp_feats_73,imp_feats_74,imp_feats_75,imp_feats_76,imp_feats_77,imp_feats_78,imp_feats_79,imp_feats_80,imp_feats_81,imp_feats_82,imp_feats_83,imp_feats_84,imp_feats_85,imp_feats_86,imp_feats_87,imp_feats_88,imp_feats_89,imp_feats_90,imp_feats_91,imp_feats_92,imp_feats_93,imp_feats_94,imp_feats_95,imp_feats_96,imp_feats_97,imp_feats_98,imp_feats_99,imp_feats_100,imp_feats_101,imp_feats_102,imp_feats_103,imp_feats_104,imp_feats_105,imp_feats_106,imp_feats_107,imp_feats_108,imp_feats_109,imp_feats_110,imp_feats_111,imp_feats_112,imp_feats_113,imp_feats_114,imp_feats_115,imp_feats_116,imp_feats_117,imp_feats_118,imp_feats_119,imp_feats_120,imp_feats_121,imp_feats_122,imp_feats_123,imp_feats_124,imp_feats_125,imp_feats_126,imp_feats_127,imp_feats_128,imp_feats_129,imp_feats_130,imp_feats_131,imp_feats_132,imp_feats_133,imp_feats_134,imp_feats_135,imp_feats_136,imp_feats_137,imp_feats_138,imp_feats_139,imp_feats_140,imp_feats_141,imp_feats_142,imp_feats_143,imp_feats_144,imp_feats_145,imp_feats_146,imp_feats_147,imp_feats_148,imp_feats_149,imp_feats_150
0,-1,0,0,6,49,55,55,52,57,55,52,55,59,59,54,53,56,58,57,59,57,58,58,59,61,49,51,52,52,55,56,56,55,57,59,59,59,60,57,58,55,56,57,43,43,44,47,51,51,41,41,41,43,43,42,47,48,48,36,37,37,38,42,42,44,44,45,47,50,50,51,51,52,52,52,52,52,53,54,54,49,50,50,41,42,43,43,45,45,50,49,49,51,51,52,52,52,53,53,54,54,55,55,55,51,51,42,42,44,44,44,44,45,42,42,42,42,43,43,44,44,45,45,45,45,46,41,41,41,41,41,43,45,45,46,45,37,39,41,42,42,42,42,42,42,42,42,42,42,42,42,42,42
1,-1,0,0,53,36,49,51,40,45,56,50,51,57,56,55,59,62,59,58,57,58,60,60,61,54,57,58,60,60,60,61,61,62,50,53,54,54,56,56,56,56,57,58,58,58,58,59,60,60,60,60,60,60,60,61,61,61,61,61,57,57,58,58,58,58,58,57,58,59,59,59,59,56,58,58,58,58,58,58,59,59,60,60,57,58,58,58,60,60,58,58,58,59,59,59,59,60,60,60,60,60,60,60,60,61,61,62,62,62,62,58,58,59,60,59,60,60,61,61,61,59,59,59,59,56,56,56,56,56,56,57,57,55,56,56,56,56,56,52,52,52,52,52,52,52,52,52,52,52,52,52,52,52
2,-1,0,0,54,45,33,32,31,33,41,49,42,44,44,46,52,40,39,25,26,32,33,32,34,37,36,36,40,40,42,45,46,48,49,49,47,47,54,53,55,58,58,56,57,57,45,46,48,50,51,52,52,52,53,53,55,55,55,55,55,55,55,55,55,55,55,55,55,56,56,56,56,58,59,59,60,60,60,60,61,58,59,59,60,60,60,60,58,59,55,55,55,57,57,51,49,41,39,39,39,39,39,39,40,40,40,35,35,35,26,27,27,27,29,30,31,30,32,25,26,27,26,27,29,30,31,30,30,32,32,32,27,24,24,24,26,26,24,26,27,28,28,28,28,28,28,28,28,28,28,28,28,28
3,-1,0,0,195,33,53,43,56,56,31,34,16,17,17,18,21,21,22,21,23,25,29,27,30,16,21,20,21,21,24,12,7,9,4,4,4,4,4,4,5,5,5,4,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2
4,-1,0,0,396,16,20,20,23,7,9,12,4,5,5,7,7,8,11,12,11,15,17,21,24,28,15,13,13,13,14,17,21,13,14,13,14,13,13,14,14,16,15,20,22,22,23,23,17,18,12,13,13,14,16,15,12,12,12,12,13,13,13,14,14,14,17,18,18,18,18,17,20,16,16,19,19,19,19,20,21,21,21,21,22,22,23,25,25,25,29,19,19,19,19,21,23,26,29,29,33,33,33,33,34,36,24,25,27,28,30,20,21,21,21,22,23,24,24,26,27,28,28,28,28,29,30,31,31,33,33,33,34,30,30,30,31,33,34,35,35,36,36,36,36,36,36,36,36,36,36,36,36,36


In [183]:
match_pos_df_actual.to_csv(f'data/analysis/match_pos_df_actual_{nonce}_{k_bin_size}.csv',index=False)
k_order_df_actual.to_csv(f'data/analysis/k_order_df_actual_{nonce}_{k_bin_size}.csv',index=False)

In [132]:
sel_feats = [2]+list(range(5,61,5))
# sel_feats = None

if isinstance(sel_feats, list):
    print('sel_feats:',sel_feats)
    mask = match_pos_df['imp_feats_count'].isin(sel_feats)
    match_pos_filt_feats = match_pos_df[mask]
else:
    match_pos_filt_feats = match_pos_df
# mask_hash_count = match_pos_filt_feats['hash_index']<=2
# match_pos_filt_feats=   match_pos_filt_feats[mask_hash_count]
match_pos_filt_feats.head()

sel_feats: [2, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60]


Unnamed: 0,imp_feats_count,hash_index,hash_feat_name,ratio_mean_asc,roll_mean_asc,first_match_pos,other_match_pos
0,2,0,roll_0_nonce,False,False,14,"[22, 39, 42, 44, 63, 70, 73]"
3,5,0,roll_0_nonce,False,False,3,"[19, 36, 47, 50, 51, 71, 75]"
8,10,0,roll_0_nonce,False,False,6,"[16, 29, 48, 53, 72, 73, 79]"
13,15,0,roll_0_nonce,False,False,12,"[30, 40, 49, 54, 65, 67, 79]"
18,20,0,roll_0_nonce,False,False,12,"[24, 34, 37, 57, 59, 66, 78]"


In [133]:
        
match_summary =match_pos_filt_feats.groupby(['hash_index','ratio_mean_asc','roll_mean_asc'])\
                            .agg(first_match_pos_mean =('first_match_pos','mean'))
                                                                        
print(match_summary.reset_index())
match_top_pos = match_summary['first_match_pos_mean'].groupby('hash_index', group_keys=False).nsmallest(1)
match_top_pos = match_top_pos.reset_index()
print(match_top_pos)


match_freq = match_top_pos.groupby(['ratio_mean_asc','roll_mean_asc']).size().sort_values(ascending=False)
print(match_freq.reset_index())
top_ratio_mean_asc = match_freq.index[0]  
# if len(match_value_counts)==2:
#     ratio_mean_order_equal = match_top_pos['ratio_mean_asc'].value_counts().iloc[0]==match_top_pos['ratio_mean_asc'].value_counts().iloc[1]
#     if ratio_mean_order_equal:
#         print('PROJECT WARNING: Both Ratio mean order ascending and descending are equal')
print(f'\nTop options by frequency : Ratio mean asc={top_ratio_mean_asc[0]}  Roll mean asc={top_ratio_mean_asc[1]}')

    hash_index  ratio_mean_asc  roll_mean_asc  first_match_pos_mean
0            0           False          False             17.000000
1            0           False           True             14.538462
2            0            True          False             10.153846
3            0            True           True              8.384615
4            1           False          False              7.923077
5            1           False           True              8.846154
6            1            True          False              6.153846
7            1            True           True              6.230769
8            2           False          False             13.538462
9            2           False           True              7.384615
10           2            True          False              8.384615
11           2            True           True              4.153846
12           3           False          False              9.846154
13           3           False           True   

In [134]:
match_mean = match_pos_filt_feats.groupby(['ratio_mean_asc','roll_mean_asc'])['first_match_pos'].mean().sort_values(ascending=True)
print(match_mean.reset_index())
top_options = match_mean.index[0]  
print(f'\nTop options by mean : Ratio mean asc={top_options[0]}  Roll mean asc={top_options[1]}')

   ratio_mean_asc  roll_mean_asc  first_match_pos
0            True           True         6.846154
1            True          False         9.030769
2           False           True        11.169231
3           False          False        13.476923

Top options by mean : Ratio mean asc=True  Roll mean asc=True


In [135]:
# sel_ratio_mean_asc = top_options[0]
# sel_roll_mean_asc = top_options[1]

sel_ratio_mean_asc = True   
sel_roll_mean_asc =  False

In [136]:
# match_pos_df.groupby('hash_index')
# match_pos_df['rank'] =  match_pos_df['first_match_pos'].rank(method='average',
#                                                              ascending=True)
# mask = (match_pos_df['ratio_mean_asc']==top_ratio_mean_asc)
# pos_top_df= match_pos_df[mask].sort_values(['hash_index','first_match_pos'],ascending=True)\
#             .groupby(['hash_index'])\
#             .head(5)
# print(pos_top_df['imp_feats_count'].value_counts())
# pos_top_df

mask = (match_pos_filt_feats['ratio_mean_asc']==sel_ratio_mean_asc) \
        & (match_pos_filt_feats['roll_mean_asc']==sel_roll_mean_asc)
match_pos_df_filt=match_pos_filt_feats[mask]
match_pos_df_filt['rank'] = match_pos_df_filt.groupby('hash_index')['first_match_pos'].rank("min")
mask2=match_pos_df_filt['rank']<=5
match_pos_df_filt=match_pos_df_filt[mask2]
print(match_pos_df_filt['imp_feats_count'].value_counts())
match_pos_df_filt
# match_pos_df.sort_values(['first_match_pos'],ascending=True)\
#             .groupby(['hash_index','ratio_mean_asc'])\
#             .head(5)\
#             .sort_values(['hash_index','ratio_mean_asc'],ascending=True)
    

25    4
20    4
5     3
30    3
40    3
60    3
35    2
45    2
10    1
50    1
55    1
Name: imp_feats_count, dtype: int64


Unnamed: 0,imp_feats_count,hash_index,hash_feat_name,ratio_mean_asc,roll_mean_asc,first_match_pos,other_match_pos,rank
301,5,0,roll_0_nonce,True,False,7,"[18, 21, 22, 42, 46, 66, 79]",3.0
306,10,0,roll_0_nonce,True,False,3,"[8, 27, 28, 34, 36, 49, 78]",1.0
321,25,0,roll_0_nonce,True,False,9,"[12, 15, 27, 37, 54, 65, 67]",5.0
326,30,0,roll_0_nonce,True,False,7,"[17, 22, 25, 27, 43, 59, 62]",3.0
336,40,0,roll_0_nonce,True,False,9,"[25, 26, 29, 36, 42, 45, 48]",5.0
356,60,0,roll_0_nonce,True,False,4,"[16, 22, 35, 37, 49, 63, 69]",2.0
897,5,1,roll_56_nonce,True,False,4,"[10, 12, 18, 22, 27, 35, 39, 40, 65, 78]",3.0
912,20,1,roll_56_nonce,True,False,4,"[25, 34, 35, 41, 44, 52, 62, 64, 68, 72]",3.0
917,25,1,roll_56_nonce,True,False,1,"[18, 27, 30, 40, 43, 46, 49, 59, 66, 73]",1.0
922,30,1,roll_56_nonce,True,False,1,"[12, 22, 31, 34, 35, 55, 57, 61, 66, 68]",1.0


In [137]:
# match_pos_df.groupby('hash_index')
# match_pos_df['rank'] =  match_pos_df['first_match_pos'].rank(method='average',
#                                                              ascending=True)
mask = (match_pos_filt_feats['ratio_mean_asc']==sel_ratio_mean_asc) \
        & (match_pos_filt_feats['roll_mean_asc']==sel_roll_mean_asc)
# mask = (match_pos_df['ratio_mean_asc']==False)
pos_top_df= match_pos_filt_feats[mask].sort_values(['hash_index','first_match_pos'],ascending=True)\
            .groupby(['imp_feats_count'])\
            .agg(mean =('first_match_pos','mean'),\
                 std =('first_match_pos','std'),
                median = ('first_match_pos','median'),
                minimum = ('first_match_pos','min'),
                maximum = ('first_match_pos','max'),)
#             .sort_values('imp_feats_count')
# print(pos_top_df['imp_feats_count'].value_counts())
pos_mean_summary = pos_top_df.sort_values(['mean','minimum','maximum'],ascending=[True,True,True]).reset_index()
pos_mean_summary
# match_pos_df.sort_values(['first_match_pos'],ascending=True)\
#             .groupby(['hash_index','ratio_mean_asc'])\
#             .head(5)\
#             .sort_values(['hash_index','ratio_mean_asc'],ascending=True)
    

Unnamed: 0,imp_feats_count,mean,std,median,minimum,maximum
0,25,5.2,3.03315,5.0,1,9
1,60,5.4,2.607681,4.0,4,10
2,20,5.6,3.361547,5.0,2,11
3,30,6.6,3.911521,7.0,1,12
4,35,8.0,4.527693,9.0,3,14
5,40,8.4,4.878524,9.0,2,13
6,55,8.4,2.50998,7.0,6,12
7,10,8.8,4.147288,11.0,3,13
8,45,9.0,5.0,9.0,2,16
9,15,9.2,3.271085,9.0,5,14


In [139]:
sel_feats_count = pos_mean_summary['imp_feats_count'].iloc[0]
sel_feats_count

25

In [140]:
# pos_top_imp_feats = list(pos_top_df['imp_feats_count'].value_counts().index)
# imp_feats_count_arr = np.array(imp_feats_count_list)
# # itemindex = np.where(imp_feats_count_arr in pos_top_imp_feats)
# print(pos_top_imp_feats)
# itemindex= np.searchsorted(imp_feats_count_arr, pos_top_imp_feats)
# itemindex

In [184]:
if isinstance(sel_feats, list):
    print('sel_feats:',sel_feats)
    mask = match_pos_df_actual['imp_feats_count'].isin(sel_feats)
    match_pos_actual_filt_feats = match_pos_df_actual[mask]
else:
    match_pos_actual_filt_feats = match_pos_df_actual

# match_pos_actual_filt_feats.head()

sel_feats: [2, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60]


In [185]:
mask = (match_pos_actual_filt_feats['ratio_mean_asc']==sel_ratio_mean_asc) \
        & (match_pos_actual_filt_feats['roll_mean_asc']==sel_roll_mean_asc)
match_pos_actual_filt=match_pos_actual_filt_feats[mask]
match_pos_actual_filt.sort_values('first_match_pos',ascending=True)

Unnamed: 0,imp_feats_count,hash_index,hash_feat_name,ratio_mean_asc,roll_mean_asc,first_match_pos,other_match_pos
316,20,-1,ACTUAL,True,False,4,"[20, 35, 39, 58, 61]"
351,55,-1,ACTUAL,True,False,4,"[12, 22, 40, 55, 61]"
346,50,-1,ACTUAL,True,False,6,"[14, 25, 33, 56, 62]"
341,45,-1,ACTUAL,True,False,7,"[16, 28, 30, 55, 62]"
298,2,-1,ACTUAL,True,False,8,"[19, 31, 33, 50, 62]"
356,60,-1,ACTUAL,True,False,8,"[9, 26, 43, 51, 59]"
301,5,-1,ACTUAL,True,False,9,"[16, 34, 55, 56, 62]"
326,30,-1,ACTUAL,True,False,11,"[22, 23, 30, 59, 62]"
311,15,-1,ACTUAL,True,False,12,"[13, 27, 37, 45, 53]"
336,40,-1,ACTUAL,True,False,12,"[22, 23, 24, 53, 62]"


In [186]:
match_pos_actual_filt

Unnamed: 0,imp_feats_count,hash_index,hash_feat_name,ratio_mean_asc,roll_mean_asc,first_match_pos,other_match_pos
298,2,-1,ACTUAL,True,False,8,"[19, 31, 33, 50, 62]"
301,5,-1,ACTUAL,True,False,9,"[16, 34, 55, 56, 62]"
306,10,-1,ACTUAL,True,False,17,"[20, 25, 36, 51, 60]"
311,15,-1,ACTUAL,True,False,12,"[13, 27, 37, 45, 53]"
316,20,-1,ACTUAL,True,False,4,"[20, 35, 39, 58, 61]"
321,25,-1,ACTUAL,True,False,15,"[17, 29, 40, 50, 62]"
326,30,-1,ACTUAL,True,False,11,"[22, 23, 30, 59, 62]"
331,35,-1,ACTUAL,True,False,16,"[25, 26, 28, 56, 62]"
336,40,-1,ACTUAL,True,False,12,"[22, 23, 24, 53, 62]"
341,45,-1,ACTUAL,True,False,7,"[16, 28, 30, 55, 62]"


In [187]:
len(match_pos_df_actual)
print('First Match Pos Mean:',match_pos_actual_filt_feats.groupby(['ratio_mean_asc','roll_mean_asc'])['first_match_pos'].mean().reset_index())

First Match Pos Mean:    ratio_mean_asc  roll_mean_asc  first_match_pos
0           False          False         3.384615
1           False           True         2.230769
2            True          False         9.923077
3            True           True         4.153846


In [188]:
%%time
# imp_feats_count_list = [2]
imp_feats_count_list = [sel_feats_count]

# imp_feats_count_list= [2]+list(range(5,201,5))
# imp_feats_count_list= list(range(2,151))

sim_cutoff_dfs_actual=None
top_n = len(all_k)
cutoff_pos_df_actual,sim_cutoff_dfs_actual,top_mean_actual=create_cutoff_pos_df(test,xgb_models,
                    features_test,df[df.index.isin(all_k)],
                    imp_feats_count_list,test_all_k,is_compute_matches=False,
                   test_ratio_limit=0.12,sim_cutoff_dfs=sim_cutoff_dfs_actual,
                    low_high_record_condns=[(False,1,top_n)],
                    ratio_mean_asc=sel_ratio_mean_asc,
                    roll_mean_asc=sel_roll_mean_asc,
                    top_k_ele=top_k_ele,is_print=False)
cutoff_pos_df_actual

max_good_k=194

******success data *****
      k  ratio_mean    roll_mean
14   53        0.04  4864.984177
16  552        0.04  4665.095186
28  420        0.08  4433.624872
39  608        0.12  4425.380593
49  476        0.16  4193.731390
61  631        0.28  3781.075965

******success positions *****
[15, 17, 29, 40, 50, 62]
CPU times: user 10.4 ms, sys: 3.93 ms, total: 14.3 ms
Wall time: 12.8 ms


Unnamed: 0,imp_feats_count,condn_max,condn_val,k_top_1,k_top_2,k_top_3,k_top_4,k_top_5,mean_top_1,mean_top_2,mean_top_3,mean_top_4,mean_top_5,match_pos_1,match_total_top_5,match_total_top_10,match_ks_top_5
0,25,False,1,404,456,471,461,412,0.04,0.04,0.04,0.04,0.04,,,,


In [189]:
%%time
k_list = top_mean_actual[0][:5]['k'].values
test_sel=test_all_k[test_all_k['k'].isin(k_list)]
print(k_list)
test_sel[['k','roll_actual','client_seed','roll_0','roll_1']]

[404 456 471 461 412]
CPU times: user 2.06 ms, sys: 0 ns, total: 2.06 ms
Wall time: 1.91 ms


Unnamed: 0,k,roll_actual,client_seed,roll_0,roll_1
3929,404,3878,woxpwoxpwoxpwoxpwoxp98c588b2a97dc0356e6003ea0f004e8fa4e21f675708ab1290eb934fd93ae317,6003,6464
2710,412,9582,woxpwoxpwoxpwoxpwoxpe828a19d6b5d480146cb51de754a0089dacfa96ec80fc799227ef45ceec14ff5,6003,5025
2985,456,7145,woxpwoxpwoxpwoxpwoxp1a502d2bfa719385ac6fd0cfdadf4bf21d3889c2b8dfac8af26577f7e1e18d88,9006,5882
6658,461,8204,woxpwoxpwoxpwoxpwoxpfe6e70ffe4b56be744afae912ab03896fdcebd2776d0bdbcc6c5acbc819f67d7,7008,7742
2510,471,7249,woxpwoxpwoxpwoxpwoxpf4d93ea4759fa65ac29193778ff396d2ec084e49e0072d01d8730c8d29037e14,3007,379


In [190]:
top_k_ele_imp

[[0, 1, 2, 3, 4, 5, 6, 7],
 [52, 383, 418, 430, 431, 445, 465, 474, 482, 551, 568],
 [53, 471, 492, 565, 696],
 [448, 513, 517, 559, 564, 574],
 [402, 410, 508, 531, 547, 629]]

In [191]:
col_to_sort = f'imp_feats_{sel_feats_count}'
cols = ['hash_idx','k',col_to_sort]
mask_temp = (k_order_df_actual['ratio_mean_asc']==sel_ratio_mean_asc) & (k_order_df_actual['roll_mean_asc']==sel_roll_mean_asc) 
k_order_df_filt_actual = k_order_df_actual[mask_temp]
k_order_df_filt_actual = k_order_df_filt_actual.sort_values(col_to_sort)
k_list = k_order_df_filt_actual['k'][:5].to_numpy()
print(k_list)
k_order_df_filt_actual[cols].head(5)

[404 456 471 461 412]


Unnamed: 0,hash_idx,k,imp_feats_25
129,-1,404,1
138,-1,456,2
144,-1,471,3
140,-1,461,4
131,-1,412,5


In [192]:
# # top_k_ele_imp
# top_k_ele_all=[val for sublist in top_k_ele_imp for val in sublist]
# all_k_ele_imp=[val for sublist in all_k_imp for val in sublist]
# # sorted(x)

# # diff = set(all_k).difference(set(top_k_ele_all))
# diff = set(all_k).difference(set(all_k_ele_imp))
# print(len(all_k),len(diff))
# print(diff)

In [193]:
cols = ['hash_idx','k',col_to_sort,'ratio_mean_asc','roll_mean_asc']
mask_temp = (k_order_df['ratio_mean_asc']==sel_ratio_mean_asc) & (k_order_df['roll_mean_asc']==sel_roll_mean_asc) \
            & (k_order_df['k'].isin(k_list))
# mask_temp = (k_order_df['k'].isin(k_list))
k_order_df_filt = k_order_df[mask_temp]
imp_feats_cols= [col for col in k_order_df_filt.columns if col.startswith('imp_feats')]
k_order_df_filt['imp_mean']=k_order_df_filt[imp_feats_cols].mean(axis=1)

k_order_df_filt = k_order_df_filt.sort_values(col_to_sort)
print(k_order_df_filt['k'].unique())
k_order_df_filt[['imp_mean']+cols]


[471 456 404 412]


Unnamed: 0,imp_mean,hash_idx,k,imp_feats_25,ratio_mean_asc,roll_mean_asc
795,5.483221,2,471,7,1,0
509,12.973154,1,456,14,1,0
1036,22.926174,3,471,51,1,0
781,50.154362,2,404,61,1,0
169,45.234899,0,412,62,1,0


In [151]:
#prev seed matches in k order
cols = ['hash_idx','k',col_to_sort,'ratio_mean_asc','roll_mean_asc']
mask_temp = (k_order_df['k'].isin( top_k_ele )) # prev seed success

k_order_df_filt = k_order_df[mask_temp]
imp_feats_cols= [col for col in k_order_df_filt.columns if col.startswith('imp_feats')]
k_order_df_filt['imp_mean']=k_order_df_filt[imp_feats_cols].mean(axis=1)

k_order_df_filt = k_order_df_filt.sort_values('imp_mean')
print(k_order_df_filt['k'].unique())
k_order_df_filt[['imp_mean']+cols]


[600 523 420 482]


Unnamed: 0,imp_mean,hash_idx,k,imp_feats_25,ratio_mean_asc,roll_mean_asc
1262,12.033557,4,600,10,0,1
1189,14.241611,4,600,8,0,0
1049,16.275168,3,523,33,1,0
1106,17.040268,3,523,35,1,1
847,23.483221,2,420,9,1,1
783,30.342282,2,420,17,1,0
437,31.771812,1,482,33,0,1
719,34.657718,2,420,48,0,1
356,37.926174,1,482,43,0,0
935,40.959732,3,523,23,0,0


###### Test Hash Imp Based Match Pos Generation

In [None]:
# test_hash = test_hash_imp[0].copy()

In [None]:
# %%time

# k_bin_size=700

# print(f'***************** Bin {k_bin_size} ***************** ')
# df= gen_hash_cutoff_df(xgb_models,features_test,test_hash,
#                             k_bin_size=k_bin_size,quant=0.95)
# print('Test Hash Cutoff Df Stats')
# print('Total size:',len(df[df['total']==1]))
# print('Success size:',len(df[(df['total']==1) & (df['ratio']>0)]))

# all_k,top_k,test_all_k = gen_all_k(xgb_models,features_test,
#                                 test,df,target_total=1)

# top_k_ele = [x[0] for x in top_k]
# print(top_k_ele)
# print('Test Size:',len(test_all_k))
# test_all_k.head()

In [None]:
# df[df['total']==1]

In [None]:
# key_cols = ['k','roll_actual','probs','roll_0','roll_1','client_seed']
# other_cols = [col for col in test_all_k if col not in key_cols]
# test_all_k[key_cols+other_cols]

In [None]:
# # key_cols = ['k','roll_actual','client_seed','probs','roll_0','roll_1']
# other_cols = [col for col in test_all_k if col not in key_cols]
# test_all_k[test_all_k['k'].isin(top_k_ele)][key_cols+other_cols]

Important Feats Top K Elements

In [None]:
# %%time

# # k_bin_size=600

# all_k_imp =[-1]*hash_imp_count
# top_k_imp =[-1]*hash_imp_count
# test_all_k_imp =[-1]*hash_imp_count
# top_k_ele_imp =[-1]*hash_imp_count

# df= gen_hash_cutoff_df(xgb_models,features_test,test_hash,
#                             k_bin_size=k_bin_size,quant=0.95)
# print('Test Hash Cutoff Df Stats')
# print('Total size:',len(df[df['total']==1]))
# print('Success size:',len(df[(df['total']==1) & (df['ratio']>0)]))

# for hash_idx in range(hash_imp_count):

#     print(f'***************** Test Hash {hash_imp_names[hash_idx]} ***************** ')

#     all_k_imp[hash_idx],top_k_imp[hash_idx],test_all_k_imp[hash_idx] = gen_all_k(xgb_models,features_test,
#                                                                         test_hash_imp[hash_idx],
#                                                                         df,target_total=1)

#     top_k_ele_imp[hash_idx] = [x[0] for x in top_k_imp[hash_idx]]


Check top important hash test data for ascending or descending ratio mean performance

In [None]:
# %%time
# first_match_pos_arr= np.zeros((hash_imp_count,2,imp_feats_size))
# dictionary_list=[]

# total_k_all_hash=0
# for hash_idx in range(hash_imp_count):
#     total_k_all_hash += len(all_k_imp[hash_idx])
    
# k_order_arr = np.zeros((4+imp_feats_size_norm,
#                         opt_count*total_k_all_hash))


# end =0

# for hash_idx in range(hash_imp_count):
#     k_count = len(all_k_imp[hash_idx])
#     top_n = k_count
    
#     print(f'\ntop_n:{top_n}')
#     for ratio_mean_order in [False,True]:
#         for roll_mean_order in [False,True]:
#             print(f'********************** Test Hash idx:{hash_idx} {hash_imp_names[hash_idx]} (ratio_mean order) {ratio_mean_order} (roll_mean order) {roll_mean_order} ***********************')
#             sim_cutoff_dfs_actual=None
#             cutoff_pos_df_actual,sim_cutoff_dfs_actual,top_mean_actual=create_cutoff_pos_df(test_hash_imp[hash_idx],xgb_models,
#                                 features_test,df[df.index.isin(all_k_imp[hash_idx])],
#                                 imp_feats_count_list,test_all_k_imp[hash_idx],
#                                 is_compute_matches=False,
#                                test_ratio_limit=0.12,sim_cutoff_dfs=sim_cutoff_dfs_actual,
#                                 low_high_record_condns=[(False,1,top_n)],
#                                 ratio_mean_asc=ratio_mean_order,
#                                 roll_mean_asc=roll_mean_order,
#                                 is_print=False,     
#                                 )
            
#             k_order_arr,end = create_k_order(end,top_n,hash_idx,ratio_mean_order,roll_mean_order,
#                   all_k_imp[hash_idx],imp_feats_count_list_norm_idx,imp_feats_count_list_norm_arr,
#                    top_mean_actual,k_order_arr)
            
#             first_match_pos_list=[]
#             match_pos_list=[]
#             for feats_idx in range(imp_feats_size):
#                 cur_top_data = top_mean_actual[feats_idx]
# #                 print(f'\n******{imp_feats_count_list[feats_idx]} feats success data *****')
#                 mask= cur_top_data['k'].isin(top_k_ele_imp[hash_idx])
# #                 print(cur_top_data[mask]['k'].to_numpy())
#                 cur_positions = list(cur_top_data[mask].index+1)

#                 dictionary_data = {'imp_feats_count': imp_feats_count_list[feats_idx],
#                                    'hash_index':hash_idx, 
#                                    'hash_feat_name': hash_imp_names[hash_idx], 
#                                     'ratio_mean_asc':ratio_mean_order,
#                                     'roll_mean_asc':roll_mean_order,
#                                     'first_match_pos':cur_positions[0],
#                                     'other_match_pos':cur_positions[1:]
#                       }
#                 dictionary_list.append(dictionary_data)
# #                 print(f'\n {imp_feats_count_list[feats_idx]} feats success positions')
# #                 print(cur_positions)
        
        
# match_pos_df = pd.DataFrame.from_dict(dictionary_list)

# imp_feats_count_cols  = [f'imp_feats_{val}' for val in imp_feats_count_list_norm_arr]

# k_order_df = pd.DataFrame(k_order_arr.T,columns=['hash_idx','ratio_mean_asc','roll_mean_asc',
#                                               'k']+imp_feats_count_cols)
# for col in k_order_df.columns:
#     k_order_df[col]= k_order_df[col].astype('int')
# k_order_df.head()
# # match_pos_df        

In [None]:
# match_pos_df.to_csv(f'data/analysis/match_pos_df_hash_{nonce}_{k_bin_size}_hashimp.csv',index=False)
# k_order_df.to_csv(f'data/analysis/k_order_df_hash_{nonce}_{k_bin_size}_hashimp.csv',index=False)

In [None]:
# imp_feats_count_list= [2]+list(range(2,151))

In [None]:
# %%time
# # imp_feats_count_list = [2,5,10,15,20,25,30,35,40,45,50,55,60]
# # imp_feats_count_list= [2]+list(range(5,201,5))
# # imp_feats_count_list= list(range(2,151))
# # imp_feats_count_list= list(range(150,250,10))

# # imp_feats_count_list = [2,5]
# imp_feats_size = len(imp_feats_count_list)
# dictionary_list=[]

# top_n = len(all_k)
# print(f'\ntop_n:{top_n}')
# k_order_arr_actual = np.zeros((4+imp_feats_size_norm,
#                         opt_count*top_n))
# end = 0
# hash_idx =-1
# for ratio_mean_order in [False,True]:
#     for roll_mean_order in [False,True]:

#         print(f'****************** ACTUAL ratio_mean asc {ratio_mean_order} roll_mean asc {roll_mean_order} *******************')
#         sim_cutoff_dfs_actual=None
#         cutoff_pos_df_actual,sim_cutoff_dfs_actual,top_mean_actual=create_cutoff_pos_df(test,xgb_models,
#                             features_test,df[df.index.isin(all_k)],
#                             imp_feats_count_list,test_all_k,is_compute_matches=False,
#                            test_ratio_limit=0.12,sim_cutoff_dfs=sim_cutoff_dfs_actual,
#                             low_high_record_condns=[(False,1,top_n)],
#                             ratio_mean_asc=ratio_mean_order,
#                             roll_mean_asc=roll_mean_order,#not(HIGH_ANALYSIS)
#                             )
#         k_order_arr_actual,end = create_k_order(end,top_n,hash_idx,ratio_mean_order,roll_mean_order,
#                           all_k,imp_feats_count_list_norm_idx,imp_feats_count_list_norm_arr,
#                            top_mean_actual,k_order_arr_actual)

#         for feats_idx in range(imp_feats_size):
#             cur_top_data = top_mean_actual[feats_idx]
# #             print(f'\n******{imp_feats_count_list[feats_idx]} feats success data *****')
#             mask= cur_top_data['k'].isin(top_k_ele)
# #             print(cur_top_data[mask])
#             cur_positions = list(cur_top_data[mask].index+1)

#             dictionary_data = {'imp_feats_count': imp_feats_count_list[feats_idx],
#                                'hash_index':hash_idx, 
#                                'hash_feat_name': 'ACTUAL', 
#                                 'ratio_mean_asc':ratio_mean_order,
#                                 'roll_mean_asc':roll_mean_order,
#                                 'first_match_pos':cur_positions[0],
#                                 'other_match_pos':cur_positions[1:]
#                   }
#             dictionary_list.append(dictionary_data)
# #             print(f'\n {imp_feats_count_list[feats_idx]} feats success positions')
# #             print(cur_positions)


# match_pos_df_actual = pd.DataFrame.from_dict(dictionary_list)

# print()
# print('First Match Pos Mean:',match_pos_df_actual.groupby(['ratio_mean_asc','roll_mean_asc'])['first_match_pos'].mean().reset_index())
# print()
# # match_pos_df_actual.head()  

# imp_feats_count_cols  = [f'imp_feats_{val}' for val in imp_feats_count_list_norm_arr]

# k_order_df_actual = pd.DataFrame(k_order_arr_actual.T,
#                                  columns=['hash_idx','ratio_mean_asc','roll_mean_asc',
#                                           'k']+imp_feats_count_cols)
# for col in k_order_df.columns:
#     k_order_df_actual[col]= k_order_df_actual[col].astype('int')
# k_order_df_actual.head()


In [None]:
# match_pos_df_actual.to_csv(f'data/analysis/match_pos_df_actual_{nonce}_{k_bin_size}_hashimp.csv',index=False)
# k_order_df_actual.to_csv(f'data/analysis/k_order_df_actual_{nonce}_{k_bin_size}_hashimp.csv',index=False)

In [None]:
# sel_feats = [2]+list(range(5,61,5))
# # sel_feats = None

# if isinstance(sel_feats, list):
#     print('sel_feats:',sel_feats)
#     mask = match_pos_df['imp_feats_count'].isin(sel_feats)
#     match_pos_filt_feats = match_pos_df[mask]
# else:
#     match_pos_filt_feats = match_pos_df
# # mask_hash_count = match_pos_filt_feats['hash_index']<=2
# # match_pos_filt_feats=   match_pos_filt_feats[mask_hash_count]
# match_pos_filt_feats.head()

In [None]:
        
# match_summary =match_pos_filt_feats.groupby(['hash_index','ratio_mean_asc','roll_mean_asc'])\
#                             .agg(first_match_pos_mean =('first_match_pos','mean'))
                                                                        
# print(match_summary.reset_index())
# match_top_pos = match_summary['first_match_pos_mean'].groupby('hash_index', group_keys=False).nsmallest(1)
# match_top_pos = match_top_pos.reset_index()
# print(match_top_pos)


# match_freq = match_top_pos.groupby(['ratio_mean_asc','roll_mean_asc']).size().sort_values(ascending=False)
# print(match_freq.reset_index())
# top_ratio_mean_asc = match_freq.index[0]  
# # if len(match_value_counts)==2:
# #     ratio_mean_order_equal = match_top_pos['ratio_mean_asc'].value_counts().iloc[0]==match_top_pos['ratio_mean_asc'].value_counts().iloc[1]
# #     if ratio_mean_order_equal:
# #         print('PROJECT WARNING: Both Ratio mean order ascending and descending are equal')
# print(f'\nTop options by frequency : Ratio mean asc={top_ratio_mean_asc[0]}  Roll mean asc={top_ratio_mean_asc[1]}')

In [None]:
# match_mean = match_pos_filt_feats.groupby(['ratio_mean_asc','roll_mean_asc'])['first_match_pos'].mean().sort_values(ascending=True)
# print(match_mean.reset_index())
# top_options = match_mean.index[0]  
# print(f'\nTop options by mean : Ratio mean asc={top_options[0]}  Roll mean asc={top_options[1]}')

In [None]:
# # sel_ratio_mean_asc = top_options[0]
# # sel_roll_mean_asc = top_options[1]

# sel_ratio_mean_asc =  False 
# sel_roll_mean_asc = False

In [None]:
# # match_pos_df.groupby('hash_index')
# # match_pos_df['rank'] =  match_pos_df['first_match_pos'].rank(method='average',
# #                                                              ascending=True)
# # mask = (match_pos_df['ratio_mean_asc']==top_ratio_mean_asc)
# # pos_top_df= match_pos_df[mask].sort_values(['hash_index','first_match_pos'],ascending=True)\
# #             .groupby(['hash_index'])\
# #             .head(5)
# # print(pos_top_df['imp_feats_count'].value_counts())
# # pos_top_df

# mask = (match_pos_filt_feats['ratio_mean_asc']==sel_ratio_mean_asc) \
#         & (match_pos_filt_feats['roll_mean_asc']==sel_roll_mean_asc)
# match_pos_df_filt=match_pos_filt_feats[mask]
# match_pos_df_filt['rank'] = match_pos_df_filt.groupby('hash_index')['first_match_pos'].rank("min")
# mask2=match_pos_df_filt['rank']<=5
# match_pos_df_filt=match_pos_df_filt[mask2]
# print(match_pos_df_filt['imp_feats_count'].value_counts())
# match_pos_df_filt
# # match_pos_df.sort_values(['first_match_pos'],ascending=True)\
# #             .groupby(['hash_index','ratio_mean_asc'])\
# #             .head(5)\
# #             .sort_values(['hash_index','ratio_mean_asc'],ascending=True)
    

In [None]:
# # match_pos_df.groupby('hash_index')
# # match_pos_df['rank'] =  match_pos_df['first_match_pos'].rank(method='average',
# #                                                              ascending=True)
# mask = (match_pos_filt_feats['ratio_mean_asc']==sel_ratio_mean_asc) \
#         & (match_pos_filt_feats['roll_mean_asc']==sel_roll_mean_asc)
# # mask = (match_pos_df['ratio_mean_asc']==False)
# pos_top_df= match_pos_filt_feats[mask].sort_values(['hash_index','first_match_pos'],ascending=True)\
#             .groupby(['imp_feats_count'])\
#             .agg(mean =('first_match_pos','mean'),\
#                  std =('first_match_pos','std'),
#                 median = ('first_match_pos','median'),
#                 minimum = ('first_match_pos','min'),
#                 maximum = ('first_match_pos','max'),)
# #             .sort_values('imp_feats_count')
# # print(pos_top_df['imp_feats_count'].value_counts())
# pos_mean_summary = pos_top_df.sort_values(['mean','minimum','maximum'],ascending=[True,True,True]).reset_index()
# pos_mean_summary
# # match_pos_df.sort_values(['first_match_pos'],ascending=True)\
# #             .groupby(['hash_index','ratio_mean_asc'])\
# #             .head(5)\
# #             .sort_values(['hash_index','ratio_mean_asc'],ascending=True)
    

In [None]:
# pos_top_imp_feats = list(pos_top_df['imp_feats_count'].value_counts().index)
# imp_feats_count_arr = np.array(imp_feats_count_list)
# # itemindex = np.where(imp_feats_count_arr in pos_top_imp_feats)
# print(pos_top_imp_feats)
# itemindex= np.searchsorted(imp_feats_count_arr, pos_top_imp_feats)
# itemindex

In [None]:
# if isinstance(sel_feats, list):
#     print('sel_feats:',sel_feats)
#     mask = match_pos_df_actual['imp_feats_count'].isin(sel_feats)
#     match_pos_actual_filt_feats = match_pos_df_actual[mask]
# else:
#     match_pos_actual_filt_feats = match_pos_df_actual
    
# # match_pos_actual_filt_feats.head()

In [None]:
# mask = (match_pos_actual_filt_feats['ratio_mean_asc']==sel_ratio_mean_asc) \
#         & (match_pos_actual_filt_feats['roll_mean_asc']==sel_roll_mean_asc)
# match_pos_actual_filt=match_pos_actual_filt_feats[mask]
# match_pos_actual_filt.sort_values('first_match_pos',ascending=True)

In [None]:
# match_pos_actual_filt

In [None]:
# # len(match_pos_df_actual)
# print('First Match Pos Mean:',match_pos_actual_filt_feats.groupby(['ratio_mean_asc','roll_mean_asc'])['first_match_pos'].mean().reset_index())

In [None]:
# %%time
# # imp_feats_count_list = [2]
# imp_feats_count_list_cur = [sel_feats_count]

# # imp_feats_count_list= [2]+list(range(5,201,5))
# # imp_feats_count_list= list(range(2,151))

# sim_cutoff_dfs_actual=None
# top_n = len(all_k)
# cutoff_pos_df_actual,sim_cutoff_dfs_actual,top_mean_actual=create_cutoff_pos_df(test,xgb_models,
#                     features_test,df[df.index.isin(all_k)],imp_feats_count_list_cur,
#                     test_all_k,is_compute_matches=False,
#                    test_ratio_limit=0.12,sim_cutoff_dfs=sim_cutoff_dfs_actual,
#                     low_high_record_condns=[(False,1,top_n)],
#                     ratio_mean_asc=sel_ratio_mean_asc,
#                     roll_mean_asc=sel_roll_mean_asc,
#                     top_k_ele=top_k_ele, is_print=True)
# cutoff_pos_df_actual

In [None]:
# %%time
# k_list = top_mean_actual[0][:5]['k'].values
# test_sel=test_all_k[test_all_k['k'].isin(k_list)]
# test_sel[['k','roll_actual','client_seed','roll_0','roll_1']]

In [None]:
# cols = ['hash_idx','k','imp_feats_40','imp_feats_35']
# col_to_sort = 'imp_feats_5'
# mask_temp = (k_order_df_actual['ratio_mean_asc']==0) & (k_order_df_actual['roll_mean_asc']==0) 
# k_order_df_filt_actual = k_order_df_actual[mask_temp]
# k_order_df_filt_actual = k_order_df_filt_actual.sort_values(col_to_sort)
# k_list = k_order_df_filt_actual['k'][:5].to_numpy()
# print(k_list)
# k_order_df_filt_actual[cols].head(5)

In [None]:
# cols = ['hash_idx','k','imp_feats_10','imp_feats_35','imp_feats_40','ratio_mean_asc','roll_mean_asc']
# mask_temp = (k_order_df['ratio_mean_asc']==0) & (k_order_df['roll_mean_asc']==0) \
#             & (k_order_df['k'].isin(k_list)) \
#             & (k_order_df['imp_feats_40']<=100)

# k_order_df_filt = k_order_df[mask_temp]
# imp_feats_cols= [col for col in k_order_df_filt.columns if col.startswith('imp_feats')]
# k_order_df_filt['imp_mean']=k_order_df_filt[imp_feats_cols].mean(axis=1)
# k_order_df_filt = k_order_df_filt.sort_values(col_to_sort)
# k_order_df_filt[['imp_mean']+cols]


In [None]:
# sel_imp_feats_top=[]
# for i in itemindex:
#     sel_imp_feats_top.extend(top_mean_actual[i][:5]['k'].values)
# sel_imp_feats_top=np.array(sel_imp_feats_top)
# np.array(np.unique(sel_imp_feats_top, return_counts=True)).T

In [None]:
# %%time
# # imp_feats_count_list = [2,5,10,15,20,25,30,40]
# sim_cutoff_dfs_actual=None
# cutoff_pos_df_actual,sim_cutoff_dfs_actual,test_probs=create_cutoff_pos_df(test,xgb_models,
#                     features_test,df[df.index.isin(trans_k)],
#                     imp_feats_count_list,is_compute_matches=False,
#                    test_ratio_limit=0.12,sim_cutoff_dfs=sim_cutoff_dfs_actual,
#                     low_high_record_condns=[(False,2,20)])
# cutoff_pos_df_actual

In [None]:
# # k_sel,test_sel,cutoff_pos_best =get_test_sel_cutoff_pos(test_probs,cutoff_pos_df_hash,cutoff_pos_df_actual)
# # imp_feats_count_list = [2,5,10,15,20,25]

# k_sel,test_sel,top_mean_best =get_test_sel_cutoff_pos(test_probs,xgb_models,features_test,
#                                                       df,
#                             cutoff_pos_df_hash,
#                             is_min_total=False,total_cri=2,top_n=10,
#                             sim_cutoff_dfs=sim_cutoff_dfs_actual,
#                             imp_feats_count_list=imp_feats_count_list)
# print(f'{k_sel=}')
# print('cutoff best')
# print(top_mean_best)
# print(' ** selected test **')
# test_sel[['k','client_seed','probs']]

In [None]:
# actual_seed="e1dbc502401294d4a38283e39d60e4d58cbaf5786f1b0770b3f6cf4f071bbeaa"
# #     actual_seed = cur_hash_list[25]
# print(actual_seed)
# # Vectorize the function
# vectorized_calculate_roll = np.vectorize(calculate_roll)

# # Compute the roll values for the input arrays
# roll_array = vectorized_calculate_roll(actual_seed,
#                                        test_all_k['client_seed'],
#                                        nonce)

# test_all_k['roll_actual_prev']=roll_array

In [None]:
# %%time
# k_list = top_mean_actual[2][:5]['k'].values
# test_sel=test_all_k[test_all_k['k'].isin(k_list)]
# test_sel[['k','roll_actual_prev','roll_actual','client_seed','roll_0','roll_1']]

In [None]:
# test_sel=None
# #array([ 10, 536, 371, 333, 518])
# k_list = top_mean_actual[37][:5]['k'].values
# print(k_list)
# for k_sel in k_list:
#     row = df.iloc[k_sel]
# #     print(row)
#     mask = (test['probs']>=row['cutoff']) & (test['probs']<=row['cutoff_2'])
#     cur_test_sel = test[mask]
#     cur_test_sel['k']=k_sel
#     if test_sel is None:
#         test_sel = cur_test_sel
#     else:
#         test_sel = pd.concat([test_sel,cur_test_sel],axis=0)
#     print('Test Size:',len(test_sel))
# test_sel[['k','roll_actual','client_seed','roll_0','roll_1']]

In [None]:
# test_sel

END

In [None]:
imp_feats = [col for col in imp_df['feature'].values if (col.startswith('roll_'))][:10] 
test_sel[['k']+imp_feats]

In [None]:
sim_hash_group[sim_hash_group['k']==24]

In [None]:
sim_cutoff_df[sim_cutoff_df['k']==24]

In [None]:
def compute_total(data,cutoffs,cols):
    mask = None
    for cutoff,col in zip(cutoffs,cols):
        cur_mask =(data[col]>=cutoff) 
        if mask is None:
            mask = cur_mask
        else:
            mask = mask & cur_mask
    data_filt = data[mask]
    total = len(data_filt)
    return total,data_filt

In [None]:
cols=['roll_20_nonce','roll_14_nonce','roll_3_nonce','roll_16_nonce','roll_11_nonce']
cols=['roll_43_nonce','roll_56_nonce','roll_31_nonce','roll_20_nonce','roll_45_nonce']
cols=['roll_31_nonce','roll_52_nonce','roll_15_nonce','roll_45_nonce','roll_2_nonce']
cols=['roll_31_nonce','roll_52_nonce']
cols=['roll_52_nonce','roll_31_nonce','roll_24_nonce','roll_49_nonce','roll_41_nonce']
cols=['roll_mean_25_50_nonce','roll_52_nonce','roll_31_nonce']
cols=['roll_52_nonce','roll_31_nonce']
cols=['roll_46_nonce','roll_25_nonce']
cols=['roll_42_nonce','roll_37_nonce']

# cols=['roll_20_nonce','roll_15_nonce','roll_13_nonce','roll_12_nonce','roll_18_nonce']
# cols=['roll_20_nonce','roll_15_nonce','roll_13_nonce']
# cols=['roll_6_nonce','roll_11_nonce','roll_12_nonce','roll_10_nonce',]




In [None]:
def get_cutoff(data,data_hash,col,quantile):
    mask=create_target_mask(data_hash)
#     cutoff = data[col].mean()-data_hash[col].mean()+data_hash[mask][col].mean()
#     cutoff = data_hash[mask][col].quantile(0.5)
    cutoff = data[col].mean()-data_hash[col].mean()+data_hash[mask][col].quantile(quantile)
    return cutoff



best_ratio =0
best_success = 0
best_total = 0
best_cutoffs = []
best_quantile = 0
best_actual = test_filt
# for quantile in np.linspace(0.01,0.95,50):
for quantile in np.linspace(0.01,0.95,50):
    cutoffs=[]
    for col in cols:
        #use entire hash to determine cutoff
        cutoff = get_cutoff(test,test_hash,col,quantile)
        cutoffs.append(cutoff)
    
#     print(cutoffs)
    total,filt_inter = compute_total(test_filt_hash,cutoffs,cols)
    mask2=create_target_mask(filt_inter)
#     mask2 = (filt_inter['roll_actual']>=9000)
    success = len(filt_inter[mask2])
    
    total_actual,filt_inter_actual = compute_total(test_filt,cutoffs,cols)
    mask2=create_target_mask(filt_inter_actual)

#     mask2 = (filt_inter_actual['roll_actual']>=9000)
    
    if total_actual==0:
        success_actual=0
        ratio_actual=0
    else:
        success_actual = len(filt_inter_actual[mask2])
        ratio_actual = success_actual / total_actual
    print('\nquantile:',quantile)
    print('Actuals:',ratio_actual,success_actual,total_actual)
    
    if total==0:
        ratio=0
    else:
        ratio = success / total
    print('Hash:',ratio,success,total)
#     print(quantile,cutoffs)
    print(total,total_actual)
#     print(total,success)
    if (total==0) or (success==0) : #or (total_actual<1):
        break

#     print(ratio,success,total)
    if ratio >= best_ratio:
        best_ratio = ratio
        best_success = success
        best_total = total
        best_cutoffs = cutoffs
        best_quantile = quantile
        best_actual = filt_inter_actual
print(best_quantile)        
print(best_ratio,best_success,best_total)        
print(best_cutoffs)
# print(test_filt[mask]['roll_actual'].describe())

In [None]:
mask2=create_target_mask(best_actual)
success = len(best_actual[mask2])
total = len(best_actual)
ratio = success / total
print(ratio,success,total)

In [None]:
test_sel = best_actual.copy()

In [None]:


def get_actual_based_data(data,data_hash,data_filt,cols,total_cutoff=3):

    best_filt_inter = data
    best_total= 0
    best_quantile = 0
    for quantile in np.linspace(0.01,0.95,50):
        cutoffs=[]
        for col in cols:
            cutoff = get_cutoff(data,data_hash,col,quantile)
        #     cutoff = get_cutoff(test,test_hash,col)
            cutoffs.append(cutoff)
#         print(quantile,cutoffs)

        total,filt_inter=compute_total(data_filt,cutoffs,cols)
        if total<total_cutoff:
            break
        best_filt_inter = filt_inter
        best_total = total
        best_quantile = quantile
        
    return best_filt_inter,best_total,best_quantile

In [None]:
for cutoff in range(0,11):

    best_filt_inter,best_total,best_quantile=get_actual_based_data(test,test_hash,test_filt,cols,
                                                                        total_cutoff=cutoff)
#     print('quantile:',best_quantile)
    mask2=create_target_mask(best_filt_inter)
    # mask2 = (best_filt_inter['roll_actual']>=9000)
    success = len(best_filt_inter[mask2])
    if best_total==0:
        ratio=0
        success=0
    else:
        success = len(best_filt_inter[mask2])
        ratio = success / best_total
    print(cutoff, ratio,success,best_total)
    # print(test_filt[mask]['roll_actual'].describe())

In [None]:
test_sel =best_filt_inter.copy()

In [None]:
best_filt_inter,best_total,best_quantile=get_actual_based_data(test_filt,test_filt_hash,test_filt,cols,
                                                              total_cutoff=4)
print('quantile:',best_quantile)
mask2=create_target_mask(best_filt_inter)
# mask2 = (best_filt_inter['roll_actual']>=9000)
success = len(best_filt_inter[mask2])
if best_total==0:
    ratio=0
    success=0
else:
    success = len(best_filt_inter[mask2])
    ratio = success / best_total
print(ratio,success,best_total)
# print(test_filt[mask]['roll_actual'].describe())

In [None]:
# mask_actual = (test_probs['probs']>=row['cutoff']) & (test_probs['probs']<=row['cutoff_2'])
# idx= test_probs[mask_actual].index
# mask_hash = test_hash_probs.index.isin(idx)
# test_filt_hash_pos = test_hash_probs.loc[mask_hash]

mask_hash = (test_hash_probs['probs']>=row['cutoff']) & (test_hash_probs['probs']<=row['cutoff_2'])
idx= test_hash_probs[mask_hash].index
mask_actual = test_probs.index.isin(idx)
test_filt_pos = test_probs.loc[mask_actual]

best_filt_inter,best_total,best_quantile=get_actual_based_data(test,test_hash,
                                                               test_filt_pos,cols,
                                                              total_cutoff=37)
print('quantile:',best_quantile)
mask2=create_target_mask(best_filt_inter)
# mask2 = (best_filt_inter['roll_actual']>=9000)
if best_total==0:
    ratio=0
    success=0
else:
    success = len(best_filt_inter[mask2])
    ratio = success / best_total
print(ratio,success,best_total)
# print(test_filt[mask]['roll_actual'].describe())

In [None]:
best_filt_inter

In [None]:
test_sel=best_filt_inter.copy()

In [None]:
# analysis_df_copy  = analysis_df.copy()

In [None]:
def report_exceptional_teratio(analysis_df,digitlist,
                     ratio_cutoff,count_cutoff_min,count_cutoff_max,
                     pattern,bHighCheck,
                    zerocolcutoffs,is_ratio_cri_lessthan=False,
                               is_aftval_opp=False,is_aftval_sum=False):
    mask1 = analysis_df['digit'].isin(digitlist)
    if is_ratio_cri_lessthan:
        mask2= (analysis_df['tr_ratio']<ratio_cutoff) 
        if is_aftval_opp:
            mask2= mask2 & ((analysis_df['aft_all']>=ratio_cutoff) | (analysis_df['aft_1']>=ratio_cutoff) | (analysis_df['aft_2']>=ratio_cutoff)) 
    else:
        mask2=  (analysis_df['tr_ratio']>=ratio_cutoff) 
        if is_aftval_opp:
            mask2= mask2 & ((analysis_df['aft_all']<ratio_cutoff) | (analysis_df['aft_1']<ratio_cutoff) | (analysis_df['aft_2']<ratio_cutoff)) \
    
        
    mask= mask1 & mask2 \
             & ((analysis_df['aft_all']!=0) & (analysis_df['aft_1']!=0) & (analysis_df['aft_2']!=0)) \
                & (analysis_df['te_total']>=count_cutoff_min) \
                & (analysis_df['te_total']<=count_cutoff_max) \
                & (analysis_df['pattern_99']==pattern) \
                & (analysis_df['target_high']==bHighCheck) \
                & (analysis_df['zero_col_cutoff'].isin(zerocolcutoffs))
        
    if is_aftval_sum:
        mask= mask & ((analysis_df['aft_all']+analysis_df['aft_1']+analysis_df['aft_2'])>=36) \

    filtered = analysis_df[mask]
    
    records  = len(filtered)
    cols = ['tr_ratio','tr_total','te_total']
    
    return filtered

##### Production Code

In [167]:
# def get_random_client_seed(test):
#     size = len(test)
#     test = test.reset_index(drop=True).sample(n=size).reset_index(drop=True)
#     rand_pos = random.randint(0,size-1)
#     print(f'rand_pos:{rand_pos}')
#     return test.iloc[rand_pos]['client_seed'],test

def get_random_client_seed(test):
    size = len(test)
    final_test= test.sample(n=size)
    rand_pos=random.randint(0,size-1)
    sample=final_test.reset_index(drop=True).loc[rand_pos]
#     rand_pos = sample.index[0]
    print(f'rand_pos:{rand_pos}')
    return sample['client_seed'],final_test

In [168]:
client_seed,final_test = get_random_client_seed(test_sel)
print('Client Seed Selected')
print(client_seed)
# final_test.to_csv(f'data/analysis/final_test_{nonce}.csv')
# final_test.head()

rand_pos:2
Client Seed Selected
woxpwoxpwoxpwoxpwoxp98c588b2a97dc0356e6003ea0f004e8fa4e21f675708ab1290eb934fd93ae317


In [169]:
final_test

Unnamed: 0,roll_actual,seed,hash,client_seed,client_index,roll_0,roll_1,roll_2,roll_3,roll_4,roll_5,roll_6,roll_7,roll_8,roll_9,roll_10,roll_11,roll_12,roll_13,roll_14,roll_15,roll_16,roll_17,roll_18,roll_19,roll_20,roll_21,roll_22,roll_23,roll_24,roll_25,roll_26,roll_27,roll_28,roll_29,roll_30,roll_31,roll_32,roll_33,roll_34,roll_35,roll_36,roll_37,roll_38,roll_39,roll_40,roll_41,roll_42,roll_43,roll_44,roll_45,roll_46,roll_47,roll_48,roll_49,roll_50,roll_51,roll_52,roll_53,roll_54,roll_55,roll_56,roll_57,roll_58,roll_59,roll_60,count_lt_1000,count_gt_9000,count_gt_8000,count_gt_7000,count_gt_6000,count_gt_5000,count_gt_4000,count_gt_3000,count_gt_2000,count_gt_1000,count_hi,count_lo,count_lo_hi,count_gt_9000_k,count_lt_1000_k,total_dig_9,total_dig_0,roll_mean,roll_std,count_lt_1000_25_50,count_gt_9000_25_50,count_gt_8000_25_50,count_gt_7000_25_50,count_gt_6000_25_50,count_gt_5000_25_50,count_gt_4000_25_50,count_gt_3000_25_50,count_gt_2000_25_50,count_gt_1000_25_50,count_hi_25_50,count_lo_25_50,count_lo_hi_25_50,count_gt_9000_k_25_50,count_lt_1000_k_25_50,total_dig_9_25_50,total_dig_0_25_50,roll_mean_25_50,roll_std_25_50,count_lt_1000_50_60,count_gt_9000_50_60,count_gt_8000_50_60,count_gt_7000_50_60,count_gt_6000_50_60,count_gt_5000_50_60,count_gt_4000_50_60,count_gt_3000_50_60,count_gt_2000_50_60,count_gt_1000_50_60,count_hi_50_60,count_lo_50_60,count_lo_hi_50_60,count_gt_9000_k_50_60,count_lt_1000_k_50_60,total_dig_9_50_60,total_dig_0_50_60,roll_mean_50_60,roll_std_50_60,count_lt_1000_1_60,count_gt_9000_1_60,count_gt_8000_1_60,count_gt_7000_1_60,count_gt_6000_1_60,count_gt_5000_1_60,count_gt_4000_1_60,count_gt_3000_1_60,count_gt_2000_1_60,count_gt_1000_1_60,count_hi_1_60,count_lo_1_60,count_lo_hi_1_60,count_gt_9000_k_1_60,count_lt_1000_k_1_60,total_dig_9_1_60,total_dig_0_1_60,roll_mean_1_60,roll_std_1_60,roll_0_nonce,roll_1_nonce,roll_2_nonce,roll_3_nonce,roll_4_nonce,roll_5_nonce,roll_6_nonce,roll_7_nonce,roll_8_nonce,roll_9_nonce,roll_10_nonce,roll_11_nonce,roll_12_nonce,roll_13_nonce,roll_14_nonce,roll_15_nonce,roll_16_nonce,roll_17_nonce,roll_18_nonce,roll_19_nonce,roll_20_nonce,roll_21_nonce,roll_22_nonce,roll_23_nonce,roll_24_nonce,roll_25_nonce,roll_26_nonce,roll_27_nonce,roll_28_nonce,roll_29_nonce,roll_30_nonce,roll_31_nonce,roll_32_nonce,roll_33_nonce,roll_34_nonce,roll_35_nonce,roll_36_nonce,roll_37_nonce,roll_38_nonce,roll_39_nonce,roll_40_nonce,roll_41_nonce,roll_42_nonce,roll_43_nonce,roll_44_nonce,roll_45_nonce,roll_46_nonce,roll_47_nonce,roll_48_nonce,roll_49_nonce,roll_50_nonce,roll_51_nonce,roll_52_nonce,roll_53_nonce,roll_54_nonce,roll_55_nonce,roll_56_nonce,roll_57_nonce,roll_58_nonce,roll_59_nonce,roll_60_nonce,count_lt_1000_nonce,count_gt_9000_nonce,count_gt_8000_nonce,count_gt_7000_nonce,count_gt_6000_nonce,count_gt_5000_nonce,count_gt_4000_nonce,count_gt_3000_nonce,count_gt_2000_nonce,count_gt_1000_nonce,count_hi_nonce,count_lo_nonce,count_lo_hi_nonce,count_gt_9000_k_nonce,count_lt_1000_k_nonce,total_dig_9_nonce,total_dig_0_nonce,roll_mean_nonce,roll_std_nonce,count_lt_1000_25_50_nonce,count_gt_9000_25_50_nonce,count_gt_8000_25_50_nonce,count_gt_7000_25_50_nonce,count_gt_6000_25_50_nonce,count_gt_5000_25_50_nonce,count_gt_4000_25_50_nonce,count_gt_3000_25_50_nonce,count_gt_2000_25_50_nonce,count_gt_1000_25_50_nonce,count_hi_25_50_nonce,count_lo_25_50_nonce,count_lo_hi_25_50_nonce,count_gt_9000_k_25_50_nonce,count_lt_1000_k_25_50_nonce,total_dig_9_25_50_nonce,total_dig_0_25_50_nonce,roll_mean_25_50_nonce,roll_std_25_50_nonce,count_lt_1000_50_60_nonce,count_gt_9000_50_60_nonce,count_gt_8000_50_60_nonce,count_gt_7000_50_60_nonce,count_gt_6000_50_60_nonce,count_gt_5000_50_60_nonce,count_gt_4000_50_60_nonce,count_gt_3000_50_60_nonce,count_gt_2000_50_60_nonce,count_gt_1000_50_60_nonce,count_hi_50_60_nonce,count_lo_50_60_nonce,count_lo_hi_50_60_nonce,count_gt_9000_k_50_60_nonce,count_lt_1000_k_50_60_nonce,total_dig_9_50_60_nonce,total_dig_0_50_60_nonce,roll_mean_50_60_nonce,roll_std_50_60_nonce,count_lt_1000_1_60_nonce,count_gt_9000_1_60_nonce,count_gt_8000_1_60_nonce,count_gt_7000_1_60_nonce,count_gt_6000_1_60_nonce,count_gt_5000_1_60_nonce,count_gt_4000_1_60_nonce,count_gt_3000_1_60_nonce,count_gt_2000_1_60_nonce,count_gt_1000_1_60_nonce,count_hi_1_60_nonce,count_lo_1_60_nonce,count_lo_hi_1_60_nonce,count_gt_9000_k_1_60_nonce,count_lt_1000_k_1_60_nonce,total_dig_9_1_60_nonce,total_dig_0_1_60_nonce,roll_mean_1_60_nonce,roll_std_1_60_nonce,probs,k
2710,5347,67f2894be30edb8cb5f8684ec7946bf8c8a9b38085730abd80ad03b45db48f75,6fefa69a032912bfa31127d7d145121caa236c2f285cfac3166c6e392c15bd87,woxpwoxpwoxpwoxpwoxpe828a19d6b5d480146cb51de754a0089dacfa96ec80fc799227ef45ceec14ff5,670064,6003,5025,2616,9528,1695,4249,6446,3583,9251,4500,3551,6650,8499,7123,5821,983,4906,9029,9170,4604,8726,6142,5844,5179,5443,2709,1867,4523,5614,2750,6497,5114,2632,3002,5319,4920,282,5079,3906,964,1483,7724,3492,1616,4904,5159,4413,8089,3018,7946,910,9939,894,1390,1192,8849,1836,6217,7125,3357,6935,1,4,2,1,2,2,4,2,1,1,10,8,18,1,0,11,8,5797.75,2649.911873,3,0,1,2,1,5,4,4,3,3,6,15,21,0,0,13,11,3997.384615,2182.475843,2,1,1,1,2,0,0,1,0,3,5,6,11,1,2,9,2,4422.181818,3451.195353,5,5,4,4,6,10,8,7,4,7,24,28,52,1,0,33,20,4837.15,2597.48416,6003,4113,4809,6603,3432,9246,9946,4440,9865,9125,5139,3148,5108,6692,4084,3664,3279,6611,4525,4158,2334,8474,4466,9350,260,6446,1103,2232,1903,2338,2172,9489,9323,5171,8971,7125,9679,2143,4436,8758,5176,860,1923,9607,3500,2922,5977,4256,8814,9509,3979,9459,9233,2927,3088,5724,5119,6150,1400,9739,6463,0,4,0,0,3,2,6,4,1,0,7,10,17,1,0,9,5,5516.05,2368.224129,1,5,3,1,1,3,2,2,5,3,11,13,24,0,0,15,7,5300.461538,3116.253574,0,3,0,0,2,2,0,2,1,1,6,4,10,2,0,9,4,5752.818182,2828.104023,2,13,4,1,6,7,9,7,7,4,26,28,54,2,0,32,18,5499.75,2855.095918,0.384136,412
6658,7219,67f2894be30edb8cb5f8684ec7946bf8c8a9b38085730abd80ad03b45db48f75,6fefa69a032912bfa31127d7d145121caa236c2f285cfac3166c6e392c15bd87,woxpwoxpwoxpwoxpwoxpfe6e70ffe4b56be744afae912ab03896fdcebd2776d0bdbcc6c5acbc819f67d7,186662,7008,7742,8558,8057,7183,1625,7778,5918,5113,5027,3050,3952,9528,8558,5886,947,5887,4133,2841,8578,6710,5538,3519,9900,4123,8105,7621,8837,4849,1080,3442,1819,1696,8804,9616,6654,7884,1681,7427,8704,8346,3574,9688,5965,3478,812,2539,4947,1050,804,6247,9948,7866,7441,1711,9678,9542,3282,8719,1092,8016,1,1,4,3,1,5,1,2,1,1,12,6,18,0,0,4,5,5853.55,2480.26447,2,2,5,3,2,1,2,3,1,5,13,11,24,0,0,8,8,5218.038462,3118.477263,0,3,2,2,1,0,0,1,0,2,8,3,11,2,0,6,2,6685.636364,3216.166267,3,7,11,8,3,7,4,7,2,8,34,22,56,0,0,21,17,5718.583333,2926.296834,7008,3963,168,7474,2647,6161,8994,3059,5852,5829,8312,6848,317,8678,7666,8318,8386,6568,2343,5698,6952,7874,7312,6830,7236,2380,5001,9236,1932,6556,900,7263,8857,594,3022,9908,1988,9006,3338,4201,6957,6076,8392,6705,6954,4746,4894,7018,6130,1882,8228,3380,9555,6285,580,2255,7131,7864,419,1671,9511,2,0,5,2,4,3,0,2,2,0,14,6,20,0,1,7,1,5711.65,2730.650824,2,3,3,2,6,1,3,2,1,3,14,10,24,1,1,12,14,5467.846154,2774.732545,2,2,1,2,1,0,0,1,1,1,6,5,11,1,1,3,2,5170.818182,3568.67863,6,5,8,9,12,4,3,5,4,4,37,21,58,0,1,22,18,5571.666667,2821.14366,0.363597,461
3929,5951,67f2894be30edb8cb5f8684ec7946bf8c8a9b38085730abd80ad03b45db48f75,6fefa69a032912bfa31127d7d145121caa236c2f285cfac3166c6e392c15bd87,woxpwoxpwoxpwoxpwoxp98c588b2a97dc0356e6003ea0f004e8fa4e21f675708ab1290eb934fd93ae317,519787,6003,6464,6325,6859,7841,7501,2552,9997,7843,4017,8614,7566,6680,6122,8508,6415,5355,3747,7944,8470,5991,1498,8301,1728,4543,9465,7189,8499,3820,4222,7965,3096,1181,466,7003,8823,1150,9743,882,8408,490,103,7745,8989,9066,8565,5091,7374,272,344,1209,7212,5048,9464,8200,5774,2599,9530,9185,8661,6240,0,1,3,5,6,2,1,1,1,0,17,3,20,0,0,7,5,6740.55,1814.031929,6,3,5,5,0,1,1,2,0,3,13,12,25,1,0,13,11,5044.615385,3644.453891,0,3,2,1,1,2,0,0,1,1,8,2,10,1,0,6,6,6647.454545,2809.245036,6,7,11,11,7,5,3,3,2,5,39,19,58,0,0,26,22,5899.233333,3011.776149,6003,16,6295,9912,2405,6612,4091,3649,8289,6048,2795,4252,2120,7498,3510,9761,5861,3729,75,3360,3514,2492,4077,7985,3289,9566,9942,8755,1764,6667,2138,8910,5189,5508,1495,4622,220,3264,3132,669,3165,6762,2577,4246,9137,8100,5949,4536,7817,5873,4402,5192,7660,9638,2511,5957,6408,7228,5753,7115,6832,2,2,1,1,3,1,2,5,3,0,8,12,20,1,1,10,6,4689.6,2800.105964,2,3,3,1,2,4,4,3,2,2,12,13,25,2,0,10,6,5169.423077,2870.492239,0,1,0,3,2,3,1,0,1,0,8,2,10,1,0,3,3,6245.090909,1854.244399,4,6,4,6,7,8,7,9,7,2,29,29,58,1,1,26,15,5172.233333,2686.800607,0.387194,404
2510,6491,67f2894be30edb8cb5f8684ec7946bf8c8a9b38085730abd80ad03b45db48f75,6fefa69a032912bfa31127d7d145121caa236c2f285cfac3166c6e392c15bd87,woxpwoxpwoxpwoxpwoxpf4d93ea4759fa65ac29193778ff396d2ec084e49e0072d01d8730c8d29037e14,693703,3007,379,9570,1073,6863,3887,6096,9634,3124,96,5099,4315,5021,4212,549,6778,7687,6926,9634,9605,9630,8234,1776,7123,35,7202,2485,5046,6689,6452,4221,971,1246,511,2716,3917,6511,9960,1600,519,5572,8256,8703,1121,5514,4048,4773,4443,7930,7524,743,9999,7616,7420,3652,9591,9342,6611,4209,3736,9007,3,5,0,1,4,2,2,2,0,1,10,8,18,1,1,12,7,5508.9,3297.151959,4,1,2,3,3,3,4,1,2,3,11,13,24,0,0,7,8,4564.346154,2838.991637,1,4,0,2,1,0,1,2,0,0,7,4,11,2,1,9,4,6538.727273,3039.564182,8,10,3,7,8,5,7,5,2,5,30,26,56,1,1,28,19,5286.7,3090.596752,3007,2921,3658,2262,945,6638,7463,372,6036,9672,1198,660,6014,6552,2554,5262,3791,4284,9408,6119,1132,2326,5097,7851,2644,8310,9679,9450,6586,2813,1334,5072,7090,3844,269,69,4899,6073,1616,807,6703,108,8634,4016,7036,3996,483,5801,1938,6228,3780,6102,6539,3506,2762,7550,2169,8202,7297,7646,8096,3,2,0,1,5,1,1,2,3,2,9,11,20,0,1,7,4,4347.05,2856.224292,5,2,2,2,4,2,2,3,1,3,11,13,24,2,0,11,13,4485.923077,3024.538985,0,0,2,3,2,0,0,2,2,0,7,4,11,0,0,4,6,5786.272727,2283.509102,8,4,4,7,11,4,3,6,8,5,28,29,57,0,1,23,23,4689.366667,2827.232893,0.359592,471
2985,5009,67f2894be30edb8cb5f8684ec7946bf8c8a9b38085730abd80ad03b45db48f75,6fefa69a032912bfa31127d7d145121caa236c2f285cfac3166c6e392c15bd87,woxpwoxpwoxpwoxpwoxp1a502d2bfa719385ac6fd0cfdadf4bf21d3889c2b8dfac8af26577f7e1e18d88,635567,9006,5882,6663,4310,4129,6342,4301,327,5308,2133,5406,66,4527,2012,796,1290,6796,4155,2830,3884,1240,9831,6314,8712,8804,7485,6755,2429,859,6934,4381,8038,2365,8333,845,1092,6989,8624,2595,2517,1776,6284,9348,7395,4120,8579,7739,9593,4577,670,4525,4001,4842,3607,5574,4683,1898,2073,1300,9041,4860,3,0,0,0,3,3,5,1,3,2,6,14,20,0,0,4,8,3619.85,2155.206408,3,2,4,3,4,0,4,0,4,2,13,13,26,0,1,13,4,5186.423077,2987.365263,0,1,0,0,0,1,5,1,1,2,2,7,9,0,0,2,8,4218.545455,2124.397202,6,4,6,3,8,4,13,2,8,6,25,33,58,0,0,20,21,4713.066667,2762.825555,9006,558,7947,9371,8736,9788,6208,8756,8617,8467,1999,613,1362,7599,9045,978,8492,3260,8727,9974,6482,7982,6132,5555,8580,4246,4090,9129,5752,9332,846,4348,8309,844,2754,2152,8302,4869,6629,688,7787,513,824,4665,8998,9867,9868,941,7265,8892,6067,9338,7366,71,2512,974,1653,4312,935,6539,2349,3,4,6,2,2,0,0,1,0,2,14,6,20,2,1,13,3,6348.95,3445.134074,6,4,4,2,2,1,5,0,2,0,13,12,25,2,1,13,5,5306.807692,3306.163348,3,1,0,1,2,0,1,0,2,1,4,7,11,1,2,5,1,3828.727273,3078.216565,12,9,11,6,6,2,6,1,4,3,34,25,59,2,1,32,9,5487.566667,3332.835904,0.365111,456


In [170]:
# final_test.reset_index()[final_test.reset_index()['index']==6826]

After Actual SEED is known

In [171]:
actual_seed="f9e1702bb3393f834aea8907e7e346bf9641e4c39131de555a19d56cf1b66989"
# actual_seed=cur_hash_list[4]
# print(f'{actual_seed=}')

In [172]:
# Vectorize the function
vectorized_calculate_roll = np.vectorize(calculate_roll)

# Compute the roll values for the input arrays
roll_array = vectorized_calculate_roll(actual_seed,
                                       final_test['client_seed'],
                                       nonce)
final_test['roll_actual_dummy']=final_test['roll_actual'].copy()
final_test['roll_actual']=roll_array
# final_test.to_csv(f'data/analysis/final_test_{nonce}.csv')

In [173]:
final_test

Unnamed: 0,roll_actual,seed,hash,client_seed,client_index,roll_0,roll_1,roll_2,roll_3,roll_4,roll_5,roll_6,roll_7,roll_8,roll_9,roll_10,roll_11,roll_12,roll_13,roll_14,roll_15,roll_16,roll_17,roll_18,roll_19,roll_20,roll_21,roll_22,roll_23,roll_24,roll_25,roll_26,roll_27,roll_28,roll_29,roll_30,roll_31,roll_32,roll_33,roll_34,roll_35,roll_36,roll_37,roll_38,roll_39,roll_40,roll_41,roll_42,roll_43,roll_44,roll_45,roll_46,roll_47,roll_48,roll_49,roll_50,roll_51,roll_52,roll_53,roll_54,roll_55,roll_56,roll_57,roll_58,roll_59,roll_60,count_lt_1000,count_gt_9000,count_gt_8000,count_gt_7000,count_gt_6000,count_gt_5000,count_gt_4000,count_gt_3000,count_gt_2000,count_gt_1000,count_hi,count_lo,count_lo_hi,count_gt_9000_k,count_lt_1000_k,total_dig_9,total_dig_0,roll_mean,roll_std,count_lt_1000_25_50,count_gt_9000_25_50,count_gt_8000_25_50,count_gt_7000_25_50,count_gt_6000_25_50,count_gt_5000_25_50,count_gt_4000_25_50,count_gt_3000_25_50,count_gt_2000_25_50,count_gt_1000_25_50,count_hi_25_50,count_lo_25_50,count_lo_hi_25_50,count_gt_9000_k_25_50,count_lt_1000_k_25_50,total_dig_9_25_50,total_dig_0_25_50,roll_mean_25_50,roll_std_25_50,count_lt_1000_50_60,count_gt_9000_50_60,count_gt_8000_50_60,count_gt_7000_50_60,count_gt_6000_50_60,count_gt_5000_50_60,count_gt_4000_50_60,count_gt_3000_50_60,count_gt_2000_50_60,count_gt_1000_50_60,count_hi_50_60,count_lo_50_60,count_lo_hi_50_60,count_gt_9000_k_50_60,count_lt_1000_k_50_60,total_dig_9_50_60,total_dig_0_50_60,roll_mean_50_60,roll_std_50_60,count_lt_1000_1_60,count_gt_9000_1_60,count_gt_8000_1_60,count_gt_7000_1_60,count_gt_6000_1_60,count_gt_5000_1_60,count_gt_4000_1_60,count_gt_3000_1_60,count_gt_2000_1_60,count_gt_1000_1_60,count_hi_1_60,count_lo_1_60,count_lo_hi_1_60,count_gt_9000_k_1_60,count_lt_1000_k_1_60,total_dig_9_1_60,total_dig_0_1_60,roll_mean_1_60,roll_std_1_60,roll_0_nonce,roll_1_nonce,roll_2_nonce,roll_3_nonce,roll_4_nonce,roll_5_nonce,roll_6_nonce,roll_7_nonce,roll_8_nonce,roll_9_nonce,roll_10_nonce,roll_11_nonce,roll_12_nonce,roll_13_nonce,roll_14_nonce,roll_15_nonce,roll_16_nonce,roll_17_nonce,roll_18_nonce,roll_19_nonce,roll_20_nonce,roll_21_nonce,roll_22_nonce,roll_23_nonce,roll_24_nonce,roll_25_nonce,roll_26_nonce,roll_27_nonce,roll_28_nonce,roll_29_nonce,roll_30_nonce,roll_31_nonce,roll_32_nonce,roll_33_nonce,roll_34_nonce,roll_35_nonce,roll_36_nonce,roll_37_nonce,roll_38_nonce,roll_39_nonce,roll_40_nonce,roll_41_nonce,roll_42_nonce,roll_43_nonce,roll_44_nonce,roll_45_nonce,roll_46_nonce,roll_47_nonce,roll_48_nonce,roll_49_nonce,roll_50_nonce,roll_51_nonce,roll_52_nonce,roll_53_nonce,roll_54_nonce,roll_55_nonce,roll_56_nonce,roll_57_nonce,roll_58_nonce,roll_59_nonce,roll_60_nonce,count_lt_1000_nonce,count_gt_9000_nonce,count_gt_8000_nonce,count_gt_7000_nonce,count_gt_6000_nonce,count_gt_5000_nonce,count_gt_4000_nonce,count_gt_3000_nonce,count_gt_2000_nonce,count_gt_1000_nonce,count_hi_nonce,count_lo_nonce,count_lo_hi_nonce,count_gt_9000_k_nonce,count_lt_1000_k_nonce,total_dig_9_nonce,total_dig_0_nonce,roll_mean_nonce,roll_std_nonce,count_lt_1000_25_50_nonce,count_gt_9000_25_50_nonce,count_gt_8000_25_50_nonce,count_gt_7000_25_50_nonce,count_gt_6000_25_50_nonce,count_gt_5000_25_50_nonce,count_gt_4000_25_50_nonce,count_gt_3000_25_50_nonce,count_gt_2000_25_50_nonce,count_gt_1000_25_50_nonce,count_hi_25_50_nonce,count_lo_25_50_nonce,count_lo_hi_25_50_nonce,count_gt_9000_k_25_50_nonce,count_lt_1000_k_25_50_nonce,total_dig_9_25_50_nonce,total_dig_0_25_50_nonce,roll_mean_25_50_nonce,roll_std_25_50_nonce,count_lt_1000_50_60_nonce,count_gt_9000_50_60_nonce,count_gt_8000_50_60_nonce,count_gt_7000_50_60_nonce,count_gt_6000_50_60_nonce,count_gt_5000_50_60_nonce,count_gt_4000_50_60_nonce,count_gt_3000_50_60_nonce,count_gt_2000_50_60_nonce,count_gt_1000_50_60_nonce,count_hi_50_60_nonce,count_lo_50_60_nonce,count_lo_hi_50_60_nonce,count_gt_9000_k_50_60_nonce,count_lt_1000_k_50_60_nonce,total_dig_9_50_60_nonce,total_dig_0_50_60_nonce,roll_mean_50_60_nonce,roll_std_50_60_nonce,count_lt_1000_1_60_nonce,count_gt_9000_1_60_nonce,count_gt_8000_1_60_nonce,count_gt_7000_1_60_nonce,count_gt_6000_1_60_nonce,count_gt_5000_1_60_nonce,count_gt_4000_1_60_nonce,count_gt_3000_1_60_nonce,count_gt_2000_1_60_nonce,count_gt_1000_1_60_nonce,count_hi_1_60_nonce,count_lo_1_60_nonce,count_lo_hi_1_60_nonce,count_gt_9000_k_1_60_nonce,count_lt_1000_k_1_60_nonce,total_dig_9_1_60_nonce,total_dig_0_1_60_nonce,roll_mean_1_60_nonce,roll_std_1_60_nonce,probs,k,roll_actual_dummy
2710,9582,67f2894be30edb8cb5f8684ec7946bf8c8a9b38085730abd80ad03b45db48f75,6fefa69a032912bfa31127d7d145121caa236c2f285cfac3166c6e392c15bd87,woxpwoxpwoxpwoxpwoxpe828a19d6b5d480146cb51de754a0089dacfa96ec80fc799227ef45ceec14ff5,670064,6003,5025,2616,9528,1695,4249,6446,3583,9251,4500,3551,6650,8499,7123,5821,983,4906,9029,9170,4604,8726,6142,5844,5179,5443,2709,1867,4523,5614,2750,6497,5114,2632,3002,5319,4920,282,5079,3906,964,1483,7724,3492,1616,4904,5159,4413,8089,3018,7946,910,9939,894,1390,1192,8849,1836,6217,7125,3357,6935,1,4,2,1,2,2,4,2,1,1,10,8,18,1,0,11,8,5797.75,2649.911873,3,0,1,2,1,5,4,4,3,3,6,15,21,0,0,13,11,3997.384615,2182.475843,2,1,1,1,2,0,0,1,0,3,5,6,11,1,2,9,2,4422.181818,3451.195353,5,5,4,4,6,10,8,7,4,7,24,28,52,1,0,33,20,4837.15,2597.48416,6003,4113,4809,6603,3432,9246,9946,4440,9865,9125,5139,3148,5108,6692,4084,3664,3279,6611,4525,4158,2334,8474,4466,9350,260,6446,1103,2232,1903,2338,2172,9489,9323,5171,8971,7125,9679,2143,4436,8758,5176,860,1923,9607,3500,2922,5977,4256,8814,9509,3979,9459,9233,2927,3088,5724,5119,6150,1400,9739,6463,0,4,0,0,3,2,6,4,1,0,7,10,17,1,0,9,5,5516.05,2368.224129,1,5,3,1,1,3,2,2,5,3,11,13,24,0,0,15,7,5300.461538,3116.253574,0,3,0,0,2,2,0,2,1,1,6,4,10,2,0,9,4,5752.818182,2828.104023,2,13,4,1,6,7,9,7,7,4,26,28,54,2,0,32,18,5499.75,2855.095918,0.384136,412,5347
6658,8204,67f2894be30edb8cb5f8684ec7946bf8c8a9b38085730abd80ad03b45db48f75,6fefa69a032912bfa31127d7d145121caa236c2f285cfac3166c6e392c15bd87,woxpwoxpwoxpwoxpwoxpfe6e70ffe4b56be744afae912ab03896fdcebd2776d0bdbcc6c5acbc819f67d7,186662,7008,7742,8558,8057,7183,1625,7778,5918,5113,5027,3050,3952,9528,8558,5886,947,5887,4133,2841,8578,6710,5538,3519,9900,4123,8105,7621,8837,4849,1080,3442,1819,1696,8804,9616,6654,7884,1681,7427,8704,8346,3574,9688,5965,3478,812,2539,4947,1050,804,6247,9948,7866,7441,1711,9678,9542,3282,8719,1092,8016,1,1,4,3,1,5,1,2,1,1,12,6,18,0,0,4,5,5853.55,2480.26447,2,2,5,3,2,1,2,3,1,5,13,11,24,0,0,8,8,5218.038462,3118.477263,0,3,2,2,1,0,0,1,0,2,8,3,11,2,0,6,2,6685.636364,3216.166267,3,7,11,8,3,7,4,7,2,8,34,22,56,0,0,21,17,5718.583333,2926.296834,7008,3963,168,7474,2647,6161,8994,3059,5852,5829,8312,6848,317,8678,7666,8318,8386,6568,2343,5698,6952,7874,7312,6830,7236,2380,5001,9236,1932,6556,900,7263,8857,594,3022,9908,1988,9006,3338,4201,6957,6076,8392,6705,6954,4746,4894,7018,6130,1882,8228,3380,9555,6285,580,2255,7131,7864,419,1671,9511,2,0,5,2,4,3,0,2,2,0,14,6,20,0,1,7,1,5711.65,2730.650824,2,3,3,2,6,1,3,2,1,3,14,10,24,1,1,12,14,5467.846154,2774.732545,2,2,1,2,1,0,0,1,1,1,6,5,11,1,1,3,2,5170.818182,3568.67863,6,5,8,9,12,4,3,5,4,4,37,21,58,0,1,22,18,5571.666667,2821.14366,0.363597,461,7219
3929,3878,67f2894be30edb8cb5f8684ec7946bf8c8a9b38085730abd80ad03b45db48f75,6fefa69a032912bfa31127d7d145121caa236c2f285cfac3166c6e392c15bd87,woxpwoxpwoxpwoxpwoxp98c588b2a97dc0356e6003ea0f004e8fa4e21f675708ab1290eb934fd93ae317,519787,6003,6464,6325,6859,7841,7501,2552,9997,7843,4017,8614,7566,6680,6122,8508,6415,5355,3747,7944,8470,5991,1498,8301,1728,4543,9465,7189,8499,3820,4222,7965,3096,1181,466,7003,8823,1150,9743,882,8408,490,103,7745,8989,9066,8565,5091,7374,272,344,1209,7212,5048,9464,8200,5774,2599,9530,9185,8661,6240,0,1,3,5,6,2,1,1,1,0,17,3,20,0,0,7,5,6740.55,1814.031929,6,3,5,5,0,1,1,2,0,3,13,12,25,1,0,13,11,5044.615385,3644.453891,0,3,2,1,1,2,0,0,1,1,8,2,10,1,0,6,6,6647.454545,2809.245036,6,7,11,11,7,5,3,3,2,5,39,19,58,0,0,26,22,5899.233333,3011.776149,6003,16,6295,9912,2405,6612,4091,3649,8289,6048,2795,4252,2120,7498,3510,9761,5861,3729,75,3360,3514,2492,4077,7985,3289,9566,9942,8755,1764,6667,2138,8910,5189,5508,1495,4622,220,3264,3132,669,3165,6762,2577,4246,9137,8100,5949,4536,7817,5873,4402,5192,7660,9638,2511,5957,6408,7228,5753,7115,6832,2,2,1,1,3,1,2,5,3,0,8,12,20,1,1,10,6,4689.6,2800.105964,2,3,3,1,2,4,4,3,2,2,12,13,25,2,0,10,6,5169.423077,2870.492239,0,1,0,3,2,3,1,0,1,0,8,2,10,1,0,3,3,6245.090909,1854.244399,4,6,4,6,7,8,7,9,7,2,29,29,58,1,1,26,15,5172.233333,2686.800607,0.387194,404,5951
2510,7249,67f2894be30edb8cb5f8684ec7946bf8c8a9b38085730abd80ad03b45db48f75,6fefa69a032912bfa31127d7d145121caa236c2f285cfac3166c6e392c15bd87,woxpwoxpwoxpwoxpwoxpf4d93ea4759fa65ac29193778ff396d2ec084e49e0072d01d8730c8d29037e14,693703,3007,379,9570,1073,6863,3887,6096,9634,3124,96,5099,4315,5021,4212,549,6778,7687,6926,9634,9605,9630,8234,1776,7123,35,7202,2485,5046,6689,6452,4221,971,1246,511,2716,3917,6511,9960,1600,519,5572,8256,8703,1121,5514,4048,4773,4443,7930,7524,743,9999,7616,7420,3652,9591,9342,6611,4209,3736,9007,3,5,0,1,4,2,2,2,0,1,10,8,18,1,1,12,7,5508.9,3297.151959,4,1,2,3,3,3,4,1,2,3,11,13,24,0,0,7,8,4564.346154,2838.991637,1,4,0,2,1,0,1,2,0,0,7,4,11,2,1,9,4,6538.727273,3039.564182,8,10,3,7,8,5,7,5,2,5,30,26,56,1,1,28,19,5286.7,3090.596752,3007,2921,3658,2262,945,6638,7463,372,6036,9672,1198,660,6014,6552,2554,5262,3791,4284,9408,6119,1132,2326,5097,7851,2644,8310,9679,9450,6586,2813,1334,5072,7090,3844,269,69,4899,6073,1616,807,6703,108,8634,4016,7036,3996,483,5801,1938,6228,3780,6102,6539,3506,2762,7550,2169,8202,7297,7646,8096,3,2,0,1,5,1,1,2,3,2,9,11,20,0,1,7,4,4347.05,2856.224292,5,2,2,2,4,2,2,3,1,3,11,13,24,2,0,11,13,4485.923077,3024.538985,0,0,2,3,2,0,0,2,2,0,7,4,11,0,0,4,6,5786.272727,2283.509102,8,4,4,7,11,4,3,6,8,5,28,29,57,0,1,23,23,4689.366667,2827.232893,0.359592,471,6491
2985,7145,67f2894be30edb8cb5f8684ec7946bf8c8a9b38085730abd80ad03b45db48f75,6fefa69a032912bfa31127d7d145121caa236c2f285cfac3166c6e392c15bd87,woxpwoxpwoxpwoxpwoxp1a502d2bfa719385ac6fd0cfdadf4bf21d3889c2b8dfac8af26577f7e1e18d88,635567,9006,5882,6663,4310,4129,6342,4301,327,5308,2133,5406,66,4527,2012,796,1290,6796,4155,2830,3884,1240,9831,6314,8712,8804,7485,6755,2429,859,6934,4381,8038,2365,8333,845,1092,6989,8624,2595,2517,1776,6284,9348,7395,4120,8579,7739,9593,4577,670,4525,4001,4842,3607,5574,4683,1898,2073,1300,9041,4860,3,0,0,0,3,3,5,1,3,2,6,14,20,0,0,4,8,3619.85,2155.206408,3,2,4,3,4,0,4,0,4,2,13,13,26,0,1,13,4,5186.423077,2987.365263,0,1,0,0,0,1,5,1,1,2,2,7,9,0,0,2,8,4218.545455,2124.397202,6,4,6,3,8,4,13,2,8,6,25,33,58,0,0,20,21,4713.066667,2762.825555,9006,558,7947,9371,8736,9788,6208,8756,8617,8467,1999,613,1362,7599,9045,978,8492,3260,8727,9974,6482,7982,6132,5555,8580,4246,4090,9129,5752,9332,846,4348,8309,844,2754,2152,8302,4869,6629,688,7787,513,824,4665,8998,9867,9868,941,7265,8892,6067,9338,7366,71,2512,974,1653,4312,935,6539,2349,3,4,6,2,2,0,0,1,0,2,14,6,20,2,1,13,3,6348.95,3445.134074,6,4,4,2,2,1,5,0,2,0,13,12,25,2,1,13,5,5306.807692,3306.163348,3,1,0,1,2,0,1,0,2,1,4,7,11,1,2,5,1,3828.727273,3078.216565,12,9,11,6,6,2,6,1,4,3,34,25,59,2,1,32,9,5487.566667,3332.835904,0.365111,456,5009


In [None]:
final_test.to_csv(f'data/final_test_{nonce}_{file_pattern_str}_pattern.csv')

In [None]:
LOW_TARGET

In [None]:
mask = create_target_mask(final_test)
success_pos = final_test[mask].index

if HIGH_ANALYSIS:
    success_pos = final_test[final_test['roll_actual']>=HIGH_TARGET].index
else:
    success_pos = final_test[final_test['roll_actual']<LOW_TARGET].index
print(len(success_pos))
print(f'success_pos:{success_pos}')

# target = set([17, 19, 34, 35, 44, 47])
# matched  = set(list(success_pos)).intersection(target)
# print(matched)

# print(len(matched)/len(success_pos))

In [None]:
test_sel.head()

In [None]:
final_test

In [None]:
# Vectorize the function
vectorized_calculate_roll = np.vectorize(calculate_roll)

# Compute the roll values for the input arrays
roll_array = vectorized_calculate_roll(actual_seed,
                                       results_df_test['client_seed'],
                                       nonce)

results_df_test['roll_actual']=roll_array
test_manual = generate_test_features(results_df_test)
test = generate_features_full(results_df_test,False,feature_chain_length)


analysis_df,test = gen_analysis(nonce,train_manual,test_manual,test,xgb_models,tr_last_cutoffs,val_cutoffs,
                multi_models=True,file_suffix="")
analysis_df.head()

#### END

Finalize Training

In [None]:
tr_index

In [None]:
train.index

In [None]:
# tr_index = train[30 * train_client_size:(30 + 5) * train_client_size].index
tr_index = train[train_client_size:].index
train_ignored = train[0:30*train_client_size]
# tr_index = list(tr_index) + list(train_ignored[train_ignored['target']==1].index)
print(len(tr_index))
xgb_model_final = xgb.XGBClassifier(**params)

X_tr,y_tr = X.iloc[tr_index],y.iloc[tr_index]
oversample = RandomOverSampler(sampling_strategy='minority',random_state=123)
# fit and apply the transform
X_tr, y_tr = oversample.fit_resample(X_tr, y_tr)
print(pd.Series(y_tr).value_counts())
xgb_model_final.fit(X_tr,y_tr)

In [None]:
len(X_tr)

In [None]:
test_probs = xgb_model_final.predict_proba(test)[:,1]

test_labels = convert_probtolabels(test_probs)
subm=pd.DataFrame()
subm['client_seed']= results_df_test['client_seed']
subm['preds'] = test_labels
subm['prob'] = test_probs
score = accuracy_score(y_test,test_labels)
print(f'test score:{score}')

In [None]:
mask = (subm['prob']<0.52) & (subm['preds']==1)
print(len(subm[mask]))
score = accuracy_score(y_test[mask],test_labels[mask])
print(f'test score:{score}')