Install **PuLP** library for linear programming modelling

In [None]:
!pip install pulp

Input DataFrame **in_df**

In [23]:
import pandas as pd
in_df = {'pr_id' : 1
        ,'pr_code' : 'ABC'
        ,'blockchain' : 'ethereum'
        ,'node_operator': ['anyblockAnalytics,omniscience,staked,ztake,TSystems,everstake,secureDataLinks,xbto,newRoad,wetez,prophet,huobi,validationCapital,dMakers,kaiko,syncnode,p2pOrg,blocksizeCapital,chorusOne,youbi,kyber,dexTrac,kytzu,inotel,linkForest,linkPool,simplyVC,figmentNetworks,cosmostation,vulcan,snzPool,easy2Stake,chainlayer,stakingFacilities,alphachain,frameworkVentures,infinityStones,onchainTech,stakeFish,01node,bHarvest,fiews']
        ,'ranking_score' : ['9.75,8.86,8.29,8.88,8.43,8.14,4.38,10,9.29,8.57,9,5.57,8.86,4.89,4.3,9.14,8.86,8,8.3,6.7,7.14,9.75,8.57,9.7,10,10,9.75,8.29,8.43,8.14,8,8.29,10,9.57,9,9.88,3.7,7.38,7.14,9.43,7.14,10']
        ,'node_operator_id' : ['1,2,3,7,8,9,10,12,14,15,21,24,25,28,30,31,34,36,38,41,42,43,44,46,48,49,53,55,57,61,64,65,66,67,71,73,74,76,77,78,79,82']
        ,'target_mean':8.5
        ,'target_std' :1.25
        ,'oracles_num': 16
         ,'api' : ['coinapi,coingecko,coinmarketcap,nomics']
        ,'json_file' : ['D:\\operators-apis.json']
       }
in_df = pd.DataFrame(in_df)
display(in_df)

Unnamed: 0,pr_id,pr_code,blockchain,node_operator,ranking_score,node_operator_id,target_mean,target_std,oracles_num,api,json_file
0,1,ABC,ethereum,"anyblockAnalytics,omniscience,staked,ztake,TSy...","9.75,8.86,8.29,8.88,8.43,8.14,4.38,10,9.29,8.5...","1,2,3,7,8,9,10,12,14,15,21,24,25,28,30,31,34,3...",8.5,1.25,16,"coinapi,coingecko,coinmarketcap,nomics",D:\operators-apis.json


Main Code

In [24]:
import numpy as np
import pandas as pd
import pulp as lp

def get_whole_frame(operators_apis_json_path_or_buf):
    
    df = pd.read_json(operators_apis_json_path_or_buf, orient='index')
    df.columns = [1 for x in df.columns]
    df = df.reset_index()
    df = pd.melt(df, id_vars = 'index', value_name = 'operator').pivot_table(columns='operator', index = 'index', values='variable', aggfunc='max')
    return df

def get_operator_list(operator_str, ranking_str, operator_id_str, target_mean, target_deviation, n_samples):
    
    operator_list = operator_str.split(',')
    #split string to list and cast to float if not empty, otherwise set it to target mean
    ranking_list = list(map(lambda x: float(x) if x else target_mean, ranking_str.split(',')))
    operator_id_list = operator_id_str.split(',')

    trials_list = np.random.normal(target_mean, target_deviation, n_samples)
    #print(trials_list)
    operator_dict = dict(zip(operator_list, operator_id_list))
    ranking_dict_init = dict(zip(operator_list,ranking_list))
    ranking_dict = dict()
    
    #print(ranking_dict_init)
    for t in trials_list:
        if len(ranking_dict_init)>0:
            closest_ranking = min(ranking_dict_init.values(), key=lambda x:abs(float(x)-t) + np.random.normal(0, 10e-6))
            del_trial = list(ranking_dict_init.keys())[list(ranking_dict_init.values()).index(closest_ranking)]
            ranking_dict_init.pop(del_trial)
        
            ranking_dict[del_trial] = closest_ranking
        
            #print(f'trial {t}, closest val {closest_ranking}, del_trial {del_trial}')
        #print(ranking_dict_init,'\n')
        
    for i in ranking_dict_init:
        operator_dict.pop(i)
    
    return ranking_dict, operator_dict
    
def solve_it (df, operator_list, api_list, maxNum):
    if not maxNum:
        maxNum = np.floor(len(operator_list)/2)
        
    model = lp.LpProblem(name="assignment_problem", sense=lp.LpMaximize)

    var = lp.LpVariable.dicts("op_to_api", ((i, j) for i, operator in enumerate(operator_list) for j, api in enumerate(api_list)),cat='Binary')

    model += lp.lpSum([var[i, j]*df.loc[operator,api] for i, operator in enumerate(operator_list) for j, api in enumerate(api_list)])

    for i, operator in enumerate(operator_list):
        model += (lp.lpSum([var[i, j]*df.loc[operator,api] for j, api in enumerate(api_list)]) == 3, "Triplet"+operator_list[i])
    for j, api in enumerate(api_list):
        model += (lp.lpSum([var[i, j]*df.loc[operator,api] for i, operator in enumerate(operator_list)]) >= maxNum, "Each Api"+api_list[j])
    
    #solver = lp.getSolver('PULP_CBC_CMD', timeLimit=10, logPath = 'D:\\pulp.log' ) 
    solver = lp.getSolver('PULP_CBC_CMD', timeLimit=10, msg = False) 
    model.solve(solver) 

    return model

def solve_optimum(df, operator_list, api_list, ignore_json = False):
    maxNumPerDataSource = int((len(operator_list)-1)/2)
    opt_std = 1
    if ignore_json:
        opt_std = 2
    opt_solution = pd.DataFrame()
    opt_model=''
    for maxNum in reversed(range(maxNumPerDataSource+1)):
        model = solve_it(df, operator_list, api_list,maxNum)

        if len(model.variables())<=1: #solution is empty
            continue

        df_answer = df.copy()
        for v in model.variables():
            i = int(v.name[v.name.index('(')+1:v.name.index(',')])
            j = int(v.name[v.name.index(',_')+2:v.name.index(')')])
            df_answer.iloc[i,j] = int(v.varValue)
        
        api_count_list = df_answer[df_answer.columns].sum()
        v_std = np.std(api_count_list)
        v_maxcnt  = np.max(api_count_list)
        #print(f'maxNum={maxNum}, v_std={v_std}, min = {df_answer.min().min()}, v_maxcnt = {v_maxcnt}')
        if min([v.varValue for v in model.variables()]) < 0:
            continue
        if v_std <=opt_std or (v_std < 1 and  v_maxcnt <= maxNumPerDataSource):
            opt_model = model
            opt_solution = df_answer
            opt_std = v_std
            #print(f'opt maxNum={maxNum}, v_std={v_std}')
            
    return opt_solution, opt_std

def try_it(row, df_whole, df_answer, ignore_json=False):
    api_list         = row['api'].split(',')
    

    ranking_dict, operator_dict = get_operator_list(row['node_operator'], row['ranking_score'], row['node_operator_id'], 
                                                    row['target_mean'],
                                                    row['target_std'],
                                                    row['oracles_num'])
    operator_list = list(ranking_dict.keys())
    ranking_list = list(ranking_dict.values())
    operator_id_list = list(operator_list)
    
    df = df_whole.reindex(operator_list).loc[operator_list, api_list].copy()
    df.fillna(0, inplace=True)
    df = df.astype(int)
    
    if ignore_json:
        df.loc[:,:] = 1

    if len(df)>0:
        df_answer, std = solve_optimum(df, operator_list, api_list, ignore_json)
        df_answer = df_answer.mul(df_answer.columns, axis = 'columns')
        df_answer = df_answer.reset_index().melt(id_vars=['index'])
        if len(df_answer['value'].values)>0:
            df_answer = df_answer[df_answer['value'].values !='']
        df_answer = df_answer.drop(columns=['value'])
        df_answer.columns = ['operator','api']
    
    return df_answer, ranking_dict, operator_dict

#read operators-apis.json file    
#df_whole = get_whole_frame('D:\\operators-apis.json')
df_whole = get_whole_frame(in_df.loc[0,'json_file'])

out_df = pd.DataFrame()
i=0;
in_len = len(in_df)

for index, row in in_df.iterrows():
    i+=1
    pr_id = float(row['pr_id'])
    out_df.loc[len(out_df), ['pr_id','msg']] = [row['pr_id'], f'start']

    df_answer = pd.DataFrame()
    n_trials=0
    ignore_json = False # Use operators-apis.json constraints
    while n_trials < 200: # 200 trials
        n_trials+=1
        df_answer = pd.DataFrame()
        
        if n_trials > 100: #trials from 100 to 200
            ignore_json = True #Ignore json from 100th trial: https://app.asana.com/0/1199613136501554/1200764619184594/f

        df_answer, ranking_dict, operator_dict = try_it(row, df_whole, df_answer, ignore_json)
        # Trial Details
        out_df.loc[len(out_df), ['pr_id','msg']] = [pr_id, f'end trial={n_trials}, ignore_json={ignore_json}, df_answer_len={len(df_answer)}, oracles={row["oracles_num"]}']
        if len(df_answer) == row['oracles_num']*3: # Success! Each NOp has 3 api assigned
            break
            

    if len(df_answer) == row['oracles_num']*3: #success
        df_answer['pr_id'] = pr_id
        df_answer['pr_code'] = row['pr_code']    
        df_answer['blockchain'] = row['blockchain']    
        df_answer['ranking_str'] = row['ranking_score']
        df_answer['ranking_score'] = df_answer['operator'].map(lambda x: ranking_dict[x])
        df_answer['operator_id'] = df_answer['operator'].map(lambda x: float(operator_dict[x]))
        df_answer['msg'] = 'OK'

        df_answer['comment'] = f'auto assigned with ignore_json={ignore_json} ({n_trials} random trials)'
        out_df = out_df.append(df_answer, ignore_index=True)

    comment = f'{i}, df_answer_len={len(df_answer)}, n_trials={n_trials}'
    out_df.loc[len(out_df), ['pr_id', 'msg', 'comment']] = [pr_id, 'end', comment]

Output DataFrame **out_df**

In [25]:
display(out_df[out_df['msg']=='OK'].loc[:,['operator','api']].pivot(index = 'operator', columns='api', values = 'operator').apply(pd.notna).astype('int8'))

api,coinapi,coingecko,coinmarketcap,nomics
operator,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
TSystems,1,0,1,1
anyblockAnalytics,1,0,1,1
blocksizeCapital,0,1,1,1
chorusOne,1,1,1,0
cosmostation,1,1,0,1
dMakers,1,0,1,1
huobi,0,1,1,1
kytzu,1,1,1,0
linkForest,1,1,0,1
newRoad,1,1,1,0


In [26]:
display(out_df[out_df['msg']=='OK'].sort_values(by='operator').reset_index().loc[:,['pr_code','operator','api']])

Unnamed: 0,pr_code,operator,api
0,ABC,TSystems,coinapi
1,ABC,TSystems,nomics
2,ABC,TSystems,coinmarketcap
3,ABC,anyblockAnalytics,coinmarketcap
4,ABC,anyblockAnalytics,nomics
5,ABC,anyblockAnalytics,coinapi
6,ABC,blocksizeCapital,coinmarketcap
7,ABC,blocksizeCapital,coingecko
8,ABC,blocksizeCapital,nomics
9,ABC,chorusOne,coingecko


In [27]:
display(out_df)

Unnamed: 0,pr_id,msg,operator,api,pr_code,blockchain,ranking_str,ranking_score,operator_id,comment
0,1.0,start,,,,,,,,
1,1.0,"end trial=1, ignore_json=False, df_answer_len=...",,,,,,,,
2,1.0,"end trial=2, ignore_json=False, df_answer_len=...",,,,,,,,
3,1.0,"end trial=3, ignore_json=False, df_answer_len=...",,,,,,,,
4,1.0,"end trial=4, ignore_json=False, df_answer_len=...",,,,,,,,
...,...,...,...,...,...,...,...,...,...,...
146,1.0,OK,TSystems,nomics,ABC,ethereum,"9.75,8.86,8.29,8.88,8.43,8.14,4.38,10,9.29,8.5...",8.43,8.0,auto assigned with ignore_json=True (101 rando...
147,1.0,OK,linkForest,nomics,ABC,ethereum,"9.75,8.86,8.29,8.88,8.43,8.14,4.38,10,9.29,8.5...",10.00,48.0,auto assigned with ignore_json=True (101 rando...
148,1.0,OK,cosmostation,nomics,ABC,ethereum,"9.75,8.86,8.29,8.88,8.43,8.14,4.38,10,9.29,8.5...",8.43,57.0,auto assigned with ignore_json=True (101 rando...
149,1.0,OK,huobi,nomics,ABC,ethereum,"9.75,8.86,8.29,8.88,8.43,8.14,4.38,10,9.29,8.5...",5.57,24.0,auto assigned with ignore_json=True (101 rando...
