In [1]:
import numpy as np
import subprocess
import os
os.chdir(os.path.expanduser('..'))
import shap
import scipy.stats as stats
from multiprocessing import Pool
from utils.rerank import write_average, rerank_ndcg, rerank_matrix,write_tau,write_ratio
from utils.readdata import get_microsoft_data, rewrite
from utils.separate_set import separate_set
from utils.explainer_tools import rand_row, evaluate, get_pairsname, get_rankedduculist, get_set_cover
from itertools import combinations
import datetime

In [2]:
def score(X):
    """
    The first if branch is training data, the next is for the single test data. First calling the subprocess of ranklib
    to get the scores, then rerank the scorefile according the original index. We also have to delete the produced
    files which used by the subprocess.
    :param X: input feature matrix
    :return: scores of q-d pairs
    """
    A = []
    scorefile_path = temp_path + 'scorefile_ideavalidity_{}.txt'.format(tmp_test_y_query[0].split(':')[-1].split()[0])
    restore_path = temp_path + 'restore_ideavalidity_{}.txt'.format(tmp_test_y_query[0].split(':')[-1].split()[0])
    rewrite(X, tmp_test_y_query, tmp_test_Query, restore_path)
    args = ['java', '-jar', 'RankLib-2.12.jar', '-rank', restore_path, '-load', model,
            '-indri', scorefile_path]
    subprocess.check_output(args, stderr=subprocess.STDOUT)

    # rerank the scorefile according the original index
    scorefile_data = ''.join(sorted(open(scorefile_path), key=lambda s: s.split()[1], reverse=False))
    with open(scorefile_path, 'w') as f:
        f.write(scorefile_data)
    with open(scorefile_path, 'r') as f:
        for line in f:
            A.append(float(line.split()[-2]))

    # reset the index to be original otherwise can not get the right NDCG
    restore_context = open(restore_path, 'r').readlines()
    with open(restore_path, 'w') as f:
        for lineindex in range(len(restore_context)):
            split = restore_context[lineindex].split()
            split[1] = 'qid:{}'.format(tmp_test_y_query[0].split(':')[-1].split()[0])
            newline = ''
            for i in range(len(split)):
                newline += (split[i] + ' ')
            f.write(newline + '\n')
    A = np.array(A)
    return A


def loop_query(query_index):
    """
    loop for a query, get scores of the samples of this query and rank them according to the scores
    :param query_index: the index of query
    :return: ranklist file, matrix file, delta NDCG file
    """
    # get data for this query
    global tmp_test_data
    global tmp_test_y_query
    global tmp_test_Query
    tmp_test_data =test_data[query_index]
    tmp_test_y_query = test_y_query[query_index]
    tmp_test_Query = test_Query[query_index]
    query_id = tmp_test_y_query[0].split(':')[-1].split()[0]

    # calculate the scores for the q-d pairs
    restore_path = temp_path +  'restore_ideavalidity_{}.txt'.format(query_id)
    scorefile_path = temp_path + 'scorefile_ideavalidity_{}.txt'.format(query_id)
    scores = score(tmp_test_data).reshape(-1, 1)

    # reranking the test_data according to the scores and get the list of ranking
    test_data_score = np.append(tmp_test_data,scores,axis=1)
    ranked_test_data = np.array((test_data_score[(-test_data_score[:,-1]).argsort()])[:,:-1])
    rankedduculist1 = get_rankedduculist(scores, query_index, q_d_len)
    start_time = datetime.datetime.now()
    top_k_idx_list = np.array([c for c in combinations(range(tmp_test_data.shape[1]), 3)])
    best_tau = -1
    all_features = [i for i in range(tmp_test_data.shape[1])]
    features_to_change_set = []
    for i in range(top_k_idx_list.shape[0]):
        top_k_idx = top_k_idx_list[i]
        complement_idx = list(set(all_features) - set(top_k_idx))
        features_to_change = tmp_test_data.copy()
        features_to_change[:,complement_idx] = expected_value[complement_idx]
        features_to_change_set.append(features_to_change)
    
    
    with open(temp_path+'changed_list_WMATRIXideavalidity{}.txt'.format(query_index),'w') as f:
        for i in range(len(features_to_change_set)*len(tmp_test_data)):
            line = ""
            line += "0 qid:{} ".format(str(i))
            for j in range(len(tmp_test_data[0])):
                line += ((str(j+1))+":"+str(features_to_change_set[i//len(tmp_test_data)][i%len(tmp_test_data)][j])+" ")
            line += "\n"
            f.write(line)
    end_time_1 = datetime.datetime.now()
    interval_1 = (end_time_1-start_time).seconds
    #print("the big feature_matrix costs {} seconds".format(interval_1))
    args = ['java', '-jar', 'RankLib-2.12.jar', '-rank', temp_path+'changed_list_WMATRIXideavalidity{}.txt'.format(query_index), '-load', model,
            '-indri', temp_path+'changed_list_WMATRIXideavalidity_score{}.txt'.format(query_index)]
    subprocess.check_output(args, stderr=subprocess.STDOUT)
    A = ''.join(sorted(open(temp_path+'changed_list_WMATRIXideavalidity_score{}.txt'.format(query_index)), key=lambda s: int(s.split()[0]), reverse=False))
    with open(temp_path+'changed_list_WMATRIXideavalidity_score{}.txt'.format(query_index),'w') as f:
        f.write(A)
    changed_list_score = []
    with open(temp_path+'changed_list_WMATRIXideavalidity_score{}.txt'.format(query_index),'r') as f:
        for line in f:
            changed_list_score.append(float(line.split()[-2]))
    changed_list_score =  [changed_list_score[i:i + tmp_test_data.shape[0]] for i in range(0, len(changed_list_score), tmp_test_data.shape[0])]   
    os.remove(os.path.join(temp_path, 'changed_list_WMATRIXideavalidity{}.txt'.format(query_index)))
    os.remove(os.path.join(temp_path, 'changed_list_WMATRIXideavalidity_score{}.txt'.format(query_index))) 

    
    rankedduculist2_set = []
    tau_set = []

    
    for i in range(len(changed_list_score)):
        this_score_list = np.array(changed_list_score[i]).reshape(-1, 1)
        rankedduculist2 = get_rankedduculist(this_score_list, query_index, q_d_len)
        tau, p_value = stats.kendalltau(rankedduculist1, rankedduculist2)
        rankedduculist2_set.append(rankedduculist2)
        tau_set.append(tau)
    best_tau =  max(tau_set)
    best_index = tau_set.index(best_tau)
    best_rankedduculist2 = rankedduculist2_set[best_index]
    best_top_k_idx = top_k_idx_list[best_index]
    end_time_3 = datetime.datetime.now()  
    interval_3 = (end_time_3-end_time_2).seconds
    #print("the tau calculating costs {} seconds".format(interval_3))
        
    resultfile_idea = 'ideafeatures/' + '{}_{}_validity_ideafeatures.txt'.format(dataname,modelname)
    with open(resultfile_idea, 'a') as IDEA_FILE:
        idea_line =  tmp_test_y_query[0].split(':')[-1]+'  ' \
                    + 'changed feature:'+ str(best_top_k_idx)+' '+'kendalltau='+str(round(best_tau,4))+ "\n"
        IDEA_FILE.write(idea_line)

    os.remove(scorefile_path)
    os.remove(restore_path)
            

In [3]:
if __name__ == '__main__':
    #parameters to be set
    dataset = 'mq2008'
    model_path = 'model/'
    dataset_path = 'MQ2008/'
    modelname_index = 0
    model_set  =['LambdaMART_model.txt','Ranknet_model.txt','Linearregression_model.txt']
    for MODEL in model_set:
        model = model_path + MODEL
    
        for f in range(1,2):
        # the path of data
            
            datapath = dataset_path+'Fold{}/'.format(f)
            train_path = datapath + 'train.txt'
            test_path = datapath + 'test.txt'
            modelname = model.split("_")[modelname_index].split("/")[-1]
            dataname = datapath.split('/')[0] +'_'+ datapath.split('/')[1].split('Fold')[1]
            temp_path = 'temp_file/'
            
            
            # get train data and test data
            X_train, y_query_train, Query_train = get_microsoft_data(train_path)
            X_train = np.array(X_train)
            X_test, y_query_test, Query_test = get_microsoft_data(test_path)
            X_test = np.array(X_test)
            expected_value = np.mean(X_train, axis=0)

            # separate the test set
            test_data, test_y_query, test_Query, q_d_len = separate_set(y_query_test, X_test, Query_test)
            

            resultfile_idea = 'ideafeatures/' + '{}_{}_validity_ideafeatures.txt'.format(dataname,modelname)
            with Pool(10) as p:
                print(p.map(loop_query, [query_index for query_index in range(len(test_data))]))
                
            rerank_ndcg(resultfile_idea)
            tau = write_tau(resultfile_idea)



Process ForkPoolWorker-1:
Process ForkPoolWorker-10:
Process ForkPoolWorker-2:
Process ForkPoolWorker-6:
Traceback (most recent call last):
Process ForkPoolWorker-7:
Process ForkPoolWorker-3:
Process ForkPoolWorker-5:
Traceback (most recent call last):
Traceback (most recent call last):
Process ForkPoolWorker-8:
  File "/usr/lib64/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/usr/lib64/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
Traceback (most recent call last):
  File "/usr/lib64/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
Process ForkPoolWorker-9:
Process ForkPoolWorker-4:
Traceback (most recent call last):
  File "/usr/lib64/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
Traceback (most recent call last):
  File "/usr/lib64/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs

KeyboardInterrupt: 