In [1]:
import numpy as np
import subprocess
import os
os.chdir(os.path.expanduser('..'))
import shap
import scipy.stats as stats
from multiprocessing import Pool
from utils.rerank import write_average, rerank_ndcg, rerank_matrix,write_tau,write_ratio,write_average2,write_tau2,write_ratio2
from utils.readdata import get_microsoft_data, rewrite
from utils.separate_set import separate_set
from utils.explainer_tools import rand_row, evaluate, get_pairsname, get_rankedduculist,small_get_pairsname, get_set_cover, get_set_cover_beam

In [2]:
def score(X):
    """
    First calling the subprocess of ranklib to get the scores, then rerank the scorefile according the original index.
    We also have to delete the produced files which used by the subprocess.
    :param X: input feature matrix
    :return: scores of q-d pairs
    """
    A = []
    scorefile_path = temp_path + 'scorefile_validitybeamsearch_{}.txt'.format(tmp_test_y_query[0].split(':')[-1].split()[0])
    restore_path = temp_path + 'restore_validitybeamsearch_{}.txt'.format(tmp_test_y_query[0].split(':')[-1].split()[0])
    rewrite(X, tmp_test_y_query, tmp_test_Query, restore_path)
    args = ['java', '-jar', 'RankLib-2.12.jar', '-rank', restore_path, '-load', model,
            '-indri', scorefile_path]
    subprocess.check_output(args, stderr=subprocess.STDOUT)
    with open(scorefile_path, 'r') as f:
        for line in f:
            A.append(float(line.split()[-2]))

    # reset the index to be original otherwise can not get the right NDCG
    restore_context = open(restore_path, 'r').readlines()
    with open(restore_path, 'w') as f:
        for lineindex in range(len(restore_context)):
            split = restore_context[lineindex].split()
            split[1] = 'qid:{}'.format(tmp_test_y_query[0].split(':')[-1].split()[0])
            newline = ''
            for i in range(len(split)):
                newline += (split[i] + ' ')
            f.write(newline + '\n')
    A = np.array(A)
    return A


def loop_query(query_index):
    """
    loop for a query, get scores of the samples of this query and rank them according to the scores, the results are in the NDCG file, like tau , delta NDCG 
    :param query_index: the index of query
    :return: ranklist file, matrix file, delta NDCG file
    """
    # get data for this query
    global tmp_test_data
    global tmp_test_y_query
    global tmp_test_Query
    tmp_test_data =test_data[query_index]
    tmp_test_y_query = test_y_query[query_index]
    tmp_test_Query = test_Query[query_index]
    query_id = tmp_test_y_query[0].split(':')[-1].split()[0]

    # calculate the scores for the q-d pairs
    restore_path = temp_path +  'restore_validitybeamsearch_{}.txt'.format(query_id)
    scorefile_path = temp_path + 'scorefile_validitybeamsearch_{}.txt'.format(query_id)
    scores = score(tmp_test_data).reshape(-1, 1)


    # reranking the test_data according to the scores and get the list of ranking
    test_data_score = np.append(tmp_test_data,scores,axis=1)
    ranked_test_data = np.array((test_data_score[(-test_data_score[:,-1]).argsort()])[:,:-1])
    rankedduculist1 = get_rankedduculist(scores, query_index, q_d_len)
    NDCG_before = evaluate(model, restore_path)

    # get pairsname, for mq2008, we set 50 pairs, for dataset MSLR, 100 pairs, 
    # for the queries which can not conbinate specific number of pairs, conbinate as much pairs as it can 
    global pairsname
    if dataset == 'mq2008':
        if q_d_len[query_index] >= 11:
            pairnumbers = 50
            pairsname = get_pairsname(ranked_test_data, pairnumbers)
        else:
            pairsname = small_get_pairsname(ranked_test_data)
    else:
        if q_d_len[query_index] >= 15:
            pairnumbers = 100
            pairsname = get_pairsname(ranked_test_data, pairnumbers)
        else:
            pairsname = small_get_pairsname(ranked_test_data)

    original_pairsname = pairsname.copy()
    
    
    def get_score_matrix(feature_matrix):
        """
        this function is used to generate validity scores for docus, set features other than selected features to be expected values
        param: feature matrix , each row refers to a docu , and each column refers to a feature
        return: the scores matrix, each row refers to a docu, and each column refers to the validity score of this feature for this docu
        """
        changed_list = []
        for i in range(feature_matrix.shape[0]):
            for m in range(tmp_test_data.shape[1]):
                temp = expected_value.copy()
                temp[m] = feature_matrix[i,m]
                temp[top_k_idx] = feature_matrix[i,top_k_idx]
                changed_list.append(temp)
        changed_list = np.array(changed_list)
        with open(temp_path+'changed_list_validitybeamsearch{}.txt'.format(query_index),'w') as f:
            for i in range(feature_matrix.shape[0]*tmp_test_data.shape[1]):
                line = ""
                line += "0 qid:{} ".format(str(i))
                for j in range(len(changed_list[i])):
                    line += ((str(j+1))+":"+str(changed_list[i][j])+" ")
                line += '#docid = GX008-86-4444840 inc = 1 prob = 0.086622 ' + "\n"
                f.write(line)
        args = ['java', '-jar', 'RankLib-2.12.jar', '-rank', temp_path+'changed_list_validitybeamsearch{}.txt'.format(query_index), '-load', model,
                '-indri', temp_path+'changed_list_validitybeamsearch_score{}.txt'.format(query_index)]
        subprocess.check_output(args, stderr=subprocess.STDOUT)
        A = ''.join(sorted(open(temp_path+'changed_list_validitybeamsearch_score{}.txt'.format(query_index)), key=lambda s: int(s.split()[0]), reverse=False))
        with open(temp_path+'changed_list_validitybeamsearch_score{}.txt'.format(query_index),'w') as f:
            f.write(A)
        changed_list_score = []
        with open(temp_path+'changed_list_validitybeamsearch_score{}.txt'.format(query_index),'r') as f:
            for line in f:
                changed_list_score.append(float(line.split()[-2]))
        changed_list_score =  [changed_list_score[i:i + tmp_test_data.shape[1]] for i in range(0, len(changed_list_score), tmp_test_data.shape[1])]   
        os.remove(os.path.join(temp_path, 'changed_list_validitybeamsearch{}.txt'.format(query_index)))
        os.remove(os.path.join(temp_path, 'changed_list_validitybeamsearch_score{}.txt'.format(query_index))) 
        return changed_list_score
        
        
    def get_matrix(ranked_test_data):
        """
        get the score matrix of (d1 -d2).
        param: feature matrix
        return: d1 - d2 matrix 
        """
        score_values = get_score_matrix(ranked_test_data)
        matrix = []
        for i in range(len(pairsname)):
            index1 = int(pairsname[i][1])
            index2 = int(pairsname[i][-1])
            row = [round((score_values[index1-1][j]-score_values[index2-1][j]),10) for j in range(tmp_test_data.shape[1])]
            matrix.append(row)
        return matrix       

    
    # in the following we are going to do beam search to selecting 10 features 
    top_k_idx_set = []
    pairs_set = []
    matrix_set = []
    for i in range(beam_size):
        pairsname = original_pairsname.copy()
        top_k_idx = []
        matrix  = get_matrix(ranked_test_data)
        temp_index = get_set_cover_beam(matrix)[i]
        top_k_idx.append(temp_index)
        temp_tmp_test_data = tmp_test_data.copy()
        temp2_index = temp_index
        
        # get the left 9 indexes
        for i in range(9):
            changedpairs = []
            threshold = 0
            this_feature=[x[temp2_index] for x in matrix]
            drop_pairs = np.array([this_feature[i] >= threshold for i in range(len(this_feature))]).astype(int)   

            for i in range(len(drop_pairs)):
                if drop_pairs[i] == 1:
                    changedpairs.append(pairsname[i])
                    
            # if all the pairs have been coverd, then stop selecting         
            if len(changedpairs) >= len(pairsname): break  
            for i in range(len(changedpairs)):     
                if changedpairs[i] in pairsname:
                    pairsname.remove(changedpairs[i])

            temp_matrix  = get_matrix(ranked_test_data)

            # delect the features we already selected
            all_features = [i for i in range(tmp_test_data.shape[1])]
            left_idx = list(set(all_features) - set(top_k_idx))
            temp_matrix = list(map(list, zip(*temp_matrix)))    
            matrix = []
            for i in left_idx:
                matrix.append(temp_matrix[i])
            matrix = list(map(list, zip(*matrix)))  
            temp_index= get_set_cover(matrix)
            temp2_index = temp_index
            temp_top_k_idx = top_k_idx.copy()
            # recover the index which caused by feature delecting
            for i in temp_top_k_idx:
                if i <= temp_index:
                    temp_index +=1
                    while temp_index in temp_top_k_idx:
                        temp_top_k_idx.remove(temp_index)
                        temp_index +=1

            top_k_idx.append(temp_index)
        top_k_idx_set.append(top_k_idx)
        pairs_set.append(pairsname)
        matrix_set.append(matrix)
       
    
    complement_NDCG_file_name = NDCGdata_path + '{}_validitybeamsearch_complement'.format(dataname) + modelname + '.txt'
    complement_ranklist_file = NDCGdata_path + '{}_ranklist_validitybeamsearch_complement'.format(dataname)  + modelname + '.txt'
    
    # for validity , the worst tau should be -1 
    best_tau = -1
    best_tau2 = -1
    
    
    # In the following loop, we selected the best features_list which has the biggest tau
    for i in range(beam_size):
        features_to_change = tmp_test_data.copy()
        top_k_idx = top_k_idx_set[i]
        all_features = [i for i in range(tmp_test_data.shape[1])]
        complement_idx = list(set(all_features) - set(top_k_idx[:5]))
        features_to_change[:,complement_idx] = expected_value[complement_idx]
        restore_path = temp_path +  'restore_validitybeamsearch_{}.txt'.format(query_id)
        scorefile_path = temp_path + 'scorefile_validitybeamsearch_{}.txt'.format(query_id)
        # get scores of the changed features
        scores3 = score(features_to_change).reshape(-1,1)
        rankedduculist3 = get_rankedduculist(scores3, query_index, q_d_len)
        NDCG_after2 = evaluate(model, restore_path)
        delta_NDCG2 = abs(float(NDCG_before) - float(NDCG_after2))
        if float(NDCG_before)  == 0:
            ratio_NDCG2 = 0
        else:
            ratio_NDCG2 = delta_NDCG2/float(NDCG_before) 
        tau2, p_value2 = stats.kendalltau(rankedduculist1, rankedduculist3)
        if tau2 >= best_tau:
            best_tau = tau2
            best_ratio = ratio_NDCG2
            best_delta_NDCG = delta_NDCG2
            best_complement = complement_idx
            best_top_k_idx = top_k_idx.copy()
            best_rankedduculist = rankedduculist3
            best_pairsname = pairs_set[i]
            best_matrix = matrix_set[i]
        if dataset == 'mslr':          
            features_to_change = tmp_test_data.copy()
            complement_idx = list(set(all_features) - set(top_k_idx[:10]))
            features_to_change[:,complement_idx] = expected_value[complement_idx]
            restore_path = temp_path +  'restore_validitybeamsearch_{}.txt'.format(query_id)
            scorefile_path = temp_path + 'scorefile_validitybeamsearch_{}.txt'.format(query_id)
            # get scores of the changed features
            scores3 = score(features_to_change).reshape(-1,1)
            rankedduculist3 = get_rankedduculist(scores3, query_index, q_d_len)
            NDCG_after2 = evaluate(model, restore_path)
            delta_NDCG2 = abs(float(NDCG_before) - float(NDCG_after2))
            if float(NDCG_before)  == 0:
                ratio_NDCG2 = 0
            else:
                ratio_NDCG2 = delta_NDCG2/float(NDCG_before) 
            tau2, p_value2 = stats.kendalltau(rankedduculist1, rankedduculist3)
            if tau2 >= best_tau2:
                best_tau2 = tau2
                best_ratio2 = ratio_NDCG2
                best_delta_NDCG2 = delta_NDCG2
                best_complement2 = complement_idx
                best_top_k_idx2 = top_k_idx.copy()
                best_rankedduculist2 = rankedduculist3
                best_matrix = matrix_set[i]
            
    if dataset == 'mslr':
        with open(complement_NDCG_file_name, 'a') as NDCG_FILE:
            NDCG_line =  tmp_test_y_query[0].split(':')[-1]+'  ' \
             + 'changed 5features:'+ str(best_complement)+' '+'kendalltau='+str(round(best_tau,4)) + ' '+'ratioNDCG:'+ str(round(best_ratio,4))+\
            ' ' + 'delta_NDCG ='+'  '+str(round(best_delta_NDCG,4))+ \
            ' '+'changed 10features:'+ str(best_complement2)+' '+'kendalltau2='+str(round(best_tau2,4)) + '  '+'ratioNDCG2:'\
            + str(round(best_ratio2,4)) + ' ' + 'delta_NDCG2 ='+'  '+str(round(best_delta_NDCG2,4))\
            + '  '+ 'pairnames:'+' '+str(best_pairsname) +"\n"
            NDCG_FILE.write(NDCG_line)

        with open(complement_ranklist_file, 'a') as ranklist:
            ranklist_line = tmp_test_y_query[0].split(':')[-1] + '  ' + 'ranklist before:' + str(
                rankedduculist1) + '  ' + 'ranklist after:' + '  ' + str(best_rankedduculist) +' '+ 'ranklist after2:' + '  ' + str(best_rankedduculist2) + "\n"
            ranklist.write(ranklist_line)    
        os.remove(scorefile_path)
        os.remove(restore_path) 
    else:
        with open(complement_NDCG_file_name, 'a') as NDCG_FILE:
            NDCG_line =  tmp_test_y_query[0].split(':')[-1]+'  ' \
                        + 'changed feature:'+ str(best_complement)+' '+'kendalltau='+str(round(best_tau,4)) + '  '+'ratioNDCG:'+ str(round(best_ratio,4))+ '  ' + 'pairnames:'+' '+str(best_pairsname) + \
                        '   ' + 'delta_NDCG ='+'  '+str(round(best_delta_NDCG,4))+ "\n"
            NDCG_FILE.write(NDCG_line)

        with open(complement_ranklist_file, 'a') as ranklist:
            ranklist_line = tmp_test_y_query[0].split(':')[-1] + '  ' + 'ranklist before:' + str(
                rankedduculist1) + '  ' + 'ranklist after:' + '  ' + str(best_rankedduculist) + "\n"
            ranklist.write(ranklist_line)    
        os.remove(scorefile_path)
        os.remove(restore_path)    
        
                
    def feature_k_loop(feature_number):
        nonlocal best_top_k_idx
        nonlocal best_top_k_idx2
        NDCG_file_name = NDCGdata_path + '{}_validitybeamsearch_{}features'.format(dataname,feature_number) + modelname + '.txt'
        NDCG_file_matrix = NDCGdata_path + '{}_validitybeamsearch_matrix_{}features'.format(dataname,feature_number)  + modelname + '.txt'
        ranklist_file = NDCGdata_path + '{}_ranklist_validitybeamsearch_{}features'.format(dataname,feature_number)  + modelname + '.txt'
        if dataset == 'mq2008':
            best_top_k_idx2 = best_top_k_idx.copy()    
        features_to_change = tmp_test_data.copy()
        if feature_number == 10:
            best_top_k_idx = best_top_k_idx2.copy()
        if len(best_top_k_idx)<= feature_number:
            feature_number = len(best_top_k_idx)
        features_to_change[:, best_top_k_idx[:feature_number]] = expected_value[best_top_k_idx[:feature_number]]
        restore_path = temp_path +  'restore_validitybeamsearch_{}.txt'.format(query_id)
        scorefile_path = temp_path + 'scorefile_validitybeamsearch_{}.txt'.format(query_id)
        scores2 = score(features_to_change).reshape(-1,1)
        rankedduculist2 = get_rankedduculist(scores2, query_index, q_d_len)
        NDCG_after = evaluate(model, restore_path)
        delta_NDCG = abs(float(NDCG_before) - float(NDCG_after))
        if float(NDCG_before)  == 0:
            ratio_NDCG = 0
        else:
            ratio_NDCG = delta_NDCG/float(NDCG_before) 
        tau, p_value = stats.kendalltau(rankedduculist1, rankedduculist2)


        with open(NDCG_file_name, 'a') as NDCG_FILE:
            NDCG_line =  tmp_test_y_query[0].split(':')[-1]+'  ' \
                        + 'changed feature:'+ str(best_top_k_idx[:feature_number])+' '+'kendalltau='+str(round(tau,4)) + '  '+'ratioNDCG:'+ str(round(ratio_NDCG,4))+ '  ' + 'pairnames:'+' '+str(best_pairsname) + \
                        '   ' + 'delta_NDCG ='+'  '+str(round(delta_NDCG,4))+ "\n"
            NDCG_FILE.write(NDCG_line)
        with open(NDCG_file_matrix, 'a') as matrix_FILE:
            matrix_line = 'matrix for {}'.format(tmp_test_y_query[0].split(':')[-1].split()[0]) \
                          + '  ' + str(best_matrix) + '  ' + "\n"
            matrix_FILE.write(matrix_line)
        with open(ranklist_file, 'a') as ranklist:
            ranklist_line = tmp_test_y_query[0].split(':')[-1] + '  ' + 'ranklist before:' + str(
                rankedduculist1) + '  ' + 'ranklist after:' + '  ' + str(rankedduculist2) + "\n"
            ranklist.write(ranklist_line)
        os.remove(scorefile_path)
        os.remove(restore_path)     

             
    feature_k_loop(5)        
    feature_k_loop(10)

In [None]:
if __name__ == '__main__':
    #parameters to be set
    dataset = 'mslr'
    beam_size = 5
    if dataset == 'mq2008':
        model_path = 'model/'
        dataset_path = 'MQ2008/'
        modelname_index = 0
        model_set  =['LambdaMART_model.txt','Ranknet_model.txt','Linearregression_model.txt']
    else:
        model_path = 'MSLR-WEB10K_model/'
        dataset_path = 'MSLR-WEB10K/'
        modelname_index = 1    
        model_set  =['LambdaMART_model.txt','RankBoost_model.txt','LinearRegression_model.txt']
       
    for f in range(1,2):
        # the path of data
        datapath =dataset_path+'Fold{}/'.format(f)
        train_path = datapath + 'train.txt'
        test_path = datapath + 'test.txt'
        dataname = datapath.split('/')[0] +'_'+ datapath.split('/')[1].split('Fold')[1]
        # saving path and save files
        NDCGdata_path = 'logs/'
        temp_path = 'temp_file/'       
        
        # get train data and test data
        X_train, y_query_train, Query_train = get_microsoft_data(train_path)
        X_train = np.array(X_train)
        X_test, y_query_test, Query_test = get_microsoft_data(test_path)
        X_test = np.array(X_test)
        expected_value = np.mean(X_train, axis=0)

        # separate the test set
        test_data, test_y_query, test_Query, q_d_len = separate_set(y_query_test, X_test, Query_test)
        
        for MODEL in model_set:
            model = model_path + MODEL
            modelname = model.split("_")[modelname_index].split("/")[-1]
            resultfile_NDCG = 'resultfile/' + '{}_{}_validitybeamsearch_NDCG.txt'.format(dataname,modelname)
            resultfile_tau = 'resultfile/' + '{}_{}_validitybeamsearch_tau.txt'.format(dataname,modelname)
            resultfile_ratio =  'resultfile/' + '{}_{}_validitybeamsearch_ratio.txt'.format(dataname,modelname)
            complement_resultfile_NDCG = 'resultfile/' + '{}_{}_validitybeamsearchcomplement_NDCG.txt'.format(dataname,modelname)
            complement_resultfile_tau = 'resultfile/' + '{}_{}_validitybeamsearchcomplement_tau.txt'.format(dataname,modelname)
            complement_resultfile_ratio =  'resultfile/' + '{}_{}_validitybeamsearchcomplement_ratio.txt'.format(dataname,modelname)

            with Pool(10) as p:
                if dataset == 'mq2008':
                    print(p.map(loop_query, [query_index for query_index in range(len(test_data))]))
                else:
                    print(p.map(loop_query, [query_index for query_index in range(500)]))
       
            for feature_number in (5,10):
                NDCG_file_name = NDCGdata_path + '{}_validitybeamsearch_{}features'.format(dataname,feature_number) + modelname + '.txt'
                NDCG_file_matrix = NDCGdata_path + '{}_validitybeamsearch_matrix_{}features'.format(dataname,feature_number)  + modelname + '.txt'
                ranklist_file = NDCGdata_path + '{}_ranklist_validitybeamsearch_{}features'.format(dataname,feature_number)  + modelname + '.txt'
                complement_NDCG_file_name =  NDCGdata_path + '{}_validitybeamsearch_complement'.format(dataname) + modelname + '.txt'
                complement_ranklist_file = NDCGdata_path + '{}_ranklist_validitybeamsearch_complement'.format(dataname)  + modelname + '.txt'

                rerank_ndcg(NDCG_file_name)
                NDCG =  write_average(NDCG_file_name)
                rerank_ndcg(ranklist_file)
                rerank_matrix(NDCG_file_matrix)
                ratio = write_ratio(NDCG_file_name)
                tau = write_tau(NDCG_file_name)
                with open(resultfile_NDCG, 'a') as NDCG_result:
                    NDCG_result_line  = str(NDCG) + "\n"
                    NDCG_result.write(NDCG_result_line)
                with open(resultfile_tau,'a') as tau_result:
                    tau_result_line  = str(tau) + "\n" 
                    tau_result.write(tau_result_line)
                with open(resultfile_ratio,'a') as ratio_result:
                    ratio_result_line  = str(ratio) + "\n" 
                    ratio_result.write(ratio_result_line) 
            
            if dataset == 'mq2008':
                rerank_ndcg(complement_NDCG_file_name)
                NDCG =  write_average(complement_NDCG_file_name)
                rerank_ndcg(complement_ranklist_file)
                ratio = write_ratio(complement_NDCG_file_name)
                tau = write_tau(complement_NDCG_file_name)
                with open(complement_resultfile_NDCG, 'a') as NDCG_result:
                    NDCG_result_line  = str(NDCG) + "\n"
                    NDCG_result.write(NDCG_result_line)
                with open(complement_resultfile_tau,'a') as tau_result:
                    tau_result_line  = str(tau) + "\n" 
                    tau_result.write(tau_result_line)
                with open(complement_resultfile_ratio,'a') as ratio_result:
                    ratio_result_line  = str(ratio) + "\n" 
                    ratio_result.write(ratio_result_line)
            else:
                rerank_ndcg(complement_NDCG_file_name)
                rerank_ndcg(complement_ranklist_file)
                for feature_number in (5,10):
                    NDCG =  write_average2(complement_NDCG_file_name,feature_number)
                    ratio = write_ratio2(complement_NDCG_file_name,feature_number)
                    tau = write_tau2(complement_NDCG_file_name,feature_number)
                    with open(complement_resultfile_NDCG, 'a') as NDCG_result:
                        NDCG_result_line  = str(NDCG) + "\n"
                        NDCG_result.write(NDCG_result_line)
                    with open(complement_resultfile_tau,'a') as tau_result:
                        tau_result_line  = str(tau) + "\n" 
                        tau_result.write(tau_result_line)
                    with open(complement_resultfile_ratio,'a') as ratio_result:
                        ratio_result_line  = str(ratio) + "\n" 
                        ratio_result.write(ratio_result_line)         
            NDCG_file_name_1 = NDCGdata_path + '{}_validitybeamsearch_10features'.format(dataname) + modelname + '.txt'
            ranklist_file_1 = NDCGdata_path + '{}_ranklist_validitybeamsearch_10features'.format(dataname)  + modelname + '.txt'       
            NDCG_file_matrix_1 = NDCGdata_path + '{}_validitybeamsearch_matrix_10features'.format(dataname)  + modelname + '.txt'
            NDCG_file_name_2 = NDCGdata_path + '{}_validitybeamsearch_5features'.format(dataname) + modelname + '.txt'
            ranklist_file_2 = NDCGdata_path + '{}_ranklist_validitybeamsearch_5features'.format(dataname)  + modelname + '.txt'
            NDCG_file_matrix_2 = NDCGdata_path + '{}_validitybeamsearch_matrix_5features'.format(dataname)  + modelname + '.txt'
            NDCG_file_name = NDCGdata_path + '{}_validitybeamsearch'.format(dataname) + modelname + '.txt'
            ranklist_file = NDCGdata_path + '{}_ranklist_validitybeamsearch'.format(dataname)  + modelname + '.txt' 
            NDCG_file_matrix = NDCGdata_path + '{}_validitybeamsearch_matrix'.format(dataname)  + modelname + '.txt'
            
            first_part_set = []
            second_part_set = []
            with open(NDCG_file_name_1,'r') as fa:
                for linea in fa:
                    first_part = linea.split()[0]+' '+'changed 10features:='+linea.split('changed feature:')[1].split('kendalltau=')[0] +' '+'kendalltau10='+linea.split('kendalltau')[1].split()[0]+' '+'ratioNDCG10:'+linea.split('ratioNDCG:')[1].split()[0]+' '+\
                    'pairnames10: '+linea.split('pairnames: ')[1].split('delta_NDCG')[0]+ 'delta_NDCG10 ='+ linea.split()[-1] + ' '
                    first_part_set.append(first_part)
                    
            with open(NDCG_file_name_2, 'r') as fb:
                for lineb in fb:
                    second_part = 'changed 5features:='+lineb.split('changed feature:')[1].split('kendalltau=')[0]+ 'kendalltau5='+lineb.split('kendalltau')[1].split()[0]+' '+'ratioNDCG5:'+lineb.split('ratioNDCG:')[1].split()[0]+' '+\
                    'pairnames5: '+lineb.split('pairnames: ')[1].split('delta_NDCG')[0]+ 'delta_NDCG5='+ lineb.split()[-1] + ' '
                    second_part_set.append(second_part)
                        
            with open(NDCG_file_name,'w') as fc:
                for i in range(len(first_part_set)):
                    fc.write(first_part_set[i])
                    fc.write(second_part_set[i]+'\n')
            list_set1 = []
            list_set2 = []
            with open(ranklist_file_1,'r') as fa:
                for linea in fa:
                    first_part = linea.split()[0]+' '+'ranklist before:'+linea.split('ranklist before:')[1].split('ranklist after:')[0] +' '+ 'ranklist after10:'+linea.split('ranklist after:')[1].split('\n')[0]+' '
                    list_set1.append(first_part)
            with open(ranklist_file_2,'r') as fb:
                for lineb in fb:
                    second_part ='ranklist after5:'+lineb.split('ranklist after:')[1].split('\n')[0]
                    list_set2.append(second_part)              
            with open(ranklist_file,'w') as fc:
                for i in range(len(list_set1)):
                    fc.write(list_set1[i])
                    fc.write(list_set2[i]+'\n')
                        
            matrix_set1 = []
            matrix_set2 = []
            with open(NDCG_file_matrix_1,'r') as fa:
                for linea in fa:
                    first_part = ' '.join(linea.split()[:3])+' '+'matrix for 10 features:'+''.join(linea.split()[3:])+' '
                    matrix_set1.append(first_part)
            with open(NDCG_file_matrix_2,'r') as fb:
                for lineb in fb:
                    second_part ='matrix for 5 features:'+''.join(lineb.split()[3:])
                    matrix_set2.append(second_part)              
            with open(NDCG_file_matrix,'w') as fc:
                for i in range(len(matrix_set1)):
                    fc.write(matrix_set1[i])
                    fc.write(matrix_set2[i]+'\n')    
                    
                    
            os.remove(NDCG_file_name_1) 
            os.remove(NDCG_file_name_2) 
            os.remove(ranklist_file_1) 
            os.remove(ranklist_file_2)    
            os.remove(NDCG_file_matrix_1)
            os.remove(NDCG_file_matrix_2)             