In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pickle
import os
import glob
import pandas as pd
import subprocess
from collections import defaultdict
import numpy as np
from IPython.display import display, HTML

In [3]:
def create_trec_results_per_number_of_target_per_source(rel_file, top_file, result_file):
    gt = defaultdict(list)
    with open(rel_file, 'r') as f:
        for line in f:
            gt[line.split(' ')[0]].append(line)
    for i in range(1,11):
        rel_file_i = rel_file + '_' + str(i)
        with open(rel_file_i, 'w') as f:
            for source in gt:
                if len(gt[source]) == i:
                    f.writelines(gt[source])
        res_file = result_file + '_' + str(i)
#         print('Processing', res_file)
#         print(' '.join(['trec_eval -m all_trec', rel_file_i, top_file, '>', res_file,'2>&1']))
        subprocess.call(' '.join(['trec_eval -m all_trec', rel_file_i, top_file, '>', res_file,'2>&1']), shell=True)
#         print('Done')


def clean_top_file(rel_file, top_file):
    """
    Clean the top file such that only queries that exist in the
    rel_file are in the top file
    """
    keep_doc_set = set()
    keep_lines = []
    with open(rel_file, 'r') as f:
        for line in f:
            keep_doc_set.add(line.split(' ')[0])
    with open(top_file, 'r') as f:
        for line in f:
            if line.split(' ')[0] in keep_doc_set:
                keep_lines.append(line)
    with open(top_file, 'w') as f:
        f.writelines(keep_lines)        

In [78]:
exp_folders = glob.glob('../experiments/*/')
params = {}
fold_results = []
wrong_list = ['trec_eval: No queries with both results and rel', 'trec_eval.get_results: Cannot read results file']
folds = []
for exp_folder in exp_folders:
    exp_name = exp_folder.split(os.sep)[2]
    if not os.path.exists(os.path.join(exp_folder, 'params.p')):
        continue
    with open(os.path.join(exp_folder, 'params.p'), 'rb') as params_file:
        params[exp_name] = pickle.load(params_file)
    fold_folders = glob.glob(os.path.join(exp_folder,'fold*'))
    for fold_folder in fold_folders:
        result_file = os.path.join(fold_folder, 'trec_results')
        if not os.path.exists(result_file):
            rel_file = os.path.join(fold_folder, 'trec_rel_file.tmp')
            top_file = os.path.join(fold_folder, 'trec_top_file.tmp')
            print('Processing', fold_folder)
            clean_top_file(rel_file, top_file)
            create_trec_results_per_number_of_target_per_source(rel_file, top_file, result_file)
            subprocess.call(' '.join(['trec_eval -m all_trec', rel_file, top_file, '>', result_file,'2>&1']), shell=True)
            print('Done')
        result_files = glob.glob(os.path.join(fold_folder,'trec_result*'))
        for rf in result_files:
            with open(rf, 'r') as f:
                ntps = rf.split('_')[-1]
                first_line = f.readline()
                if first_line[:47] in wrong_list:
#                     print(first_line)
#                     print("Error parsing: " + rf)
                    continue
                f.seek(0)
                if ntps == 'results':
                    ntps = '0'
                temp_result_df = pd.read_csv(f, header=None, delimiter=r"\s+")
                temp_result_df = temp_result_df.pivot(index=1, columns=0, values=2)
                temp_result_df['ntps'] = ntps
                folds.append(fold_folder.split('_')[-1])
                temp_result_df.index = temp_result_df[['runid']] + '_ntps_' + ntps + "_fold_" + fold_folder.split('_')[-1] 
                fold_results.append(temp_result_df.copy())

params_df = pd.DataFrame()
fold_results = pd.concat(fold_results, axis=0)
fold_results
sel = np.logical_and(fold_results.columns != 'ntps',fold_results.columns != 'runid')
# fold_results.loc[:, sel] = fold_results.loc[:, sel].apply(pd.to_numeric)
fold_results['fold'] = folds
params_df = pd.DataFrame.from_dict(params)
unique_names = set([name[:-7] for name in fold_results.index])
# print(unique_names)
results = []
for unique_name in unique_names:
    res = fold_results[fold_results.index.str.contains(unique_name + '_')]['ntps'][0:1]
#     print(res.values[0])
    rest = fold_results[fold_results.index.str.contains(unique_name + '_')].drop('ntps', axis=1).apply(pd.to_numeric, errors='ignore').mean()
    rest['ntps'] = int(res.values[0])
    rest.name = '_'.join(unique_name.split('_')[:-2])
#     print(rest)
    rest = rest.to_frame()
    res = pd.concat([rest.T, params_df.T,], axis=1, join='inner')
#     print(res)
    res.index = [unique_name]
    results.append(res)

results = pd.concat(results, axis=0)
del results['fold']
# results.rename(columns={'architecture':'algorithm'}, inplace=True)

Processing ../experiments/random_dblp_v10_10000_20_300_300_cosine_0_expand/fold_4
Done
Processing ../experiments/random_dblp_v10_10000_20_300_300_cosine_0_expand/fold_3
Done
Processing ../experiments/random_dblp_v10_10000_20_300_300_cosine_0_expand/fold_2
Done
Processing ../experiments/random_dblp_v10_10000_20_300_300_cosine_0_expand/fold_0
Done
Processing ../experiments/random_dblp_v10_10000_20_300_300_cosine_0_expand/fold_1
Done
Processing ../experiments/doc2vec-gensim_dblp_v10_10000_20_300_300_cosine_0_expand/fold_4
Done
Processing ../experiments/doc2vec-gensim_dblp_v10_10000_20_300_300_cosine_0_expand/fold_3
Done
Processing ../experiments/doc2vec-gensim_dblp_v10_10000_20_300_300_cosine_0_expand/fold_2
Done
Processing ../experiments/doc2vec-gensim_dblp_v10_10000_20_300_300_cosine_0_expand/fold_0
Done
Processing ../experiments/doc2vec-gensim_dblp_v10_10000_20_300_300_cosine_0_expand/fold_1
Done
Processing ../experiments/doc2vec-gensim_dblp_v10_10000_10_300_300_cosine_0_expand/fold_4


In [79]:
results.query('ntps == 0')[['algorithm', 'num_rel_ret', 'iterations','num_q','n_docs','n_links','ntps','Rprec','ndcg_cut_100','map_cut_100','recip_rank']]

Unnamed: 0,algorithm,num_rel_ret,iterations,num_q,n_docs,n_links,ntps,Rprec,ndcg_cut_100,map_cut_100,recip_rank
doc2vec-gensim_dblp_v10_10000_20_300_300_cosine_0_expand_ntps_0,doc2vec-gensim,14.8,20,883.4,10000.0,13586,0.0,0.00016,0.00184,0.0004,0.00132
doc2vec-gensim_dblp_v10_10000_10_300_300_cosine_0_expand_ntps_0,doc2vec-gensim,13.2,10,883.4,10000.0,13586,0.0,0.0003,0.00162,0.00038,0.0014
doc2vec-gensim_dblp_v10_10000_5_300_300_cosine_0_expand_ntps_0,doc2vec-gensim,15.0,5,883.4,10000.0,13586,0.0,0.00028,0.0018,0.00046,0.00148
aai_dblp_v10_10000_5_300_300_cosine_0_expand_ntps_0,aai,593.4,5,882.6,,13586,0.0,0.01312,0.08182,0.02448,0.04728
random_dblp_v10_10000_20_300_300_cosine_0_expand_ntps_0,random,12.6,20,883.4,,13586,0.0,0.00016,0.00168,0.00042,0.00128


In [54]:
results.query('ntps == 6').sort_values(by='ntps')[['algorithm', 'num_rel_ret','num_q','n_docs','n_links','ntps','Rprec','ndcg_cut_100','map_cut_100','recip_rank']]

Unnamed: 0,algorithm,algorithm.1,num_rel_ret,num_q,n_docs,n_links,ntps,Rprec,ndcg_cut_100,map_cut_100,recip_rank
aai_dblp_v10_10000_5_300_300_cosine_0_expand_ntps_6,doc2vec-gensim,doc2vec-gensim,41.2,32.8,10000,13586,6.0,0.03114,0.10448,0.03026,0.1018
random_dblp_v10_10000_5_300_300_cosine_0_expand_ntps_6,doc2vec-gensim,doc2vec-gensim,2.0,32.8,10000,13586,6.0,0.0,0.00332,0.00018,0.00114
doc2vec-gensim_dblp_v10_10000_20_300_300_cosine_0_expand_ntps_6,doc2vec-gensim,doc2vec-gensim,1.6,32.8,10000,13586,6.0,0.00088,0.00358,0.00066,0.0039
doc2vec-gensim_dblp_v10_10000_5_300_300_cosine_0_expand_ntps_6,doc2vec-gensim,doc2vec-gensim,2.2,32.8,10000,13586,6.0,0.0,0.00396,0.00034,0.002


In [22]:
from collections import defaultdict
scope_exp_name = "doc2vec-gensim_aminer_org_v1_50000_5_300_300_cosine_0_True" 
for exp_folder in exp_folders:
    exp_name = exp_folder.split(os.sep)[1]
    if exp_name == scope_exp_name:
        print(exp_name)
        fold_folders = glob.glob(os.path.join(exp_folder,'fold*'))
        for fold_folder in fold_folders:
            gt_fname = os.path.join(fold_folder, 'trec_rel_file.tmp')
            res_fname = os.path.join(fold_folder, 'trec_top_file.tmp')
            gt_d = []
            res_d = []
            with open(gt_fname, 'r') as f:
                gt_d.extend(f.readline().split(' ') for i in range(250))
            with open(res_fname, 'r') as f:
                res_d.extend(f.readline().split(' ') for i in range(250))
            break
        break
gt = defaultdict(list)
res = defaultdict(list)
for line in gt_d:
    gt[line[0]].append(line[2])
for line in res_d:
    res[line[0]].append(line[2])

doc2vec-gensim_aminer_org_v1_50000_5_300_300_cosine_0_True


## Trec Results 

### Standard


| Abbreviation  	| Description                                                               	|
|---------------	|---------------------------------------------------------------------------	|
| num_ret       	| Total number of documents retrieved over all queries                      	|
| num_rel       	| Total number of relevant documents over all queries                       	|
| num_rel_ret   	| Total number of relevant documents retrieved over all queries             	|
| map           	| Mean Average Precision (MAP)                                              	|
| gm_ap         	| Average Precision. Geometric Mean, q_score=log(MAX(map,.00001))           	|
| R-prec        	| R-Precision (Precision after R (= num-rel for topic) documents retrieved) 	|
| bpref         	| Binary Preference, top R judged nonrel                                    	|
| recip_rank    	| Reciprical rank of top relevant document                                  	|
| ircl_prn.0.00 	| Interpolated Recall - Precision Averages at 0.00 recall                   	|
| ircl_prn.0.10 	| Interpolated Recall - Precision Averages at 0.10 recall                   	|
| ircl_prn.0.20 	| Interpolated Recall - Precision Averages at 0.20 recall                   	|
| ircl_prn.0.30 	| Interpolated Recall - Precision Averages at 0.30 recall                   	|
| ircl_prn.0.40 	| Interpolated Recall - Precision Averages at 0.40 recall                   	|
| ircl_prn.0.50 	| Interpolated Recall - Precision Averages at 0.50 recall                   	|
| ircl_prn.0.60 	| Interpolated Recall - Precision Averages at 0.60 recall                   	|
| ircl_prn.0.70 	| Interpolated Recall - Precision Averages at 0.70 recall                   	|
| ircl_prn.0.80 	| Interpolated Recall - Precision Averages at 0.80 recall                   	|
| ircl_prn.0.90 	| Interpolated Recall - Precision Averages at 0.90 recall                   	|
| ircl_prn.1.00 	| Interpolated Recall - Precision Averages at 1.00 recall                   	|
| P5            	| Precision after 5 docs retrieved                                          	|
| P10           	| Precision after 10 docs retrieved                                         	|
| P15           	| Precision after 15 docs retrieved                                         	|
| P20           	| Precision after 20 docs retrieved                                         	|
| P30           	| Precision after 30 docs retrieved                                         	|
| P100          	| Precision after 100 docs retrieved                                        	|
| P200          	| Precision after 200 docs retrieved                                        	|
| P500          	| Precision after 500 docs retrieved                                        	|
| P1000         	| Precision after 1000 docs retrieved                                       	|

### All

| Abbreviation                     	| Description                                                                                                         	|
|----------------------------------	|---------------------------------------------------------------------------------------------------------------------	|
| num_nonrel_judged_ret            	| Total number of judged non-relevant documents retrieved over all queries                                            	|
| exact_prec                       	| Exact Precision over retrieved set                                                                                  	|
| exact_recall                     	| Exact Recall over retrieved set                                                                                     	|
| 11-pt_avg                        	| Average over all 11 points of recall-precision graph                                                                	|
| 3-pt_avg                         	| Average over 3 points of recall-precision graph                                                                     	|
| avg_doc_prec                     	| Rel doc precision averaged over all relevant docs (NOT over topics)                                                 	|
| exact_relative_prec              	| Exact relative precision                                                                                            	|
| avg_relative_prec                	| Average relative precision                                                                                          	|
| exact_unranked_avg_prec          	| Exact Unranked Average Precision                                                                                    	|
| exact_relative_unranked_avg_prec 	| Exact Relative Unranked Average Precision                                                                           	|
| map_at_R                         	| Average Precision over first R docs retrieved                                                                       	|
| int_map                          	| Interpolated Mean Average Precision                                                                                 	|
| exact_int_R_rcl_prec             	| Exact R-based-interpolated-Precision                                                                                	|
| int_map_at_R                     	| Average Interpolated Precision for first R docs retrieved                                                           	|
| bpref_allnonrel                  	| Binary Preference, all judged nonrel                                                                                	|
| bpref_retnonrel                  	| Binary Preference, all retrieved judged nonrel                                                                      	|
| bpref_topnonrel                  	| Binary Preference, top 100 judged nonrel                                                                            	|
| bpref_top5Rnonrel                	| Binary Preference, top 5R judged nonrel                                                                             	|
| bpref_top10Rnonrel               	| Binary Preference, top 10R judged nonrel                                                                            	|
| bpref_top10pRnonrel              	| Binary Preference, top 10 + R judged nonrel                                                                         	|
| bpref_top25pRnonrel              	| Binary Preference, top 25 + R judged nonrel                                                                         	|
| bpref_top50pRnonrel              	| Binary Preference, top 50 + R judged nonrel                                                                         	|
| bpref_top25p2Rnonrel             	| Binary Preference, top 25 + 2*R judged nonrel                                                                       	|
| bpref_retall                     	| Binary Preference, Only retrieved judged rel and nonrel                                                             	|
| bpref_5                          	| Binary Preference, top 5 rel, top 5 nonrel                                                                          	|
| bpref_10                         	| Binary Preference, top 10 rel, top 10 nonrel                                                                        	|
| bpref_num_all                    	| Binary Preference, Number not retrieved before (all judged)                                                         	|
| bpref_num_ret                    	| Binary Preference, Number retrieved after                                                                           	|
| bpref_num_correct                	| Binary Preference, Number correct preferences                                                                       	|
| bpref_num_possible               	| Binary Preference, Number possible correct_preferences                                                              	|
| old_bpref                        	| Buggy Version 7.3. Binary Preference, top R judged nonrel                                                           	|
| old_bpref_top10pRnonrel          	| Buggy Version 7.3. Binary Preference,top 10+R judged nonrel                                                         	|
| infAP                            	| Inferred AP. Calculate AP using only a judged random sample of the pool, averaging in unpooled documents as nonrel. 	|
| gm_bpref                         	| Binary Preference, top R judged nonrel, Geometric Mean, q_score=log(MAX(bpref,.00001))                              	|
| rank_first_rel                   	| Rank of top relevant document (0 if none)                                                                           	|
| recall5                          	| Recall after 5 docs retrieved                                                                                       	|
| recall10                         	| Recall after 10 docs retrieved                                                                                      	|
| recall15                         	| Recall after 15 docs retrieved                                                                                      	|
| recall20                         	| Recall after 20 docs retrieved                                                                                      	|
| recall30                         	| Recall after 30 docs retrieved                                                                                      	|
| recall100                        	| Recall after 100 docs retrieved                                                                                     	|
| recall200                        	| Recall after 200 docs retrieved                                                                                     	|
| recall500                        	| Recall after 500 docs retrieved                                                                                     	|
| recall1000                       	| Recall after 1000 docs retrieved                                                                                    	|
| 0.20R-prec                       	| R-based precision- precision after 0.20 * R docs retrieved                                                          	|
| 0.40R-prec                       	| R-based precision- precision after 0.40 * R docs retrieved                                                          	|
| 0.60R-prec                       	| R-based precision- precision after 0.60 * R docs retrieved                                                          	|
| 0.80R-prec                       	| R-based precision- precision after 0.80 * R docs retrieved                                                          	|
| 1.00R-prec                       	| R-based precision- precision after 1.00 * R docs retrieved                                                          	|
| 1.20R-prec                       	| R-based precision- precision after 1.20 * R docs retrieved                                                          	|
| 1.40R-prec                       	| R-based precision- precision after 1.40 * R docs retrieved                                                          	|
| 1.60R-prec                       	| R-based precision- precision after 1.60 * R docs retrieved                                                          	|
| 1.80R-prec                       	| R-based precision- precision after 1.80 * R docs retrieved                                                          	|
| 2.00R-prec                       	| R-based precision- precision after 2.00 * R docs retrieved                                                          	|
| relative_prec5                   	| Relative precision after 5 docs retrieved                                                                           	|
| relative_prec10                  	| Relative precision after 10 docs retrieved                                                                          	|
| relative_prec15                  	| Relative precision after 15 docs retrieved                                                                          	|
| relative_prec20                  	| Relative precision after 20 docs retrieved                                                                          	|
| relative_prec30                  	| Relative precision after 30 docs retrieved                                                                          	|
| relative_prec100                 	| Relative precision after 100 docs retrieved                                                                         	|
| relative_prec200                 	| Relative precision after 200 docs retrieved                                                                         	|
| relative_prec500                 	| Relative precision after 500 docs retrieved                                                                         	|
| relative_prec1000                	| Relative precision after 1000 docs retrieved                                                                        	|
| unranked_avg_prec5               	| Unranked Average Precision after 5 docs retrieved                                                                   	|
| unranked_avg_prec10              	| Unranked Average Precision after 10 docs retrieved                                                                  	|
| unranked_avg_prec15              	| Unranked Average Precision after 15 docs retrieved                                                                  	|
| unranked_avg_prec20              	| Unranked Average Precision after 20 docs retrieved                                                                  	|
| unranked_avg_prec30              	| Unranked Average Precision after 30 docs retrieved                                                                  	|
| unranked_avg_prec100             	| Unranked Average Precision after 100 docs retrieved                                                                 	|
| unranked_avg_prec200             	| Unranked Average Precision after 200 docs retrieved                                                                 	|
| unranked_avg_prec500             	| Unranked Average Precision after 500 docs retrieved                                                                 	|
| unranked_avg_prec1000            	| Unranked Average Precision after 1000 docs retrieved                                                                	|
| relative_unranked_avg_prec5      	| Relative Unranked Average Precision after 5 docs retrieved                                                          	|
| relative_unranked_avg_prec10     	| Relative Unranked Average Precision after 10 docs retrieved                                                         	|
| relative_unranked_avg_prec15     	| Relative Unranked Average Precision after 15 docs retrieved                                                         	|
| relative_unranked_avg_prec20     	| Relative Unranked Average Precision after 20 docs retrieved                                                         	|
| relative_unranked_avg_prec30     	| Relative Unranked Average Precision after 30 docs retrieved                                                         	|
| relative_unranked_avg_prec100    	| Relative Unranked Average Precision after 100 docs retrieved                                                        	|
| relative_unranked_avg_prec200    	| Relative Unranked Average Precision after 200 docs retrieved                                                        	|
| relative_unranked_avg_prec500    	| Relative Unranked Average Precision after 500 docs retrieved                                                        	|
| relative_unranked_avg_prec1000   	| Relative Unranked Average Precision after 1000 docs retrieved                                                       	|
| utility_1.0_-1.0_0.0_0.0         	| Utility (a,b,c,d) Coefficients 1.0_-1.0_0.0_0.0                                                                     	|
| rcl_at_142_nonrel                	| Recall averaged at X nonrel docs X= 142                                                                             	|
| fallout_recall_0                 	| Fallout - Recall Averages- recall after 0 nonrel docs retrieved                                                     	|
| fallout_recall_14                	| Fallout - Recall Averages- recall after 14 nonrel docs retrieved                                                    	|
| fallout_recall_28                	| Fallout - Recall Averages- recall after 28 nonrel docs retrieved                                                    	|
| fallout_recall_42                	| Fallout - Recall Averages- recall after 42 nonrel docs retrieved                                                    	|
| fallout_recall_56                	| Fallout - Recall Averages- recall after 56 nonrel docs retrieved                                                    	|
| fallout_recall_71                	| Fallout - Recall Averages- recall after 71 nonrel docs retrieved                                                    	|
| fallout_recall_85                	| Fallout - Recall Averages- recall after 85 nonrel docs retrieved                                                    	|
| fallout_recall_99                	| Fallout - Recall Averages- recall after 99 nonrel docs retrieved                                                    	|
| fallout_recall_113               	| Fallout - Recall Averages- recall after 113 nonrel docs retrieved                                                   	|
| fallout_recall_127               	| Fallout - Recall Averages- recall after 127 nonrel docs retrieved                                                   	|
| fallout_recall_142               	| Fallout - Recall Averages- recall after 142 nonrel docs retrieved                                                   	|
| int_0.20R-prec                   	| Interpolated R-based precision, after 0.20 * R docs retrieved                                                       	|
| int_0.40R-prec                   	| Interpolated R-based precision, after 0.40 * R docs retrieved                                                       	|
| int_0.60R-prec                   	| Interpolated R-based precision, after 0.60 * R docs retrieved                                                       	|
| int_0.80R-prec                   	| Interpolated R-based precision, after 0.80 * R docs retrieved                                                       	|
| int_1.00R-prec                   	| Interpolated R-based precision, after 1.00 * R docs retrieved                                                       	|
| int_1.20R-prec                   	| Interpolated R-based precision, after 1.20 * R docs retrieved                                                       	|
| int_1.40R-prec                   	| Interpolated R-based precision, after 1.40 * R docs retrieved                                                       	|
| int_1.60R-prec                   	| Interpolated R-based precision, after 1.60 * R docs retrieved                                                       	|
| int_1.80R-prec                   	| Interpolated R-based precision, after 1.80 * R docs retrieved                                                       	|
| int_2.00R-prec                   	| Interpolated R-based precision, after 2.00 * R docs retrieved                                                       	|
| micro_prec                       	| Total relevant retrieved documents / Total retrieved documents                                                      	|
| micro_recall                     	| Total relevant retrieved documents / Total relevant documents                                                       	|
| micro_bpref                      	| Total correct preferences / Total possible preferences                                                              	|

In [23]:
def raw_text(doc_name, limit=500):
    with open('data.tmp/aminer_org_v1/texts.txt', 'r') as f:
        for line in f:
            if line[:10].split(' ')[0] == doc_name:
                if limit > 0:
                    line = line[:limit]
                return line
doc_name = '146'
print('----- Original ----')
print(raw_text(doc_name))
print('-----    GT    ----')
for t in gt[doc_name]:
    print('>', raw_text(t))
print('-----  Found   ----')
for t in res[doc_name][:5]:
    print('>', raw_text(t))

----- Original ----
146 Finite difference approximations of the second derivative in space appearing in, parabolic, incompletely parabolic systems of, and 2nd-order hyperbolic, partial differential equations are considered. If the solution is pointwise bounded, we prove that finite difference approximations of those classes of equations can be closed with two orders less accuracy at the boundary without reducing the global order of accuracy.This result is generalised to initial-boundary value problems with an mth-o
-----    GT    ----
-----  Found   ----
> 352303 We present an efficient protocol for privacy-preserving evaluation of diagnostic programs, represented as binary decision trees or branching programs. The protocol applies a branching diagnostic program with classification labels in the leaves to the user's attribute vector. The user learns only the label assigned by the program to his vector; the diagnostic program itself remains secret. The program's owner does not learn any

In [8]:
doc_names

NameError: name 'doc_names' is not defined

In [41]:
params_df.T

Unnamed: 0,algorithm,architecture,batch_size,concat,data,dist_measure,document_size,emb_size_d,emb_size_w,embedding_size_d,...,learning_rate,loss_type,n_neg_samples,n_steps,optimize,remove_docs_without_links,sample,seed,vocabulary_size,window_size
doc2vec-gensim_aminer_org_v1_50000_5_300_300_cosine_0_True,doc2vec-gensim,doc2vec-gensim,128,True,aminer_org_v1,cosine,34104.8,300.0,300.0,300,...,1,sampled_softmax_loss,64,170524,Adagrad,True,50000.0,0.0,50000,8
random_aminer_org_v1_50000_5_300_300_cosine_0_True,,random,128,True,,,34104.8,,,300,...,1,sampled_softmax_loss,64,170524,Adagrad,,,,50000,8


In [143]:
exp_folders = glob.glob('experiments/*/')
params = {}
results = {}
for exp_folder in exp_folders:
    exp_name = exp_folder.split(os.sep)[1]
    if not os.path.exists(os.path.join(exp_folder, 'params.p')):
        continue
    with open(os.path.join(exp_folder, 'params.p'), 'rb') as params_file:
        params[exp_name] = pickle.load(params_file)
    with open(os.path.join(exp_folder, 'results.p'), 'rb') as results_file:
        results[exp_name] = pickle.load(results_file)

result_df = pd.DataFrame()
params_df = pd.DataFrame()
for exp, result in results.items():
    mean_result = pd.DataFrame.from_dict(result).mean()
    mean_result.name=exp
    result_df = result_df.append(mean_result)
result_df = pd.concat([result_df, pd.DataFrame.from_dict(params).T], axis=1)
result_df[['average_precision', 'ndcg_at_10', 'n_steps']].sort_values('n_steps')

Unnamed: 0,average_precision,ndcg_at_10,n_steps
pvdm_original_articles_1_300_300_cosine,0.203961,0.228228,1031
pvdm_original_articles_5_300_300_cosine,0.926748,0.932935,5155


In [4]:
result_df

Unnamed: 0,average_precision,ndcg_at_10,algorithm,architecture,batch_size,concat,data,dist_measure,document_size,emb_size_d,...,iterations,learning_rate,loss_type,n_neg_samples,n_steps,nsteps,optimize,prior_sample_size,vocabulary_size,window_size
doc2vec-gensim_aminer_org_v1_5_300_300_cosine,0.000702,0.001029,doc2vec-gensim,pvdm,128,True,aminer_org_v1,cosine,111000,300,...,5.0,1,sampled_softmax_loss,64,555000,,Adagrad,,50000,8
doc2vec-gensim_original_articles_100001_300_300_cosine,0.677843,0.687516,doc2vec-gensim,pvdm,128,True,original_articles,cosine,1031,300,...,,1,sampled_softmax_loss,64,100001,100001.0,Adagrad,,50000,8
doc2vec-gensim_original_articles_5_100_100_cosine,0.675223,0.680724,doc2vec-gensim,pvdm,128,True,original_articles,cosine,1031,100,...,5.0,1,sampled_softmax_loss,64,5155,,Adagrad,,50000,8
doc2vec-gensim_original_articles_5_300_300_cosine,0.67304,0.682589,doc2vec-gensim,pvdm,128,True,original_articles,cosine,1031,300,...,5.0,1,sampled_softmax_loss,64,5155,,Adagrad,,50000,8
pvdm_original_articles_10000_300_300_cosine,0.649623,0.670708,pvdm,pvdm,128,True,original_articles,cosine,1031,300,...,,1,sampled_softmax_loss,64,10000,10000.0,Adagrad,10.0,50000,8
pvdm_original_articles_1000_300_300_cosine,0.674012,0.685618,pvdm,pvdm,128,True,original_articles,cosine,1031,300,...,,1,sampled_softmax_loss,64,1000,1000.0,Adagrad,10.0,50000,8
pvdm_original_articles_100_300_300_cosine,0.679642,0.689899,pvdm,pvdm,128,True,original_articles,cosine,1031,300,...,,1,sampled_softmax_loss,64,100,100.0,Adagrad,10.0,50000,8
pvdm_original_articles_50000_300_300_cosine,0.639258,0.665104,pvdm,pvdm,128,True,original_articles,cosine,1031,300,...,,1,sampled_softmax_loss,64,50000,50000.0,Adagrad,10.0,50000,8


In [36]:
import pickle
data_folder = 'aminer_org_v1'
rel_labels_fname = 'relevance_labels_' + data_folder + '.p'
with open(rel_labels_fname, 'rb') as rel_lab_file:
    _, _, _, tokenized, _, sorted_bm25_indices = pickle.load(rel_lab_file)

In [37]:
import numpy as np

In [43]:
import nltk
import gensim
output_fname='rel_labels.p'
folder='original_articles'
source_dict = {}  # maps article filename to source
docs = []  # list with documents
doc_names = []  # doc names with same index as docs

data_path = os.path.join(os.getcwd(), 'data.tmp', folder)
for subdir, dirs, files in os.walk(data_path):
    files = [fi for fi in files if fi.endswith(".txt")]
    for file in files:
        path = os.path.join(subdir, file)
        folder_name = subdir.split(os.path.sep)[-1]
        fname = file[:-4]
        source_dict[fname] = subdir.split(os.path.sep)[-1]
        with open(path, 'r', encoding='utf8') as f:
            docs.append(f.read())
        doc_names.append(fname)

tokenized = []
for doc in docs:
    tokens = [word for sent in nltk.sent_tokenize(doc) for word in nltk.word_tokenize(sent)]
    tokenized.append(tokens)

print(len(docs))
print("Computing BM25...")
bm25 = gensim.summarization.bm25.BM25(tokenized)
print("Done computing bm25, compute average IDF...")
average_idf = sum(map(lambda k: float(bm25.idf[k]), bm25.idf.keys())) / len(bm25.idf.keys())
print("Done computing average IDF.")
bm25_scores = []
sorted_bm25_indices = []
len_tokenized = len(tokenized)

1031
Computing BM25...
Done computing bm25, compute average IDF...
Done computing average IDF.


In [55]:
doc = tokenized[500]
print(' '.join(doc))
temp_bm25_scores = bm25.get_scores(doc, average_idf)
temp_bm25_scores = temp_bm25_scores
sorted_indices = sorted(range(len(temp_bm25_scores)), key=lambda x: temp_bm25_scores[x], reverse=True)
# print(sorted_indices)

Article 517 Definition of eligible capital By 31 December 2014 the Commission shall review and report on the appropriateness of the definition of eligible capital being applied for the purposes of Title III of Part Two and Part Four and shall submit that report to the European Parliament and the Council , and , if appropriate , a legislative proposal .
867


In [101]:
import gensim
from gensim import corpora
import math

class BM25 :
    def __init__(self, fn_docs) :
        self.dictionary = corpora.Dictionary()
        self.doc_names = []
        self.source_dict = {}
        self.DF = {}
        self.DocTF = []
        self.DocIDF = {}
        self.N = 0
        self.DocAvgLen = 0
        self.fn_docs = fn_docs
        self.DocLen = []
        self.buildDictionary()
        self.TFIDF_Generator()

    def buildDictionary(self) :
        data = []
#         for line in file(self.fn_docs) :
#             data.append(line.strip().split(self.delimiter))
        data_path = os.path.join(os.getcwd(), 'data.tmp', self.fn_docs)
        for subdir, dirs, files in os.walk(data_path):
            files = [fi for fi in files if fi.endswith(".txt")]
            for file in files:
                path = os.path.join(subdir, file)
                folder_name = subdir.split(os.path.sep)[-1]
                fname = file[:-4]
                self.source_dict[fname] = subdir.split(os.path.sep)[-1]
                with open(path, 'r', encoding='utf8') as f:
                    data.append([word for sent in nltk.sent_tokenize(f.read()) for word in nltk.word_tokenize(sent)])
                self.doc_names.append(fname)
        self.dictionary.add_documents(data)

    def TFIDF_Generator(self, base=math.e) :
        docTotalLen = 0
        data_path = os.path.join(os.getcwd(), 'data.tmp', self.fn_docs)
        for subdir, dirs, files in os.walk(data_path):
            files = [fi for fi in files if fi.endswith(".txt")]
            for file in files:
                path = os.path.join(subdir, file)
                with open(path, 'r', encoding='utf8') as f:
                    doc = [word for sent in nltk.sent_tokenize(f.read()) for word in nltk.word_tokenize(sent)]
                docTotalLen += len(doc)
                self.DocLen.append(len(doc))
                bow = dict([(term, freq*1.0/len(doc)) for term, freq in self.dictionary.doc2bow(doc)])
                for term, tf in bow.items() :
                    if term not in self.DF :
                        self.DF[term] = 0
                    self.DF[term] += 1
                self.DocTF.append(bow)
                self.N = self.N + 1
        for term in self.DF:
            self.DocIDF[term] = math.log((self.N - self.DF[term] +0.5) / (self.DF[term] + 0.5), base)
        self.DocAvgLen = docTotalLen / self.N

    def BM25Score(self, Query=[], k1=1.5, b=0.75) :
        query_bow = self.dictionary.doc2bow(Query)
        scores = []
        for idx, doc in enumerate(self.DocTF) :
            commonTerms = set(dict(query_bow).keys()) & set(doc.keys())
            tmp_score = []
            doc_terms_len = self.DocLen[idx]
            for term in commonTerms :
                upper = (doc[term] * (k1+1))
                below = ((doc[term]) + k1*(1 - b + b*doc_terms_len/self.DocAvgLen))
                tmp_score.append(self.DocIDF[term] * upper / below)
            scores.append(sum(tmp_score))
        return scores

    def TFIDF(self) :
        tfidf = []
        for doc in self.DocTF :
            doc_tfidf  = [(term, tf*self.DocIDF[term]) for term, tf in doc.items()]
            doc_tfidf.sort()
            tfidf.append(doc_tfidf)
        return tfidf

    def Items(self) :
        # Return a list [(term_idx, term_desc),]
        items = self.dictionary.items()
        items.sort()
        return items

In [134]:
import nltk
import gensim
output_fname='rel_labels.p'
folder='aminer_org_v1'
source_dict = {}  # maps article filename to source
docs = []  # list with documents
doc_names = []  # doc names with same index as docs

data_path = os.path.join(os.getcwd(), 'data.tmp', folder)
for subdir, dirs, files in os.walk(data_path):
    files = [fi for fi in files if fi.endswith(".txt")]
    for file in files:
        path = os.path.join(subdir, file)
        folder_name = subdir.split(os.path.sep)[-1]
        fname = file[:-4]
        source_dict[fname] = subdir.split(os.path.sep)[-1]
        with open(path, 'r', encoding='utf8') as f:
            docs.append(f.read())
        doc_names.append(fname)

tokenized = []
for doc in docs:
    tokens = [word for sent in nltk.sent_tokenize(doc) for word in nltk.word_tokenize(sent)]
    tokenized.append(tokens)

In [133]:
fn_docs = 'aminer_org_v1'
bm25 = BM25(fn_docs)

In [138]:
def inspect(index):
    print(index)
    Query = tokenized[index]
    scores = bm25.BM25Score(Query)
    sorted_indices = sorted(range(len(scores)), key=lambda x: scores[x], reverse=True)
    print(sorted_indices[:10])
    print('\nOriginal text:\n')
    print(' '.join(tokenized[index][:300]))
    print('\nTop 5:\n')
    for i in range(5):
        print(' '.join(tokenized[sorted_indices[i]]),'\n--------------------\n')

index = np.random.randint(len(tokenized))
inspect(index)

91283
[91283, 6339, 72175, 33383, 57814, 109712, 19367, 94624, 35338, 22166]

Original text:

Dynamically managing the communication-parallelism trade-off in future clustered processors Clustered microarchitectures are an attractive alternative to large monolithic superscalar designs due to their potential for higher clock rates in the face of increasingly wire-delay-constrained process technologies . As increasing transistor counts allow an increase in the number of clusters , thereby allowing more aggressive use of instruction-level parallelism ( ILP ) , the inter-cluster communication increases as data values get spread across a wider area . As a result of the emergence of this trade-off between communication and parallelism , a subset of the total on-chip clusters is optimal for performance . To match the hardware to the application 's needs , we use a robust algorithm to dynamically tune the clustered architecture . The algorithm , which is based on program metrics gathered at per

In [68]:
fn_docs = 'original_articles'


data = []
data_path = os.path.join(os.getcwd(), 'data.tmp', fn_docs)
print(data_path)
doc_names=[]
dictionary=corpora.Dictionary()
for subdir, dirs, files in os.walk(data_path):
    files = [fi for fi in files if fi.endswith(".txt")]
    for file in files:
        path = os.path.join(subdir, file)
        folder_name = subdir.split(os.path.sep)[-1]
        fname = file[:-4]
        source_dict[fname] = subdir.split(os.path.sep)[-1]
        with open(path, 'r', encoding='utf8') as f:
            data.append([word for sent in nltk.sent_tokenize(f.read()) for word in nltk.word_tokenize(sent)])
        doc_names.append(fname)
dictionary.add_documents(data)

/Users/alexandervansomeren/Documents/Studie/Msc_AI/Thesis/regulatory tracker/doc2vec_pipeline/data.tmp/original_articles


In [77]:
dictionary.doc2bow()

TypeError: doc2bow() missing 1 required positional argument: 'document'