# Analyze extracted features

The goal of this notebook is to compute and present information from the features that were extracted by PRA/SFE.

In [None]:
import pandas as pd
import numpy as np
import os, time
from explain.helpers import parse_feature_matrix, get_dirs
from tools import train_test

In [None]:
# emb_import_paths = [
#     "/home/arthurcgusmao/Projects/xkbc/algorithms/OpenKE/results/WN11/TransE/1527008113",
#     "/home/arthurcgusmao/Projects/xkbc/algorithms/OpenKE/results/FB13/TransE/1527033688",
#     "/home/arthurcgusmao/Projects/xkbc/algorithms/OpenKE/results/NELL186/TransE/1526711822",
# ]

def extraction_results(emb_import_paths):
    results = []
    
    for emb_import_path in emb_import_paths:
        model_info = train_test.read_model_info(emb_import_path)
        print("Processing results for {}, {}, ({}):".format(model_info['dataset_name'], model_info['model_name'], emb_import_path.split('/')[-1]))
        pra_results_dir = os.path.join(emb_import_path, 'pra_explain/results')
        extracted_features_dirs_names = get_dirs(pra_results_dir)
        for extracted_features_dirs_name in extracted_features_dirs_names:
            dpath = os.path.join(pra_results_dir, extracted_features_dirs_name)
            relations = get_dirs(dpath)
            n_feats = {}
            
            for rel in relations:
                for fm in os.listdir(os.path.join(dpath, rel)):
                    heads, tails, labels, feat_dicts = parse_feature_matrix(os.path.join(dpath, rel, fm))
                    n_feats[fm] = pd.concat((pd.Series([len(fd) for fd in feat_dicts]), n_feats.get(fm, pd.Series()))) # concatenate features of all relations for the same fold feature matrix (e.g. train.tsv) file.
                    
            res = {
                'dataset': model_info['dataset_name'],
                'model': model_info['model_name'],
                'timestamp': emb_import_path.split('/')[-1],
                'extracted features': extracted_features_dirs_name,
            }
            total = 0
            total_w_feats = 0 # total of examples with one or more features
            total_n_feats = 0 # average number of features per example
            for fm,series in n_feats.iteritems():
                res[fm] = len(series)
                res["{}>1".format(fm)] = len(series.loc[series >= 1])
                total += len(series)
                total_w_feats += len(series.loc[series >= 1])
                total_n_feats += np.sum(series)
            res['avg #feats/example'] = float(total_n_feats) / total
            res['total'] = total
            res["total>1"] = total_w_feats
            res['% > 1'] = float(total_w_feats) / total
            results.append(res)
            print("\t{} (...)".format(extracted_features_dirs_name))
    print("Done.")
    return results

# Run

In [None]:
emb_import_paths = [
    "/home/arthurcgusmao/Projects/xkbc/algorithms/OpenKE/results/WN11/TransE/1527008113",
    "/home/arthurcgusmao/Projects/xkbc/algorithms/OpenKE/results/FB13/TransE/1527033688",
    "/home/arthurcgusmao/Projects/xkbc/algorithms/OpenKE/results/NELL186/TransE/1526711822",
]
res = extraction_results(emb_import_paths)
pd.DataFrame(res)

# Debug