Skip to content

Commit

Permalink
New models, query strategies and API changes(#149)
Browse files Browse the repository at this point in the history
  • Loading branch information
qubixes authored and J535D165 committed Jan 22, 2020
1 parent f3de58d commit a98e161
Show file tree
Hide file tree
Showing 70 changed files with 3,030 additions and 1,873 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/ci-workflow.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,6 @@ jobs:
run: |
pip install pytest
pip install --upgrade setuptools>=41.0.0
pip install .
pip install .[all]
pytest
6 changes: 3 additions & 3 deletions asreview/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,9 @@
# limitations under the License.

from asreview.logging.utils import open_logger
from asreview.models.embedding import load_embedding
from asreview.models.embedding import sample_embedding
from asreview.feature_extraction.embedding_lstm import load_embedding
from asreview.feature_extraction.embedding_lstm import sample_embedding
from asreview.feature_extraction.embedding_lstm import text_to_features
from asreview.readers import ASReviewData
from asreview.readers import read_csv
from asreview.readers import read_data
Expand All @@ -28,7 +29,6 @@
from asreview.review import review_simulate
from asreview.review import ReviewOracle
from asreview.review import ReviewSimulate
from asreview.utils import text_to_features

from ._version import get_versions
__version__ = get_versions()['version']
Expand Down
48 changes: 44 additions & 4 deletions asreview/analysis/analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,20 +12,20 @@
# See the License for the specific language governing permissions and
# limitations under the License.

'''
Analysis of log files.
'''

import itertools
import json
import os

import numpy as np
from scipy import stats
from sklearn.cluster import KMeans

from asreview.logging.utils import loggers_from_dir
from asreview.analysis.statistics import _get_labeled_order
from asreview.analysis.statistics import _get_limits
from asreview.analysis.statistics import _find_inclusions
from asreview.analysis.statistics import _get_last_proba_order
from asreview.analysis.statistics import _random_ttd, _cluster_ttd, _max_ttd


class Analysis():
Expand Down Expand Up @@ -332,6 +332,46 @@ def limits(self, prob_allow_miss=[0.1]):
results["limits"][i_prob], np.int)
return results

def time_to_inclusion(self, X_fp):
with open(X_fp, "r") as fp:
X = np.array(json.load(fp))

n_clusters = int(len(self.labels)/150)
model = KMeans(n_clusters=n_clusters, n_init=1)
clusters = model.fit_predict(X, self.labels)
logger = self.loggers[self._first_file]
n_queries = logger.n_queries()

results = {
"x_range": [],
"ttd": {
"random": [],
"cluster": [],
"max": [],
}
}
n_train = 0
for query_i in range(n_queries):
new_random_ttd = _random_ttd(self.loggers, query_i, self.labels)
new_cluster_ttd = _cluster_ttd(self.loggers, query_i, self.labels,
clusters)
new_max_ttd = _max_ttd(self.loggers, query_i, self.labels)

results["ttd"]["random"].append(new_random_ttd)
results["ttd"]["cluster"].append(new_cluster_ttd)
results["ttd"]["max"].append(new_max_ttd)

try:
new_train_idx = logger.get("train_idx", query_i)
except KeyError:
new_train_idx = None

if new_train_idx is not None:
n_train = len(new_train_idx)
results["x_range"].append(n_train)

return results

def close(self):
"Close loggers."
for logger in self.loggers.values():
Expand Down
113 changes: 113 additions & 0 deletions asreview/analysis/statistics.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,3 +129,116 @@ def _get_limits(loggers, query_i, labels, proba_allow_miss=[]):
if len(allow_miss) == 0:
break
return limits


def _random_ttd(loggers, query_i, labels):
all_ttd = []
for logger in loggers.values():
try:
pool_idx = logger.get("pool_idx", query_i)
except KeyError:
continue

pool_labels = labels[pool_idx]
n_included = np.sum(pool_labels)

if n_included == 0:
continue
ttd = 0
p_only_zero = 1
for i in range(len(pool_labels) - n_included):
p_first = n_included/(len(pool_labels)-i)
ttd += p_only_zero*p_first*i
p_only_zero *= (1-p_first)
all_ttd.append(ttd)

if len(all_ttd) == 0:
ttd_avg = 0
else:
ttd_avg = np.average(all_ttd)
return ttd_avg


def _max_ttd(loggers, query_i, labels):
all_ttd = []
for logger in loggers.values():
proba_order = _get_proba_order(logger, query_i)
if proba_order is None:
all_ttd.append(0)
continue
if len(proba_order) == 0:
continue

x = np.where(labels[proba_order] == 1)[0]
if len(x) == 0:
ttd = 0
else:
ttd = (len(proba_order)-1) - x[-1]
all_ttd.append(ttd)

if len(all_ttd) == 0:
ttd_avg = 0
else:
ttd_avg = np.average(all_ttd)
return ttd_avg


def _cluster_order(all_dict, power=0):
scores = []
for clust_id in all_dict:
for i in range(all_dict[clust_id]):
new_score = (i+1) * pow(all_dict[clust_id], -power)
scores.append((clust_id, new_score))
scores = sorted(scores, key=lambda x: x[1])
return [x[0] for x in scores]


def _get_clustering(all_prediction, pool_idx, labels):
pool_prediction = all_prediction[pool_idx]
one_idx = np.where(labels[pool_idx] == 1)[0]
unique, counts = np.unique(pool_prediction, return_counts=True)
all_dict = {unique[i]: counts[i] for i in range(len(unique))}
all_counts = [all_dict.get(i, 0) for i in range(np.max(unique)+1)]

prediction = pool_prediction[one_idx, ]
unique, counts = np.unique(prediction, return_counts=True)
one_dict = {unique[i]: counts[i] for i in range(len(unique))}
one_counts = [one_dict.get(i, 0) for i in range(len(all_counts))]
return all_dict, all_counts, one_dict, one_counts


def _cluster_ttd(loggers, query_i, labels, all_prediction):
all_ttd = []
for logger in loggers.values():
try:
pool_idx = logger.get("pool_idx", query_i)
except KeyError:
all_ttd.append(0)
continue

all_dict, all_counts, _one_dict, one_counts = _get_clustering(
all_prediction, pool_idx, labels)
cluster_order = _cluster_order(all_dict)

p_only_zero = 1
ttd = 0
if np.sum(one_counts) == 0:
continue
for i, i_clust in enumerate(cluster_order):
try:
p_first = one_counts[i_clust]/all_counts[i_clust]
except IndexError:
print(
i_clust, list(all_dict), len(all_counts), len(one_counts))
ttd += p_only_zero*p_first*i
p_only_zero *= 1-p_first
all_counts[i_clust] -= 1
if p_only_zero < 1e-6:
break
all_ttd.append(ttd)

if len(all_ttd) == 0:
ttd_avg = 0
else:
ttd_avg = np.average(all_ttd)
return ttd_avg
12 changes: 5 additions & 7 deletions asreview/balance_strategies/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,9 @@
# See the License for the specific language governing permissions and
# limitations under the License.

from asreview.balance_strategies.full_sampling import full_sample
from asreview.balance_strategies.full_sampling import FullSampleTD
from asreview.balance_strategies.triple_balance import triple_balance
from asreview.balance_strategies.triple_balance import TripleBalanceTD
from asreview.balance_strategies.undersampling import undersample
from asreview.balance_strategies.undersampling import UndersampleTD
from asreview.balance_strategies.utils import get_balance_strategy
from asreview.balance_strategies.simple import SimpleBalance
from asreview.balance_strategies.double import DoubleBalance
from asreview.balance_strategies.triple import TripleBalance
from asreview.balance_strategies.undersample import UndersampleBalance
from asreview.balance_strategies.utils import get_balance_model
from asreview.balance_strategies.utils import get_balance_class
51 changes: 24 additions & 27 deletions asreview/balance_strategies/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,35 +12,32 @@
# See the License for the specific language governing permissions and
# limitations under the License.

from abc import ABC, abstractmethod
from abc import abstractmethod
from asreview.base_model import BaseModel

from asreview.utils import _unsafe_dict_update

class BaseBalance(BaseModel):
"Abstract class for balance strategies."
name = "base-balance"

class BaseTrainData(ABC):
" Abstract class for balance strategies. "
def __init__(self, balance_kwargs):
self.balance_kwargs = self.default_kwargs()
self.balance_kwargs = _unsafe_dict_update(self.balance_kwargs,
balance_kwargs)

def func_kwargs_descr(self):
" Should give back the function and arguments for balancing. "
return (self.__class__.function(), self.balance_kwargs,
self.__class__.description())

def default_kwargs(self):
return {}

def hyperopt_space(self):
return {}

@staticmethod
@abstractmethod
def function():
raise NotImplementedError

@staticmethod
@abstractmethod
def description():
def sample(self, X, y, train_idx, shared):
"""Resample the training data.
Arguments
---------
X: np.array
Complete feature matrix.
y: np.array
Labels for all papers.
train_idx: np.array
Training indices, that is all papers that have been reviewed.
shared: dict
Dictionary to share data between balancing models and other models.
Returns
-------
np.array, np.array:
X_train, y_train: the resampled matrix, labels.
"""
raise NotImplementedError

0 comments on commit a98e161

Please sign in to comment.