New models, query strategies and API changes(#149)

asreview · Jan 22, 2020 · a98e161 · a98e161
1 parent f3de58d
commit a98e161
Show file tree

Hide file tree

Showing 70 changed files with 3,030 additions and 1,873 deletions.
diff --git a/.github/workflows/ci-workflow.yml b/.github/workflows/ci-workflow.yml
@@ -14,6 +14,6 @@ jobs:
       run: |
         pip install pytest
         pip install --upgrade setuptools>=41.0.0
-        pip install .
+        pip install .[all]
         pytest
      
diff --git a/asreview/__init__.py b/asreview/__init__.py
@@ -13,8 +13,9 @@
 # limitations under the License.
 
 from asreview.logging.utils import open_logger
-from asreview.models.embedding import load_embedding
-from asreview.models.embedding import sample_embedding
+from asreview.feature_extraction.embedding_lstm import load_embedding
+from asreview.feature_extraction.embedding_lstm import sample_embedding
+from asreview.feature_extraction.embedding_lstm import text_to_features
 from asreview.readers import ASReviewData
 from asreview.readers import read_csv
 from asreview.readers import read_data
@@ -28,7 +29,6 @@
 from asreview.review import review_simulate
 from asreview.review import ReviewOracle
 from asreview.review import ReviewSimulate
-from asreview.utils import text_to_features
 
 from ._version import get_versions
 __version__ = get_versions()['version']

diff --git a/asreview/analysis/analysis.py b/asreview/analysis/analysis.py
@@ -12,20 +12,20 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-'''
-Analysis of log files.
-'''
-
 import itertools
+import json
 import os
+
 import numpy as np
 from scipy import stats
+from sklearn.cluster import KMeans
 
 from asreview.logging.utils import loggers_from_dir
 from asreview.analysis.statistics import _get_labeled_order
 from asreview.analysis.statistics import _get_limits
 from asreview.analysis.statistics import _find_inclusions
 from asreview.analysis.statistics import _get_last_proba_order
+from asreview.analysis.statistics import _random_ttd, _cluster_ttd, _max_ttd
 
 
 class Analysis():
@@ -332,6 +332,46 @@ def limits(self, prob_allow_miss=[0.1]):
                 results["limits"][i_prob], np.int)
         return results
 
+    def time_to_inclusion(self, X_fp):
+        with open(X_fp, "r") as fp:
+            X = np.array(json.load(fp))
+
+        n_clusters = int(len(self.labels)/150)
+        model = KMeans(n_clusters=n_clusters, n_init=1)
+        clusters = model.fit_predict(X, self.labels)
+        logger = self.loggers[self._first_file]
+        n_queries = logger.n_queries()
+
+        results = {
+            "x_range": [],
+            "ttd": {
+                "random": [],
+                "cluster": [],
+                "max": [],
+            }
+        }
+        n_train = 0
+        for query_i in range(n_queries):
+            new_random_ttd = _random_ttd(self.loggers, query_i, self.labels)
+            new_cluster_ttd = _cluster_ttd(self.loggers, query_i, self.labels,
+                                           clusters)
+            new_max_ttd = _max_ttd(self.loggers, query_i, self.labels)
+
+            results["ttd"]["random"].append(new_random_ttd)
+            results["ttd"]["cluster"].append(new_cluster_ttd)
+            results["ttd"]["max"].append(new_max_ttd)
+
+            try:
+                new_train_idx = logger.get("train_idx", query_i)
+            except KeyError:
+                new_train_idx = None
+
+            if new_train_idx is not None:
+                n_train = len(new_train_idx)
+            results["x_range"].append(n_train)
+
+        return results
+
     def close(self):
         "Close loggers."
         for logger in self.loggers.values():

diff --git a/asreview/analysis/statistics.py b/asreview/analysis/statistics.py
@@ -129,3 +129,116 @@ def _get_limits(loggers, query_i, labels, proba_allow_miss=[]):
         if len(allow_miss) == 0:
             break
     return limits
+
+
+def _random_ttd(loggers, query_i, labels):
+    all_ttd = []
+    for logger in loggers.values():
+        try:
+            pool_idx = logger.get("pool_idx", query_i)
+        except KeyError:
+            continue
+
+        pool_labels = labels[pool_idx]
+        n_included = np.sum(pool_labels)
+
+        if n_included == 0:
+            continue
+        ttd = 0
+        p_only_zero = 1
+        for i in range(len(pool_labels) - n_included):
+            p_first = n_included/(len(pool_labels)-i)
+            ttd += p_only_zero*p_first*i
+            p_only_zero *= (1-p_first)
+        all_ttd.append(ttd)
+
+    if len(all_ttd) == 0:
+        ttd_avg = 0
+    else:
+        ttd_avg = np.average(all_ttd)
+    return ttd_avg
+
+
+def _max_ttd(loggers, query_i, labels):
+    all_ttd = []
+    for logger in loggers.values():
+        proba_order = _get_proba_order(logger, query_i)
+        if proba_order is None:
+            all_ttd.append(0)
+            continue
+        if len(proba_order) == 0:
+            continue
+
+        x = np.where(labels[proba_order] == 1)[0]
+        if len(x) == 0:
+            ttd = 0
+        else:
+            ttd = (len(proba_order)-1) - x[-1]
+        all_ttd.append(ttd)
+
+    if len(all_ttd) == 0:
+        ttd_avg = 0
+    else:
+        ttd_avg = np.average(all_ttd)
+    return ttd_avg
+
+
+def _cluster_order(all_dict, power=0):
+    scores = []
+    for clust_id in all_dict:
+        for i in range(all_dict[clust_id]):
+            new_score = (i+1) * pow(all_dict[clust_id], -power)
+            scores.append((clust_id, new_score))
+    scores = sorted(scores, key=lambda x: x[1])
+    return [x[0] for x in scores]
+
+
+def _get_clustering(all_prediction, pool_idx, labels):
+    pool_prediction = all_prediction[pool_idx]
+    one_idx = np.where(labels[pool_idx] == 1)[0]
+    unique, counts = np.unique(pool_prediction, return_counts=True)
+    all_dict = {unique[i]: counts[i] for i in range(len(unique))}
+    all_counts = [all_dict.get(i, 0) for i in range(np.max(unique)+1)]
+
+    prediction = pool_prediction[one_idx, ]
+    unique, counts = np.unique(prediction, return_counts=True)
+    one_dict = {unique[i]: counts[i] for i in range(len(unique))}
+    one_counts = [one_dict.get(i, 0) for i in range(len(all_counts))]
+    return all_dict, all_counts, one_dict, one_counts
+
+
+def _cluster_ttd(loggers, query_i, labels, all_prediction):
+    all_ttd = []
+    for logger in loggers.values():
+        try:
+            pool_idx = logger.get("pool_idx", query_i)
+        except KeyError:
+            all_ttd.append(0)
+            continue
+
+        all_dict, all_counts, _one_dict, one_counts = _get_clustering(
+            all_prediction, pool_idx, labels)
+        cluster_order = _cluster_order(all_dict)
+
+        p_only_zero = 1
+        ttd = 0
+        if np.sum(one_counts) == 0:
+            continue
+        for i, i_clust in enumerate(cluster_order):
+            try:
+                p_first = one_counts[i_clust]/all_counts[i_clust]
+            except IndexError:
+                print(
+                    i_clust, list(all_dict), len(all_counts), len(one_counts))
+            ttd += p_only_zero*p_first*i
+            p_only_zero *= 1-p_first
+            all_counts[i_clust] -= 1
+            if p_only_zero < 1e-6:
+                break
+        all_ttd.append(ttd)
+
+    if len(all_ttd) == 0:
+        ttd_avg = 0
+    else:
+        ttd_avg = np.average(all_ttd)
+    return ttd_avg
diff --git a/asreview/balance_strategies/__init__.py b/asreview/balance_strategies/__init__.py
@@ -12,11 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from asreview.balance_strategies.full_sampling import full_sample
-from asreview.balance_strategies.full_sampling import FullSampleTD
-from asreview.balance_strategies.triple_balance import triple_balance
-from asreview.balance_strategies.triple_balance import TripleBalanceTD
-from asreview.balance_strategies.undersampling import undersample
-from asreview.balance_strategies.undersampling import UndersampleTD
-from asreview.balance_strategies.utils import get_balance_strategy
+from asreview.balance_strategies.simple import SimpleBalance
+from asreview.balance_strategies.double import DoubleBalance
+from asreview.balance_strategies.triple import TripleBalance
+from asreview.balance_strategies.undersample import UndersampleBalance
+from asreview.balance_strategies.utils import get_balance_model
 from asreview.balance_strategies.utils import get_balance_class
diff --git a/asreview/balance_strategies/base.py b/asreview/balance_strategies/base.py
@@ -12,35 +12,32 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from abc import ABC, abstractmethod
+from abc import abstractmethod
+from asreview.base_model import BaseModel
 
-from asreview.utils import _unsafe_dict_update
 
+class BaseBalance(BaseModel):
+    "Abstract class for balance strategies."
+    name = "base-balance"
 
-class BaseTrainData(ABC):
-    " Abstract class for balance strategies. "
-    def __init__(self, balance_kwargs):
-        self.balance_kwargs = self.default_kwargs()
-        self.balance_kwargs = _unsafe_dict_update(self.balance_kwargs,
-                                                  balance_kwargs)
-
-    def func_kwargs_descr(self):
-        " Should give back the function and arguments for balancing. "
-        return (self.__class__.function(), self.balance_kwargs,
-                self.__class__.description())
-
-    def default_kwargs(self):
-        return {}
-
-    def hyperopt_space(self):
-        return {}
-
-    @staticmethod
-    @abstractmethod
-    def function():
-        raise NotImplementedError
-
-    @staticmethod
     @abstractmethod
-    def description():
+    def sample(self, X, y, train_idx, shared):
+        """Resample the training data.
+
+        Arguments
+        ---------
+        X: np.array
+            Complete feature matrix.
+        y: np.array
+            Labels for all papers.
+        train_idx: np.array
+            Training indices, that is all papers that have been reviewed.
+        shared: dict
+            Dictionary to share data between balancing models and other models.
+
+        Returns
+        -------
+        np.array, np.array:
+            X_train, y_train: the resampled matrix, labels.
+        """
         raise NotImplementedError