Merge tag '3.36.2' into stable

biolab · Oct 31, 2023 · 8d59ef1 · 8d59ef1
2 parents a309db8 + 61033ec
commit 8d59ef1
Show file tree

Hide file tree

Showing 61 changed files with 897 additions and 175 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,6 +4,18 @@ Change Log
 [next] - TBA
 ------------
 
+
+[3.36.2] - 2023-10-31
+--------------------
+##### Enhancements
+* oweditdomain: Add variable filter ([#6603](../../pull/6603))
+* IO - Change origin attribute when not find on system ([#6555](../../pull/6555))
+* Predictions: Output errors ([#6577](../../pull/6577))
+
+##### Bugfixes
+* EmbedderCache - Handle cache persisting when no permissions ([#6611](../../pull/6611))
+
+
 [3.36.1] - 2023-09-22
 --------------------
 ##### Bugfixes
@@ -1829,7 +1841,8 @@ Change Log
 * Initial version based on Python 1.5.2 and Qt 2.3
 
 
-[next]: https://github.com/biolab/orange3/compare/3.36.1..HEAD
+[next]: https://github.com/biolab/orange3/compare/3.36.2..HEAD
+[3.36.2]: https://github.com/biolab/orange3/compare/3.36.1...3.36.2
 [3.36.1]: https://github.com/biolab/orange3/compare/3.36.0...3.36.1
 [3.36.0]: https://github.com/biolab/orange3/compare/3.35.0...3.36.0
 [3.35.0]: https://github.com/biolab/orange3/compare/3.34.1...3.35.0

diff --git a/Orange/base.py b/Orange/base.py
@@ -19,6 +19,7 @@
 from Orange.util import Reprable, OrangeDeprecationWarning, wrap_callback, \
     dummy_callback
 
+
 __all__ = ["Learner", "Model", "SklLearner", "SklModel",
            "ReprableWithPreprocessors"]
 
@@ -465,10 +466,9 @@ def fix_dim(x):
         elif prediction.ndim == 2 + multitarget:
             value, probs = None, prediction
         else:
-            raise TypeError("model returned a %i-dimensional array",
-                            prediction.ndim)
+            raise TypeError(f"model returned a {prediction.ndim}-dimensional array")
 
-        # Ensure that we have what we need to return; backmapp everything
+        # Ensure that we have what we need to return; backmap everything
         if probs is None and (ret != Model.Value or backmappers is not None):
             probs = one_hot_probs(value)
         if probs is not None:
@@ -596,7 +596,15 @@ def fit(self, X, Y, W=None):
     def supports_weights(self):
         """Indicates whether this learner supports weighted instances.
         """
-        return 'sample_weight' in self.__wraps__.fit.__code__.co_varnames
+        warnings.warn('SklLearner.supports_weights property is deprecated. All '
+                      'subclasses should redefine the supports_weights attribute. '
+                      'The property will be removed in 3.39.',
+                      OrangeDeprecationWarning)
+        varnames = self.__wraps__.fit.__code__.co_varnames
+        # scikit-learn often uses decorators on fit()
+        if hasattr(self.__wraps__.fit, "__wrapped__"):
+            varnames = varnames + self.__wraps__.fit.__wrapped__.__code__.co_varnames
+        return 'sample_weight' in varnames
 
     def __getattr__(self, item):
         try:

diff --git a/Orange/classification/gb.py b/Orange/classification/gb.py
@@ -23,6 +23,7 @@ def score(self, data: Table) -> Tuple[np.ndarray, Tuple[Variable]]:
 class GBClassifier(SklLearner, _FeatureScorerMixin):
     __wraps__ = skl_ensemble.GradientBoostingClassifier
     __returns__ = SklModel
+    supports_weights = True
 
     def __init__(self,
                  loss="log_loss",

diff --git a/Orange/classification/knn.py b/Orange/classification/knn.py
@@ -7,3 +7,4 @@
 
 class KNNLearner(KNNBase, SklLearner):
     __wraps__ = skl_neighbors.KNeighborsClassifier
+    supports_weights = False
diff --git a/Orange/classification/logistic_regression.py b/Orange/classification/logistic_regression.py
@@ -33,6 +33,7 @@ class LogisticRegressionLearner(SklLearner, _FeatureScorerMixin):
     __wraps__ = skl_linear_model.LogisticRegression
     __returns__ = LogisticRegressionClassifier
     preprocessors = SklLearner.preprocessors
+    supports_weights = True
 
     def __init__(self, penalty="l2", dual=False, tol=0.0001, C=1.0,
                  fit_intercept=True, intercept_scaling=1, class_weight=None,

diff --git a/Orange/classification/neural_network.py b/Orange/classification/neural_network.py
@@ -25,6 +25,7 @@ class MLPClassifierWCallback(skl_nn.MLPClassifier, NIterCallbackMixin):
 
 class NNClassificationLearner(NNBase, SklLearner):
     __wraps__ = MLPClassifierWCallback
+    supports_weights = False
 
     def _initialize_wrapped(self):
         clf = SklLearner._initialize_wrapped(self)

diff --git a/Orange/classification/outlier_detection.py b/Orange/classification/outlier_detection.py
@@ -89,6 +89,7 @@ class OneClassSVMLearner(_OutlierLearner):
     name = "One class SVM"
     __wraps__ = OneClassSVM
     preprocessors = SklLearner.preprocessors + [AdaptiveNormalize()]
+    supports_weights = True
 
     def __init__(self, kernel='rbf', degree=3, gamma="auto", coef0=0.0,
                  tol=0.001, nu=0.5, shrinking=True, cache_size=200,
@@ -100,6 +101,7 @@ def __init__(self, kernel='rbf', degree=3, gamma="auto", coef0=0.0,
 class LocalOutlierFactorLearner(_OutlierLearner):
     __wraps__ = LocalOutlierFactor
     name = "Local Outlier Factor"
+    supports_weights = False
 
     def __init__(self, n_neighbors=20, algorithm="auto", leaf_size=30,
                  metric="minkowski", p=2, metric_params=None,
@@ -112,6 +114,7 @@ def __init__(self, n_neighbors=20, algorithm="auto", leaf_size=30,
 class IsolationForestLearner(_OutlierLearner):
     __wraps__ = IsolationForest
     name = "Isolation Forest"
+    supports_weights = True
 
     def __init__(self, n_estimators=100, max_samples='auto',
                  contamination='auto', max_features=1.0, bootstrap=False,
@@ -156,6 +159,7 @@ class EllipticEnvelopeLearner(_OutlierLearner):
     __wraps__ = EllipticEnvelope
     __returns__ = EllipticEnvelopeClassifier
     name = "Covariance Estimator"
+    supports_weights = False
 
     def __init__(self, store_precision=True, assume_centered=False,
                  support_fraction=None, contamination=0.1,

diff --git a/Orange/classification/random_forest.py b/Orange/classification/random_forest.py
@@ -38,6 +38,7 @@ def wrap(tree, i):
 class RandomForestLearner(SklLearner, _FeatureScorerMixin):
     __wraps__ = skl_ensemble.RandomForestClassifier
     __returns__ = RandomForestClassifier
+    supports_weights = True
 
     def __init__(self,
                  n_estimators=10,

diff --git a/Orange/classification/sgd.py b/Orange/classification/sgd.py
@@ -12,6 +12,7 @@ class SGDClassificationLearner(SklLearner):
     __wraps__ = SGDClassifier
     __returns__ = LinearModel
     preprocessors = SklLearner.preprocessors + [Normalize()]
+    supports_weights = True
 
     def __init__(self, loss='hinge', penalty='l2', alpha=0.0001,
                  l1_ratio=0.15, fit_intercept=True, max_iter=5,

diff --git a/Orange/classification/tree.py b/Orange/classification/tree.py
@@ -233,6 +233,7 @@ class SklTreeLearner(SklLearner):
     __wraps__ = skl_tree.DecisionTreeClassifier
     __returns__ = SklTreeClassifier
     name = 'tree'
+    supports_weights = True
 
     def __init__(self, criterion="gini", splitter="best", max_depth=None,
                  min_samples_split=2, min_samples_leaf=1,

diff --git a/Orange/classification/xgb.py b/Orange/classification/xgb.py
@@ -25,6 +25,7 @@ def score(self, data: Table) -> Tuple[np.ndarray, Tuple[Variable]]:
 class XGBClassifier(XGBBase, Learner, _FeatureScorerMixin):
     __wraps__ = xgboost.XGBClassifier
     __returns__ = SklModel
+    supports_weights = True
 
     def __init__(self,
                  max_depth=None,
@@ -88,6 +89,7 @@ def __init__(self,
 class XGBRFClassifier(XGBBase, Learner, _FeatureScorerMixin):
     __wraps__ = xgboost.XGBRFClassifier
     __returns__ = SklModel
+    supports_weights = True
 
     def __init__(self,
                  max_depth=None,

diff --git a/Orange/data/io.py b/Orange/data/io.py
@@ -24,7 +24,7 @@
 import xlsxwriter
 import openpyxl
 
-from Orange.data import _io, Table, Domain, ContinuousVariable
+from Orange.data import _io, Table, Domain, ContinuousVariable, update_origin
 from Orange.data import Compression, open_compressed, detect_encoding, \
     isnastr, guess_data_type, sanitize_variable
 from Orange.data.io_base import FileFormatBase, Flags, DataTableMixin, PICKLE_PROTOCOL
@@ -164,21 +164,15 @@ def read(self):
                         skipinitialspace=True,
                     )
                     data = self.data_table(reader)
-
-                    # TODO: Name can be set unconditionally when/if
-                    # self.filename will always be a string with the file name.
-                    # Currently, some tests pass StringIO instead of
-                    # the file name to a reader.
-                    if isinstance(self.filename, str):
-                        data.name = path.splitext(
-                            path.split(self.filename)[-1])[0]
+                    data.name = path.splitext(path.split(self.filename)[-1])[0]
                     if error and isinstance(error, UnicodeDecodeError):
                         pos, endpos = error.args[2], error.args[3]
                         warning = ('Skipped invalid byte(s) in position '
                                    '{}{}').format(pos,
                                                   ('-' + str(endpos)) if (endpos - pos) > 1 else '')
                         warnings.warn(warning)
                     self.set_table_metadata(self.filename, data)
+                    update_origin(data, self.filename)
                     return data
                 except Exception as e:
                     error = e
@@ -215,6 +209,7 @@ def read(self):
             if not isinstance(table, Table):
                 raise TypeError("file does not contain a data table")
             else:
+                update_origin(table, self.filename)
                 return table
 
     @classmethod
@@ -264,6 +259,7 @@ def read(self):
         try:
             cells = self.get_cells()
             table = self.data_table(cells)
+            update_origin(table, self.filename)
             table.name = path.splitext(path.split(self.filename)[-1])[0]
             if self.sheet and len(self.sheets) > 1:
                 table.name = '-'.join((table.name, self.sheet))

diff --git a/Orange/data/io_base.py b/Orange/data/io_base.py
@@ -48,7 +48,7 @@ class Flags:
         ('weight', 'w'),
         ('.+?=.*?', ''),  # general key=value attributes
     ))
-    _RE_ALL = re.compile(r'^({})$'.format('|'.join(
+    RE_ALL = re.compile(r'^({})$'.format('|'.join(
         filter(None, flatten(ALL.items())))))
 
     def __init__(self, flags):
@@ -57,7 +57,7 @@ def __init__(self, flags):
         self.attributes = {}
         for flag in flags or []:
             flag = flag.strip()
-            if self._RE_ALL.match(flag):
+            if self.RE_ALL.match(flag):
                 if '=' in flag:
                     k, v = flag.split('=', 1)
                     if not Flags._RE_ATTR_UNQUOTED_STR(v):
@@ -167,8 +167,15 @@ def _header1(cls, headers: List[List[str]]) -> Tuple[List, List, List]:
           2) -||- with type and flags prepended, separated by #,
              e.g. d#sex,c#age,cC#IQ
         """
+
+        def is_flag(x):
+            return bool(Flags.RE_ALL.match(cls._type_from_flag([x])[0]) or
+                        Flags.RE_ALL.match(cls._flag_from_flag([x])[0]))
+
         flags, names = zip(*[i.split(cls.HEADER1_FLAG_SEP, 1)
-                             if cls.HEADER1_FLAG_SEP in i else ('', i)
+                             if cls.HEADER1_FLAG_SEP in i and
+                             is_flag(i.split(cls.HEADER1_FLAG_SEP)[0])
+                             else ('', i)
                              for i in headers[0]])
         names = list(names)
         return names, cls._type_from_flag(flags), cls._flag_from_flag(flags)

diff --git a/Orange/data/io_util.py b/Orange/data/io_util.py
@@ -1,17 +1,27 @@
+import os.path
 import subprocess
 from collections import defaultdict
+from typing import Tuple, Optional
 
 import numpy as np
+import pandas as pd
 from chardet.universaldetector import UniversalDetector
 
 from Orange.data import (
     is_discrete_values, MISSING_VALUES, Variable,
-    DiscreteVariable, StringVariable, ContinuousVariable, TimeVariable,
+    DiscreteVariable, StringVariable, ContinuousVariable, TimeVariable, Table,
 )
 from Orange.misc.collections import natural_sorted
 
-__all__ = ["Compression", "open_compressed", "detect_encoding", "isnastr",
-           "guess_data_type", "sanitize_variable"]
+__all__ = [
+    "Compression",
+    "open_compressed",
+    "detect_encoding",
+    "isnastr",
+    "guess_data_type",
+    "sanitize_variable",
+    "update_origin",
+]
 
 
 class Compression:
@@ -207,3 +217,69 @@ def mapvalues(arr):
         values = [_var.parse(i) for i in orig_values]
 
     return values, var
+
+
+def _extract_new_origin(attr: Variable, table: Table, lookup_dirs: Tuple[str]) -> Optional[str]:
+    # origin exists
+    if os.path.exists(attr.attributes["origin"]):
+        return attr.attributes["origin"]
+
+    # last dir of origin in lookup dirs
+    dir_ = os.path.basename(os.path.normpath(attr.attributes["origin"]))
+    for ld in lookup_dirs:
+        new_dir = os.path.join(ld, dir_)
+        if os.path.isdir(new_dir):
+            return new_dir
+
+    # all column paths in lookup dirs
+    for ld in lookup_dirs:
+        if all(
+            os.path.exists(os.path.join(ld, attr.str_val(v)))
+            for v in table.get_column(attr)
+            if v and not pd.isna(v)
+        ):
+            return ld
+
+    return None
+
+
+def update_origin(table: Table, file_path: str):
+    """
+    When a dataset with file paths in the column is moved to another computer,
+    the absolute path may not be correct. This function updates the path for all
+    columns with an "origin" attribute.
+
+    The process consists of two steps. First, we identify directories to search
+    for files, and in the second step, we check if paths exist.
+
+    Lookup directories:
+    1. The directory where the file from file_path is placed
+    2. The parent directory of 1. The situation when the user places dataset
+       file in the directory with files (for example, workflow in a directory
+       with images)
+
+    Possible situations for file search:
+    1. The last directory of origin (basedir) is in one of the lookup directories
+    2. Origin doesn't exist in any lookup directories, but paths in a column can
+       be found in one of the lookup directories. This is usually a situation
+       when paths in a column are complex (e.g. a/b/c/d/file.txt).
+
+    Note: This function updates the existing table
+
+    Parameters
+    ----------
+    table
+        Orange Table to be updated if origin exits in any column
+    file_path
+        Path of the loaded dataset for reference. Only paths inside datasets
+        directory or its parent directory will be considered for new origin.
+    """
+    file_dir = os.path.dirname(file_path)
+    parent_dir = os.path.dirname(file_dir)
+    # if file_dir already root file_dir == parent_dir
+    lookup_dirs = tuple({file_dir: 0, parent_dir: 0})
+    for attr in table.domain.metas:
+        if "origin" in attr.attributes and (attr.is_string or attr.is_discrete):
+            new_orig = _extract_new_origin(attr, table, lookup_dirs)
+            if new_orig:
+                attr.attributes["origin"] = new_orig
diff --git a/Orange/data/table.py b/Orange/data/table.py
@@ -1325,7 +1325,7 @@ def __repr__(self):
         return s
 
     @classmethod
-    def concatenate(cls, tables, axis=0):
+    def concatenate(cls, tables, axis=0, *, ignore_domains=None):
         """
         Concatenate tables into a new table, either vertically or horizontally.
 
@@ -1346,14 +1346,15 @@ def concatenate(cls, tables, axis=0):
         """
         if axis not in (0, 1):
             raise ValueError("invalid axis")
+        if ignore_domains is not None and axis != 0:
+            raise ValueError("'ignore_domains' is incompatible with 'axis=1'")
         if not tables:
             raise ValueError('need at least one table to concatenate')
 
         if len(tables) == 1:
             return tables[0].copy()
-
         if axis == 0:
-            conc = cls._concatenate_vertical(tables)
+            conc = cls._concatenate_vertical(tables, bool(ignore_domains))
         else:
             conc = cls._concatenate_horizontal(tables)
 
@@ -1368,7 +1369,7 @@ def concatenate(cls, tables, axis=0):
         return conc
 
     @classmethod
-    def _concatenate_vertical(cls, tables):
+    def _concatenate_vertical(cls, tables, ignore_domains=False):
         def vstack(arrs):
             return [np, sp][any(sp.issparse(arr) for arr in arrs)].vstack(arrs)
 
@@ -1387,7 +1388,8 @@ def collect(attr):
             return [getattr(arr, attr) for arr in tables]
 
         domain = tables[0].domain
-        if any(table.domain != domain for table in tables):
+        if not ignore_domains \
+                and any(table.domain != domain for table in tables):
             raise ValueError('concatenated tables must have the same domain')
 
         conc = cls.from_numpy(