Skip to content

Commit

Permalink
Merge pull request #6772 from VesnaT/pls_residual_n_plot
Browse files Browse the repository at this point in the history
PLS: Add residual normal probability, distance to the model
  • Loading branch information
lanzagar committed May 10, 2024
2 parents 384756b + 22b2e7f commit 7331d38
Show file tree
Hide file tree
Showing 4 changed files with 102 additions and 7 deletions.
62 changes: 59 additions & 3 deletions Orange/regression/pls.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import numpy as np
import scipy.stats as ss
import sklearn.cross_decomposition as skl_pls
from sklearn.preprocessing import StandardScaler

from Orange.data import Table, Domain, Variable, \
ContinuousVariable, StringVariable
Expand Down Expand Up @@ -102,9 +104,8 @@ def project(self, data):
transformer = _PLSCommonTransform(self)

def trvar(i, name):
return ContinuousVariable(name,
compute_value=PLSProjector(transformer,
i))
return ContinuousVariable(
name, compute_value=PLSProjector(transformer, i))

n_components = self.skl_model.x_loadings_.shape[1]

Expand Down Expand Up @@ -155,6 +156,61 @@ def coefficients_table(self):
coef_table.name = "coefficients"
return coef_table

def residuals_normal_probability(self, data: Table) -> Table:
pred = self(data)
n = len(data)
m = len(data.domain.class_vars)

err = data.Y - pred
if m == 1:
err = err[:, None]

theoretical_percentiles = (np.arange(1.0, n + 1)) / (n + 1)
quantiles = ss.norm.ppf(theoretical_percentiles)
ind = np.argsort(err, axis=0)
theoretical_quantiles = np.zeros((n, m), dtype=float)
for i in range(m):
theoretical_quantiles[ind[:, i], i] = quantiles

# check names so that tables could later be merged
proposed = [f"{name} ({var.name})" for var in data.domain.class_vars
for name in ("Sample Quantiles", "Theoretical Quantiles")]
names = get_unique_names(data.domain, proposed)
domain = Domain([ContinuousVariable(name) for name in names])
X = np.zeros((n, m * 2), dtype=float)
X[:, 0::2] = err
X[:, 1::2] = theoretical_quantiles
res_table = Table.from_numpy(domain, X)
res_table.name = "residuals normal probability"
return res_table

def dmodx(self, data: Table) -> Table:
data = self.data_to_model_domain(data)

n_comp = self.skl_model.n_components
resids_ssx = self._residual_ssx(data.X)
s = np.sqrt(resids_ssx / (self.skl_model.x_loadings_.shape[0] - n_comp))
s0 = np.sqrt(resids_ssx.sum() / (
(self.skl_model.x_scores_.shape[0] - n_comp - 1) *
(data.X.shape[1] - n_comp)))
dist = np.sqrt((s / s0) ** 2)

name = get_unique_names(data.domain, ["DModX"])[0]
domain = Domain([ContinuousVariable(name)])
dist_table = Table.from_numpy(domain, dist[:, None])
dist_table.name = "DMod"
return dist_table

def _residual_ssx(self, X: np.ndarray) -> np.ndarray:
pred_scores = self.skl_model.transform(X)
inv_pred_scores = self.skl_model.inverse_transform(pred_scores)

scaler = StandardScaler()
scaler.fit(X)
x_recons = scaler.transform(inv_pred_scores)
x_scaled = scaler.transform(X)
return np.sum((x_scaled - x_recons) ** 2, axis=1)


class PLSRegressionLearner(SklLearnerRegression, _FeatureScorerMixin):
__wraps__ = skl_pls.PLSRegression
Expand Down
13 changes: 13 additions & 0 deletions Orange/regression/tests/test_pls.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,19 @@ def test_coefficients(self):
np.testing.assert_almost_equal(scikit_model.coef_.T,
coef_table.X)

def test_residuals_normal_probability(self):
for d in [table(10, 5, 1), table(10, 5, 3)]:
orange_model = PLSRegressionLearner()(d)
res_table = orange_model.residuals_normal_probability(d)
n_target = len(d.domain.class_vars)
self.assertEqual(res_table.X.shape, (len(d), 2 * n_target))

def test_dmodx(self):
for d in (table(10, 5, 1), table(10, 5, 3)):
orange_model = PLSRegressionLearner()(d)
dist_table = orange_model.dmodx(d)
self.assertEqual(dist_table.X.shape, (len(d), 1))

def test_eq_hash(self):
data = Table("housing")
pls1 = PLSRegressionLearner()(data)
Expand Down
14 changes: 12 additions & 2 deletions Orange/widgets/model/owpls.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,11 +62,21 @@ def update_model(self):

def _create_output_data(self) -> Table:
projection = self.model.project(self.data)
normal_probs = self.model.residuals_normal_probability(self.data)
dmodx = self.model.dmodx(self.data)
data_domain = self.data.domain
proj_domain = projection.domain
metas = proj_domain.metas + proj_domain.attributes
nprobs_domain = normal_probs.domain
dmodx_domain = dmodx.domain
metas = proj_domain.metas + proj_domain.attributes + \
nprobs_domain.attributes + dmodx_domain.attributes
domain = Domain(data_domain.attributes, data_domain.class_vars, metas)
return self.data.transform(domain)
data: Table = self.data.transform(domain)
with data.unlocked(data.metas):
data.metas[:, -2 * len(self.data.domain.class_vars) - 1: -1] = \
normal_probs.X
data.metas[:, -1] = dmodx.X[:, 0]
return data

@OWBaseLearner.Inputs.data
def set_data(self, data):
Expand Down
20 changes: 18 additions & 2 deletions Orange/widgets/model/tests/test_owpls.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,14 +48,30 @@ def test_output_data(self):
output = self.get_output(self.widget.Outputs.data)
self.assertEqual(output.X.shape, (506, 13))
self.assertEqual(output.Y.shape, (506,))
self.assertEqual(output.metas.shape, (506, 5))
self.assertEqual(output.metas.shape, (506, 8))
self.assertEqual([v.name for v in self._data.domain.variables],
[v.name for v in output.domain.variables])
metas = ["PLS U1", "PLS U2", "PLS T1", "PLS T2",
"Sample Quantiles (MEDV)", "Theoretical Quantiles (MEDV)",
"DModX"]
self.assertEqual([v.name for v in self._data.domain.metas] + metas,
[v.name for v in output.domain.metas])

def test_output_data_multi_target(self):
self.send_signal(self.widget.Inputs.data, self._data_multi_target)
output = self.get_output(self.widget.Outputs.data)
self.assertEqual(output.X.shape, (506, 12))
self.assertEqual(output.Y.shape, (506, 2))
self.assertEqual(output.metas.shape, (506, 5))
self.assertEqual(output.metas.shape, (506, 10))
orig_domain = self._data_multi_target.domain
self.assertEqual([v.name for v in orig_domain.variables],
[v.name for v in output.domain.variables])
metas = ["PLS U1", "PLS U2", "PLS T1", "PLS T2",
"Sample Quantiles (MEDV)", "Theoretical Quantiles (MEDV)",
"Sample Quantiles (CRIM)", "Theoretical Quantiles (CRIM)",
"DModX"]
self.assertEqual([v.name for v in orig_domain.metas] + metas,
[v.name for v in output.domain.metas])

def test_output_components(self):
self.send_signal(self.widget.Inputs.data, self._data)
Expand Down

0 comments on commit 7331d38

Please sign in to comment.