Skip to content

Commit

Permalink
Did some more work in Continuize widget.
Browse files Browse the repository at this point in the history
  • Loading branch information
mfrlin committed Jul 25, 2013
1 parent 8657b2b commit 11f7593
Show file tree
Hide file tree
Showing 3 changed files with 264 additions and 21 deletions.
40 changes: 26 additions & 14 deletions Orange/data/continuizer.py
Expand Up @@ -52,19 +52,30 @@ def transform_discrete(var):
return new_vars

def transform_continuous(var):
if not self.normalize_continuous:
if self.normalize_continuous == self.Leave:
return var
new_var = ContinuousVariable(var.name)
dma, dmi = dists[var_ptr].max(), dists[var_ptr].min()
diff = dma - dmi
if diff < 1e-15:
diff = 1
if self.zero_based:
new_var.get_value_from = Normalizer(var, dmi, 1 / diff)
else:
new_var.get_value_from = Normalizer(var, (dma + dmi) / 2,
elif self.normalize_continuous == self.NormalizeBySpan:
new_var = ContinuousVariable(var.name)
dma, dmi = dists[var_ptr].max(), dists[var_ptr].min()
diff = dma - dmi
if diff < 1e-15:
diff = 1
if self.zero_based:
new_var.get_value_from = Normalizer(var, dmi, 1 / diff)
else:
new_var.get_value_from = Normalizer(var, (dma + dmi) / 2,
2 / diff)
return new_var
elif self.normalize_continuous == self.NormalizeByVariance:
new_var = ContinuousVariable(var.name)
avg = None
variance = None
if self.zero_based:
new_var.get_value_from = Normalizer(var, avg, 1 / variance)
else:
new_var.get_value_from = Normalizer(var, (dma + dmi) / 2,
2 / diff)
return new_var
return new_var

def transform_list(s):
nonlocal var_ptr
Expand All @@ -89,7 +100,7 @@ def transform_list(s):
raise ValueError("data has multinomial attributes")
needs_discrete = (treat == self.FrequentIsBase and
domain.has_discrete_attributes(transform_class))
needs_continuous = (self.normalize_continuous and
needs_continuous = (not self.normalize_continuous == self.Leave and
domain.has_continuous_attributes(transform_class))
if needs_discrete or needs_continuous:
if isinstance(data, Domain):
Expand All @@ -105,8 +116,9 @@ def transform_list(s):
return Domain(new_attrs, new_classes, domain.metas)

# To make PyCharm happy
NValues = LowestIsBase = FrequentIsBase = Ignore = IgnoreMulti = ReportError = AsOrdinal = AsNormalizedOrdinal = 0
NValues = LowestIsBase = FrequentIsBase = Ignore = IgnoreMulti = ReportError = AsOrdinal =\
Leave = NormalizeBySpan = NormalizeByVariance = AsNormalizedOrdinal = 0

MultinomialTreatment = Enum("NValues", "LowestIsBase", "FrequentIsBase",
"Ignore", "IgnoreMulti", "ReportError", "AsOrdinal",
"AsNormalizedOrdinal").pull_up(DomainContinuizer)
"AsNormalizedOrdinal", "Leave", "NormalizeBySpan", "NormalizeByVariance").pull_up(DomainContinuizer)
14 changes: 7 additions & 7 deletions Orange/widgets/data/owcontinuize.py
Expand Up @@ -18,7 +18,7 @@ class OWContinuize(widget.OWWidget):
_keywords = ["data", "continuize"]

inputs = [("Data", Table, "setData")]
outputs = [("Data", Table), ("Preprocessor", "PreprocessedLearner")]
outputs = [("Data", Table)]

want_main_area = False

Expand All @@ -28,8 +28,8 @@ class OWContinuize(widget.OWWidget):
continuousTreatment = Setting(0)
autosend = Setting(0)

#settingsHandler = ClassValuesContextHandler("", ["targetValue"])
#contextHandlers = ContextSetting({})
settingsHandler = ClassValuesContextHandler()
targetValue = ContextSetting("")

multinomialTreats = (("Target or First value as base", DomainContinuizer.LowestIsBase),
("Most frequent value as base", DomainContinuizer.FrequentIsBase),
Expand All @@ -39,9 +39,9 @@ class OWContinuize(widget.OWWidget):
("Treat as ordinal", DomainContinuizer.AsOrdinal),
("Divide by number of values", DomainContinuizer.AsNormalizedOrdinal))

#continuousTreats = (("Leave them as they are", DomainContinuizer.Leave),
# ("Normalize by span", DomainContinuizer.NormalizeBySpan),
# ("Normalize by variance", DomainContinuizer.NormalizeByVariance))
continuousTreats = (("Leave them as they are", DomainContinuizer.Leave),
("Normalize by span", DomainContinuizer.NormalizeBySpan),
("Normalize by variance", DomainContinuizer.NormalizeByVariance))

classTreats = (("Leave it as it is", DomainContinuizer.Ignore),
("Treat as ordinal", DomainContinuizer.AsOrdinal),
Expand Down Expand Up @@ -134,7 +134,7 @@ def enableAuto(self):
def constructContinuizer(self):
conzer = DomainContinuizer()
conzer.zeroBased = self.zeroBased
conzer.continuousTreatment = self.continuousTreatment
conzer.continuousTreatment = self.continuousTreats[self.continuousTreatment][1]
conzer.multinomialTreatment = self.multinomialTreats[self.multinomialTreatment][1]
conzer.classTreatment = self.classTreats[self.classTreatment][1]
return conzer
Expand Down
231 changes: 231 additions & 0 deletions Orange/widgets/data/owpurgedomain.py
@@ -0,0 +1,231 @@
from PyQt4 import QtGui

from Orange.data.domain import Domain
from Orange.data.table import Table
from Orange.data.variable import Variable
from Orange.widgets import gui, widget
from Orange.widgets.settings import Setting

class OWPurgeDomain(widget.OWWidget):
_name = "Purge Domain"
_description = "Removes redundant values and attributes, sorts values."
_icon = "icons/PurgeDomain.svg"
_author = "Martin Frlin"
_category = "Data"
_keywords = ["data", "purge", "domain"]

inputs = [("Data", Table, "setData")]
outputs = [("Data", Table)]

removeValues = Setting(1)
removeAttributes = Setting(1)
removeClassAttribute = Setting(1)
removeClasses = Setting(1)
autoSend = Setting(1)
sortValues = Setting(True)
sortClasses = Setting(True)

def __init__(self, parent=None, signalManager=None):
widget.OWWidget.__init__(self, parent, signalManager, 'PurgeDomain', wantMainArea=False)
self.data = None

self.preRemoveValues = 1
self.preRemoveClasses = 1
self.autoSend = 1
self.dataChanged = False

self.removedAttrs = self.reducedAttrs = self.resortedAttrs = self.classAttr = "-"

boxAt =gui.widgetBox(self.controlArea, "Attributes", addSpace=True)
gui.checkBox(boxAt, self, 'sortValues', 'Sort attribute values', callback = self.optionsChanged)
rua = gui.checkBox(boxAt, self, "removeAttributes", "Remove attributes with less than two values", callback = self.removeAttributesChanged)

ruv = gui.checkBox(gui.indentedBox(boxAt, sep=gui.checkButtonOffsetHint(rua)), self, "removeValues", "Remove unused attribute values", callback = self.optionsChanged)
rua.disables = [ruv]
rua.makeConsistent()


boxAt = gui.widgetBox(self.controlArea, "Classes", addSpace=True)
gui.checkBox(boxAt, self, 'sortClasses', 'Sort classes', callback = self.optionsChanged)
rua = gui.checkBox(boxAt, self, "removeClassAttribute", "Remove class attribute if there are less than two classes", callback = self.removeClassesChanged)
ruv = gui.checkBox(gui.indentedBox(boxAt, sep=gui.checkButtonOffsetHint(rua)), self, "removeClasses", "Remove unused class values", callback = self.optionsChanged)
rua.disables = [ruv]
rua.makeConsistent()


box3 = gui.widgetBox(self.controlArea, 'Statistics', addSpace=True)
gui.label(box3, self, "Removed attributes: %(removedAttrs)s")
gui.label(box3, self, "Reduced attributes: %(reducedAttrs)s")
gui.label(box3, self, "Resorted attributes: %(resortedAttrs)s")
gui.label(box3, self, "Class attribute: %(classAttr)s")

box2 = gui.widgetBox(self.controlArea, "Send")
btSend = gui.button(box2, self, "Send data", callback = self.process, default=True)
cbAutoSend = gui.checkBox(box2, self, "autoSend", "Send automatically")

gui.setStopper(self, btSend, cbAutoSend, "dataChanged", self.process)

gui.rubber(self.controlArea)

# OWGUI.separator(self.controlArea, height=24)

#self.adjustSize()

def setData(self, dataset):
if dataset:
self.data = dataset
self.process()
else:
self.reducedAttrs = self.removedAttrs = self.resortedAttrs = self.classAttr = ""
self.send("Data", None)
self.data = None
self.dataChanged = False

def removeAttributesChanged(self):
if not self.removeAttributes:
self.preRemoveValues = self.removeValues
self.removeValues = False
else:
self.removeValues = self.preRemoveValues
self.optionsChanged()

def removeClassesChanged(self):
if not self.removeClassAttribute:
self.preRemoveClasses = self.removeClasses
self.removeClasses = False
else:
self.removeClasses = self.preRemoveClasses
self.optionsChanged()

def optionsChanged(self):
if self.autoSend:
self.process()
else:
self.dataChanged = True

def sortAttrValues(self, attr, interattr=None):
if not interattr:
interattr = attr

newvalues = list(interattr.values)
newvalues.sort()
if newvalues == list(interattr.values):
return interattr

newattr = orange.EnumVariable(interattr.name, values=newvalues)
newattr.getValueFrom = orange.ClassifierByLookupTable(newattr, attr)
lookupTable = newattr.getValueFrom.lookupTable
distributions = newattr.getValueFrom.distributions
for val in interattr.values:
idx = attr.values.index(val)
lookupTable[idx] = val
distributions[idx][newvalues.index(val)] += 1
return newattr

def process(self):
if self.data == None:
return

self.reducedAttrs = 0
self.removedAttrs = 0
self.resortedAttrs = 0
self.classAttribute = 0

if self.removeAttributes or self.sortValues:
newattrs = []
for attr in self.data.domain.attributes:
if attr.varType == Variable.VarTypes.Continuous:
if orange.RemoveRedundantOneValue.has_at_least_two_values(self.data, attr):
newattrs.append(attr)
else:
self.removedAttrs += 1
continue

if attr.varType != Variable.VarTypes.Discrete:
newattrs.append(attr)
continue

if self.removeValues:
newattr = orange.RemoveUnusedValues(attr, self.data)
if not newattr:
self.removedAttrs += 1
continue

if newattr != attr:
self.reducedAttrs += 1
else:
newattr = attr

if self.removeValues and len(newattr.values) < 2:
self.removedAttrs += 1
continue

if self.sortValues:
newnewattr = self.sortAttrValues(attr, newattr)
if newnewattr != newattr:
self.resortedAttrs += 1
newattr = newnewattr

newattrs.append(newattr)
else:
newattrs = self.data.domain.attributes


klass = self.data.domain.classVar
classChanged = False
if not klass:
newclass = klass
self.classAttr = "No class"
elif klass.varType != Variable.VarTypes.Discrete:
newclass = klass
self.classAttr = "Class is not discrete"
elif not (self.removeClassAttribute or self.sortClasses):
newclass = klass
self.classAttr = "Class is not checked"
else:
self.classAttr = ""

if self.removeClasses:
newclass = orange.RemoveUnusedValues(klass, self.data)
else:
newclass = klass

if not newclass or self.removeClassAttribute and len(newclass.values) < 2:
newclass = None
self.classAttr = "Class is removed"
elif len(newclass.values) != len(klass.values):
self.classAttr = "Class is reduced"

if newclass and self.sortClasses:
newnewclass = self.sortAttrValues(klass, newclass)
if newnewclass != newclass:
if self.classAttr:
self.classAttr = "Class is reduced and sorted"
else:
self.classAttr = "Class is sorted"
newclass = newnewclass

if not self.classAttr:
self.classAttr = "Class is unchanged"

if self.reducedAttrs or self.removedAttrs or self.resortedAttrs or newclass != klass:
newDomain = Domain(newattrs, newclass)
newData = Table(newDomain, self.data)
else:
newData = self.data

self.send("Data", newData)

self.dataChanged = False


if __name__=="__main__":
import sys
appl = QtGui.QApplication(sys.argv)
ow = OWPurgeDomain()
#data = orange.ExampleTable('..\\..\\doc\\datasets\\car.tab')
#data.domain.attributes[3].values.append("X")
#ow.setData(data)
ow.show()
appl.exec_()
ow.saveSettings()

1 comment on commit 11f7593

@janezd
Copy link
Contributor

@janezd janezd commented on 11f7593 Aug 16, 2013

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Martin, try not to forget running unit tests after making changes (as we all do, sometimes ;). See my fixes in 39ccf57.

Also, try not to forget writing unit tests for new code (as we all hate to ;). As punishment for the above, I'm adding this as an issue.

Please sign in to comment.