In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os.path
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt

from sklearn.preprocessing import scale
from sklearn.cluster import KMeans
from sklearn.datasets.mldata import fetch_mldata
from sklearn.cross_validation import StratifiedKFold

In [3]:
from tabpar import TabDataParser
from reppar import RulesParser
from procrules import ProcRules

from rulstat import RulesStats
from rcluster import NRules
from logical import SimpleVoting

In [4]:
data_home = os.path.join("../", "data")
wine_bunch = fetch_mldata("wine", data_home=data_home)
data, labels = scale(wine_bunch['data']), wine_bunch['target']

In [5]:
skf = StratifiedKFold(y=labels, n_folds=2, shuffle=False, random_state=42)
for train_idx, test_idx in skf: pass # get the last of the two splits

In [6]:
ftrain = os.path.join("../", "data", "wine-train.tab")
ftest = os.path.join("../", "data", "wine-test.tab")
fall = os.path.join("../", "data", "wine-all.tab")

frules = os.path.join("../", "data", "wine-lrules.html")

In [7]:
# TabDataParser.np2tab(ftrain, data[train_idx, :], labels[train_idx])
# TabDataParser.np2tab(ftest, data[test_idx, :], labels[test_idx])
# TabDataParser.np2tab(fall, data, labels)

In [8]:
rp = RulesParser(frules)
tp = TabDataParser(ftrain)
processor = ProcRules(tp, rp)
rules, rulesbin = processor.rules, processor.rulesbin

In [9]:
vote_mdl = SimpleVoting(rules)

In [10]:
y = vote_mdl.fit(data[test_idx, :])

In [11]:
full_correct = sum([1 for i, j in zip(y, labels[test_idx]) if i == j]) / len(labels[test_idx])

In [12]:
data_train = TabDataParser(ftrain)
stats = RulesStats(rules)
stats.compute_stats(data_train.data)

In [13]:
n_clusters = min([len(rules[k]) for k in rules.keys()])

correct = []
for i in range(2, n_clusters + 1):
    km = KMeans(n_clusters=i)
    nrules = {}
    for k in rules.keys():
        km.fit(rules[k])
        nrules[k] = km.cluster_centers_

    nvotemdl = SimpleVoting(nrules)
    y = nvotemdl.fit(data[test_idx, :])
    correct.append(sum([1 for i, j in zip(y, labels[test_idx]) if i == j]) / len(labels[test_idx]))

In [14]:
igbincorrect = []
for i in range(2, n_clusters + 1):
    nrules = {}
    for k in rulesbin.keys():
        km = NRules(i=k, n_clusters=i)
        km.fit(rulesbin[k])
        km.restore(data[train_idx, :], labels[train_idx], RulesStats.infogain)
        nrules[k] = km.cluster_centers_

    binvotemdl = SimpleVoting(nrules)
    y = binvotemdl.fit(data[test_idx, :])
    igbincorrect.append(sum([1 for i, j in zip(y, labels[test_idx]) if i == j]) / len(labels[test_idx]))

In [15]:
stbincorrect = []
for i in range(2, n_clusters + 1):
    nrules = {}
    for k in rulesbin.keys():
        km = NRules(i=k, n_clusters=i)
        km.fit(rulesbin[k])
        km.restore(data[train_idx, :], labels[train_idx], RulesStats.statcriterion)
        nrules[k] = km.cluster_centers_

    binvotemdl = SimpleVoting(nrules)
    y = binvotemdl.fit(data[test_idx, :])
    stbincorrect.append(sum([1 for i, j in zip(y, labels[test_idx]) if i == j]) / len(labels[test_idx]))

In [42]:
plt.rcdefaults()
# rc('font', **{'family': 'serif'})
plt.rc('text', usetex=True)
plt.rc('text.latex', unicode=True)
plt.rc('text.latex', preamble=r"\usepackage[utf8]{inputenc}")
plt.rc('text.latex', preamble=r"\usepackage[russian]{babel}")
plt.rcParams['font.serif'] = 'cmunst'

In [48]:
x = list(range(2, n_clusters + 1))
cutoff = [full_correct for i in range(2, n_clusters + 1)]
plt.plot(x, cutoff, '-r', linewidth=2, label='простое голосование')
markersize = 4
plt.plot(x, correct, '-ob', label=r"вектор левых и правых границ", markersize=markersize)
plt.plot(x, igbincorrect, '-^g', label='бинарный вектор, IGain', markersize=markersize)
plt.plot(x, stbincorrect, '-sc', label='бинарный вектор, Stat', markersize=markersize)
plt.legend(loc=4)
plt.xlabel(u"количество логических закономерностей")
plt.ylabel(u"доля верно классифицированных объектов")
plt.savefig("../LaTeX/graphs/wine.pdf", bbox_inches="tight")
plt.show()