# Synthetic data

## Write to disk

In [14]:
def write_libsvm(filename, labels, features):
    with open(filename, "w") as f:
        for label, feature in zip(labels, features):
            feature = enumerate(feature)
            feature = filter(lambda t: t[1] != 0.0, feature)
            feature = map(lambda t: "%d:%d" % (t[0], t[1]), feature)
            f.write(str(label) + ' ' + ' '.join(feature) + '\n')


def write_pickle(filename, n, m):
    fp = np.memmap(filename, dtype=np.bool_, mode='w+', shape=(n, m + 1))
    fp[:, :] = data[:, :]

In [3]:
import numpy as np
from scipy import sparse
from random import random


n, m = 1000000, 100
n_test = int(n * 0.01)
labels, features, t_labels, t_features = [None] * 4

def reset():
    # balanced binary labels
    global labels, features, t_labels, t_features
    labels = (np.random.random(size=(n,)) > 0.5).astype(int)
    features = (np.random.random(size=(n, m)) > 0.5).astype(int)
    t_labels = (np.random.random(size=(n_test,)) > 0.5).astype(int)
    t_features = (np.random.random(size=(n_test, m)) > 0.5).astype(int)

## One-level tree

### Deterministic

In [3]:
reset()
for idx in [10, 20, 30]:
    coin = int(random() * 2)
    features[:, idx] = coin * labels + (1.0 - coin) * (1.0 - labels)
    t_features[:, idx] = coin * t_labels + (1.0 - coin) * (1.0 - t_labels)

In [4]:
write_libsvm("deterministic.libsvm", labels, features)
write_libsvm("deterministic_test.libsvm", t_labels, t_features)

## Probabilistic

In [5]:
SCALE = 0.3  # Weakness of the strongest weak rule

reset()
for idx in range(m):
    coin = int(random() * 2)
    prob = 0.5 + float(idx / m) * SCALE

    p = (np.random.random(np.sum(n)) <= prob).astype(float)  # noise
    pcoin = p * coin + (1.0 - p) * (1.0 - coin)
    features[:, idx] = pcoin * labels + (1.0 - pcoin) * (1.0 - labels)

    p = (np.random.random(np.sum(n_test)) <= prob).astype(float)  # noise
    pcoin = p * coin + (1.0 - p) * (1.0 - coin)
    t_features[:, idx] = pcoin * t_labels + (1.0 - pcoin) * (1.0 - t_labels)

In [6]:
write_libsvm("probabilistic.libsvm", labels, features)
write_libsvm("probabilistic_test.libsvm", t_labels, t_features)

## Multi-level tree

### Deterministic

In [18]:
reset()
for idx1, idx2, prob in [(10, 20, 0.05), (30, 40, 0.1), (50, 60, 0.2)]:
    for num, x, y in [(n, features, labels), (n_test, t_features, t_labels)]:
        left1 = np.logical_and(y == 1, np.random.random(num) <= 0.5 + prob)
        left0 = np.logical_and(y == 0, np.random.random(num) <= 0.5 - prob)
        left = np.logical_or(left1, left0)
        right = np.logical_not(left)
        right1 = np.logical_and(right, y == 1)
        right0 = np.logical_and(right, y == 0)

        x[left, idx1] = 0
        x[left0, idx2] = 1
        x[left1, idx2] = 0
        x[right, idx1] = 1
        x[right0, idx2] = 0
        x[right1, idx2] = 1

In [13]:
write_libsvm("tree-deterministic.libsvm", labels, features)
write_libsvm("tree-deterministic_test.libsvm", t_labels, t_features)

## Probabilistic

In [28]:
SCALE1 = 0.2
SCALE2 = 0.3

reset()
candid = list(range(m))
for i in range(int(m / 4)):
    rand = int(random() * len(candid))
    idx1 = candid[rand]
    del candid[rand]
    rand = int(random() * len(candid))
    idx2 = candid[rand]
    del candid[rand]

    prob1 = float((i + 1) / m * 4.0) * SCALE1
    prob2 = SCALE1 + float(1.0 - (i + 1) / m * 4.0) * (SCALE2 - SCALE1)
    print("{:d}\t{:.4f}\t{:d}\t{:.4f}".format(idx1, prob1, idx2, prob2), end='')
    for num, x, y in [(n, features, labels), (n_test, t_features, t_labels)]:
        left1 = np.logical_and(y == 1, np.random.random(num) <= 0.5 + prob1)
        left0 = np.logical_and(y == 0, np.random.random(num) <= 0.5 - prob1)
        left = np.logical_or(left1, left0)
        right = np.logical_not(left)
        right1 = np.logical_and(right, y == 1)
        right0 = np.logical_and(right, y == 0)

        x[left, idx1] = 0
        x[left0, idx2] = np.random.random(np.sum(left0)) <= 0.5 + prob2
        x[left1, idx2] = np.random.random(np.sum(left1)) <= 0.5 - prob2
        x[right, idx1] = 1
        x[right0, idx2] = np.random.random(np.sum(right0)) <= 0.5 - prob2
        x[right1, idx2] = np.random.random(np.sum(right1)) <= 0.5 + prob2

        # idx2pos = x[:, idx2] == 1
        # idx2neg = x[:, idx2] == 0
        # print("\t{:.4f}".format(np.sum(np.logical_and(left1, idx2pos)) / np.sum(left)), end='')
    print()

52	0.0080	92	0.2960
66	0.0160	4	0.2920
55	0.0240	34	0.2880
22	0.0320	35	0.2840
48	0.0400	15	0.2800
45	0.0480	70	0.2760
18	0.0560	75	0.2720
53	0.0640	79	0.2680
32	0.0720	89	0.2640
81	0.0800	28	0.2600
95	0.0880	13	0.2560
67	0.0960	49	0.2520
76	0.1040	71	0.2480
37	0.1120	93	0.2440
38	0.1200	29	0.2400
83	0.1280	23	0.2360
63	0.1360	33	0.2320
64	0.1440	82	0.2280
69	0.1520	62	0.2240
30	0.1600	91	0.2200
56	0.1680	17	0.2160
59	0.1760	77	0.2120
84	0.1840	86	0.2080
14	0.1920	42	0.2040
36	0.2000	90	0.2000


In [29]:
write_libsvm("tree-probabilistic.libsvm", labels, features)
write_libsvm("tree-probabilistic_test.libsvm", t_labels, t_features)