# Synthetic data

## Write to disk

In [8]:
def write_libsvm(filename, labels, features):
    with open(filename, "w") as f:
        for label, feature in zip(labels, features):
            feature = enumerate(feature)
            feature = filter(lambda t: t[1] != 0.0, feature)
            feature = map(lambda t: "{}:{}".format(t[0], t[1]), feature)
            f.write("{} {}\n".format(label, ' '.join(feature)))


def write_pickle(filename, n, m):
    fp = np.memmap(filename, dtype=np.bool_, mode='w+', shape=(n, m + 1))
    fp[:, :] = data[:, :]

In [46]:
import numpy as np
from scipy import sparse
from random import random


n, m = 1000000, 100
n_test = int(n * 0.01)
labels, features, t_labels, t_features = [None] * 4

def reset():
    # balanced binary labels
    global labels, features, t_labels, t_features
    labels = (np.random.random(size=(n,)) > 0.5).astype(int)
    features = (np.random.random(size=(n, m)) > 0.5).astype(int)
    t_labels = (np.random.random(size=(n_test,)) > 0.5).astype(int)
    t_features = (np.random.random(size=(n_test, m)) > 0.5).astype(int)

## One-level tree

### Deterministic

In [59]:
reset()
for idx in [10, 20, 30]:
    coin = int(random() * 2)
    label = 1  # int(random() * 2)
    index = features[:, idx] == coin
    labels[index] = label
    index = t_features[:, idx] == coin
    t_labels[index] = label

In [60]:
write_libsvm("deterministic.libsvm", labels, features)
write_libsvm("deterministic_test.libsvm", t_labels, t_features)

## Probabilistic

In [53]:
SCALE = 0.3  # Weakness of the strongest weak rule

reset()
for idx in range(m):
    prob = 0.5 + float(idx / m) * SCALE
    coin = int(random() * 2)
    label = int(random() * 2)

    index = features[:, idx] == coin
    p = (np.random.random(np.sum(index)) <= prob).astype(float)
    labels[index] = p * coin + (1.0 - p) * (1.0 - coin)
    index = t_features[:, idx] == coin
    p = (np.random.random(np.sum(index)) <= prob).astype(float)
    t_labels[index] = p * coin + (1.0 - p) * (1.0 - coin)

In [54]:
write_libsvm("probabilistic.libsvm", labels, features)
write_libsvm("probabilistic_test.libsvm", t_labels, t_features)

## Multi-level tree

### Deterministic

In [61]:
reset()
for idx1, idx2 in [(10, 20), (30, 40), (50, 60), (70, 80)]:
    coin1 = int(random() * 2)
    coin2 = int(random() * 2)
    label = 1  # int(random() * 2)
    labels[
        np.logical_and(features[:, idx1] == coin1, features[:, idx2] == coin2)
    ] = label
    t_labels[
        np.logical_and(t_features[:, idx1] == coin1, t_features[:, idx2] == coin2)
    ] = label

In [62]:
write_libsvm("tree-deterministic.libsvm", labels, features)
write_libsvm("tree-deterministic_test.libsvm", t_labels, t_features)

## Probabilistic

In [57]:
SCALE = 0.3

reset()
candid = list(range(m))
for i in range(int(m / 4)):
    rand = int(random() * len(candid))
    idx1 = candid[rand]
    del candid[rand]
    rand = int(random() * len(candid))
    idx2 = candid[rand]
    del candid[rand]

    coin1 = int(random() * 2)
    coin2 = int(random() * 2)
    label = int(random() * 2)
    prob = 0.5 + float(i / m * 4.0) * SCALE
    index = np.logical_and(features[:, idx1] == coin1, features[:, idx2] == coin2)
    p = (np.random.random(np.sum(index)) <= prob).astype(float)
    labels[index] = p * coin + (1.0 - p) * (1.0 - coin)
    index = np.logical_and(t_features[:, idx1] == coin1, t_features[:, idx2] == coin2)
    p = (np.random.random(np.sum(index)) <= prob).astype(float)
    t_labels[index] = p * coin + (1.0 - p) * (1.0 - coin)

In [58]:
write_libsvm("tree-probabilistic.libsvm", labels, features)
write_libsvm("tree-probabilistic_test.libsvm", t_labels, t_features)