In [None]:
import pandas as pd
import numpy as np
import scipy

In [None]:
d = 2 # Number of dimensions
k = 6 # Number of classes
n = 10000 # Number of data points
portions = np.array([3, 7, 10, 15, 19, 21]) * 8# Scale for how spread apart the points are
scaling = portions[-1]

## Generate Training Set ("Real" Data)

In [None]:
## Create one linear classifier, and k-1 "barriers" such that,
## if point x lies between barrier i and i-1, the probability it belongs to class i
## is proportional to how close it is to that barrier
def generate_quantile_data(d, k, n):
    ## Vector that classification will be based on
    w = np.array([1/d]*d)

    ## Generate points
    projection_points = np.matrix(np.random.uniform(0, scaling, n)).T
    points_on_line = projection_points * w

    #Ensure noise in range as to not mess with classification too much
    noise = np.reshape(np.matrix(np.random.normal(0, scaling / (16*d), n*d)), (n, d))
    noise = noise - np.dot(noise, w).T * w
    X = points_on_line + noise

    ## Get class labels
    y = get_class_labels(projection_points, portions, k)
    
    return X,y

In [None]:
def get_class_labels(projection_points, portions, k):
    y = np.matrix(np.zeros(projection_points.size)).T
    for i in range(0, projection_points.size):
        curr_point = projection_points[i]
        likely_class = np.argmax((curr_point - portions) < 0)
        if(likely_class == k - 1):
            y[i] = k-1
        else:
            if(likely_class > 0):
                prob_up = 1 - (portions[likely_class] - curr_point) / (portions[likely_class] - portions[likely_class - 1])
            else:
                prob_up = 1 - (portions[likely_class] - curr_point) / (portions[likely_class] - 0)
            y[i] = likely_class + 1 * (np.random.uniform(0,1) < prob_up)
    return y

In [None]:
X,y = generate_quantile_data(d, k, n)

In [None]:
np.savetxt('quantile_synthetic_features.csv', X, delimiter=',')
np.savetxt('quantile_synthetic_labels.csv', y, delimiter=',')

## Generate Test Data (Computed Quantiles)

In [None]:
## Create one linear classifier, and k-1 "barriers" such that,
## if point x lies between barrier i and i-1, the probability it belongs to class i
## is proportional to how close it is to that barrier
def generate_quantile_testdata(d, k, n):
    ## Vector that classification will be based on
    w = np.array([1/d]*d)

    ## Generate points
    projection_points = np.matrix(np.random.uniform(0, scaling, n)).T
    print(projection_points)
    points_on_line = projection_points * w
    print(points_on_line)

    #Ensure noise in range as to not mess with classification too much
    noise = np.reshape(np.matrix(np.random.normal(0, scaling / (16*d), n*d)), (n, d))
    noise = noise - np.dot(noise, w).T * w
    X = points_on_line + noise
    print(X.dot(w) *d)
    
    return X

In [None]:
X = generate_quantile_testdata(d, k, n)

In [None]:
np.savetxt('quantile_synthetic_testfeatures.csv', X, delimiter=',')

## Understand how data looks

In [None]:
from matplotlib import pyplot as plt

In [None]:
plt.scatter([X[:, 0]], [X[:, 1]], c=[y])