# Random binning features

In [2]:
import numpy as np
from sklearn.datasets import make_blobs
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn import metrics
import pandas as pd
from scipy import stats

from time import time

In [3]:
def get_blobs(N, D, cluster_std):
  return make_blobs(n_samples=N, centers=2, n_features=D, cluster_std=cluster_std, random_state=0)

def generate_data(N, D, cluster_std, normalize=True):
  
  X, y = get_blobs(N, D, cluster_std)
  
  if normalize:
    X = (X - np.mean(X, axis=0)) / np.std(X, axis=0)

  return train_test_split(X, y, train_size=0.7)



In [4]:
def sample_delta(size):
   gamma = stats.gamma(a=2, scale=1)
   return gamma.rvs(size=size)

def sample_parameter_rvs(P, d):
  delta = sample_delta((P, d))
  u = np.random.uniform(low=0, high=delta, size=(P, d))


  assert np.all(u <= delta), 'Something with the bound of u'
  assert np.all(delta > 0), 'Deltas should be positive'

  return delta, u

In [5]:
def random_binning_features_matrix(X, delta, u, P=350, TEMP_VECTOR_SIZE = 128):
    """
    X is (n, d) matrix
    delta is (n, d)
    u is (n,d)
    Returns (TEMP_VECTOR_SIZE, P) matrix
    """
    
    n, d = X.shape
    Z = np.zeros((n, P * TEMP_VECTOR_SIZE)) 


    for p in range(P):
        delta_p, u_p = delta[p], u[p]

        tmp = np.zeros((n, TEMP_VECTOR_SIZE))
        bin_index = np.ceil((X - u_p) / delta_p).astype('int')

        for i in range(n):

          idx = bin_index[i] % TEMP_VECTOR_SIZE
          tmp[i, idx] += 1

        Z[:, p * TEMP_VECTOR_SIZE: (p+1) * TEMP_VECTOR_SIZE] = tmp
    
    return 1/ np.sqrt(P) * Z


In [6]:
def evaluate_random_binning_SVM(X, y, P, iterations):
  X_norm = (X - np.mean(X, axis=0)) / np.std(X, axis=0)
  X_train, X_test, y_train, y_test = train_test_split(X,y)

  n, d = X_train.shape


  scores = np.empty(iterations)
  times = np.empty(iterations)

  for i in range(iterations):

    start_time = time()

    delta, u = sample_parameter_rvs(P, d)
    Zx = random_binning_features_matrix(X_train, delta, u, P)
    K_train = Zx @ Zx.T
  

    clf = SVC(kernel='precomputed') # kernel should return nxn matrix
    clf.fit(K_train, y_train)

    end_time = time()

    Zy = random_binning_features_matrix(X_test, delta, u, P)
    K_test = Zy @ Zx.T
    y_pred = clf.predict(K_test)

    times[i] = end_time - start_time
    scores[i] = metrics.accuracy_score(y_test, y_pred)
    print('-' * 20)
    print(f'Iteration #{i + 1}/{iterations}')
    print(f'Time passed: {times[i]}')
    print(f'Accuracy score: {scores[i]}')

  
  print('-' * 20)
  print('FINAL AVERAGE VALUES')
  print(f'Mean time: {np.mean(times)}')
  print(f'Mean accuracy: {np.mean(scores)}')
  






In [14]:
evaluate_random_binning_SVM(*get_blobs(50000, 2000, cluster_std=2), P=30, iterations=5)

--------------------
Iteration #1/5
Time passed: 158.22926807403564
Accuracy score: 1.0
--------------------
Iteration #2/5
Time passed: 202.68652868270874
Accuracy score: 1.0
--------------------
Iteration #3/5
Time passed: 191.85163688659668
Accuracy score: 1.0
--------------------
Iteration #4/5
Time passed: 189.53769993782043
Accuracy score: 0.99992
--------------------
Iteration #5/5
Time passed: 182.57079482078552
Accuracy score: 1.0
--------------------
FINAL AVERAGE VALUES
Mean time: 184.97518568038942
Mean accuracy: 0.9999839999999999


## letter.csv


In [9]:
df_letter = pd.read_csv("letter.csv")
df_letter = df_letter.drop(df_letter.columns[0], axis=1)
print(df_letter.shape)
print(df_letter.head())

(15000, 17)
   to_predict         1         2         3         4         5         6  \
0          26 -0.733333 -0.466667 -0.466667 -0.600000 -0.733333 -0.066667   
1          16 -0.466667 -0.066667 -0.333333 -0.333333 -0.333333 -0.333333   
2          19 -0.066667  0.333333  0.066667 -0.066667 -0.466667  0.066667   
3           8 -0.466667  0.200000 -0.333333 -0.066667 -0.466667 -0.066667   
4           8 -0.200000 -0.066667  0.066667 -0.333333 -0.466667 -0.066667   

          7         8         9        10        11        12        13  \
0  0.066667 -0.733333  0.200000  0.466667 -0.066667 -0.066667 -0.866667   
1  0.200000 -0.200000 -0.466667  0.066667 -0.066667  0.200000 -0.733333   
2  0.066667 -0.333333  0.333333  0.466667 -0.733333  0.066667 -0.733333   
3 -0.066667  0.733333 -0.866667 -0.066667 -0.200000  0.066667 -0.600000   
4 -0.200000 -0.600000 -0.066667  0.333333 -0.066667  0.200000 -0.600000   

         14        15        16  
0  0.066667 -0.333333 -0.200000  
1  0.2

In [10]:
y_letter = df_letter['to_predict'].to_numpy()

X_letter = df_letter.drop("to_predict", axis=1).to_numpy()

In [13]:
evaluate_random_binning_SVM(X_letter, y_letter, P=30, iterations=5)

--------------------
Iteration #1/5
Time passed: 6.785836935043335
Accuracy score: 0.7792
--------------------
Iteration #2/5
Time passed: 6.738014221191406
Accuracy score: 0.8250666666666666
--------------------
Iteration #3/5
Time passed: 6.3219499588012695
Accuracy score: 0.7536
--------------------
Iteration #4/5
Time passed: 5.946943998336792
Accuracy score: 0.7970666666666667
--------------------
Iteration #5/5
Time passed: 7.007684230804443
Accuracy score: 0.8192
--------------------
FINAL AVERAGE VALUES
Mean time: 6.560085868835449
Mean accuracy: 0.7948266666666667


## covtype.csv

In [7]:
df_covtype = pd.read_csv("covtype_10.csv")
df_covtype = df_covtype.drop(df_covtype.columns[0], axis=1)
print(df_covtype.shape)
print(df_covtype.head())

(58101, 55)
   Elevation  Aspect  Slope  Horizontal_Distance_To_Hydrology  \
0       3137     139     20                               162   
1       3065     106      9                               234   
2       3000      69     19                               384   
3       3057     248      8                               430   
4       2276      30     15                               384   

   Vertical_Distance_To_Hydrology  Horizontal_Distance_To_Roadways  \
0                              39                             3830   
1                              19                             3102   
2                               8                              484   
3                              66                             5850   
4                             181                              570   

   Hillshade_9am  Hillshade_Noon  Hillshade_3pm  \
0            246             229            101   
1            235             229            125   
2            236       

In [8]:
y_covtype = df_covtype['Cover_Type'].to_numpy()[:30000]

X_covtype = df_covtype.drop("Cover_Type", axis=1).to_numpy()[:30000]

In [9]:
evaluate_random_binning_SVM(X_covtype, y_covtype, P=30, iterations=5)

  X_norm = (X - np.mean(X, axis=0)) / np.std(X, axis=0)


--------------------
Iteration #1/5
Time passed: 56.123246908187866
Accuracy score: 0.6006666666666667
--------------------
Iteration #2/5
Time passed: 59.70258092880249
Accuracy score: 0.5852
--------------------
Iteration #3/5
Time passed: 58.371793270111084
Accuracy score: 0.6054666666666667
--------------------
Iteration #4/5
Time passed: 69.64421701431274
Accuracy score: 0.6050666666666666
--------------------
Iteration #5/5
Time passed: 63.346566915512085
Accuracy score: 0.606
--------------------
FINAL AVERAGE VALUES
Mean time: 61.43768100738525
Mean accuracy: 0.6004799999999999


## star classification

In [13]:
def vectorize_class(value):
  if value == "GALAXY":
    return 0
  if value == "STAR":
    return 1
  return 2

In [14]:
df_star = pd.read_csv("star_classification.csv")
df_star = df_star.drop(["rerun_ID", "run_ID", "cam_col", "field_ID", "spec_obj_ID", "plate", "MJD", "fiber_ID"], axis=1)
df_star['class'] = df_star['class'].apply(lambda row: vectorize_class(row))
print(df_star.shape)
df_star.head()

(100000, 10)


Unnamed: 0,obj_ID,alpha,delta,u,g,r,i,z,class,redshift
0,1.237661e+18,135.689107,32.494632,23.87882,22.2753,20.39501,19.16573,18.79371,0,0.634794
1,1.237665e+18,144.826101,31.274185,24.77759,22.83188,22.58444,21.16812,21.61427,0,0.779136
2,1.237661e+18,142.18879,35.582444,25.26307,22.66389,20.60976,19.34857,18.94827,0,0.644195
3,1.237663e+18,338.741038,-0.402828,22.13682,23.77656,21.61162,20.50454,19.2501,0,0.932346
4,1.23768e+18,345.282593,21.183866,19.43718,17.58028,16.49747,15.97711,15.54461,0,0.116123


In [15]:
y_star = df_star['class'].to_numpy()[:50000]

X_star = df_star.drop("class", axis=1).to_numpy()[:50000]

In [16]:
evaluate_random_binning_SVM(X_star, y_star, P=30, iterations=5)

--------------------
Iteration #1/5
Time passed: 74.84564304351807
Accuracy score: 0.9292
--------------------
Iteration #2/5
Time passed: 86.2202479839325
Accuracy score: 0.96288
--------------------
Iteration #3/5
Time passed: 89.73142695426941
Accuracy score: 0.96352
--------------------
Iteration #4/5
Time passed: 89.8348479270935
Accuracy score: 0.96408
--------------------
Iteration #5/5
Time passed: 97.27880692481995
Accuracy score: 0.92432
--------------------
FINAL AVERAGE VALUES
Mean time: 87.58219456672668
Mean accuracy: 0.9488


## MNIST

In [17]:
df_mnist = pd.read_csv("mnist_test.csv")
print(df_mnist.head())

   label  1x1  1x2  1x3  1x4  1x5  1x6  1x7  1x8  1x9  ...  28x19  28x20  \
0      7    0    0    0    0    0    0    0    0    0  ...      0      0   
1      2    0    0    0    0    0    0    0    0    0  ...      0      0   
2      1    0    0    0    0    0    0    0    0    0  ...      0      0   
3      0    0    0    0    0    0    0    0    0    0  ...      0      0   
4      4    0    0    0    0    0    0    0    0    0  ...      0      0   

   28x21  28x22  28x23  28x24  28x25  28x26  28x27  28x28  
0      0      0      0      0      0      0      0      0  
1      0      0      0      0      0      0      0      0  
2      0      0      0      0      0      0      0      0  
3      0      0      0      0      0      0      0      0  
4      0      0      0      0      0      0      0      0  

[5 rows x 785 columns]


In [18]:
y_mnist = df_mnist.label.values

X_mnist = df_mnist[df_mnist.columns[1:]].values / 255

In [19]:
evaluate_random_binning_SVM(X_mnist, y_mnist, P=30, iterations=5)

  X_norm = (X - np.mean(X, axis=0)) / np.std(X, axis=0)


--------------------
Iteration #1/5
Time passed: 10.167913913726807
Accuracy score: 0.8312
--------------------
Iteration #2/5
Time passed: 9.823300123214722
Accuracy score: 0.8264
--------------------
Iteration #3/5
Time passed: 9.840606927871704
Accuracy score: 0.8252
--------------------
Iteration #4/5
Time passed: 10.486677169799805
Accuracy score: 0.8168
--------------------
Iteration #5/5
Time passed: 10.309889078140259
Accuracy score: 0.8204
--------------------
FINAL AVERAGE VALUES
Mean time: 10.125677442550659
Mean accuracy: 0.8240000000000001
