In [2]:
from sklearn import svm
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import rbf_kernel
from sklearn.datasets import make_blobs
from sklearn.metrics import mean_absolute_percentage_error
import numpy as np
!pip install google-cloud-storage
from google.cloud import storage
import os
import time



In [3]:
def download_public_file(bucket_name, source_blob_name, destination_file_name):
    """Downloads a public blob from the bucket."""
    # bucket_name = "your-bucket-name"
    # source_blob_name = "storage-object-name"
    # destination_file_name = "local/path/to/file"

    storage_client = storage.Client.create_anonymous_client()

    bucket = storage_client.bucket(bucket_name=bucket_name)
    blob = bucket.blob(source_blob_name)
    blob.download_to_filename(destination_file_name)

    print(
        "Downloaded public blob {} from bucket {} to {}.".format(
            source_blob_name, bucket.name, destination_file_name
        )
    )

# Download datasets

In [3]:
bucket_name = "mmml_project_datasets"
source_blob_names = ["bodyfat.csv", "covtype_10.csv", "letter.csv", "mnist_test.csv", "star_classification.csv"]

for source_blob_name in source_blob_names:
  destination_file_name = source_blob_name
  download_public_file(bucket_name, source_blob_name, destination_file_name)

Downloaded public blob bodyfat.csv from bucket mmml_project_datasets to bodyfat.csv.
Downloaded public blob covtype_10.csv from bucket mmml_project_datasets to covtype_10.csv.
Downloaded public blob letter.csv from bucket mmml_project_datasets to letter.csv.
Downloaded public blob mnist_test.csv from bucket mmml_project_datasets to mnist_test.csv.
Downloaded public blob star_classification.csv from bucket mmml_project_datasets to star_classification.csv.


# SVM classification

In [5]:
def generate_kernel(d, s, m=100):
    b = np.random.uniform(low=0, high=2*np.pi, size=(1,m))
    W = np.random.multivariate_normal(mean=np.zeros(d), cov=2*s*np.eye(d), size=m) # m x d
    def ker(x, y):
        z1 = np.cos(x @ W.T + b)
        z2 = np.cos(y @ W.T + b)
        return z1 @ z2.T / m
    return ker

In [6]:
def test_svm(X, y, d, D, rff_run_iterations=1):
  X = (X - np.mean(X)) / np.std(X) 
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)

  rbf_scores = np.empty(rff_run_iterations)
  rbf_times = np.empty(rff_run_iterations)
  rbf_mapes = np.empty(rff_run_iterations)

  for i in range(rff_run_iterations):
    print("RBF ", i)
    
    # Test RBF kernel
    clf_covtype = svm.SVC()
    start = time.time()
    clf_covtype.fit(X_train, y_train)
    
    end = time.time()

    rbf_times[i] = end - start

    pred = clf_covtype.predict(X_test)
    result = np.isclose(pred, y_test)

    rbf_mapes[i] = mean_absolute_percentage_error(pred, y_test)
    rbf_scores[i] = np.count_nonzero(result == True) / len(y_test)

  print('RBF Time: ', np.mean(rbf_times))
  print('RBF Score: ', np.mean(rbf_scores))
  print('RBF MAPE: ', np.mean(rbf_mapes))

  rff_scores = np.empty(rff_run_iterations)
  rff_times = np.empty(rff_run_iterations)
  rff_mapes = np.empty(rff_run_iterations)

  for i in range(rff_run_iterations):
    print("RFF ", i)

    rff_clf_covtype = svm.SVC(kernel=generate_kernel(d, 1/d, D))
    start = time.time()
    rff_clf_covtype.fit(X_train, y_train)
    end = time.time()

    rff_times[i] = end - start

    pred = rff_clf_covtype.predict(X_test)
    result = np.isclose(pred, y_test)

    rff_mapes[i] = mean_absolute_percentage_error(pred, y_test)
    rff_scores[i] = np.count_nonzero(result == True) / len(y_test)
  
  print('\n---\n')
  print('RFF Time: ', np.mean(rff_times))
  print('RFF Score: ', np.mean(rff_scores))
  print('RFF MAPE: ', np.mean(rff_mapes))

## Synthetic dataset


In [9]:
num_samples = 10000
num_features = 700

X, y = X, y = make_blobs(n_samples=num_samples, centers=2, n_features=num_features, cluster_std=2, random_state=0)

#### RFF Kernel

In [10]:
test_svm(X, y, X.shape[1], 50, 5)

RBF  0
RBF  1
RBF  2
RBF  3
RBF  4
RBF Time:  0.4840841770172119
RBF Score:  1.0
RBF MAPE:  0.0
RFF  0
RFF  1
RFF  2
RFF  3
RFF  4

---

RFF Time:  0.19872946739196778
RFF Score:  1.0
RFF MAPE:  0.0


## letter.csv

In [73]:
df_letter = pd.read_csv("letter.csv")
df_letter = df_letter.drop(df_letter.columns[0], axis=1)
print(df_letter.shape)
print(df_letter.head())

(15000, 17)
   to_predict         1         2         3         4         5         6  \
0          26 -0.733333 -0.466667 -0.466667 -0.600000 -0.733333 -0.066667   
1          16 -0.466667 -0.066667 -0.333333 -0.333333 -0.333333 -0.333333   
2          19 -0.066667  0.333333  0.066667 -0.066667 -0.466667  0.066667   
3           8 -0.466667  0.200000 -0.333333 -0.066667 -0.466667 -0.066667   
4           8 -0.200000 -0.066667  0.066667 -0.333333 -0.466667 -0.066667   

          7         8         9        10        11        12        13  \
0  0.066667 -0.733333  0.200000  0.466667 -0.066667 -0.066667 -0.866667   
1  0.200000 -0.200000 -0.466667  0.066667 -0.066667  0.200000 -0.733333   
2  0.066667 -0.333333  0.333333  0.466667 -0.733333  0.066667 -0.733333   
3 -0.066667  0.733333 -0.866667 -0.066667 -0.200000  0.066667 -0.600000   
4 -0.200000 -0.600000 -0.066667  0.333333 -0.066667  0.200000 -0.600000   

         14        15        16  
0  0.066667 -0.333333 -0.200000  
1  0.2

In [74]:
y_letter = df_letter['to_predict'].to_numpy()

X_letter = df_letter.drop("to_predict", axis=1).to_numpy()

In [76]:
test_svm(X_letter, y_letter, X_letter.shape[1], 300, 5)

RBF  0
RBF  1
RBF  2
RBF  3
RBF  4
RBF Time:  2.0277395248413086
RBF Score:  0.9082828282828282
RBF MAPE:  0.11737751876380734
RFF  0
RFF  1
RFF  2
RFF  3
RFF  4

---

RFF Time:  1.667382001876831
RFF Score:  0.8663030303030304
RFF MAPE:  0.1998905702116441


## covtype.csv

In [1]:
df_covtype = pd.read_csv("covtype_10.csv")
df_covtype = df_covtype.drop(df_covtype.columns[0], axis=1)
print(df_covtype.shape)
print(df_covtype.head())

NameError: name 'pd' is not defined

In [71]:
y_covtype = df_covtype['Cover_Type'].to_numpy()[:30000]

X_covtype = df_covtype.drop("Cover_Type", axis=1).to_numpy()[:30000]

In [72]:
test_svm(X_covtype, y_covtype, X_covtype.shape[1], 500, 5)

RBF  0
RBF  1
RBF  2
RBF  3
RBF  4
RBF Time:  27.849138975143433
RBF Score:  0.6913131313131313
RBF MAPE:  0.42607744107744117
RFF  0
RFF  1
RFF  2
RFF  3
RFF  4

---

RFF Time:  9.477790832519531
RFF Score:  0.6872727272727271
RFF MAPE:  0.42908080808080806


## star classification

In [20]:
def vectorize_class(value):
  if value == "GALAXY":
    return 0
  if value == "STAR":
    return 1
  return 2

In [21]:
df_star = pd.read_csv("star_classification.csv")
df_star = df_star.drop(["rerun_ID", "run_ID", "cam_col", "field_ID", "spec_obj_ID", "plate", "MJD", "fiber_ID"], axis=1)
df_star['class'] = df_star['class'].apply(lambda row: vectorize_class(row))
print(df_star.shape)
df_star.head()

(100000, 10)


Unnamed: 0,obj_ID,alpha,delta,u,g,r,i,z,class,redshift
0,1.237661e+18,135.689107,32.494632,23.87882,22.2753,20.39501,19.16573,18.79371,0,0.634794
1,1.237665e+18,144.826101,31.274185,24.77759,22.83188,22.58444,21.16812,21.61427,0,0.779136
2,1.237661e+18,142.18879,35.582444,25.26307,22.66389,20.60976,19.34857,18.94827,0,0.644195
3,1.237663e+18,338.741038,-0.402828,22.13682,23.77656,21.61162,20.50454,19.2501,0,0.932346
4,1.23768e+18,345.282593,21.183866,19.43718,17.58028,16.49747,15.97711,15.54461,0,0.116123


In [24]:
y_star = df_star['class'].to_numpy()[:50000]

X_star = df_star.drop("class", axis=1).to_numpy()[:50000]

In [25]:
test_svm(X_star, y_star, X_star.shape[1], 500, 5)

RBF  0
RBF  1
RBF  2
RBF  3
RBF  4
RBF Time:  38.15086002349854
RBF Score:  0.5921212121212122
RBF MAPE:  2697792649510908.0
RFF  0
RFF  1
RFF  2
RFF  3
RFF  4

---

RFF Time:  21.72342677116394
RFF Score:  0.5921212121212122
RFF MAPE:  2697792649510908.0


## MNIST

In [14]:
df_mnist = pd.read_csv("mnist_test.csv")
print(df_mnist.head())

   label  1x1  1x2  1x3  1x4  1x5  1x6  1x7  1x8  1x9  ...  28x19  28x20  \
0      7    0    0    0    0    0    0    0    0    0  ...      0      0   
1      2    0    0    0    0    0    0    0    0    0  ...      0      0   
2      1    0    0    0    0    0    0    0    0    0  ...      0      0   
3      0    0    0    0    0    0    0    0    0    0  ...      0      0   
4      4    0    0    0    0    0    0    0    0    0  ...      0      0   

   28x21  28x22  28x23  28x24  28x25  28x26  28x27  28x28  
0      0      0      0      0      0      0      0      0  
1      0      0      0      0      0      0      0      0  
2      0      0      0      0      0      0      0      0  
3      0      0      0      0      0      0      0      0  
4      0      0      0      0      0      0      0      0  

[5 rows x 785 columns]


In [15]:
y_mnist = df_mnist.label.values

X_mnist = df_mnist[df_mnist.columns[1:]].values / 255

In [16]:
X_mnist.shape

(10000, 784)

In [17]:
test_svm(X_mnist, y_mnist, X_mnist.shape[1], 500, 5)

RBF  0
RBF  1
RBF  2
RBF  3
RBF  4
RBF Time:  5.92200608253479
RBF Score:  0.9624242424242425
RBF MAPE:  103719264145502.38
RFF  0
RFF  1
RFF  2
RFF  3
RFF  4

---

RFF Time:  0.9517799854278565
RFF Score:  0.9377575757575757
RFF MAPE:  154214169058444.3
