In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
from pca_utils import *
from sklearn.datasets import make_classification
import gc
from tqdm.notebook import tqdm
import pandas as pd

In [3]:
from pdb import set_trace

In [4]:
full, half = torch.float32, torch.float16

In [5]:
def get_gaussian_sampler(dimensions=5, mean=0.0, variance=1.0, var_factor=0.25):
    mu = torch.tensor([mean]*dimensions).float()
    cov= torch.eye(dimensions).float() * (variance**2) * var_factor
    sampler = torch.distributions.MultivariateNormal(mu, cov)
    return sampler

In [6]:
sampler = get_gaussian_sampler()

In [7]:
sampler.sample((2,)).type(half)

tensor([[ 0.3345, -0.4514,  0.0264,  0.4294,  0.0201],
        [ 0.0602, -0.2874,  0.0221,  0.0773,  0.4463]], dtype=torch.float16)

In [8]:
def get_classification_data(n, features, classes):
    n_samples = n
    n_features= features
    n_informative= int(np.ceil(n_features * 0.75))
    n_redundant = 0 # (n_features - n_informative) // 2
    n_classes = classes
    data = make_classification(n_samples, n_features, n_informative, n_redundant, n_classes=n_classes)
    return torch.tensor(data[0]), torch.tensor(data[1])

%time
x, y = get_classification_data(10000, 400, 2)
x.shape, y.shape

Need: 
1. Time for 32 and 16 bit for both scaled and non-scaled
2. Relative and absolute accuracy of 16bit/32bit for both scaled and non-scaled for both reduced data and eigen vectors
3. So, a total of 12 items (6+6) per dataset

In [9]:
def scale_to_01(x):
    return x.sub(x.min()).div(x.max())

In [10]:
eps = 1e-7
def absolute_deviation(preds, targets):
    return (preds - targets).mean().abs()

def relative_deviation(preds, targets):
    return ((preds - targets) / (targets + eps)).mean().abs()

In [11]:
def process_results(x32, eigs32, time32, x16, eigs16, time16):
    dict1 = {}
    dict1['abs_deviation'], dict1['rel_deviation'], dict1['time'] = {}, {}, {}
    dict1['abs_deviation']['data'] = absolute_deviation(x16, x32).item() if type(x16) != int else 0
    dict1['rel_deviation']['data'] = relative_deviation(x16, x32).item() if type(x16) != int else 0
    dict1['abs_deviation']['eigv'] = absolute_deviation(eigs16, eigs32).item() if type(eigs16) != int else 0
    dict1['rel_deviation']['eigv'] = relative_deviation(eigs16, eigs32).item() if type(eigs16) != int else 0
    dict1['time'][16] = time16
    dict1['time'][32] = time32
    return dict1

In [12]:
def get_results(x, iterations=10, scaled_only=True, debug=False):
    res = {}
    time = torch.zeros((2, 50))
    if x.shape[0] >= 1000000:
        if x.shape[0] >= 5000000: iterations = 2
        else: iterations = 5
    if not scaled_only:
        if debug: print("Processing non-scaled dataset")
        for i in range(iterations):
            (x32, eigs32), time[0, i] = torchPCA(x, k=3, fp16=False)
            (x16, eigs16), time[1, i] = torchPCA(x, k=3, fp16=True)
        time32, time16 = time[0,:].mean().item(), time[1,:].mean().item()
    else:
        x32, eigs32, time32, x16, eigs16, time16 = 0, 0, 0, 0, 0, 0        
    
    if debug: set_trace()
    res['non_scaled'] = process_results(x32, eigs32, time32, x16, eigs16, time16)
    
    x = scale_to_01(x)
    
    time = torch.zeros((2, 50))
    if debug: print("Processing scaled (to 0-1 range) dataset")
    for i in range(iterations):
        (x32, eigs32), time[0, i] = torchPCA(x, k=3, fp16=False)
        (x16, eigs16), time[1, i] = torchPCA(x, k=3, fp16=True)
    time32, time16 = time[0,:].mean().item(), time[1,:].mean().item()
    
    if debug: set_trace()
    res['scaled'] = process_results(x32, eigs32, time32, x16, eigs16, time16)
    return res

In [13]:
sampler = get_gaussian_sampler(200, 0, 1)
# x = scale_to_01(sampler.sample((1000000,)))
x = sampler.sample((100000,))

# x = x.type(torch.float16).cuda()

cov = torchCov(x, debug=False)

cov.min(), cov.max()

In [14]:
res = get_results(x, scaled_only=False); res

{'non_scaled': {'abs_deviation': {'data': 9.76211504166713e-06,
   'eigv': 0.02815176732838154},
  'rel_deviation': {'data': 5.141624927520752, 'eigv': 0.9940662980079651},
  'time': {16: 0.003105799900367856, 32: 0.00905982032418251}},
 'scaled': {'abs_deviation': {'data': 0.0001512068120064214,
   'eigv': 0.028739726170897484},
  'rel_deviation': {'data': 0.1260276585817337, 'eigv': 1.0468394756317139},
  'time': {16: 0.003294499823823571, 32: 0.006751899607479572}}}

In [15]:
def write_result(i:int, n:int, dimensions:int, k:int, mean:float, var:float, res:dict, df:pd.DataFrame, output:str="results.csv"):
    scaled, nscaled = res['scaled'], res['non_scaled']
    absd, reld = 'abs_deviation', 'rel_deviation'
    res1 = [scaled[absd]['data'], scaled[absd]['eigv'], scaled[reld]['data'], scaled[reld]['eigv']]
    res1+= [scaled['time'][16], scaled['time'][32]]
    res2 = [nscaled[absd]['data'], nscaled[absd]['eigv'], nscaled[reld]['data'], nscaled[reld]['eigv']]
    res2+= [nscaled['time'][16], nscaled['time'][32]]
    res1 = [True, n, dimensions, k, mean, var] + res1
    res2 = [False, n, dimensions,k, mean, var] + res2
    df.iloc[i]   = res1
    df.iloc[i+1] = res2
    if i % 5 == 0: 
        print("Writing to CSV...")
        df.to_csv(output)

In [16]:
mean_vars = [[0, 1], [0, 4], [0, 16], [0, 64], [0, 128], [0, 512], [0, 2048], [0, 8192], [0, 32768], [0, 65519]]
mean_vars+= [[1.5, 0.5], [1.5, 3], [3, 1]]
mean_vars+= [[6, 2], [6, 12], [12, 4]]
mean_vars+= [[24, 8], [24, 48], [48, 16]]
mean_vars+= [[96, 32], [96, 192], [192, 64]]
mean_vars+= [[384, 128], [384, 768], [768, 256]]
mean_vars+= [[1536, 512], [1536, 3072], [3072, 1024]]
mean_vars+= [[6144, 2048], [6144, 12288], [12288, 4096]]
mean_vars+= [[24576, 8192], [24576, 32768], [49152, 16384]]

n_values  = [100, 200, 500, 1000, 2000, 5000, 10000, 20000, 50000, 100000, 200000, 500000, 1000000, 2000000
            ,5000000, 10000000]
dimensions= [10, 50, 100, 200, 400]
dimensions= [10, ]

total = len(n_values) * len(mean_vars) * len(dimensions) * 2
k = 3
max_retries = 8
print(total)

df = pd.DataFrame(np.zeros((total, 12)), columns=['Scaled', 'N', 'Dimensions', 'K', 'Mean', 'Variance',
                                                'abs_deviation_data', 'abs_deviation_eigv',
                                                'rel_deviation_data', 'rel_deviation_eigv',
                                                'time_16', 'time_32'])

df.Scaled = df.Scaled.astype(bool)
df.N = df.N.astype(int)
df.Dimensions = df.Dimensions.astype(int)
df.K = df.K.astype(int)

df.dtypes

1088


Scaled                   bool
N                       int64
Dimensions              int64
K                       int64
Mean                  float64
Variance              float64
abs_deviation_data    float64
abs_deviation_eigv    float64
rel_deviation_data    float64
rel_deviation_eigv    float64
time_16               float64
time_32               float64
dtype: object

In [None]:
skipped = []
pos = 0
debug = True
for dimension in dimensions:
    print(f"Number of columns: {dimension}")
    for i, (mean, var) in tqdm(list(enumerate(mean_vars))):
        if debug and i == 2: break
        sampler = get_gaussian_sampler(dimension, mean, var)
        df.to_csv("results.csv")
        for n in tqdm(n_values):
            if dimension == 400 and n == 10000000: continue
            still_doing = max_retries
            scaled_only=False
            while still_doing > 0:
                try:
                    data = sampler.sample((n,))
                    res = get_results(data, scaled_only=scaled_only)
                    still_doing = 0
                    write_result(pos, n, dimension, k, mean, var, res, df)
                    pos += 2
                except:
                    still_doing -= 1
                    if still_doing == max_retries // 2:
                        scaled_only = True
                    if still_doing == 0:
                        print(f"Skipping N={n}\t Mean={mean}\t Var={var}")
                        skipped.append([n, mean, var])
                        
df.to_csv("results.csv")

Number of columns: 10


HBox(children=(FloatProgress(value=0.0, max=34.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))

Writing to CSV...
Writing to CSV...
Writing to CSV...
Writing to CSV...



HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))

Writing to CSV...
Writing to CSV...
Writing to CSV...



HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))

Writing to CSV...
Writing to CSV...
Writing to CSV...



HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))

Writing to CSV...
Writing to CSV...
Writing to CSV...



HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))

Writing to CSV...
Writing to CSV...
Writing to CSV...



HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))

Writing to CSV...
Writing to CSV...
Writing to CSV...
Writing to CSV...



HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))

Writing to CSV...
Writing to CSV...
Writing to CSV...



HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))

Writing to CSV...
Writing to CSV...
Writing to CSV...



HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))

Writing to CSV...
Writing to CSV...
Writing to CSV...



HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))

Writing to CSV...
Writing to CSV...
Writing to CSV...



HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))

Writing to CSV...
Writing to CSV...
Writing to CSV...
Writing to CSV...



HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))

Writing to CSV...
Writing to CSV...
Writing to CSV...



HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))

Writing to CSV...
Writing to CSV...
Writing to CSV...



HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))

Writing to CSV...
Writing to CSV...
Writing to CSV...



HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))

Writing to CSV...
Writing to CSV...
Writing to CSV...



HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))

Writing to CSV...
Writing to CSV...
Writing to CSV...
Writing to CSV...



HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))

Writing to CSV...
Writing to CSV...
Writing to CSV...



HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))

Writing to CSV...
Writing to CSV...
Writing to CSV...



HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))

Writing to CSV...
Writing to CSV...
Writing to CSV...



HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))

In [17]:
%run run_exp

  0%|          | 0/34 [00:00<?, ?it/s]
  0%|          | 0/16 [00:00<?, ?it/s][A

1088
Number of columns: 10



  6%|▋         | 1/16 [00:00<00:02,  5.24it/s][A
 12%|█▎        | 2/16 [00:00<00:02,  5.78it/s][A

Writing to CSV: done!



 19%|█▉        | 3/16 [00:00<00:02,  6.22it/s][A
 25%|██▌       | 4/16 [00:00<00:01,  6.56it/s][A
 31%|███▏      | 5/16 [00:00<00:01,  6.80it/s][A
 38%|███▊      | 6/16 [00:00<00:01,  6.68it/s][A
 44%|████▍     | 7/16 [00:01<00:01,  6.82it/s][A

Writing to CSV: done!



 50%|█████     | 8/16 [00:01<00:01,  6.87it/s][A
 56%|█████▋    | 9/16 [00:01<00:01,  6.64it/s][A
 62%|██████▎   | 10/16 [00:01<00:00,  6.11it/s][A
 69%|██████▉   | 11/16 [00:01<00:01,  4.79it/s][A

Writing to CSV: done!



 75%|███████▌  | 12/16 [00:02<00:01,  3.29it/s][A
 81%|████████▏ | 13/16 [00:02<00:01,  2.72it/s][A
 88%|████████▊ | 14/16 [00:03<00:01,  1.80it/s][A
 94%|█████████▍| 15/16 [00:05<00:00,  1.25it/s][A
100%|██████████| 16/16 [00:07<00:00,  2.02it/s][A
  3%|▎         | 1/34 [00:07<04:22,  7.95s/it]
  0%|          | 0/16 [00:00<?, ?it/s][A
  6%|▋         | 1/16 [00:00<00:02,  7.36it/s][A

Writing to CSV: done!



 12%|█▎        | 2/16 [00:00<00:01,  7.55it/s][A
 19%|█▉        | 3/16 [00:00<00:01,  7.58it/s][A
 25%|██▌       | 4/16 [00:00<00:01,  7.59it/s][A
 31%|███▏      | 5/16 [00:00<00:01,  7.29it/s][A
 38%|███▊      | 6/16 [00:00<00:01,  7.06it/s][A

Writing to CSV: done!



 44%|████▍     | 7/16 [00:00<00:01,  6.91it/s][A
 50%|█████     | 8/16 [00:01<00:01,  6.56it/s][A
 56%|█████▋    | 9/16 [00:01<00:01,  5.92it/s][A
 62%|██████▎   | 10/16 [00:01<00:01,  5.08it/s][A

Writing to CSV: done!



 69%|██████▉   | 11/16 [00:01<00:01,  4.23it/s][A
 75%|███████▌  | 12/16 [00:02<00:01,  3.04it/s][A
 81%|████████▏ | 13/16 [00:03<00:01,  2.56it/s][A
 88%|████████▊ | 14/16 [00:04<00:01,  1.73it/s][A
 94%|█████████▍| 15/16 [00:05<00:00,  1.22it/s][A

Writing to CSV: done!



100%|██████████| 16/16 [00:08<00:00,  1.97it/s][A
  6%|▌         | 2/34 [00:16<04:17,  8.05s/it]


In [None]:
df

In [25]:
str(skipped)

'[[1, 2], [3, 4], [5, 6]]'

In [27]:
skip_f = open("skipped.txt", "w")

In [28]:
skip_f.write(str(skipped) + "\n")
skip_f.close()

In [25]:
skipped

[[1000, 0, 32768],
 [2000, 0, 32768],
 [5000, 0, 32768],
 [10000, 0, 32768],
 [20000, 0, 32768],
 [50000, 0, 32768],
 [100, 0, 65519],
 [200, 0, 65519],
 [500, 0, 65519],
 [1000, 0, 65519],
 [2000, 0, 65519],
 [5000, 0, 65519],
 [10000, 0, 65519],
 [20000, 0, 65519],
 [50000, 0, 65519],
 [100, 24576, 32768],
 [200, 24576, 32768],
 [500, 24576, 32768],
 [1000, 24576, 32768],
 [2000, 24576, 32768],
 [5000, 24576, 32768],
 [10000, 24576, 32768],
 [20000, 24576, 32768],
 [50000, 24576, 32768],
 [100, 49152, 16384],
 [200, 49152, 16384],
 [500, 49152, 16384],
 [1000, 49152, 16384],
 [2000, 49152, 16384],
 [5000, 49152, 16384],
 [10000, 49152, 16384],
 [20000, 49152, 16384],
 [50000, 49152, 16384]]

In [26]:
df

Unnamed: 0,Scaled,N,Dimensions,K,Mean,Variance,abs_deviation_data,abs_deviation_eigv,rel_deviation_data,rel_deviation_eigv,time_16,time_32
0,True,100,100,3,0.0,1.0,2.133828e-05,0.000015,0.000547,0.015347,0.003768,0.003700
1,False,100,100,3,0.0,1.0,3.905979e-05,0.000002,0.000699,0.007553,0.003940,0.004084
2,True,200,100,3,0.0,1.0,7.224416e-05,0.000040,0.074825,0.002692,0.003808,0.003843
3,False,200,100,3,0.0,1.0,9.220442e-06,0.016511,1.383038,1.332179,0.003750,0.003757
4,True,500,100,3,0.0,1.0,1.104968e-04,0.003833,1.346836,1.353642,0.003776,0.003791
5,False,500,100,3,0.0,1.0,2.278328e-06,0.003493,2.000345,2.001043,0.004035,0.003965
6,True,1000,100,3,0.0,1.0,5.381965e-05,0.015442,2.005904,1.773590,0.004369,0.004352
7,False,1000,100,3,0.0,1.0,1.916567e-05,0.015419,2.005364,1.982639,0.003820,0.003846
8,True,2000,100,3,0.0,1.0,7.880147e-05,0.008761,2.001355,1.997978,0.003938,0.004101
9,False,2000,100,3,0.0,1.0,1.157888e-05,0.006496,1.999981,1.997767,0.003965,0.003888


In [121]:
df.to_csv("results.csv")

In [30]:
%run run_exp

  0%|          | 0/34 [00:00<?, ?it/s]
  0%|          | 0/16 [00:00<?, ?it/s][A
  6%|▋         | 1/16 [00:00<00:02,  6.01it/s][A

5440
Number of columns: 10



 12%|█▎        | 2/16 [00:00<00:02,  6.01it/s][A
 19%|█▉        | 3/16 [00:00<00:02,  6.03it/s][A
 25%|██▌       | 4/16 [00:00<00:01,  6.39it/s][A
 31%|███▏      | 5/16 [00:00<00:01,  6.97it/s][A
 38%|███▊      | 6/16 [00:00<00:01,  7.42it/s][A
 44%|████▍     | 7/16 [00:00<00:01,  7.74it/s][A
 50%|█████     | 8/16 [00:01<00:01,  7.91it/s][A
 56%|█████▋    | 9/16 [00:01<00:00,  7.71it/s][A
 62%|██████▎   | 10/16 [00:01<00:00,  7.46it/s][A
 69%|██████▉   | 11/16 [00:01<00:00,  6.85it/s][A
 75%|███████▌  | 12/16 [00:01<00:00,  5.48it/s][A
 81%|████████▏ | 13/16 [00:02<00:00,  4.66it/s][A
 88%|████████▊ | 14/16 [00:02<00:00,  3.07it/s][A
 94%|█████████▍| 15/16 [00:03<00:00,  1.85it/s][A
100%|██████████| 16/16 [00:05<00:00,  2.76it/s][A
  3%|▎         | 1/34 [00:05<03:11,  5.80s/it]
  0%|          | 0/16 [00:00<?, ?it/s][A
  6%|▋         | 1/16 [00:00<00:01,  9.19it/s][A
 12%|█▎        | 2/16 [00:00<00:01,  9.12it/s][A
 19%|█▉        | 3/16 [00:00<00:01,  9.06it/s][A
 25

 75%|███████▌  | 12/16 [00:01<00:00,  9.31it/s][A
 81%|████████▏ | 13/16 [00:01<00:00,  6.36it/s][A
 88%|████████▊ | 14/16 [00:01<00:00,  3.56it/s][A
 94%|█████████▍| 15/16 [00:03<00:00,  1.97it/s][A
100%|██████████| 16/16 [00:05<00:00,  3.16it/s][A
 38%|███▊      | 13/34 [01:26<02:21,  6.75s/it]
  0%|          | 0/16 [00:00<?, ?it/s][A
 12%|█▎        | 2/16 [00:00<00:00, 14.16it/s][A
 25%|██▌       | 4/16 [00:00<00:00, 14.04it/s][A
 38%|███▊      | 6/16 [00:00<00:00, 13.89it/s][A
 50%|█████     | 8/16 [00:00<00:00, 13.53it/s][A
 62%|██████▎   | 10/16 [00:00<00:00, 12.57it/s][A
 75%|███████▌  | 12/16 [00:01<00:00,  9.32it/s][A
 81%|████████▏ | 13/16 [00:01<00:00,  6.28it/s][A
 88%|████████▊ | 14/16 [00:01<00:00,  3.57it/s][A
 94%|█████████▍| 15/16 [00:03<00:00,  1.97it/s][A
100%|██████████| 16/16 [00:05<00:00,  3.16it/s][A
 41%|████      | 14/34 [01:31<02:04,  6.25s/it]
  0%|          | 0/16 [00:00<?, ?it/s][A
 12%|█▎        | 2/16 [00:00<00:00, 14.13it/s][A
 25%|██▌ 

KeyboardInterrupt: 