In [21]:
import pandas as pd
import numpy as np
import os

In [46]:
def generate_data(n, d, sigma):
    """
    Models relationship Y = XB + eps, where eps is standard normal.
    
    Returns numpy arrays of X and Y
    """
    X = np.random.random(size = (n, d))
    beta = np.random.random(size = (d, 1))
    eps = np.random.standard_normal(size = (n, 1)) * sigma
    
    Y = X@beta + eps
    
    return X, Y

In [47]:
def make_dir(name, remove_existing=True):
    if os.path.exists(name) and remove_existing:
        shutil.rmtree(name)
    if not os.path.exists(name):
        os.mkdir(name)

def create_dataset(n, d, sigma, dataset_name):
    """
    Generates Dataset and stores it in specified path under the data folder.
    
    Stores as a dataframe where labels are the Y column (the rest is part of X)
    """
    X, Y = generate_data(n, d, sigma)
    X_df = pd.DataFrame(X)
    X_df['Y'] = Y
    
    path = 'gendata/{}.csv'.format(dataset_name)
    print("Writing to csv at {}".format(path))
    X_df.to_csv(path, index=False)
    
    return X, Y

In [48]:
for i in range(100):
    n = np.random.randint(50, 100)
    d = np.random.randint(2, 10)
    sigma = np.random.random()
    create_dataset(n, d, sigma, 'example_dataset' + str(i + 1))

Writing to csv at gendata/example_dataset1.csv
Writing to csv at gendata/example_dataset2.csv
Writing to csv at gendata/example_dataset3.csv
Writing to csv at gendata/example_dataset4.csv
Writing to csv at gendata/example_dataset5.csv
Writing to csv at gendata/example_dataset6.csv
Writing to csv at gendata/example_dataset7.csv
Writing to csv at gendata/example_dataset8.csv
Writing to csv at gendata/example_dataset9.csv
Writing to csv at gendata/example_dataset10.csv
Writing to csv at gendata/example_dataset11.csv
Writing to csv at gendata/example_dataset12.csv
Writing to csv at gendata/example_dataset13.csv
Writing to csv at gendata/example_dataset14.csv
Writing to csv at gendata/example_dataset15.csv
Writing to csv at gendata/example_dataset16.csv
Writing to csv at gendata/example_dataset17.csv
Writing to csv at gendata/example_dataset18.csv
Writing to csv at gendata/example_dataset19.csv
Writing to csv at gendata/example_dataset20.csv
Writing to csv at gendata/example_dataset21.csv
W

In [36]:
X_df = pd.read_csv('gendata/erase_me_dataset.csv')
X_df
X_1, Y_1 = np.array(X_df.drop('Y', axis=1)), np.transpose(np.array([X_df['Y']]))

array([[-1.11022302e-16, -3.46944695e-18,  0.00000000e+00],
       [ 0.00000000e+00,  5.55111512e-17,  0.00000000e+00],
       [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [ 0.00000000e+00,  1.11022302e-16,  0.00000000e+00],
       [ 0.00000000e+00,  0.00000000e+00,  1.11022302e-16],
       [-2.77555756e-17,  0.00000000e+00,  0.00000000e+00],
       [-5.55111512e-17,  0.00000000e+00,  5.55111512e-17],
       [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [ 0.00000000e+00,  5.55111512e-17,  0.00000000e+00],
       [ 0.00000000e+00, -2.77555756e-17,  0.00000000e+00],
       [ 1.11022302e-16, -1.11022302e-16,  0.00000000e+00],
       [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [ 0.00000000e+00,  1.38777878e-17,  0.00000000e+00],
       [ 0.00000000e+00,  0.00000000e+00, -1.11022302e-16],
       [ 1.11022302e-16,  0.00000000e+00

In [30]:
n, d = 10, 5
X, Y = generate_data(n, d)
X, Y

(array([[0.34109845, 0.62115684, 0.88151336, 0.7296778 , 0.72633312],
        [0.02493885, 0.55719115, 0.31301837, 0.44702444, 0.20512388],
        [0.44025946, 0.84602163, 0.18412939, 0.75361602, 0.22314563],
        [0.30733253, 0.17496758, 0.97077532, 0.87751588, 0.03719081],
        [0.45783826, 0.33000581, 0.97598169, 0.09626396, 0.80891325],
        [0.90573312, 0.0948442 , 0.24743599, 0.32722026, 0.12684259],
        [0.1840283 , 0.50639223, 0.08824148, 0.80773569, 0.92304995],
        [0.59503357, 0.78875127, 0.40278723, 0.97114368, 0.02174225],
        [0.77562355, 0.92254702, 0.10751566, 0.80620005, 0.79584472],
        [0.56342556, 0.71701902, 0.08093584, 0.11833047, 0.57173621]]),
 array([[ 0.0946776 ],
        [-0.90367554],
        [ 1.4670011 ],
        [ 1.21300312],
        [-0.64912593],
        [ 1.64838266],
        [ 0.24215126],
        [ 0.57126584],
        [ 2.80949248],
        [-0.84963675]]))

AttributeError: module 'numpy' has no attribute 'loadz'