In [1]:
# add by qinglin, for jupyter notebook
%reload_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from asa.dataset import Dataset

# construct dataset

In [3]:
x = np.random.normal(size=1000)
y = x**2 + np.random.normal(size=1000)
z = np.log(np.abs(x + y)) + np.random.normal(size=1000)

In [4]:
data = np.array([x, y, z]).T

In [5]:
Dataset(data)

Dataset summary:
  Data shape: (1000, 3)
  Names: ['x0' 'x1' 'x2']
  Labels: ['x0' 'x1' 'x2']

In [6]:
Dataset(data, names=['x', 'y', 'z'])

Dataset summary:
  Data shape: (1000, 3)
  Names: ['x' 'y' 'z']
  Labels: ['x' 'y' 'z']

In [7]:
Dataset(data, names=['x', 'y', 'z'], labels=['x_label', 'y_label', 'z_label'])

Dataset summary:
  Data shape: (1000, 3)
  Names: ['x' 'y' 'z']
  Labels: ['x_label' 'y_label' 'z_label']

In [9]:
data_df = pd.DataFrame(data, columns=['x', 'y', 'z'])

In [10]:
Dataset(data_df)

Dataset summary:
  Data shape: (1000, 3)
  Names: ['x' 'y' 'z']
  Labels: ['x' 'y' 'z']

In [14]:
data

array([[-0.51793212, -0.71270663, -0.92131538],
       [-1.02197153,  2.1776497 , -0.27653095],
       [-0.88197904,  1.33627553, -1.39967087],
       ...,
       [ 2.76257881,  6.74269627,  2.0512898 ],
       [-0.8594992 ,  0.44117064,  0.61781804],
       [-0.40038085,  0.3037769 , -4.38660254]])

In [12]:
Dataset(data_df).data

array([[-0.51793212, -0.71270663, -0.92131538],
       [-1.02197153,  2.1776497 , -0.27653095],
       [-0.88197904,  1.33627553, -1.39967087],
       ...,
       [ 2.76257881,  6.74269627,  2.0512898 ],
       [-0.8594992 ,  0.44117064,  0.61781804],
       [-0.40038085,  0.3037769 , -4.38660254]])

# get data by name

In [32]:
x = np.random.normal(size=1000)
y = x**2 + np.random.normal(size=1000)
z = np.log(np.abs(x + y)) + np.random.normal(size=1000)

In [5]:
data = np.array([x, y, z]).T

In [7]:
dataset = Dataset(data, ['x', 'y', 'z'], ['x', 'y', 'z'])

In [8]:
np.array_equal(dataset['x'], x)

True

In [9]:
np.array_equal(dataset[['x', 'y']], np.array([x, y]).T)

True

You can also use index

In [10]:
np.array_equal(dataset[:, 0], x)

True

In [11]:
np.array_equal(dataset[3:5, 'x'], x[3:5])

True

# summary

In [12]:
x = np.random.normal(size=1000)
y = x**2 + np.random.normal(size=1000)
z = np.log(np.abs(x + y)) + np.random.normal(size=1000)

In [13]:
data = np.array([x, y, z]).T

In [14]:
dataset = Dataset(data, ['x', 'y', 'z'], ['x', 'y', 'z'])

In [15]:
dataset

Dataset summary:
  Data shape: (1000, 3)
  Names: ['x' 'y' 'z']
  Labels: ['x' 'y' 'z']

In [16]:
print(dataset)

Dataset summary:
  Data shape: (1000, 3)
  Names: ['x' 'y' 'z']
  Labels: ['x' 'y' 'z']



In [17]:
dataset.summary()

Dataset summary:
  Data shape: (1000, 3)
  Names: ['x' 'y' 'z']
  Labels: ['x' 'y' 'z']



# add col

In [3]:
x = np.random.normal(size=1000)
y = x**2 + np.random.normal(size=1000)
z = np.log(np.abs(x + y)) + np.random.normal(size=1000)

In [4]:
data = np.array([x, y, z]).T

In [5]:
dataset = Dataset(data, ['x', 'y', 'z'], ['x', 'y', 'z'])

In [6]:
x2 = np.random.normal(size=1000)
y2 = 5 * x2

data2 = np.array([x2, y2]).T

In [7]:
dataset.data

array([[-0.1401337 , -0.40810026, -0.48702198],
       [-1.81610905,  4.3922999 ,  0.19602357],
       [ 0.9464675 ,  0.37467793,  0.18620735],
       ...,
       [ 0.75332711,  0.38082139,  0.77085572],
       [ 0.27339081,  0.30781424,  1.01091056],
       [ 0.58375604,  2.31084876,  0.23651232]])

In [8]:
dataset.add_col(data2, ['x2', 'y2'], ['x2', 'y2'])

In [9]:
dataset.data

array([[-0.1401337 , -0.40810026, -0.48702198, -0.1367507 , -0.6837535 ],
       [-1.81610905,  4.3922999 ,  0.19602357, -1.16223903, -5.81119517],
       [ 0.9464675 ,  0.37467793,  0.18620735,  0.91330371,  4.56651857],
       ...,
       [ 0.75332711,  0.38082139,  0.77085572,  0.23403476,  1.1701738 ],
       [ 0.27339081,  0.30781424,  1.01091056, -0.08156473, -0.40782364],
       [ 0.58375604,  2.31084876,  0.23651232,  0.23405227,  1.17026135]])

In [10]:
dataset.labels

array(['x', 'y', 'z', 'x2', 'y2'], dtype='<U2')

In [11]:
dataset.names

array(['x', 'y', 'z', 'x2', 'y2'], dtype='<U2')

In [12]:
dataset.add_col(x2, 'x3', 'x3')

In [13]:
dataset.data

array([[-0.1401337 , -0.40810026, -0.48702198, -0.1367507 , -0.6837535 ,
        -0.1367507 ],
       [-1.81610905,  4.3922999 ,  0.19602357, -1.16223903, -5.81119517,
        -1.16223903],
       [ 0.9464675 ,  0.37467793,  0.18620735,  0.91330371,  4.56651857,
         0.91330371],
       ...,
       [ 0.75332711,  0.38082139,  0.77085572,  0.23403476,  1.1701738 ,
         0.23403476],
       [ 0.27339081,  0.30781424,  1.01091056, -0.08156473, -0.40782364,
        -0.08156473],
       [ 0.58375604,  2.31084876,  0.23651232,  0.23405227,  1.17026135,
         0.23405227]])

In [14]:
dataset.names

array(['x', 'y', 'z', 'x2', 'y2', 'x3'], dtype='<U2')

In [15]:
dataset['x4'] = x

In [16]:
dataset.names

array(['x', 'y', 'z', 'x2', 'y2', 'x3', 'x4'], dtype='<U2')

In [18]:
np.array_equal(dataset['x4'], dataset['x'])

True

# add row

In [30]:
x = np.random.normal(size=2)
y = x**2 + np.random.normal(size=2)
z = np.log(np.abs(x + y)) + np.random.normal(size=2)

data = np.array([x, y, z]).T

In [31]:
dataset = Dataset(data, ['x', 'y', 'z'], ['x', 'y', 'z'])

In [32]:
dataset.data

array([[ 1.06480947,  1.86882949, -0.04833452],
       [-0.2536447 , -0.65262798,  0.28142592]])

In [33]:
dataset.add_row(data)

In [34]:
dataset.data

array([[ 1.06480947,  1.86882949, -0.04833452],
       [-0.2536447 , -0.65262798,  0.28142592],
       [ 1.06480947,  1.86882949, -0.04833452],
       [-0.2536447 , -0.65262798,  0.28142592]])

# delete col

In [50]:
x = np.random.normal(size=2)
y = x**2 + np.random.normal(size=2)
z = np.log(np.abs(x + y)) + np.random.normal(size=2)

data = np.array([x, y, z]).T

In [51]:
dataset = Dataset(data, ['x', 'y', 'z'], ['x', 'y', 'z'])

In [52]:
dataset.data

array([[ 0.44195621,  1.41335667,  0.91624984],
       [-0.01707149,  1.20553685,  0.89957627]])

In [53]:
dataset.del_col(1)

In [54]:
dataset.data

array([[ 0.44195621,  0.91624984],
       [-0.01707149,  0.89957627]])

In [55]:
dataset.summary()

Dataset summary:
  Data shape: (2, 2)
  Names: ['x' 'z']
  Labels: ['x' 'z']



In [56]:
dataset.del_col('x')

In [57]:
dataset.data

array([[0.91624984],
       [0.89957627]])

In [58]:
dataset.summary()

Dataset summary:
  Data shape: (2, 1)
  Names: ['z']
  Labels: ['z']



# delete row

In [36]:
x = np.random.normal(size=2)
y = x**2 + np.random.normal(size=2)
z = np.log(np.abs(x + y)) + np.random.normal(size=2)

data = np.array([x, y, z]).T

In [37]:
dataset = Dataset(data, ['x', 'y', 'z'], ['x', 'y', 'z'])

In [38]:
dataset.data

array([[-1.07477946,  1.27267112, -1.42611577],
       [ 0.39861649, -1.77036889, -0.05582315]])

In [41]:
dataset.del_row(1)

In [42]:
dataset.data

array([[-1.07477946,  1.27267112, -1.42611577]])