<h1>Numpy Structured Arrays</h1>

In [1]:
# Numpy's structured arrays and record arrays provide efficient storage for compound, heterogenous data. 
import numpy as np

In [2]:
# Three different categories of data on number of people say age, name and weight
name = ["Alice", "Bob", "Cathy", "Doug"]
age = [25,45,37,19]
weight = [55.0, 85.5, 68.0, 61.5]

# These three arrays are related but can not be identified. 

In [3]:
# Numpy can handle this through structured arrays which are arrays with compound data types:
x = np.zeros(4, dtype=int)

In [5]:
# Structured Array using a compound data type specification can be created as:
data = np.zeros(4, dtype={"names":("name","age","weight"),
                          "formats":("U10","i4","f8")})
print(data.dtype)

[('name', '<U10'), ('age', '<i4'), ('weight', '<f8')]


In [6]:
# Fill the array with values
data["name"] = name
data["age"] = age
data["weight"] = weight
print(data)

[('Alice', 25, 55. ) ('Bob', 45, 85.5) ('Cathy', 37, 68. )
 ('Doug', 19, 61.5)]


In [7]:
# Get All names
data["name"]

array(['Alice', 'Bob', 'Cathy', 'Doug'], dtype='<U10')

In [8]:
# Get First row of data
data[0]

('Alice', 25, 55.)

In [9]:
# Get the name from the last row
data[-1]["name"]

'Doug'

In [10]:
# Using Boolean masking this even allows to do more sophisticated operations such as filtering on age:
# Get names where age is under 30
data[data["age"] < 30]["name"]

array(['Alice', 'Doug'], dtype='<U10')

<h3>Creating Structured Arrays</h3>

In [11]:
# Using dtype - dictionary type
np.dtype({"names":("name","age","weight"),
          "formats":("U10","i4","f8")})

dtype([('name', '<U10'), ('age', '<i4'), ('weight', '<f8')])

In [12]:
# Numerical types can be specified with Python types or NumPy dtypes as:
np.dtype({"names":("name","age","weight"),
          "formats":((np.str_,10),int,np.float32)})

dtype([('name', '<U10'), ('age', '<i8'), ('weight', '<f4')])

In [13]:
# A compound type can also be specified as list of tuples
np.dtype([("name","U10"),("age","i4"),("weight","f8")])

dtype([('name', '<U10'), ('age', '<i4'), ('weight', '<f8')])

In [14]:
# If the names of the type do not matter to you, you can specify the types alone in a comma-separated string
np.dtype("S10,i4,f8")

dtype([('f0', 'S10'), ('f1', '<i4'), ('f2', '<f8')])

In [15]:
# Create a 3x3 floating point matrix
tp = np.dtype([("id","i8"),("mat","f8",(3,3))])
X = np.zeros(1, dtype=tp)
print(X[0])
print(X["mat"][0])

(0, [[0., 0., 0.], [0., 0., 0.], [0., 0., 0.]])
[[0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]]


In [16]:
# In record arrays fields can be accessed as attributes rather than as dictionary keys
data["age"]

array([25, 45, 37, 19], dtype=int32)

In [17]:
data_rec = data.view(np.recarray)
data_rec.age

array([25, 45, 37, 19], dtype=int32)

In [18]:
%timeit data["age"]
%timeit data_rec["age"]
%timeit data_rec.age

55.9 ns ± 0.602 ns per loop (mean ± std. dev. of 7 runs, 10,000,000 loops each)
816 ns ± 30.5 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)
1.35 µs ± 69.6 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)
