In [1]:
import numpy as np

## NumPy Set Operations

What is a Set

A set in mathematics is a collection of unique elements.

Sets are used for operations involving frequent intersection, union and difference operations.

Create Sets in NumPy

We can use NumPy's unique() method to find unique elements from any array. E.g. create a set array, but remember that the set arrays should only be 1-D arrays

In [2]:
arr = np.array([1, 1, 1, 2, 3, 4, 5, 5, 6, 7])

x = np.unique(arr)

x

array([1, 2, 3, 4, 5, 6, 7])

In [3]:
a = np.array([[1, 1], [2, 3]])

np.unique(a)

array([1, 2, 3])

In [4]:
a = np.array([[1, 0, 0], [1, 0, 0], [2, 3, 4]])

np.unique(a, axis=0)

array([[1, 0, 0],
       [2, 3, 4]])

In [5]:
a = np.array([[1, 0, 0], [1, 0, 0], [2, 3, 4]])

np.unique(a, axis=1)

array([[0, 0, 1],
       [0, 0, 1],
       [3, 4, 2]])

In [6]:
a = np.array(['a', 'b', 'b', 'c', 'a'])

u, indices = np.unique(a, return_index=True)

print(u)
print(indices)
print(a[indices])

['a' 'b' 'c']
[0 1 3]
['a' 'b' 'c']


In [7]:
a = np.array([1, 2, 6, 4, 2, 3, 2])

u, indices = np.unique(a, return_inverse=True)

print(u)
print(indices)
print(a[indices])

[1 2 3 4 6]
[0 1 4 3 1 2 1]
[1 2 2 4 2 6 2]


In [8]:
a = np.array([1, 2, 6, 4, 2, 3, 2])

values, counts = np.unique(a, return_counts=True)

print(values)
print(counts)
print(np.repeat(values, counts)) # original order not preserved

[1 2 3 4 6]
[1 3 1 1 1]
[1 2 2 2 3 4 6]


#### Finding Union

To find the unique values of two arrays, use the union1d() method.

In [9]:
arr1 = np.array([1, 2, 3, 4])
arr2 = np.array([3, 4, 5, 6])

newarr = np.union1d(arr1, arr2)

newarr

array([1, 2, 3, 4, 5, 6])

#### Finding Intersection

To find only the values that are present in both arrays, use the intersect1d() method.

In [10]:
arr1 = np.array([1, 2, 3, 4])
arr2 = np.array([3, 4, 5, 6])

newarr = np.intersect1d(arr1, arr2, assume_unique=True)

newarr

# Note: the intersect1d() method takes an optional argument assume_unique, which if set to True can speed up computation.
#       It should always be set to True when dealing with sets.

array([3, 4])

In [11]:
x = np.array([1, 1, 2, 3, 4])

y = np.array([2, 1, 4, 6])

xy, x_ind, y_ind = np.intersect1d(x, y, return_indices=True)

x_ind, y_ind

(array([0, 2, 4], dtype=int64), array([1, 0, 2], dtype=int64))

In [12]:
xy

array([1, 2, 4])

In [13]:
xy, x[x_ind], y[y_ind]

(array([1, 2, 4]), array([1, 2, 4]), array([1, 2, 4]))

#### Finding Difference

To find only the values in the first set that is NOT present in the seconds set, use the setdiff1d() method.

In [14]:
set1 = np.array([1, 2, 3, 4])
set2 = np.array([3, 4, 5, 6])

newarr = np.setdiff1d(set1, set2, assume_unique=True)

newarr

# Note: the setdiff1d() method takes an optional argument assume_unique, which if set to True can speed up computation. 
#       It should always be set to True when dealing with sets.

array([1, 2])

#### Finding Symmetric Difference

To find only the values that are NOT present in BOTH sets, use the setxor1d() method.

In [15]:
set1 = np.array([1, 2, 3, 4])
set2 = np.array([3, 4, 5, 6])

newarr = np.setxor1d(set1, set2, assume_unique=True)

newarr

# Note: the setxor1d() method takes an optional argument assume_unique, which if set to True can speed up computation. 
#       It should always be set to True when dealing with sets.

array([1, 2, 5, 6])

### in1d()

In [16]:
test = np.array([0, 1, 2, 5, 0])

states = [0, 2]

mask = np.in1d(test, states)

mask

array([ True, False,  True, False,  True])

In [17]:
test[mask]

array([0, 2, 0])

In [18]:
mask = np.in1d(test, states, invert=True)

print(mask)

[False  True False  True False]


In [19]:
test[mask]

array([1, 5])

### isin()

In [20]:
element = 2 * np.arange(4).reshape(2,2)

element

array([[0, 2],
       [4, 6]])

In [21]:
test_elements = np.array([1,2,4,8])

mask = np.isin(element, test_elements)

mask

array([[False,  True],
       [ True, False]])

In [22]:
element[mask]

array([2, 4])

In [23]:
mask2 = np.isin(element, test_elements, invert=True)
mask2

array([[ True, False],
       [False,  True]])

In [24]:
element[mask2]

array([0, 6])