In [297]:
import numpy as np
import pandas as pd

Pandas object axis-labeling information:
- Identify data using known indicators
- Enable automatic and explicit data alignment
- Allow intuitive getting and setting of subsets of the data set

# Different choices for indexing

- `.loc`
  - Use with labels and Boolean arrays
  - Raises `KeyError` when items are not found
  - `.loc[a:b]` selects items in range `[a, b]`
- `.iloc`
  - Use with integer positions and Boolean arrays
  - Raises `IndexError` if indexer is out-of-bounds (Note that slicing allows out-of-bounds indexing)
  - `.iloc[a:b]` selects items in range `[a, b)`

In [298]:
# Create Series
s = pd.Series(range(5), index=list('abcde'))
s

a    0
b    1
c    2
d    3
e    4
dtype: int64

In [299]:
# Select Series elements by index labels
s.loc[['a', 'c', 'e']]

a    0
c    2
e    4
dtype: int64

In [300]:
# Create DataFrame
df = pd.DataFrame(
    np.arange(25).reshape(5, 5),
    index=list('abcde'),
    columns=list('vwxyz')
)

df

Unnamed: 0,v,w,x,y,z
a,0,1,2,3,4
b,5,6,7,8,9
c,10,11,12,13,14
d,15,16,17,18,19
e,20,21,22,23,24


In [301]:
# Select DataFrame elements by index labels and column labels
df.loc[['a', 'c', 'e'], ['v', 'x', 'z']]

Unnamed: 0,v,x,z
a,0,2,4
c,10,12,14
e,20,22,24


# Basics

`[]` indexing returns a lower-dimensional slice:
- `df[col]` returns a `Series` object
- `s[index]` returns a scalar value

In [302]:
# Create time series data
dates = pd.date_range('11/23/2024', periods=8)

df = pd.DataFrame(
    np.random.randn(8, 4),
    index=dates,
    columns=['A', 'B', 'C', 'D']
)

df

Unnamed: 0,A,B,C,D
2024-11-23,0.053506,-0.329728,-0.166678,0.651803
2024-11-24,-0.786621,0.489935,-0.711328,-1.89258
2024-11-25,0.396326,0.990433,-0.962693,0.909248
2024-11-26,-1.64019,-0.270767,1.757624,-0.020537
2024-11-27,-0.81674,0.037052,1.152154,-0.933777
2024-11-28,-1.076413,0.3549,1.203339,0.199647
2024-11-29,-0.802166,0.158411,-0.139283,1.431162
2024-11-30,1.837115,0.45301,0.447304,0.341906


In [303]:
# Select column from DataFrame
df['A']

2024-11-23    0.053506
2024-11-24   -0.786621
2024-11-25    0.396326
2024-11-26   -1.640190
2024-11-27   -0.816740
2024-11-28   -1.076413
2024-11-29   -0.802166
2024-11-30    1.837115
Freq: D, Name: A, dtype: float64

In [304]:
# Select columns from DataFrame
df[['B', 'A']]

Unnamed: 0,B,A
2024-11-23,-0.329728,0.053506
2024-11-24,0.489935,-0.786621
2024-11-25,0.990433,0.396326
2024-11-26,-0.270767,-1.64019
2024-11-27,0.037052,-0.81674
2024-11-28,0.3549,-1.076413
2024-11-29,0.158411,-0.802166
2024-11-30,0.45301,1.837115


# Attribute access
Index labels can be treated as attributes of a `Series` or `DataFrame` as long as:
- The label is a valid Python identifier
- The label is not an existing attribute/method of a `Series` or `DataFrame`

In [305]:
# Create Series and DataFrame
dfa = df.copy()
sa = pd.Series([1, 2, 3], index=list('abc'))

sa

a    1
b    2
c    3
dtype: int64

In [306]:
# Access Series index as attribute
sa.a

1

In [307]:
# Access DataFrame index as attribute
dfa.A

2024-11-23    0.053506
2024-11-24   -0.786621
2024-11-25    0.396326
2024-11-26   -1.640190
2024-11-27   -0.816740
2024-11-28   -1.076413
2024-11-29   -0.802166
2024-11-30    1.837115
Freq: D, Name: A, dtype: float64

# Slicing ranges

In [308]:
# Create Series
s = df['A']
s

2024-11-23    0.053506
2024-11-24   -0.786621
2024-11-25    0.396326
2024-11-26   -1.640190
2024-11-27   -0.816740
2024-11-28   -1.076413
2024-11-29   -0.802166
2024-11-30    1.837115
Freq: D, Name: A, dtype: float64

In [309]:
# Select first 5 elements of Series
s[:5]

2024-11-23    0.053506
2024-11-24   -0.786621
2024-11-25    0.396326
2024-11-26   -1.640190
2024-11-27   -0.816740
Freq: D, Name: A, dtype: float64

In [310]:
# Select every 2nd element of Series, from beginning
s[::2]

2024-11-23    0.053506
2024-11-25    0.396326
2024-11-27   -0.816740
2024-11-29   -0.802166
Freq: 2D, Name: A, dtype: float64

In [311]:
# Select every element of Series, from end
s[::-1]

2024-11-30    1.837115
2024-11-29   -0.802166
2024-11-28   -1.076413
2024-11-27   -0.816740
2024-11-26   -1.640190
2024-11-25    0.396326
2024-11-24   -0.786621
2024-11-23    0.053506
Freq: -1D, Name: A, dtype: float64

In [312]:
# Set first 5 elements of Series to 0
s2 = s.copy()
s2[:5] = 0

s2

2024-11-23    0.000000
2024-11-24    0.000000
2024-11-25    0.000000
2024-11-26    0.000000
2024-11-27    0.000000
2024-11-28   -1.076413
2024-11-29   -0.802166
2024-11-30    1.837115
Freq: D, Name: A, dtype: float64

In [313]:
# Select first 3 rows of DataFrame
df[:3]

Unnamed: 0,A,B,C,D
2024-11-23,0.053506,-0.329728,-0.166678,0.651803
2024-11-24,-0.786621,0.489935,-0.711328,-1.89258
2024-11-25,0.396326,0.990433,-0.962693,0.909248


In [314]:
# Select every row of DataFrame, from end   
df[::-1]

Unnamed: 0,A,B,C,D
2024-11-30,1.837115,0.45301,0.447304,0.341906
2024-11-29,-0.802166,0.158411,-0.139283,1.431162
2024-11-28,-1.076413,0.3549,1.203339,0.199647
2024-11-27,-0.81674,0.037052,1.152154,-0.933777
2024-11-26,-1.64019,-0.270767,1.757624,-0.020537
2024-11-25,0.396326,0.990433,-0.962693,0.909248
2024-11-24,-0.786621,0.489935,-0.711328,-1.89258
2024-11-23,0.053506,-0.329728,-0.166678,0.651803


# Selection by label

In [315]:
# Create a Series
s1 = pd.Series(np.random.randn(6), index=list('abcdef'))
s1

a    1.033015
b   -0.025466
c   -0.608378
d    1.053678
e   -0.083990
f   -1.167331
dtype: float64

In [316]:
# Access Series elements from index label 'c' to end
s1['c':]

c   -0.608378
d    1.053678
e   -0.083990
f   -1.167331
dtype: float64

In [317]:
# Access single Series element at index label 'b'
s1['b']

-0.02546578049366599

In [318]:
# Set Series elements from index label 'c' to end to 0
s1['c':] = 0
s1

a    1.033015
b   -0.025466
c    0.000000
d    0.000000
e    0.000000
f    0.000000
dtype: float64

In [319]:
# Create DataFrame
df1 = pd.DataFrame(
    np.random.randn(6, 4),
    index=list('abcdef'),
    columns=list('ABCD')
)

df1

Unnamed: 0,A,B,C,D
a,0.144774,1.318531,0.444937,0.799683
b,0.734821,-0.43108,2.29454,-0.180177
c,0.113966,-0.561439,0.028414,-1.679239
d,1.012484,-0.460037,-0.544399,-0.433653
e,0.366445,0.240408,1.163467,0.102139
f,-0.031556,1.069094,2.438397,0.193856


In [320]:
# Select DataFrame row with index label 'd', columns from labels 'A' to 'C'
df1.loc['d', 'A':'C']

A    1.012484
B   -0.460037
C   -0.544399
Name: d, dtype: float64

In [321]:
# Select DataFrame row with index label 'a'
df1.loc['a']

A    0.144774
B    1.318531
C    0.444937
D    0.799683
Name: a, dtype: float64

In [322]:
# Select DataFrame columns where the column value in row 'a' is greater than 0
df1.loc[:, df1.loc['a'] > 0]

Unnamed: 0,A,B,C,D
a,0.144774,1.318531,0.444937,0.799683
b,0.734821,-0.43108,2.29454,-0.180177
c,0.113966,-0.561439,0.028414,-1.679239
d,1.012484,-0.460037,-0.544399,-0.433653
e,0.366445,0.240408,1.163467,0.102139
f,-0.031556,1.069094,2.438397,0.193856


In [323]:
# Create Boolean mask with missing value
# This should be treated as False when used for indexing
mask = pd.array([True, False, True, False, pd.NA, False], dtype='boolean')
mask

<BooleanArray>
[True, False, True, False, <NA>, False]
Length: 6, dtype: boolean

In [324]:
# Apply mask to DataFrame
df1[mask]

Unnamed: 0,A,B,C,D
a,0.144774,1.318531,0.444937,0.799683
c,0.113966,-0.561439,0.028414,-1.679239


In [325]:
# Select the DataFrame element in row 'a' and column 'A'
df1.loc['a', 'A']

0.144774480500891

In [326]:
# Select the DataFrame element in row 'a' and column 'A'
df1.at['a', 'A']

0.144774480500891

## Slicing with labels

In [327]:
# Create Series with numerically unordered index
s = pd.Series(list('abcde'), index=[0, 3, 2, 5, 4])
s

0    a
3    b
2    c
5    d
4    e
dtype: object

In [328]:
# Select Series elements from index label 3 to 5, inclusive
s.loc[3:5]

3    b
2    c
5    d
dtype: object

In [329]:
# Attempt to select Series elements from index label 1 to 6, inclusive
try:
    s.loc[1:6]
except KeyError as e:
    print("KeyError:", e)

KeyError: 1


In [330]:
# Sort index, allowing for selection of any Series elements whose index labels are within the range,
# even if all values in the range are not present as index labels
s.sort_index().loc[1:6]

2    c
3    b
4    e
5    d
dtype: object

# Selection by position

In [331]:
# Create a Series
s1 = pd.Series(np.random.randn(5), index=list(range(0, 10, 2)))
s1

0   -0.420978
2   -1.216998
4   -1.670143
6   -0.320437
8   -2.026765
dtype: float64

In [332]:
# Select first 3 elements of Series
s1[:3]

0   -0.420978
2   -1.216998
4   -1.670143
dtype: float64

In [333]:
# Select 4th element of Series
s1.iloc[3]

-0.3204368440576932

In [334]:
# Set first 3 elements of Series to 0
s1.iloc[:3] = 0
s1

0    0.000000
2    0.000000
4    0.000000
6   -0.320437
8   -2.026765
dtype: float64

In [335]:
# Create DataFrame
df1 = pd.DataFrame(
    np.random.randn(6, 4),
    index=list(range(0, 12, 2)),
    columns=list(range(0, 8, 2))
)

df1

Unnamed: 0,0,2,4,6
0,0.210982,-2.066739,0.885642,0.342696
2,-1.610304,-0.454421,0.292959,0.353867
4,-0.926627,0.610492,-1.902717,0.390242
6,0.747152,-0.934537,1.772228,-0.030893
8,-0.331032,1.491119,-0.310732,-0.848662
10,0.067723,1.895417,0.204153,0.582477


In [336]:
# Select first 3 rows of DataFrame
df1.iloc[:3]

Unnamed: 0,0,2,4,6
0,0.210982,-2.066739,0.885642,0.342696
2,-1.610304,-0.454421,0.292959,0.353867
4,-0.926627,0.610492,-1.902717,0.390242


In [337]:
# Select 2nd through 5th rows, 3rd through 4th columns of DataFrame
df1.iloc[1:5, 2:4]

Unnamed: 0,4,6
2,0.292959,0.353867
4,-1.902717,0.390242
6,1.772228,-0.030893
8,-0.310732,-0.848662


In [338]:
# Select 2nd, 4th, and 6th rows and 2nd and 4th columns of DataFrame
df1.iloc[[1, 3, 5], [1, 3]]

Unnamed: 0,2,6
2,-0.454421,0.353867
6,-0.934537,-0.030893
10,1.895417,0.582477


In [339]:
# Select 2nd through 3rd rows and all columns of DataFrame
df1.iloc[1:3, :]

Unnamed: 0,0,2,4,6
2,-1.610304,-0.454421,0.292959,0.353867
4,-0.926627,0.610492,-1.902717,0.390242


In [340]:
# Select all rows and 2nd through 3rd columns of DataFrame
df1.iloc[:, 1:3]

Unnamed: 0,2,4
0,-2.066739,0.885642
2,-0.454421,0.292959
4,0.610492,-1.902717
6,-0.934537,1.772228
8,1.491119,-0.310732
10,1.895417,0.204153


In [341]:
# Select element at 2nd row and 2nd column of DataFrame
df1.iloc[1, 1]

-0.4544213186439711

In [342]:
# Select element at 2nd row and 2nd column of DataFrame
df1.iat[1, 1]

-0.4544213186439711

In [343]:
# Select 2nd row of DataFrame
df1.iloc[1]

0   -1.610304
2   -0.454421
4    0.292959
6    0.353867
Name: 2, dtype: float64

In [344]:
# Attempt to select rows that are out of bounds from DataFrame
try:
    df1.iloc[[4, 5, 6]]
except IndexError as e:
    print("IndexError:", e)

IndexError: positional indexers are out-of-bounds


In [345]:
# Attempt to select column that is out of bounds from DataFrame
try:
    df1.iloc[:, 4]
except IndexError as e:
    print("IndexError:", e)

IndexError: single positional indexer is out-of-bounds


# Selection by callable

In [346]:
# Create DataFrame
df1 = pd.DataFrame(
    np.random.randn(6, 4),
    index=list('abcdef'),
    columns=list('ABCD')
)

df1

Unnamed: 0,A,B,C,D
a,-0.662271,-1.739316,-0.082109,-0.294172
b,-0.436993,0.495054,-0.518555,1.206954
c,-1.360792,-0.564977,-0.005773,-1.434206
d,-1.388775,-1.273431,-1.032523,0.252298
e,-0.731756,1.16349,1.330073,-2.196376
f,-0.195956,-0.955559,1.403637,0.937393


In [347]:
# Select rows from DataFrame where value in column 'A' is greater than 0
# Select all columns from DataFrame
df1.loc[lambda df: df['A'] > 0, :]

Unnamed: 0,A,B,C,D


In [348]:
# Select all rows from DataFrame
# Select columns 'A' and 'B' from DataFrame
df1.loc[:, lambda df: ['A', 'B']]

Unnamed: 0,A,B
a,-0.662271,-1.739316
b,-0.436993,0.495054
c,-1.360792,-0.564977
d,-1.388775,-1.273431
e,-0.731756,1.16349
f,-0.195956,-0.955559


In [349]:
# Select all rows from DataFrame
# Select 1st and 2nd columns from DataFrame
df1.iloc[:, lambda df: [0, 1]]

Unnamed: 0,A,B
a,-0.662271,-1.739316
b,-0.436993,0.495054
c,-1.360792,-0.564977
d,-1.388775,-1.273431
e,-0.731756,1.16349
f,-0.195956,-0.955559


In [350]:
# Select 1st column from DataFrame
df1[lambda df: df.columns[0]]

a   -0.662271
b   -0.436993
c   -1.360792
d   -1.388775
e   -0.731756
f   -0.195956
Name: A, dtype: float64

In [351]:
# Select elements from column 'A' of DataFrame
# where element value is greater than 0
df1['A'].loc[lambda s: s > 0]

Series([], Name: A, dtype: float64)

# Combining positional and label-based indexing

In [352]:
# Create DataFrame
dfd = pd.DataFrame(
    {
        'A': [1, 2, 3],
        'B': [4, 5, 6]
    },
    index=list('abc')
)

dfd

Unnamed: 0,A,B
a,1,4
b,2,5
c,3,6


In [353]:
# Select 1st and 3rd rows, column 'A' of DataFrame
dfd.loc[dfd.index[[0, 2]], 'A']

a    1
c    3
Name: A, dtype: int64

In [354]:
# Select 1st and 3rd rows, column 'A' of DataFrame
dfd.iloc[[0, 2], dfd.columns.get_loc('A')]

a    1
c    3
Name: A, dtype: int64

In [355]:
# Select 1st and 3rd rows, columns 'A' and 'B' of DataFrame
dfd.iloc[[0, 2], dfd.columns.get_indexer(['A', 'B'])]

Unnamed: 0,A,B
a,1,4
c,3,6


## Reindexing

In [356]:
# Create Series
s = pd.Series([1, 2, 3])
s

0    1
1    2
2    3
dtype: int64

In [357]:
# Reindex Series
s.reindex([1, 2, 3])

1    2.0
2    3.0
3    NaN
dtype: float64

In [358]:
# Given a set of possible index labels,
# select elements from Series that are in the set
labels = [1, 2, 3]
s.loc[s.index.intersection(labels)]

1    2
2    3
dtype: int64

In [359]:
# Create a Series with duplicated index labels
s = pd.Series(np.arange(4), index=['a', 'a', 'b', 'c'])
s

a    0
a    1
b    2
c    3
dtype: int64

In [360]:
# Attempt to reindex Series with duplicated index labels
labels = ['c', 'd']
try:
    s.reindex(labels)
except ValueError as e:
    print("ValueError:", e)

ValueError: cannot reindex on an axis with duplicate labels


In [361]:
# Reindex Series with duplicated index labels
# by first selecting elements that are in the new set of index labels,
# and are not duplicated
s.loc[s.index.intersection(labels)].reindex(labels)

c    3.0
d    NaN
dtype: float64

In [362]:
# Attempt to reindex Series with duplicated index labels
# by first selecting elements that are in the new set of index labels,
# and are duplicated
labels = ['a', 'd']

try:
    s.loc[s.index.intersection(labels)].reindex(labels)
except ValueError as e:
    print("ValueError:", e)

ValueError: cannot reindex on an axis with duplicate labels


# Selecting random samples

In [363]:
# Create Series
s = pd.Series([0, 1, 2, 3, 4, 5])
s

0    0
1    1
2    2
3    3
4    4
5    5
dtype: int64

In [364]:
# Select random sample of 3 elements from Series
s.sample(n=3)

4    4
3    3
2    2
dtype: int64

In [365]:
# Select random sample of 50% of elements from Series
s.sample(frac=0.5)

3    3
4    4
0    0
dtype: int64

In [366]:
# Select random sample of 6 elements from Series without replacement
s.sample(n=6, replace=False)


4    4
5    5
0    0
3    3
2    2
1    1
dtype: int64

In [367]:
# Select random sample of 6 elements from Series with replacement
s.sample(n=6, replace=True)

4    4
1    1
0    0
3    3
0    0
5    5
dtype: int64

In [368]:
# Select random sample of 3 elements from Series,
# where each element is assigned a sampling weight
weights = [0, 0, 0.2, 0.2, 0.2, 0.4]
s.sample(n=3, weights=weights)

5    5
3    3
2    2
dtype: int64

In [369]:
# Select random sample of 3 elements from Series,
# where each element is assigned a sampling weight
# and sample weights to not sum to 1 -- they are automatically normalized when sampling
weights = [0.5, 0, 0, 0, 0, 0]
s.sample(n=1, weights=weights)

0    0
dtype: int64

In [370]:
# Create DataFrame
df2 = pd.DataFrame({
    'col1': [9, 8, 7, 6],
    'weight_column': [0.1, 0.2, 0.3, 0.4]
})

df2

Unnamed: 0,col1,weight_column
0,9,0.1
1,8,0.2
2,7,0.3
3,6,0.4


In [371]:
# Select random sample of 3 rows from DataFrame,
# using values from another column as sampling weights
df2.sample(n=3, weights='weight_column')

Unnamed: 0,col1,weight_column
3,6,0.4
1,8,0.2
2,7,0.3


In [372]:
# Create DataFrame
df3 = pd.DataFrame({
    'col1': [1, 2, 3],
    'col2': [2, 3, 4]
})

df3

Unnamed: 0,col1,col2
0,1,2
1,2,3
2,3,4


In [373]:
# Select random sample of 1 column from DataFrame
df3.sample(axis=1)

Unnamed: 0,col2
0,2
1,3
2,4


In [374]:
# Create DataFrame
df4 = pd.DataFrame({
    'col1': [1, 2, 3],
    'col2': [2, 3, 4]
})

df4

Unnamed: 0,col1,col2
0,1,2
1,2,3
2,3,4


In [375]:
# Select random sample of 2 rows from DataFrame,
# using a random_state seed for reproducibility
df4.sample(n=2, random_state=1)

Unnamed: 0,col1,col2
0,1,2
2,3,4


## Setting with enlargement

In [376]:
# Create Series
se = pd.Series([1, 2, 3])
se

0    1
1    2
2    3
dtype: int64

In [377]:
# Select non-existent index label from Series in order to add it
se.loc[5] = 5.0
se

0    1.0
1    2.0
2    3.0
5    5.0
dtype: float64

In [378]:
# Create DataFrame
dfi = pd.DataFrame(np.arange(6).reshape(3, 2), columns=['A', 'B'])
dfi

Unnamed: 0,A,B
0,0,1
1,2,3
2,4,5


In [379]:
# Select non-existent column label from DataFrame in order to add it
dfi.loc[:, 'C'] = dfi.loc[:, 'A']
dfi

Unnamed: 0,A,B,C
0,0,1,0
1,2,3,2
2,4,5,4


In [380]:
# Select nonexistent row label from DataFrame in order to add it
dfi.loc[3] = 5.0
dfi

Unnamed: 0,A,B,C
0,0.0,1.0,0.0
1,2.0,3.0,2.0
2,4.0,5.0,4.0
3,5.0,5.0,5.0


# Fast scalar value getting and setting

In [381]:
# Select 6th element of Series
s.iat[5]

5

In [382]:
# Select DataFrame element at 6th index label, column 'A'
df.at[dates[5], 'A']

-1.0764127281499365

In [383]:
# Select DataFrame element at 4th row, 1st column
df.iat[3, 0]

-1.6401899268645432

In [384]:
# Set value of DataFrame element at 6th index label, non-existent column 'E'
# Set value of DataFrame element at 4th row, 1st column
df.at[dates[5], 'E'] = 7
df.iat[3, 0] = 7
df

Unnamed: 0,A,B,C,D,E
2024-11-23,0.053506,-0.329728,-0.166678,0.651803,
2024-11-24,-0.786621,0.489935,-0.711328,-1.89258,
2024-11-25,0.396326,0.990433,-0.962693,0.909248,
2024-11-26,7.0,-0.270767,1.757624,-0.020537,
2024-11-27,-0.81674,0.037052,1.152154,-0.933777,
2024-11-28,-1.076413,0.3549,1.203339,0.199647,7.0
2024-11-29,-0.802166,0.158411,-0.139283,1.431162,
2024-11-30,1.837115,0.45301,0.447304,0.341906,


In [385]:
# Select DataFrame element at non-existent index and non-existent column
df.at[dates[-1] + pd.Timedelta('1 day'), 0] = 7
df

Unnamed: 0,A,B,C,D,E,0
2024-11-23,0.053506,-0.329728,-0.166678,0.651803,,
2024-11-24,-0.786621,0.489935,-0.711328,-1.89258,,
2024-11-25,0.396326,0.990433,-0.962693,0.909248,,
2024-11-26,7.0,-0.270767,1.757624,-0.020537,,
2024-11-27,-0.81674,0.037052,1.152154,-0.933777,,
2024-11-28,-1.076413,0.3549,1.203339,0.199647,7.0,
2024-11-29,-0.802166,0.158411,-0.139283,1.431162,,
2024-11-30,1.837115,0.45301,0.447304,0.341906,,
2024-12-01,,,,,,7.0


# Boolean indexing

In [386]:
# Create Series
s = pd.Series(range(-3, 4))
s

0   -3
1   -2
2   -1
3    0
4    1
5    2
6    3
dtype: int64

In [387]:
# Select Series elements that are greater than 0
s[s > 0]

4    1
5    2
6    3
dtype: int64

In [388]:
# Select Series element that are less than -1 or greater than 0.5
s[(s < -1) | (s > 0.5)]

0   -3
1   -2
4    1
5    2
6    3
dtype: int64

In [389]:
# Select Series elements that are not less than 0
s[~(s < 0)]

3    0
4    1
5    2
6    3
dtype: int64

In [390]:
# Select DataFrame rows where elements in column 'A' are greater than 0
df[df['A'] > 0]

Unnamed: 0,A,B,C,D,E,0
2024-11-23,0.053506,-0.329728,-0.166678,0.651803,,
2024-11-25,0.396326,0.990433,-0.962693,0.909248,,
2024-11-26,7.0,-0.270767,1.757624,-0.020537,,
2024-11-30,1.837115,0.45301,0.447304,0.341906,,


In [391]:
# Create DataFrame
df2 = pd.DataFrame({
    'a': ['one', 'one', 'two', 'three', 'two', 'one', 'six'],
    'b': ['x', 'y', 'y', 'x', 'y', 'x', 'x'],
    'c': np.random.randn(7)
})

df2

Unnamed: 0,a,b,c
0,one,x,0.607626
1,one,y,0.212661
2,two,y,-0.763687
3,three,x,-0.720774
4,two,y,1.243662
5,one,x,1.631732
6,six,x,0.560591


In [392]:
# Create mask to select DataFrame rows where the elements in column 'a' start with the letter 't'
mask = df2['a'].map(lambda x: x.startswith('t'))
mask

0    False
1    False
2     True
3     True
4     True
5    False
6    False
Name: a, dtype: bool

In [393]:
# Select DataFrame rows where the elements in column 'a' start with the letter 't'
df2[mask]

Unnamed: 0,a,b,c
2,two,y,-0.763687
3,three,x,-0.720774
4,two,y,1.243662


In [394]:
# Select DataFrame rows where the elements in column 'a' start with the letter 't'
df2[[x.startswith('t') for x in df2['a']]]

Unnamed: 0,a,b,c
2,two,y,-0.763687
3,three,x,-0.720774
4,two,y,1.243662


In [395]:
# Select DataFrame rows where the elements in column 'a' start with the letter 't',
# and the elements in column 'b' are equal to 'x'
df2[mask & (df2['b'] == 'x')]

Unnamed: 0,a,b,c
3,three,x,-0.720774


In [396]:

# Select DataFrame rows:
#   Elements in column 'a' start with the letter 't'
#   Elements in column 'b' are equal to 'x'
# Select DataFrame columns 'b' through 'c'
df2.loc[mask & (df2['b'] == 'x'), 'b':'c']

Unnamed: 0,b,c
3,x,-0.720774


In [397]:
# Create DataFrame
df = pd.DataFrame(
    [
        [1, 2],
        [3, 4],
        [5, 6]
    ],
    index=list('abc'),
    columns=['A', 'B']
)

df

Unnamed: 0,A,B
a,1,2
b,3,4
c,5,6


In [398]:
# Create Boolean Series of where elements in DataFrame column 'A' are greater than 2
s = (df['A'] > 2)
s

a    False
b     True
c     True
Name: A, dtype: bool

In [399]:
# Select DataFrame rows where elements in column 'A' are greater than 2
# Select DataFrame column 'B'
df.loc[s, 'B']

b    4
c    6
Name: B, dtype: int64

In [400]:
# Select DataFrame rows where elements in column 'A' are greater than 2
# Select DataFrame column 'B'
df.iloc[s.values, 1]

b    4
c    6
Name: B, dtype: int64

In [401]:
# Attempt to select DataFrame rows where elements in column 'A' are greater than 2
# Attempt to select DataFrame column 'B'
# iloc works only with Boolean arrays, not Boolean Series
try:
    df.iloc[s, 1]
except ValueError as e:
    print("ValueError:", e)

ValueError: Location based indexing can only have [integer, integer slice (START point is INCLUDED, END point is EXCLUDED), listlike of integers, boolean array] types


# Indexing with `isin`

In [402]:
# Create Series
s = pd.Series(np.arange(5), index=np.arange(5)[::-1], dtype='int64')
s

4    0
3    1
2    2
1    3
0    4
dtype: int64

In [403]:
# Find Series elements that are in [2, 4, 6]
s.isin([2, 4, 6])

4    False
3    False
2     True
1    False
0     True
dtype: bool

In [404]:
# Select Series elements that are in [2, 4, 6]
s[s.isin([2, 4, 6])]

2    2
0    4
dtype: int64

In [405]:
# Select Series elements whose index labels are in [2, 4, 6]
s[s.index.isin([2, 4, 6])]

4    0
2    2
dtype: int64

In [406]:
# Create MultiIndex Series
s_mi = pd.Series(
    np.arange(6),
    index=pd.MultiIndex.from_product([
        [0, 1],
        ['a', 'b', 'c']
    ])
)

s_mi

0  a    0
   b    1
   c    2
1  a    3
   b    4
   c    5
dtype: int64

In [407]:
# Select Series elements where index label is in [(1, 'a'), (2, 'b'), (0, 'c')]
s_mi.iloc[s_mi.index.isin([(1, 'a'), (2, 'b'), (0, 'c')])]

0  c    2
1  a    3
dtype: int64

In [408]:
# Select Series elements where lower level index label is in ['a', 'c', 'e']
s_mi.iloc[s_mi.index.isin(['a', 'c', 'e'], level=1)]

0  a    0
   c    2
1  a    3
   c    5
dtype: int64

In [409]:
# Create DataFrame
df = pd.DataFrame({
    'vals': [1, 2, 3, 4],
    'ids': ['a', 'b', 'f', 'n'],
    'ids2': ['a', 'n', 'c', 'n']
})

df

Unnamed: 0,vals,ids,ids2
0,1,a,a
1,2,b,n
2,3,f,c
3,4,n,n


In [410]:
# Find where DataFrame elements are in ['a', 'b', 1, 3]
df.isin(['a', 'b', 1, 3])

Unnamed: 0,vals,ids,ids2
0,True,True,True
1,False,True,False
2,True,False,False
3,False,False,False


In [411]:
# Find where DataFrame elements are
#   In column 'ids', in ['a', 'b']
#   In column 'vals', in [1, 3]
df.isin({'ids': ['a', 'b'], 'vals': [1, 3]})

Unnamed: 0,vals,ids,ids2
0,True,True,False
1,False,True,False
2,True,False,False
3,False,False,False


In [412]:
# Select rows of DataFrame that meet all of the following criteria:
#  Elements in column 'ids' are in ['a', 'b']
#  Elements in column 'ids2' are in ['a', 'c']
#  Elements in column 'vals' are in [1, 3]

values = {
    'ids': ['a', 'b'],
    'ids2': ['a', 'c'],
    'vals': [1, 3]
}

df[df.isin(values).all(1)]

Unnamed: 0,vals,ids,ids2
0,1,a,a


# The `where()` method and masking

In [413]:
# Select elements of Series that are greater than 0
s[s > 0]

3    1
2    2
1    3
0    4
dtype: int64

In [414]:
# Select elements of Series that are greater than 0,
# returning a Series with the same shape as the original
s.where(s > 0)

4    NaN
3    1.0
2    2.0
1    3.0
0    4.0
dtype: float64

In [415]:
# Create time series DataFrame with random values
dates = pd.date_range('11/23/2024', periods=8)
df = pd.DataFrame(
    np.random.randn(8, 4),
    index=dates,
    columns=['A', 'B', 'C', 'D']
)

df

Unnamed: 0,A,B,C,D
2024-11-23,1.59427,-1.461777,1.291376,0.502514
2024-11-24,-1.892424,0.992312,0.665586,-0.031587
2024-11-25,1.257704,0.507553,-0.015015,-0.885718
2024-11-26,0.838313,-0.805702,0.790627,-1.348403
2024-11-27,0.301127,-0.309424,-1.674621,-1.735592
2024-11-28,3.09364,0.280027,0.272086,-0.129587
2024-11-29,1.043595,-1.362946,0.79956,-1.491483
2024-11-30,-0.84265,0.107161,-0.058458,-0.586281


In [416]:
# Select elements of DataFrame that are less than 0,
# returning a DataFrame with the same shape as the original
df[df < 0]

Unnamed: 0,A,B,C,D
2024-11-23,,-1.461777,,
2024-11-24,-1.892424,,,-0.031587
2024-11-25,,,-0.015015,-0.885718
2024-11-26,,-0.805702,,-1.348403
2024-11-27,,-0.309424,-1.674621,-1.735592
2024-11-28,,,,-0.129587
2024-11-29,,-1.362946,,-1.491483
2024-11-30,-0.84265,,-0.058458,-0.586281


In [417]:
# Select elements of DataFrame that are less than 0
# Replace elements that are not less than 0 with their negatives
df.where(df < 0, -df)

Unnamed: 0,A,B,C,D
2024-11-23,-1.59427,-1.461777,-1.291376,-0.502514
2024-11-24,-1.892424,-0.992312,-0.665586,-0.031587
2024-11-25,-1.257704,-0.507553,-0.015015,-0.885718
2024-11-26,-0.838313,-0.805702,-0.790627,-1.348403
2024-11-27,-0.301127,-0.309424,-1.674621,-1.735592
2024-11-28,-3.09364,-0.280027,-0.272086,-0.129587
2024-11-29,-1.043595,-1.362946,-0.79956,-1.491483
2024-11-30,-0.84265,-0.107161,-0.058458,-0.586281


In [418]:
# Set elements of Series that are less than 0 to 0
s2 = s.copy()
s2[s2 < 0] = 0
s2

4    0
3    1
2    2
1    3
0    4
dtype: int64

In [419]:
# Set elements of DataFrame that are less than 0 to 0
df2 = df.copy()
df2[df2 < 0] = 0
df2

Unnamed: 0,A,B,C,D
2024-11-23,1.59427,0.0,1.291376,0.502514
2024-11-24,0.0,0.992312,0.665586,0.0
2024-11-25,1.257704,0.507553,0.0,0.0
2024-11-26,0.838313,0.0,0.790627,0.0
2024-11-27,0.301127,0.0,0.0,0.0
2024-11-28,3.09364,0.280027,0.272086,0.0
2024-11-29,1.043595,0.0,0.79956,0.0
2024-11-30,0.0,0.107161,0.0,0.0


In [420]:
# Set elements in the 2nd through 4th rows of the DataFrame that are less than 0 to 3
df2 = df.copy()
df2[df2[1:4] < 0] = 3
df2

Unnamed: 0,A,B,C,D
2024-11-23,1.59427,-1.461777,1.291376,0.502514
2024-11-24,3.0,0.992312,0.665586,3.0
2024-11-25,1.257704,0.507553,3.0,3.0
2024-11-26,0.838313,3.0,0.790627,3.0
2024-11-27,0.301127,-0.309424,-1.674621,-1.735592
2024-11-28,3.09364,0.280027,0.272086,-0.129587
2024-11-29,1.043595,-1.362946,0.79956,-1.491483
2024-11-30,-0.84265,0.107161,-0.058458,-0.586281


In [421]:
# Set elements of DataFrame that are greater to 0 to the element in the same row from column 'A'
df2 = df.copy()
df2.where(df2 > 0, df2['A'], axis='index')

Unnamed: 0,A,B,C,D
2024-11-23,1.59427,1.59427,1.291376,0.502514
2024-11-24,-1.892424,0.992312,0.665586,-1.892424
2024-11-25,1.257704,0.507553,1.257704,1.257704
2024-11-26,0.838313,0.838313,0.790627,0.838313
2024-11-27,0.301127,0.301127,0.301127,0.301127
2024-11-28,3.09364,0.280027,0.272086,3.09364
2024-11-29,1.043595,1.043595,0.79956,1.043595
2024-11-30,-0.84265,0.107161,-0.84265,-0.84265


In [422]:
# Set elements of DataFrame that are greater to 0 to the element in the same row from column 'A'
df2 = df.copy()
df.apply(lambda x, y: x.where(x > 0, y), y=df['A'])

Unnamed: 0,A,B,C,D
2024-11-23,1.59427,1.59427,1.291376,0.502514
2024-11-24,-1.892424,0.992312,0.665586,-1.892424
2024-11-25,1.257704,0.507553,1.257704,1.257704
2024-11-26,0.838313,0.838313,0.790627,0.838313
2024-11-27,0.301127,0.301127,0.301127,0.301127
2024-11-28,3.09364,0.280027,0.272086,3.09364
2024-11-29,1.043595,1.043595,0.79956,1.043595
2024-11-30,-0.84265,0.107161,-0.84265,-0.84265


In [423]:
# Create DataFrame
df3 = pd.DataFrame({
    'A': [1, 2, 3],
    'B': [4, 5, 6],
    'C': [7, 8, 9]
})

df3

Unnamed: 0,A,B,C
0,1,4,7
1,2,5,8
2,3,6,9


In [424]:
# Select elements of DataFrame that are greater than 4
# If element is not greater than 4, add 10 to it
df3.where(lambda x: x > 4, lambda x: x + 10)

Unnamed: 0,A,B,C
0,11,14,7
1,12,5,8
2,13,6,9


# Mask

In [425]:
# Mask out elements of Series that are greater than or equal to 0
s.mask(s >= 0)

4   NaN
3   NaN
2   NaN
1   NaN
0   NaN
dtype: float64

In [426]:
# Mask out elements of DataFrame that are greater than or equal to 0
df.mask(df >= 0)

Unnamed: 0,A,B,C,D
2024-11-23,,-1.461777,,
2024-11-24,-1.892424,,,-0.031587
2024-11-25,,,-0.015015,-0.885718
2024-11-26,,-0.805702,,-1.348403
2024-11-27,,-0.309424,-1.674621,-1.735592
2024-11-28,,,,-0.129587
2024-11-29,,-1.362946,,-1.491483
2024-11-30,-0.84265,,-0.058458,-0.586281


# Setting with enlargement conditionally using `numpy`

In [427]:
# Create DataFrame
df = pd.DataFrame({
    'col1': list('ABBC'),
    'col2': list('ZZXY')
})

df

Unnamed: 0,col1,col2
0,A,Z
1,B,Z
2,B,X
3,C,Y


In [428]:
# Add column 'color' to DataFrame
#  'color' is 'green' where 'col2' is 'Z'
#  'color' is 'red' where 'col2' is not 'Z'
df['color'] = np.where(df['col2'] == 'Z', 'green', 'red')
df

Unnamed: 0,col1,col2,color
0,A,Z,green
1,B,Z,green
2,B,X,red
3,C,Y,red


In [429]:
# Add column 'color' to DataFrame
#  'color' is 'yellow' where 'col2' is 'Z' and 'col1' is 'A'
#  'color' is 'blue' where 'col2' is 'Z' and 'col1' is 'B'
#  'color' is 'purple' where 'col2' is not 'Z' and 'col1' is 'B'
#  'color' is 'black' otherwise
conditions = [
    (df['col2'] == 'Z') & (df['col1'] == 'A'),
    (df['col2'] == 'Z') & (df['col1'] == 'B'),
    (df['col1'] == 'B')
]

colors = ['yellow', 'blue', 'purple']

df['color'] = np.select(conditions, colors, default='black')
df

Unnamed: 0,col1,col2,color
0,A,Z,yellow
1,B,Z,blue
2,B,X,purple
3,C,Y,black


# The `query()` method

In [430]:
# Create DataFrame
df = pd.DataFrame(np.random.rand(10, 3), columns=list('abc'))
df

Unnamed: 0,a,b,c
0,0.338541,0.561988,0.746268
1,0.193515,0.777228,0.070122
2,0.945325,0.292075,0.524396
3,0.67225,0.533822,0.672933
4,0.777947,0.048854,0.234949
5,0.165195,0.313384,0.600554
6,0.394224,0.096861,0.651177
7,0.530717,0.32582,0.960789
8,0.585595,0.586352,0.988608
9,0.471784,0.220477,0.724462


In [431]:
# Select rows of DataFrame where:
#  Element in column 'a' is less than element in column 'b'
#  Element in column 'b' is less than element in column 'c'
df[(df['a'] < df['b']) & (df['b'] < df['c'])]

Unnamed: 0,a,b,c
0,0.338541,0.561988,0.746268
5,0.165195,0.313384,0.600554
8,0.585595,0.586352,0.988608


In [432]:
# Select rows of DataFrame where:
#  Element in column 'a' is less than element in column 'b'
#  Element in column 'b' is less than element in column 'c'
df.query('(a < b) & (b < c)')

Unnamed: 0,a,b,c
0,0.338541,0.561988,0.746268
5,0.165195,0.313384,0.600554
8,0.585595,0.586352,0.988608


In [433]:
# Time first method
%timeit df[(df['a'] < df['b']) & (df['b'] < df['c'])]

207 μs ± 4.66 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [434]:
# Time second method
%timeit df.query('(a < b) & (b < c)')

1.09 ms ± 17.2 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [435]:
# Create DataFrame
df = pd.DataFrame(np.random.randint(5, size=(10, 2)), columns=list('bc'))
df.index.name = 'a'
df

Unnamed: 0_level_0,b,c
a,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0,0
1,3,2
2,0,4
3,2,4
4,2,0
5,4,2
6,0,1
7,3,3
8,4,1
9,3,3


In [436]:
# Select rows of DataFrame where:
#  Element in column 'a' or index 'a' is less than element in column 'b'
#  Element in column 'b' is less than 'c'
df.query('a < b and b < c')

Unnamed: 0_level_0,b,c
a,Unnamed: 1_level_1,Unnamed: 2_level_1


In [437]:
# Select rows of DataFrame where:
#  Index label is less than element in column 'b'
#  Element in column 'b' is less than 'c'
df.query('index < b < c')

Unnamed: 0_level_0,b,c
a,Unnamed: 1_level_1,Unnamed: 2_level_1


## `MultiIndex` `query()` syntax

In [438]:
# Create MultiIndex DataFrame
colors = np.random.choice(['red', 'green'], size=10)
foods = np.random.choice(['eggs', 'ham'], size=10)

index = pd.MultiIndex.from_arrays([colors, foods], names=['color', 'food'])
df = pd.DataFrame(np.random.randn(10, 2), index=index)

df

Unnamed: 0_level_0,Unnamed: 1_level_0,0,1
color,food,Unnamed: 2_level_1,Unnamed: 3_level_1
red,ham,-0.215649,0.32871
green,ham,-1.255477,0.87489
green,eggs,0.392549,0.584647
red,ham,-0.429773,-2.009907
red,ham,-0.046501,0.619881
red,ham,0.005684,0.345676
green,eggs,-0.028411,1.429687
green,eggs,1.289009,1.013079
green,eggs,0.790932,0.11456
red,ham,-0.824333,1.097696


In [439]:
# Select rows in DataFrame where color index label is 'red'
df.query('color == "red"')

Unnamed: 0_level_0,Unnamed: 1_level_0,0,1
color,food,Unnamed: 2_level_1,Unnamed: 3_level_1
red,ham,-0.215649,0.32871
red,ham,-0.429773,-2.009907
red,ham,-0.046501,0.619881
red,ham,0.005684,0.345676
red,ham,-0.824333,1.097696


In [440]:
# Unname MultiIndex
df.index.names = [None, None]
df

Unnamed: 0,Unnamed: 1,0,1
red,ham,-0.215649,0.32871
green,ham,-1.255477,0.87489
green,eggs,0.392549,0.584647
red,ham,-0.429773,-2.009907
red,ham,-0.046501,0.619881
red,ham,0.005684,0.345676
green,eggs,-0.028411,1.429687
green,eggs,1.289009,1.013079
green,eggs,0.790932,0.11456
red,ham,-0.824333,1.097696


In [441]:
# Select rows in DataFrame where first level index label is 'red'
df.query('ilevel_0 == "red"')

Unnamed: 0,Unnamed: 1,0,1
red,ham,-0.215649,0.32871
red,ham,-0.429773,-2.009907
red,ham,-0.046501,0.619881
red,ham,0.005684,0.345676
red,ham,-0.824333,1.097696


## `query()` use cases

In [442]:
# Create DataFrame
df = pd.DataFrame(np.random.rand(10, 3), columns=list('abc'))
df

Unnamed: 0,a,b,c
0,0.01839,0.779809,0.3216
1,0.081392,0.004852,0.804148
2,0.441538,0.899199,0.300364
3,0.161822,0.810387,0.141125
4,0.516705,0.858766,0.392036
5,0.682823,0.98369,0.587742
6,0.394142,0.158287,0.293924
7,0.619541,0.062707,0.048213
8,0.273109,0.876349,0.335721
9,0.364634,0.911841,0.026718


In [443]:
# Create DataFrame
df2 = pd.DataFrame(np.random.rand(12, 3), columns=list('abc'))
df2

Unnamed: 0,a,b,c
0,0.690252,0.351333,0.209075
1,0.162583,0.342591,0.319274
2,0.319167,0.057344,0.274487
3,0.836412,0.742702,0.770984
4,0.861461,0.695888,0.247938
5,0.694172,0.212969,0.292064
6,0.248094,0.025402,0.515271
7,0.26724,0.292328,0.031347
8,0.481838,0.523315,0.554651
9,0.235157,0.498505,0.270159


In [444]:
# Perform query on DataFrames that have shared index/column labels
for result in map(lambda frame: frame.query('a > b'), [df, df2]):
    print(result)

          a         b         c
1  0.081392  0.004852  0.804148
6  0.394142  0.158287  0.293924
7  0.619541  0.062707  0.048213
           a         b         c
0   0.690252  0.351333  0.209075
2   0.319167  0.057344  0.274487
3   0.836412  0.742702  0.770984
4   0.861461  0.695888  0.247938
5   0.694172  0.212969  0.292064
6   0.248094  0.025402  0.515271
11  0.678008  0.567196  0.165122


## `query()` Python vs pandas syntax comparison

In [445]:
# Create DataFrame
df = pd.DataFrame(np.random.randint(10, size=(10, 3)), columns=list('abc'))
df

Unnamed: 0,a,b,c
0,5,9,3
1,9,8,9
2,3,3,9
3,5,4,6
4,6,4,8
5,1,0,3
6,8,6,1
7,7,6,5
8,2,6,1
9,8,5,7


In [446]:
# Perform query on DataFrame using Python syntax
# Query: Select rows where element in column 'a' is greater than element in column 'b',
#        and element in column 'b' is less than element in column 'c'
df[(df['a'] < df['b']) & (df['b'] < df['c'])]

Unnamed: 0,a,b,c


In [447]:
# Perform query on DataFrame using query method
# Query: Select rows where element in column 'a' is greater than element in column 'b',
#        and element in column 'b' is less than element in column 'c'
df.query('(a < b) & (b < c)')

Unnamed: 0,a,b,c


In [448]:
# Perform query on DataFrame using query method
# Query: Select rows where element in column 'a' is greater than element in column 'b',
#        and element in column 'b' is less than element in column 'c'
df.query('a < b & b < c')

Unnamed: 0,a,b,c


In [449]:
# Perform query on DataFrame using query method
# Query: Select rows where element in column 'a' is greater than element in column 'b',
#        and element in column 'b' is less than element in column 'c'
df.query('a < b and b < c')

Unnamed: 0,a,b,c


In [450]:
# Perform query on DataFrame using query method
# Query: Select rows where element in column 'a' is greater than element in column 'b',
#        and element in column 'b' is less than element in column 'c'
df.query('a < b < c')

Unnamed: 0,a,b,c


## The `in` and `not in` operators

In [451]:
# Create DataFrame
df = pd.DataFrame({
    'a': list('aabbccddeeff'),
    'b': list('aaaabbbbcccc'),
    'c': np.random.randint(5, size=12),
    'd': np.random.randint(9, size=12)
})

df

Unnamed: 0,a,b,c,d
0,a,a,1,1
1,a,a,3,1
2,b,a,4,6
3,b,a,1,3
4,c,b,0,1
5,c,b,1,5
6,d,b,1,1
7,d,b,0,3
8,e,c,2,3
9,e,c,4,8


In [452]:
# Select rows where the element in column 'a' takes a value present in column 'b'
df[df['a'].isin(df['b'])]

Unnamed: 0,a,b,c,d
0,a,a,1,1
1,a,a,3,1
2,b,a,4,6
3,b,a,1,3
4,c,b,0,1
5,c,b,1,5


In [453]:
# Select rows where the element in column 'a' takes a value present in column 'b'
df.query('a in b')

Unnamed: 0,a,b,c,d
0,a,a,1,1
1,a,a,3,1
2,b,a,4,6
3,b,a,1,3
4,c,b,0,1
5,c,b,1,5


In [454]:
# Select rows where the element in column 'a' takes a value not present in column 'b'
df[~df['a'].isin(df['b'])]

Unnamed: 0,a,b,c,d
6,d,b,1,1
7,d,b,0,3
8,e,c,2,3
9,e,c,4,8
10,f,c,0,0
11,f,c,4,3


In [455]:
# Select rows where the element in column 'a' takes a value not present in column 'b'
df.query('a not in b')

Unnamed: 0,a,b,c,d
6,d,b,1,1
7,d,b,0,3
8,e,c,2,3
9,e,c,4,8
10,f,c,0,0
11,f,c,4,3


In [456]:
# Select rows where:
#  Element in column 'a' takes value that is present in column 'b'
#  Element in column 'c' is less than element in column 'd'
df[df['a'].isin(df['b']) & (df['c'] < df['d'])]

Unnamed: 0,a,b,c,d
2,b,a,4,6
3,b,a,1,3
4,c,b,0,1
5,c,b,1,5


In [457]:
# Select rows where:
#  Element in column 'a' takes value that is present in column 'b'
#  Element in column 'c' is less than element in column 'd'
df.query('a in b and c < d')

Unnamed: 0,a,b,c,d
2,b,a,4,6
3,b,a,1,3
4,c,b,0,1
5,c,b,1,5


## Special use of the `==` operator with `list` objects

In [458]:
# Return rows where the element in column 'b' is one of ['a', 'b']
df[df['b'].isin(['a', 'b'])]

Unnamed: 0,a,b,c,d
0,a,a,1,1
1,a,a,3,1
2,b,a,4,6
3,b,a,1,3
4,c,b,0,1
5,c,b,1,5
6,d,b,1,1
7,d,b,0,3


In [459]:
# Return rows where the element in column 'b' is one of ['a', 'b']
df.query('b in ["a", "b"]')

Unnamed: 0,a,b,c,d
0,a,a,1,1
1,a,a,3,1
2,b,a,4,6
3,b,a,1,3
4,c,b,0,1
5,c,b,1,5
6,d,b,1,1
7,d,b,0,3


In [460]:
# Return rows where the element in column 'b' is one of ['a', 'b']
df.query('b == ["a", "b"]')

Unnamed: 0,a,b,c,d
0,a,a,1,1
1,a,a,3,1
2,b,a,4,6
3,b,a,1,3
4,c,b,0,1
5,c,b,1,5
6,d,b,1,1
7,d,b,0,3


## Boolean operators

In [461]:
# Create DataFrame
df = pd.DataFrame(np.random.rand(10, 3), columns=list('abc'))
df

Unnamed: 0,a,b,c
0,0.744798,0.092875,0.641281
1,0.697577,0.887885,0.543088
2,0.274256,0.978938,0.021216
3,0.519259,0.181899,0.266028
4,0.594308,0.40138,0.559753
5,0.696207,0.533394,0.429081
6,0.624338,0.790852,0.566208
7,0.169219,0.720016,0.001964
8,0.492682,0.727642,0.319239
9,0.098193,0.934664,0.303873


In [462]:
# Add column of random Boolean values to DataFrame
df['bools'] = np.random.choice([True, False], 10)
df

Unnamed: 0,a,b,c,bools
0,0.744798,0.092875,0.641281,False
1,0.697577,0.887885,0.543088,True
2,0.274256,0.978938,0.021216,False
3,0.519259,0.181899,0.266028,False
4,0.594308,0.40138,0.559753,False
5,0.696207,0.533394,0.429081,True
6,0.624338,0.790852,0.566208,True
7,0.169219,0.720016,0.001964,False
8,0.492682,0.727642,0.319239,True
9,0.098193,0.934664,0.303873,True


In [463]:
# Select rows where the element in 'bools' is False
df[~df['bools']]

Unnamed: 0,a,b,c,bools
0,0.744798,0.092875,0.641281,False
2,0.274256,0.978938,0.021216,False
3,0.519259,0.181899,0.266028,False
4,0.594308,0.40138,0.559753,False
7,0.169219,0.720016,0.001964,False


In [464]:
# Select rows where the element in 'bools' is False
df.query('~bools')

Unnamed: 0,a,b,c,bools
0,0.744798,0.092875,0.641281,False
2,0.274256,0.978938,0.021216,False
3,0.519259,0.181899,0.266028,False
4,0.594308,0.40138,0.559753,False
7,0.169219,0.720016,0.001964,False


In [465]:
# Select rows where the element in 'bools' is False
df.query('not bools')

Unnamed: 0,a,b,c,bools
0,0.744798,0.092875,0.641281,False
2,0.274256,0.978938,0.021216,False
3,0.519259,0.181899,0.266028,False
4,0.594308,0.40138,0.559753,False
7,0.169219,0.720016,0.001964,False


## Performance of `query()`

- For DataFrames with less than ~100,000 rows, it is faster to form the query in Python
- For DataFrames with ~100,000 rows or more, it is faster to use `query()` with the `numexpr` engine

# Duplicate data

In [466]:
# Create DataFrame with duplicate rows
df2 = pd.DataFrame({
    'a': ['one', 'one', 'two', 'two', 'two', 'three', 'four'],
    'b': ['x', 'y', 'x', 'y', 'x', 'x', 'x'],
    'c': np.random.randn(7)
})

df2

Unnamed: 0,a,b,c
0,one,x,-0.249987
1,one,y,0.256973
2,two,x,1.353291
3,two,y,0.063633
4,two,x,-0.87014
5,three,x,0.828169
6,four,x,-0.168135


In [None]:
# Find duplicate rows in column 'a' of DataFrame
# By default, the first occurrence of each set of duplicate rows is not considered a duplicate
df2.duplicated('a')

0    False
1     True
2    False
3     True
4     True
5    False
6    False
dtype: bool

In [469]:
# Find duplicate rows in column 'a' of DataFrame
# Mark the last occurrence of each set of duplicate rows as not a duplicate
df2.duplicated('a', keep='last')

0     True
1    False
2     True
3     True
4    False
5    False
6    False
dtype: bool

In [470]:
# Find duplicate rows in column 'a' of DataFrame
# Mark all occurrences of each set of duplicate rows as a duplicate
df2.duplicated('a', keep=False)

0     True
1     True
2     True
3     True
4     True
5    False
6    False
dtype: bool

In [472]:
# Remove rows where value of column 'a' is a duplicate
# Keep first occurance of each set of duplicate rows
df2.drop_duplicates('a')

Unnamed: 0,a,b,c
0,one,x,-0.249987
2,two,x,1.353291
5,three,x,0.828169
6,four,x,-0.168135


In [474]:
# Remove rows where value of column 'a' is a duplicate
# Keep last occurance of each set of duplicate rows
df2.drop_duplicates('a', keep='last')

Unnamed: 0,a,b,c
1,one,y,0.256973
4,two,x,-0.87014
5,three,x,0.828169
6,four,x,-0.168135


In [475]:
# Remove rows where value of column 'a' is a duplicate
# Remove all occurrences of each set of duplicate rows
df2.drop_duplicates('a', keep=False)

Unnamed: 0,a,b,c
5,three,x,0.828169
6,four,x,-0.168135


In [476]:
# Find rows where the elements in columns 'a' and 'b' are duplicates
df2.duplicated(['a', 'b'])

0    False
1    False
2    False
3    False
4     True
5    False
6    False
dtype: bool

In [477]:
# Remove rows where the elements in columns 'a' and 'b' are duplicates
# Keep first occurance of each set of duplicate rows
df2.drop_duplicates(['a', 'b'])

Unnamed: 0,a,b,c
0,one,x,-0.249987
1,one,y,0.256973
2,two,x,1.353291
3,two,y,0.063633
5,three,x,0.828169
6,four,x,-0.168135


In [479]:
# Create DataFrame with duplicated index values
df3 = pd.DataFrame(
    {
        'a': np.arange(6),
        'b': np.random.randn(6),
    },
    index=['a', 'a', 'b', 'c', 'b', 'a']
)

df3

Unnamed: 0,a,b
a,0,-1.974084
a,1,-0.567026
b,2,-1.347897
c,3,2.030258
b,4,1.048312
a,5,0.858396


In [480]:
# Find duplicate index values in DataFrame
# The first occurance of each value is not considered a duplicate
df3.index.duplicated()

array([False,  True, False, False,  True,  True])

In [482]:
# Select rows where index is not duplicated
# Treat the first occurance of each value as not a duplicate
df3[~df3.index.duplicated()]

Unnamed: 0,a,b
a,0,-1.974084
b,2,-1.347897
c,3,2.030258


In [484]:
# Select rows where index is not duplicated
# Treat the last occurance of each value as not a duplicate
df3[~df3.index.duplicated(keep='last')]

Unnamed: 0,a,b
c,3,2.030258
b,4,1.048312
a,5,0.858396


In [485]:
# Select rows where index is not duplicated
# Treat all occurances of each value as a duplicate
df3[~df3.index.duplicated(keep=False)]

Unnamed: 0,a,b
c,3,2.030258


# Dictionary-like `get()` method

In [486]:
# Create Series
s = pd.Series([1, 2, 3], index=['a', 'b', 'c'])

In [487]:
# Get element at index 'a'
s.get('a')

1

In [488]:
# Get element at index 'x'
# If index 'x' does not exist, return -1
s.get('x', default=-1)

-1

# Looking up values by index/column labels

In [489]:
# Create DataFrame
df = pd.DataFrame({
    'col': ['A', 'A', 'B', 'B'],
    'A': [80, 23, np.nan, 22],
    'B': [80, 55, 76, 67]
})

df

Unnamed: 0,col,A,B
0,A,80.0,80
1,A,23.0,55
2,B,,76
3,B,22.0,67


In [492]:
# Encode values of column 'col'
idx, col = pd.factorize(df['col'])
idx, col

(array([0, 0, 1, 1]), Index(['A', 'B'], dtype='object'))

In [496]:
# Where 'col' is 'A', select element from column 'A'
# Where 'col' is 'B', select element from column 'B'
df.reindex(col, axis=1).to_numpy()[np.arange(len(df)), idx]

array([80., 23., 76., 67.])

# `Index` objects

In [502]:
# Create Index
index = pd.Index(['e', 'd', 'a', 'b'])
index

Index(['e', 'd', 'a', 'b'], dtype='object')

In [503]:
# Find whether label 'd' is in index
'd' in index

True

In [504]:
# Create Index
index = pd.Index([1, 5, 12])
index

Index([1, 5, 12], dtype='int64')

In [505]:
# Find whether label 5 is in index
5 in index

True

In [506]:
# Create string Index
index = pd.Index(['e', 'd', 'a', 'b'], dtype='string')
index

Index(['e', 'd', 'a', 'b'], dtype='string')

In [507]:
# Create 8-bit integer Index
index = pd.Index([1, 5, 12], dtype='int8')
index

Index([1, 5, 12], dtype='int8')

In [508]:
# Create 32-bit float Index
index = pd.Index([1, 5, 12], dtype='float32')
index

Index([1.0, 5.0, 12.0], dtype='float32')

In [509]:
# Create Index with name
index = pd.Index(['e', 'd', 'a', 'b'], name='index')
index

Index(['e', 'd', 'a', 'b'], dtype='object', name='index')

In [510]:
# Create DataFrame with named row and column Index
index = pd.Index(list(range(5)), name='rows')
columns = pd.Index(['A', 'B', 'C'], name='cols')

df = pd.DataFrame(np.random.randn(5, 3), index=index, columns=columns)
df

cols,A,B,C
rows,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.523614,1.066223,-0.663542
1,0.606209,-2.284479,0.129079
2,1.710622,1.668589,2.007636
3,0.845767,-1.241559,0.155215
4,0.208757,-0.080638,0.483421


## Setting metadata

In [511]:
# Create Index without name
ind = pd.Index([1, 2, 3])
ind

Index([1, 2, 3], dtype='int64')

In [512]:
# Rename index
ind = ind.rename('apple')
ind

Index([1, 2, 3], dtype='int64', name='apple')

In [514]:
# Rename index
ind = ind.set_names(['banana'])
ind

Index([1, 2, 3], dtype='int64', name='banana')

In [516]:
# Rename index
ind.name = 'coconut'
ind

Index([1, 2, 3], dtype='int64', name='coconut')

In [517]:
# Create named MultiIndex
index = pd.MultiIndex.from_product([range(3), ['one', 'two']], names=['first', 'second'])
index

MultiIndex([(0, 'one'),
            (0, 'two'),
            (1, 'one'),
            (1, 'two'),
            (2, 'one'),
            (2, 'two')],
           names=['first', 'second'])

In [519]:
# Get Index at level 2
index.levels[1]

Index(['one', 'two'], dtype='object', name='second')

In [520]:
# Set index labels at Index level 2
index.set_levels(['a', 'b'], level=1)

MultiIndex([(0, 'a'),
            (0, 'b'),
            (1, 'a'),
            (1, 'b'),
            (2, 'a'),
            (2, 'b')],
           names=['first', 'second'])

## Set operations on `Index` objects

In [526]:
# Take difference of Index objects
a = pd.Index(['c', 'b', 'a'])
b = pd.Index(['c', 'e', 'd'])
a.difference(b)

Index(['a', 'b'], dtype='object')

In [527]:
# Take symmetric difference of Index objects
idx1 = pd.Index([1, 2, 3, 4])
idx2 = pd.Index([2, 3, 4, 5])
idx1.symmetric_difference(idx2)

Index([1, 5], dtype='int64')

In [528]:
# Take union of Index objects
idx1 = pd.Index([0, 1, 2])
idx2 = pd.Index([0.5, 1.5])
idx1.union(idx2)

Index([0.0, 0.5, 1.0, 1.5, 2.0], dtype='float64')

## Missing values

In [529]:
# Create Index with missing values
idx1 = pd.Index([1, np.nan, 3, 4])
idx1

Index([1.0, nan, 3.0, 4.0], dtype='float64')

In [530]:
# Fill missing Index values with 2
idx1.fillna(2)

Index([1.0, 2.0, 3.0, 4.0], dtype='float64')

In [532]:
# Create Index with missing values
idx2 = pd.DatetimeIndex([
    pd.Timestamp('2024-11-23'),
    pd.NaT,
    pd.Timestamp('2024-11-25'),
])

idx2

DatetimeIndex(['2024-11-23', 'NaT', '2024-11-25'], dtype='datetime64[ns]', freq=None)

In [533]:
# Fill missing Index values with Timestamp('2024-11-24')
idx2.fillna(pd.Timestamp('2024-11-24'))

DatetimeIndex(['2024-11-23', '2024-11-24', '2024-11-25'], dtype='datetime64[ns]', freq=None)

# Set/reset index

## Set an index

In [534]:
# Create DataFrame with default Index
data = pd.DataFrame({
    'a': ['bar', 'bar', 'foo', 'foo'],
    'b': ['one', 'two', 'one', 'two'],
    'c': ['z', 'y', 'x', 'w'],
    'd': [1., 2., 3., 4.]
})

data

Unnamed: 0,a,b,c,d
0,bar,one,z,1.0
1,bar,two,y,2.0
2,foo,one,x,3.0
3,foo,two,w,4.0


In [535]:
# Set column 'c' as index
data.set_index('c')

Unnamed: 0_level_0,a,b,d
c,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
z,bar,one,1.0
y,bar,two,2.0
x,foo,one,3.0
w,foo,two,4.0


In [537]:
# Set columns 'a' and 'b' as multi-level index
data.set_index(['a', 'b'])

Unnamed: 0_level_0,Unnamed: 1_level_0,c,d
a,b,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,z,1.0
bar,two,y,2.0
foo,one,x,3.0
foo,two,w,4.0


In [540]:
# Set column 'c' as index, keeping column 'c' in DataFrame
# Append columns 'a' and 'b' as multi-level index, dropping columns 'a', 'b' from DataFrame
data.set_index('c', drop=False).set_index(['a', 'b'], append=True)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,c,d
c,a,b,Unnamed: 3_level_1,Unnamed: 4_level_1
z,bar,one,z,1.0
y,bar,two,y,2.0
x,foo,one,x,3.0
w,foo,two,w,4.0


## Reset the index

In [542]:
# Add DataFrame column using current Index name and values
# Create new integer Index
data.reset_index()

Unnamed: 0,index,a,b,c,d
0,0,bar,one,z,1.0
1,1,bar,two,y,2.0
2,2,foo,one,x,3.0
3,3,foo,two,w,4.0


In [546]:
# Create MultiIndex DataFrame
dmi = data.set_index('c', drop=False).set_index(['a', 'b'], append=True)
dmi

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,c,d
c,a,b,Unnamed: 3_level_1,Unnamed: 4_level_1
z,bar,one,z,1.0
y,bar,two,y,2.0
x,foo,one,x,3.0
w,foo,two,w,4.0


In [548]:
# Reset middle level of MultiIndex
dmi.reset_index(level=1)

Unnamed: 0_level_0,Unnamed: 1_level_0,a,c,d
c,b,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
z,one,bar,z,1.0
y,two,bar,y,2.0
x,one,foo,x,3.0
w,two,foo,w,4.0


## Adding an ad hoc index

In [549]:
# Create DataFrame with default Index
df_idx = pd.DataFrame(range(4))
df_idx

Unnamed: 0,0
0,0
1,1
2,2
3,3


In [550]:
# Set DataFrame Index to new Index
df_idx.index = pd.Index([10, 20, 30, 40], name='a')
df_idx

Unnamed: 0_level_0,0
a,Unnamed: 1_level_1
10,0
20,1
30,2
40,3


# Returning a view vs a copy

In [552]:
# Create MultiIndex column DataFrame
dfmi = pd.DataFrame(
    [
        list('abcd'),
        list('efgh'),
        list('ijkl'),
        list('mnop')
    ],
    columns=pd.MultiIndex.from_product([
        ['one', 'two'],
        ['first', 'second']
    ])
)

dfmi

Unnamed: 0_level_0,one,one,two,two
Unnamed: 0_level_1,first,second,first,second
0,a,b,c,d
1,e,f,g,h
2,i,j,k,l
3,m,n,o,p


In [553]:
# Select column ('one', 'second') using chained indexing
# pandas treats this as two separate operations on two separate entities
# Furthermore, the first operation may or may not return a copy dependent on order of operations;
# assignment to the result of this chained indexing may or may not modify the original object.
dfmi['one']['second']

0    b
1    f
2    j
3    n
Name: second, dtype: object

In [554]:
# Select column ('one', 'second') using loc
# pandas treats this as a single operation on a single entity
# Furthermore, the operation always returns a view on the original object;
# assignment to the result of this view modifies the original object.
dfmi.loc[:, ('one', 'second')]

0    b
1    f
2    j
3    n
Name: (one, second), dtype: object