# Selecting & Filtering

In [61]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [62]:
import numpy as np
import pandas as pd

In [63]:
dates = pd.date_range('20210629', periods=6)
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('ABCD'))
df2 = pd.DataFrame({'A': 1.,
                        'B': pd.Timestamp('20130102'),
                        'C': pd.Series(1, index=list(range(4)), dtype='float32'),
                        'D': np.array([3] * 4, dtype='int32'),
                        'E': pd.Categorical(["test", "train", "test", "train"]),
                        'F': 'foo'})
df

Unnamed: 0,A,B,C,D
2021-06-29,-0.838911,1.24496,0.069035,0.085307
2021-06-30,-1.009835,-0.450579,0.34634,-0.557755
2021-07-01,-1.245852,0.73998,1.47417,1.936949
2021-07-02,-0.557376,-0.65449,-0.839403,0.461118
2021-07-03,0.175406,-0.923196,-0.379162,0.852363
2021-07-04,-0.67735,-0.204282,-0.98674,-0.738049


### Selecting

**Selecting Column by Name**

In [64]:
df["A"]

2021-06-29   -0.838911
2021-06-30   -1.009835
2021-07-01   -1.245852
2021-07-02   -0.557376
2021-07-03    0.175406
2021-07-04   -0.677350
Freq: D, Name: A, dtype: float64

**Selecting Column by index**

In [65]:
df.iloc[:,0]

2021-06-29   -0.838911
2021-06-30   -1.009835
2021-07-01   -1.245852
2021-07-02   -0.557376
2021-07-03    0.175406
2021-07-04   -0.677350
Freq: D, Name: A, dtype: float64

**Selecting Row**

In [67]:
df.loc["20210629"]

A   -0.838911
B    1.244960
C    0.069035
D    0.085307
Name: 2021-06-29 00:00:00, dtype: float64

**Selecting Row by index**


In [68]:
df.iloc[0]

A   -0.838911
B    1.244960
C    0.069035
D    0.085307
Name: 2021-06-29 00:00:00, dtype: float64

### Slicing

**Slice Rows**

In [69]:
df["20210701" : "20210703"]

Unnamed: 0,A,B,C,D
2021-07-01,-1.245852,0.73998,1.47417,1.936949
2021-07-02,-0.557376,-0.65449,-0.839403,0.461118
2021-07-03,0.175406,-0.923196,-0.379162,0.852363


In [70]:
df.iloc[2:5]

Unnamed: 0,A,B,C,D
2021-07-01,-1.245852,0.73998,1.47417,1.936949
2021-07-02,-0.557376,-0.65449,-0.839403,0.461118
2021-07-03,0.175406,-0.923196,-0.379162,0.852363


**Slice Columns**

In [71]:
df.loc[:,"A":"C"]

Unnamed: 0,A,B,C
2021-06-29,-0.838911,1.24496,0.069035
2021-06-30,-1.009835,-0.450579,0.34634
2021-07-01,-1.245852,0.73998,1.47417
2021-07-02,-0.557376,-0.65449,-0.839403
2021-07-03,0.175406,-0.923196,-0.379162
2021-07-04,-0.67735,-0.204282,-0.98674


In [72]:
df.iloc[:,0:3]

Unnamed: 0,A,B,C
2021-06-29,-0.838911,1.24496,0.069035
2021-06-30,-1.009835,-0.450579,0.34634
2021-07-01,-1.245852,0.73998,1.47417
2021-07-02,-0.557376,-0.65449,-0.839403
2021-07-03,0.175406,-0.923196,-0.379162
2021-07-04,-0.67735,-0.204282,-0.98674


**Slice Rows & Columns**

In [73]:
df.loc["2021-06-30":"2021-07-02", "B":"D"]

Unnamed: 0,B,C,D
2021-06-30,-0.450579,0.34634,-0.557755
2021-07-01,0.73998,1.47417,1.936949
2021-07-02,-0.65449,-0.839403,0.461118


You can also substitute the slices for lists to get rows/columns that are not sequential

### Select by dtypes

In [74]:
df = pd.DataFrame({'string': list('abc'),
                       'int64': list(range(1, 4)),
                       'uint8': np.arange(3, 6).astype('u1'),
                       'float64': np.arange(4.0, 7.0),
                       'bool1': [True, False, True],
                       'bool2': [False, True, False],
                       'dates': pd.date_range('now', periods=3),
                       'category': pd.Series(list("ABC")).astype('category')})
df

Unnamed: 0,string,int64,uint8,float64,bool1,bool2,dates,category
0,a,1,3,4.0,True,False,2021-06-29 16:45:30.357184,A
1,b,2,4,5.0,False,True,2021-06-30 16:45:30.357184,B
2,c,3,5,6.0,True,False,2021-07-01 16:45:30.357184,C


In [75]:
df.select_dtypes(["bool"])

Unnamed: 0,bool1,bool2
0,True,False
1,False,True
2,True,False


In [76]:
df.select_dtypes(["int","float"])

Unnamed: 0,int64,float64
0,1,4.0
1,2,5.0
2,3,6.0


### Boolen indexing (i.e. filter arrays)

In [77]:
df

Unnamed: 0,string,int64,uint8,float64,bool1,bool2,dates,category
0,a,1,3,4.0,True,False,2021-06-29 16:45:30.357184,A
1,b,2,4,5.0,False,True,2021-06-30 16:45:30.357184,B
2,c,3,5,6.0,True,False,2021-07-01 16:45:30.357184,C


In [78]:
df[df['float64'] >= 5]

Unnamed: 0,string,int64,uint8,float64,bool1,bool2,dates,category
1,b,2,4,5.0,False,True,2021-06-30 16:45:30.357184,B
2,c,3,5,6.0,True,False,2021-07-01 16:45:30.357184,C


In [79]:
df2 = df.copy()
df2['E'] = ['one', 'two', 'three']
df2

Unnamed: 0,string,int64,uint8,float64,bool1,bool2,dates,category,E
0,a,1,3,4.0,True,False,2021-06-29 16:45:30.357184,A,one
1,b,2,4,5.0,False,True,2021-06-30 16:45:30.357184,B,two
2,c,3,5,6.0,True,False,2021-07-01 16:45:30.357184,C,three


In [80]:
df2

df2["E"].isin(["one", "three"])

df2[df2["E"].isin(["one", "three"])]

Unnamed: 0,string,int64,uint8,float64,bool1,bool2,dates,category,E
0,a,1,3,4.0,True,False,2021-06-29 16:45:30.357184,A,one
1,b,2,4,5.0,False,True,2021-06-30 16:45:30.357184,B,two
2,c,3,5,6.0,True,False,2021-07-01 16:45:30.357184,C,three


0     True
1    False
2     True
Name: E, dtype: bool

Unnamed: 0,string,int64,uint8,float64,bool1,bool2,dates,category,E
0,a,1,3,4.0,True,False,2021-06-29 16:45:30.357184,A,one
2,c,3,5,6.0,True,False,2021-07-01 16:45:30.357184,C,three


### Setting Values

In [83]:
df3 = df.copy()

In [87]:
df

Unnamed: 0,string,int64,uint8,float64,bool1,bool2,dates,category
0,a,1,3,4.0,True,False,2021-06-29 16:45:30.357184,A
1,b,2,4,5.0,False,True,2021-06-30 16:45:30.357184,B
2,c,3,5,6.0,True,False,2021-07-01 16:45:30.357184,C


In [88]:
df3.iat[0, 1] = -1
df3

df3.iloc[0, 1] = -2
df3

Unnamed: 0,string,int64,uint8,float64,bool1,bool2,dates,category
0,a,-1,3,4.0,True,False,2021-06-29 16:45:30.357184,A
1,b,2,4,5.0,False,True,2021-06-30 16:45:30.357184,B
2,c,3,5,6.0,True,False,2021-07-01 16:45:30.357184,C


Unnamed: 0,string,int64,uint8,float64,bool1,bool2,dates,category
0,a,-2,3,4.0,True,False,2021-06-29 16:45:30.357184,A
1,b,2,4,5.0,False,True,2021-06-30 16:45:30.357184,B
2,c,3,5,6.0,True,False,2021-07-01 16:45:30.357184,C


In [89]:
df3.at[0, 'float64'] = -10
df3
df3.loc[0, 'float64'] = -20
df3

Unnamed: 0,string,int64,uint8,float64,bool1,bool2,dates,category
0,a,-2,3,-10.0,True,False,2021-06-29 16:45:30.357184,A
1,b,2,4,5.0,False,True,2021-06-30 16:45:30.357184,B
2,c,3,5,6.0,True,False,2021-07-01 16:45:30.357184,C


Unnamed: 0,string,int64,uint8,float64,bool1,bool2,dates,category
0,a,-2,3,-20.0,True,False,2021-06-29 16:45:30.357184,A
1,b,2,4,5.0,False,True,2021-06-30 16:45:30.357184,B
2,c,3,5,6.0,True,False,2021-07-01 16:45:30.357184,C


In [94]:
df3.loc[:, 'uint8'] = np.array([50] * len(df))
df3

Unnamed: 0,string,int64,uint8,float64,bool1,bool2,dates,category
0,a,-2,50,-20.0,True,False,2021-06-29 16:45:30.357184,A
1,b,2,50,5.0,False,True,2021-06-30 16:45:30.357184,B
2,c,3,50,6.0,True,False,2021-07-01 16:45:30.357184,C
