In [2]:
import numpy as np
import pandas as pd

# Indexing and selecting data

## Different choices for indexing
- iloc
- loc
- [ ]

## Basics

In [3]:
dates = pd.date_range('1/1/2000', periods=8)
df = pd.DataFrame(np.random.randn(8, 4),
                  index=dates, columns=['A', 'B', 'C', 'D'])

In [4]:
df

Unnamed: 0,A,B,C,D
2000-01-01,-0.092021,-0.093106,-1.202401,-0.256
2000-01-02,1.75512,0.038471,0.37468,-1.605079
2000-01-03,0.47544,-0.802453,0.007725,-0.164143
2000-01-04,0.405963,-0.662958,0.33898,-1.882461
2000-01-05,0.196291,2.910929,1.683268,-0.830867
2000-01-06,-0.87186,0.580512,0.331308,-0.928876
2000-01-07,-0.028979,-0.978789,0.347033,1.967454
2000-01-08,0.727421,-0.095678,-0.893186,0.276228


In [21]:
s = df['A']
s[dates[5]]

-0.871860099850522

In [22]:
df

Unnamed: 0,A,B,C,D
2000-01-01,-0.092021,-0.093106,-1.202401,-0.256
2000-01-02,1.75512,0.038471,0.37468,-1.605079
2000-01-03,0.47544,-0.802453,0.007725,-0.164143
2000-01-04,0.405963,-0.662958,0.33898,-1.882461
2000-01-05,0.196291,2.910929,1.683268,-0.830867
2000-01-06,-0.87186,0.580512,0.331308,-0.928876
2000-01-07,-0.028979,-0.978789,0.347033,1.967454
2000-01-08,0.727421,-0.095678,-0.893186,0.276228


In [5]:
df[['B', 'A']] = df[['A', 'B']]

In [6]:
df

Unnamed: 0,A,B,C,D
2000-01-01,-0.093106,-0.092021,-1.202401,-0.256
2000-01-02,0.038471,1.75512,0.37468,-1.605079
2000-01-03,-0.802453,0.47544,0.007725,-0.164143
2000-01-04,-0.662958,0.405963,0.33898,-1.882461
2000-01-05,2.910929,0.196291,1.683268,-0.830867
2000-01-06,0.580512,-0.87186,0.331308,-0.928876
2000-01-07,-0.978789,-0.028979,0.347033,1.967454
2000-01-08,-0.095678,0.727421,-0.893186,0.276228


In [7]:
df.loc[:, ['B', 'A']] = df[['A', 'B']]

In [8]:
# no change
df

Unnamed: 0,A,B,C,D
2000-01-01,-0.093106,-0.092021,-1.202401,-0.256
2000-01-02,0.038471,1.75512,0.37468,-1.605079
2000-01-03,-0.802453,0.47544,0.007725,-0.164143
2000-01-04,-0.662958,0.405963,0.33898,-1.882461
2000-01-05,2.910929,0.196291,1.683268,-0.830867
2000-01-06,0.580512,-0.87186,0.331308,-0.928876
2000-01-07,-0.978789,-0.028979,0.347033,1.967454
2000-01-08,-0.095678,0.727421,-0.893186,0.276228


In [9]:
df.loc[:, ['B', 'A']] = df[['A', 'B']].to_numpy()

In [10]:
# change
df

Unnamed: 0,A,B,C,D
2000-01-01,-0.092021,-0.093106,-1.202401,-0.256
2000-01-02,1.75512,0.038471,0.37468,-1.605079
2000-01-03,0.47544,-0.802453,0.007725,-0.164143
2000-01-04,0.405963,-0.662958,0.33898,-1.882461
2000-01-05,0.196291,2.910929,1.683268,-0.830867
2000-01-06,-0.87186,0.580512,0.331308,-0.928876
2000-01-07,-0.028979,-0.978789,0.347033,1.967454
2000-01-08,0.727421,-0.095678,-0.893186,0.276228


## Attribute access

In [11]:
sa = pd.Series([1, 2, 3], index=list('abc'))
dfa = df.copy()

In [12]:
sa.b

2

In [14]:
dfa.A

2000-01-01   -0.092021
2000-01-02    1.755120
2000-01-03    0.475440
2000-01-04    0.405963
2000-01-05    0.196291
2000-01-06   -0.871860
2000-01-07   -0.028979
2000-01-08    0.727421
Freq: D, Name: A, dtype: float64

In [15]:
sa.a = 5
sa

a    5
b    2
c    3
dtype: int64

In [17]:
dfa.A = list(range(len(dfa.index)))
dfa

Unnamed: 0,A,B,C,D
2000-01-01,0,-0.093106,-1.202401,-0.256
2000-01-02,1,0.038471,0.37468,-1.605079
2000-01-03,2,-0.802453,0.007725,-0.164143
2000-01-04,3,-0.662958,0.33898,-1.882461
2000-01-05,4,2.910929,1.683268,-0.830867
2000-01-06,5,0.580512,0.331308,-0.928876
2000-01-07,6,-0.978789,0.347033,1.967454
2000-01-08,7,-0.095678,-0.893186,0.276228


In [19]:
dfa['A'] = list(range(len(dfa.index)))
dfa

Unnamed: 0,A,B,C,D
2000-01-01,0,-0.093106,-1.202401,-0.256
2000-01-02,1,0.038471,0.37468,-1.605079
2000-01-03,2,-0.802453,0.007725,-0.164143
2000-01-04,3,-0.662958,0.33898,-1.882461
2000-01-05,4,2.910929,1.683268,-0.830867
2000-01-06,5,0.580512,0.331308,-0.928876
2000-01-07,6,-0.978789,0.347033,1.967454
2000-01-08,7,-0.095678,-0.893186,0.276228


In [20]:
x = pd.DataFrame({'x': [1, 2, 3], 'y': [3, 4, 5]})
x.iloc[1] = {'x': 9, 'y': 99}
x

Unnamed: 0,x,y
0,1,3
1,9,99
2,3,5


## Slicing ranges

In [24]:
s

2000-01-01   -0.092021
2000-01-02    1.755120
2000-01-03    0.475440
2000-01-04    0.405963
2000-01-05    0.196291
2000-01-06   -0.871860
2000-01-07   -0.028979
2000-01-08    0.727421
Freq: D, Name: A, dtype: float64

In [25]:
s[:5]

2000-01-01   -0.092021
2000-01-02    1.755120
2000-01-03    0.475440
2000-01-04    0.405963
2000-01-05    0.196291
Freq: D, Name: A, dtype: float64

In [28]:
s[::2]

2000-01-01   -0.092021
2000-01-03    0.475440
2000-01-05    0.196291
2000-01-07   -0.028979
Freq: 2D, Name: A, dtype: float64

In [29]:
s[::-1]

2000-01-08    0.727421
2000-01-07   -0.028979
2000-01-06   -0.871860
2000-01-05    0.196291
2000-01-04    0.405963
2000-01-03    0.475440
2000-01-02    1.755120
2000-01-01   -0.092021
Freq: -1D, Name: A, dtype: float64

In [30]:
s2 = s.copy()

In [32]:
s2[:5] = 0
s2

2000-01-01    0.000000
2000-01-02    0.000000
2000-01-03    0.000000
2000-01-04    0.000000
2000-01-05    0.000000
2000-01-06   -0.871860
2000-01-07   -0.028979
2000-01-08    0.727421
Freq: D, Name: A, dtype: float64

In [33]:
df[:3]

Unnamed: 0,A,B,C,D
2000-01-01,-0.092021,-0.093106,-1.202401,-0.256
2000-01-02,1.75512,0.038471,0.37468,-1.605079
2000-01-03,0.47544,-0.802453,0.007725,-0.164143


In [34]:
df[::-1]

Unnamed: 0,A,B,C,D
2000-01-08,0.727421,-0.095678,-0.893186,0.276228
2000-01-07,-0.028979,-0.978789,0.347033,1.967454
2000-01-06,-0.87186,0.580512,0.331308,-0.928876
2000-01-05,0.196291,2.910929,1.683268,-0.830867
2000-01-04,0.405963,-0.662958,0.33898,-1.882461
2000-01-03,0.47544,-0.802453,0.007725,-0.164143
2000-01-02,1.75512,0.038471,0.37468,-1.605079
2000-01-01,-0.092021,-0.093106,-1.202401,-0.256


## Selection by label

In [38]:
dfl = pd.DataFrame(np.random.randn(5, 4),
                   columns=list('ABCD'),
                   index=pd.date_range('20130101', periods=5))
dfl

Unnamed: 0,A,B,C,D
2013-01-01,-0.479617,-0.175157,-0.078779,0.036839
2013-01-02,-0.132072,-1.346288,0.258151,0.248275
2013-01-03,-0.033322,0.138139,-0.461086,-0.338935
2013-01-04,-1.090504,-2.032162,-0.024861,-2.524302
2013-01-05,1.132762,0.293526,-0.206966,1.066141


In [45]:
# dfl.loc[2:4]

In [46]:
dfl.loc['20130102':'20130104']

Unnamed: 0,A,B,C,D
2013-01-02,-0.132072,-1.346288,0.258151,0.248275
2013-01-03,-0.033322,0.138139,-0.461086,-0.338935
2013-01-04,-1.090504,-2.032162,-0.024861,-2.524302


In [47]:
s1 = pd.Series(np.random.randn(6), index=list('ABCDEF'))
s1

A   -1.120054
B   -0.315927
C   -0.099595
D   -0.549080
E   -0.739006
F    1.577703
dtype: float64

In [48]:
s1.loc['C']

-0.09959536248974958

In [49]:
s1.loc['A':'C']

A   -1.120054
B   -0.315927
C   -0.099595
dtype: float64

In [50]:
s1.loc['D'] = 0
s1

A   -1.120054
B   -0.315927
C   -0.099595
D    0.000000
E   -0.739006
F    1.577703
dtype: float64

In [52]:
df1 = pd.DataFrame(np.random.randn(6, 4),
                   index=list('abcdef'),
                   columns=list('ABCD'))
df1

Unnamed: 0,A,B,C,D
a,0.024426,0.74876,0.702267,0.351428
b,0.387529,-0.746759,0.245507,-0.289084
c,-1.891651,1.157953,-0.466411,-0.493194
d,-1.157115,1.080764,-0.205128,-0.559386
e,0.052117,-0.790026,-0.760879,-0.001344
f,-0.043156,-0.347248,-0.614077,0.006933


In [53]:
df1.loc[["a", "b", "c", "d", "e", "f"], :]

Unnamed: 0,A,B,C,D
a,0.024426,0.74876,0.702267,0.351428
b,0.387529,-0.746759,0.245507,-0.289084
c,-1.891651,1.157953,-0.466411,-0.493194
d,-1.157115,1.080764,-0.205128,-0.559386
e,0.052117,-0.790026,-0.760879,-0.001344
f,-0.043156,-0.347248,-0.614077,0.006933


In [54]:
df1.loc[["a", "b", "d"], :]

Unnamed: 0,A,B,C,D
a,0.024426,0.74876,0.702267,0.351428
b,0.387529,-0.746759,0.245507,-0.289084
d,-1.157115,1.080764,-0.205128,-0.559386


In [55]:
df1.loc["d":, "A":"C"]

Unnamed: 0,A,B,C
d,-1.157115,1.080764,-0.205128
e,0.052117,-0.790026,-0.760879
f,-0.043156,-0.347248,-0.614077


In [56]:
df1.loc['a']

A    0.024426
B    0.748760
C    0.702267
D    0.351428
Name: a, dtype: float64

In [58]:
type(df1.loc['a'])

pandas.core.series.Series

In [59]:
df1.loc['a'] > 0

A    True
B    True
C    True
D    True
Name: a, dtype: bool

In [61]:
df1.loc[:, df1.loc['a'] > 0]

Unnamed: 0,A,B,C,D
a,0.024426,0.74876,0.702267,0.351428
b,0.387529,-0.746759,0.245507,-0.289084
c,-1.891651,1.157953,-0.466411,-0.493194
d,-1.157115,1.080764,-0.205128,-0.559386
e,0.052117,-0.790026,-0.760879,-0.001344
f,-0.043156,-0.347248,-0.614077,0.006933


In [62]:
df1.loc['a', df1.loc['a'] > 0]

A    0.024426
B    0.748760
C    0.702267
D    0.351428
Name: a, dtype: float64

In [66]:
df1.loc['a', 'A']

0.024426051596202095

### Slicing with labels

In [67]:
s = pd.Series(list('abcde'), index=[0, 3, 2, 5,  4])
s

0    a
3    b
2    c
5    d
4    e
dtype: object

In [68]:
s.loc[3:5]

3    b
2    c
5    d
dtype: object

In [69]:
s.sort_index()

0    a
2    c
3    b
4    e
5    d
dtype: object

In [70]:
s.sort_index().loc[1:6]

2    c
3    b
4    e
5    d
dtype: object