# Coding Practice Session 4
## Selecting Data in Pandas

In [1]:
import pandas as pd
import numpy as np

### Accessing Elements in a Series

In [2]:
s = pd.Series([1, 2, 3, 4, 5], index=["a", "b", "c", "d", "e"])
s

a    1
b    2
c    3
d    4
e    5
dtype: int64

In [3]:
s["a"]

np.int64(1)

In [4]:
print(s["d"])

4


In [5]:
s[0]  # will be deprecated, not recommended

  s[0]  # will be deprecated, not recommended


np.int64(1)

In [6]:
s["b":"d"]

b    2
c    3
d    4
dtype: int64

In [7]:
s["a":"c"]  # right side included

a    1
b    2
c    3
dtype: int64

In [8]:
s[1:3]  # right side excluded

b    2
c    3
dtype: int64

In [9]:
s[0:4]

a    1
b    2
c    3
d    4
dtype: int64

### Selecting Columns in a DataFrame

In [10]:
df = pd.DataFrame(
    np.arange(20, 32).reshape(4, 3), columns=["A", "B", "C"], index=["a", "b", "c", "d"]
)
df

Unnamed: 0,A,B,C
a,20,21,22
b,23,24,25
c,26,27,28
d,29,30,31


In [11]:
df.columns

Index(['A', 'B', 'C'], dtype='object')

In [12]:
df["B"]

a    21
b    24
c    27
d    30
Name: B, dtype: int64

In [13]:
df.C  # not recommended

a    22
b    25
c    28
d    31
Name: C, dtype: int64

In [14]:
df[["A", "C"]]

Unnamed: 0,A,C
a,20,22
b,23,25
c,26,28
d,29,31


In [15]:
df["A":"C"]

Unnamed: 0,A,B,C


**Note:** Slicing does not work as assumed on the columns of a `DataFrame`

**Note:** Columns in a `DataFrame` are only accessible through the column's label.

By default, using `[]` gives access to the columns in a `DataFrame`

### Selecting Rows in a DataFrame

In [16]:
df

Unnamed: 0,A,B,C
a,20,21,22
b,23,24,25
c,26,27,28
d,29,30,31


In [17]:
df.index

Index(['a', 'b', 'c', 'd'], dtype='object')

In [18]:
df.loc["a"]

A    20
B    21
C    22
Name: a, dtype: int64

In [19]:
df.iloc[0]

A    20
B    21
C    22
Name: a, dtype: int64

In [20]:
df.iloc[-1]

A    29
B    30
C    31
Name: d, dtype: int64

In [21]:
df.loc["d"]

A    29
B    30
C    31
Name: d, dtype: int64

In [22]:
df.loc[["a", "d"]]

Unnamed: 0,A,B,C
a,20,21,22
d,29,30,31


In [23]:
df.iloc[[1, 3]]

Unnamed: 0,A,B,C
b,23,24,25
d,29,30,31


### Slicing Rows and Columns

In [24]:
df

Unnamed: 0,A,B,C
a,20,21,22
b,23,24,25
c,26,27,28
d,29,30,31


In [25]:
df.loc["a":"c"]

Unnamed: 0,A,B,C
a,20,21,22
b,23,24,25
c,26,27,28


In [26]:
df.loc["b":"d"]

Unnamed: 0,A,B,C
b,23,24,25
c,26,27,28
d,29,30,31


In [27]:
df.loc[["a", "d"]]

Unnamed: 0,A,B,C
a,20,21,22
d,29,30,31


In [28]:
df.loc[:, "A"]

a    20
b    23
c    26
d    29
Name: A, dtype: int64

In [29]:
df["A"]

a    20
b    23
c    26
d    29
Name: A, dtype: int64

In [30]:
df.loc["a", :]

A    20
B    21
C    22
Name: a, dtype: int64

In [31]:
df.loc["a"]

A    20
B    21
C    22
Name: a, dtype: int64

In [32]:
df.loc[["a", "c"], ["A", "B"]]

Unnamed: 0,A,B
a,20,21
c,26,27


In [33]:
df.iloc[0]

A    20
B    21
C    22
Name: a, dtype: int64

In [34]:
df.iloc[1:3, 0]

b    23
c    26
Name: A, dtype: int64

In [35]:
df.iloc[0:2, 1:]

Unnamed: 0,B,C
a,21,22
b,24,25


In [36]:
df.loc[:, "B":]

Unnamed: 0,B,C
a,21,22
b,24,25
c,27,28
d,30,31


In [37]:
df.iloc[[0, 2, 3], -1]

a    22
c    28
d    31
Name: C, dtype: int64

In [38]:
s = pd.Series(range(5))
s

0    0
1    1
2    2
3    3
4    4
dtype: int64

In [39]:
s.loc[1]

np.int64(1)

In [40]:
s.iloc[1]

np.int64(1)

In [41]:
s.loc[0] = 1000

In [42]:
s

0    1000
1       1
2       2
3       3
4       4
dtype: int64

In [43]:
df

Unnamed: 0,A,B,C
a,20,21,22
b,23,24,25
c,26,27,28
d,29,30,31


In [44]:
df.loc["a", ["A", "B"]] = (1000, 2000)

In [45]:
df

Unnamed: 0,A,B,C
a,1000,2000,22
b,23,24,25
c,26,27,28
d,29,30,31


### Using Callable Functions for Indexing

In [46]:
df = pd.DataFrame(
    np.random.randint(-10, 10, size=(5, 4)),
    columns=["A", "B", "C", "D"],
    index=list("abcde"),
)

df

Unnamed: 0,A,B,C,D
a,8,-10,5,-1
b,7,-9,9,-6
c,3,-2,-3,4
d,3,-8,-6,2
e,8,-3,-1,-5


In [47]:
def select_positive(df):
    return df > 0


def select_even(df):
    return df % 2 == 0

In [48]:
df[select_positive]

Unnamed: 0,A,B,C,D
a,8,,5.0,
b,7,,9.0,
c,3,,,4.0
d,3,,,2.0
e,8,,,


In [49]:
df[select_even]

Unnamed: 0,A,B,C,D
a,8.0,-10.0,,
b,,,,-6.0
c,,-2.0,,4.0
d,,-8.0,-6.0,2.0
e,8.0,,,


### Selecting Random Samples

In [50]:
df = pd.DataFrame(
    np.random.randint(1, 20, size=(6, 8)),
    columns=list("ABCDEFGH"),
    index=list("abcdef"),
)

df

Unnamed: 0,A,B,C,D,E,F,G,H
a,3,8,17,18,7,18,10,14
b,8,8,19,4,3,18,17,15
c,19,5,12,11,1,3,13,15
d,2,5,9,13,6,19,18,10
e,17,16,4,10,15,13,1,18
f,19,6,8,5,7,6,3,13


In [51]:
df.sample() # returns a single record

Unnamed: 0,A,B,C,D,E,F,G,H
e,17,16,4,10,15,13,1,18


In [52]:
df.sample(n=3)

Unnamed: 0,A,B,C,D,E,F,G,H
c,19,5,12,11,1,3,13,15
f,19,6,8,5,7,6,3,13
a,3,8,17,18,7,18,10,14


In [53]:
df.sample(frac=0.33)

Unnamed: 0,A,B,C,D,E,F,G,H
d,2,5,9,13,6,19,18,10
b,8,8,19,4,3,18,17,15


In [54]:
df.sample(frac=0.5, replace=True)

Unnamed: 0,A,B,C,D,E,F,G,H
a,3,8,17,18,7,18,10,14
a,3,8,17,18,7,18,10,14
d,2,5,9,13,6,19,18,10


In [55]:
weights = [0.1, 0.1, 0.15, 0.05, 0.4, 0.2]
df.sample(n=2, weights=weights)

Unnamed: 0,A,B,C,D,E,F,G,H
f,19,6,8,5,7,6,3,13
a,3,8,17,18,7,18,10,14


In [56]:
df.sample(n=4, random_state=21) # makes it reproducible

Unnamed: 0,A,B,C,D,E,F,G,H
f,19,6,8,5,7,6,3,13
c,19,5,12,11,1,3,13,15
d,2,5,9,13,6,19,18,10
e,17,16,4,10,15,13,1,18


In [57]:
s = pd.Series(np.arange(1, 10), index=list("abcdefghi"))
s

a    1
b    2
c    3
d    4
e    5
f    6
g    7
h    8
i    9
dtype: int64

In [58]:
s.sample(n=4, replace=True)

e    5
a    1
d    4
g    7
dtype: int64

In [59]:
np.shares_memory(s, s.sample(n=4, replace=True))

False

### Fast Scalar Value Getting and Setting

In [60]:
df = pd.DataFrame(
    {"A": [num for num in range(5)], "B": [num for num in range(10, 20, 2)]},
    index=["a", "b", "c", "d", "e"]
)

df

Unnamed: 0,A,B
a,0,10
b,1,12
c,2,14
d,3,16
e,4,18


In [61]:
df.at["c", "B"]

np.int64(14)

In [62]:
df.at["e", "A"]

np.int64(4)

In [63]:
df.at["a", "A"] = 700
df

Unnamed: 0,A,B
a,700,10
b,1,12
c,2,14
d,3,16
e,4,18


In [64]:
df.at["d", "A"] = 1500
df

Unnamed: 0,A,B
a,700,10
b,1,12
c,2,14
d,1500,16
e,4,18


In [65]:
df.at["c", "B"] = 1864

In [66]:
df

Unnamed: 0,A,B
a,700,10
b,1,12
c,2,1864
d,1500,16
e,4,18


In [67]:
df.iat[1, 1]

np.int64(12)

In [68]:
print(df.iat[-1, -1])

18


In [69]:
df.iat[1, 0] = 88
df

Unnamed: 0,A,B
a,700,10
b,88,12
c,2,1864
d,1500,16
e,4,18


In [70]:
data = np.random.randint(0, 100, size=(1_000_000, 3), dtype="u1")
df = pd.DataFrame(data, columns=["A", "B", "C"])

In [71]:
df.head()

Unnamed: 0,A,B,C
0,44,62,88
1,2,34,75
2,10,0,33
3,25,39,73
4,72,87,67


In [72]:
df.dtypes

A    uint8
B    uint8
C    uint8
dtype: object

In [73]:
%timeit df.at[500_000, "C"]

1.29 μs ± 41.8 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)


In [74]:
%timeit df.loc[500_000, "C"]

2.94 μs ± 95.9 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)


In [75]:
%timeit df.iat[500_000, 2]

6.07 μs ± 87.5 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)


In [76]:
%timeit df.iloc[500_000, 2]

7.79 μs ± 184 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)


As we can seel using the `at` accessor is faster than `loc`. Likewise, `iat` performs faster than `iloc`.

However, it must be noted that `at` and `iat` accessors are efficient for scaler value access and setting. For accessing and modifying multiple values in a DataFrame, `loc` and `iloc` are more appropriate.