In [1]:
def my_function():
    pass

In [2]:
def my_sq(x):
    return x ** 2

In [3]:
my_sq(16)

256

In [4]:
my_sq("hello")

TypeError: unsupported operand type(s) for ** or pow(): 'str' and 'int'

In [5]:
assert my_sq(4) == 16

In [6]:
type(4)

int

In [7]:
def my_sq(x: int|float) -> int|float:
    return x ** 2

In [8]:
import pandas as pd

In [9]:
df = pd.DataFrame({ "a": [10, 20, 30], "b": [20, 30, 40] })

In [10]:
df

Unnamed: 0,a,b
0,10,20
1,20,30
2,30,40


In [11]:
df["a"] ** 2

0    100
1    400
2    900
Name: a, dtype: int64

In [12]:
df.a ** 2

0    100
1    400
2    900
Name: a, dtype: int64

In [13]:
my_sq(df.a)

0    100
1    400
2    900
Name: a, dtype: int64

In [14]:
type(df.a)

pandas.core.series.Series

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   a       3 non-null      int64
 1   b       3 non-null      int64
dtypes: int64(2)
memory usage: 180.0 bytes


In [16]:
df.a.apply(my_sq)

0    100
1    400
2    900
Name: a, dtype: int64

In [17]:
my_sq

<function __main__.my_sq(x: int | float) -> int | float>

In [18]:
def my_exp(x, e):
    return x ** e

In [19]:
my_exp(4, 2)

16

In [20]:
my_exp(2, 10)

1024

In [21]:
df.a.apply(my_exp)

TypeError: my_exp() missing 1 required positional argument: 'e'

In [22]:
df.a.apply(my_exp, e=10)

0        10000000000
1     10240000000000
2    590490000000000
Name: a, dtype: int64

In [23]:
def my_exp_flipped(e, x):
    return my_exp(x, e)

In [24]:
df.a.apply(my_exp_flipped, x=2)

0          1024
1       1048576
2    1073741824
Name: a, dtype: int64

In [25]:
df.a.apply(lambda val: my_exp(2, val))

0          1024
1       1048576
2    1073741824
Name: a, dtype: int64

In [26]:
df

Unnamed: 0,a,b
0,10,20
1,20,30
2,30,40


In [27]:
def print_me(x):
    print(x)

In [28]:
x = print_me("hello")

hello


In [29]:
x

In [30]:
df.a.apply(print_me)

10
20
30


0    None
1    None
2    None
Name: a, dtype: object

In [31]:
df.apply(print_me)

0    10
1    20
2    30
Name: a, dtype: int64
0    20
1    30
2    40
Name: b, dtype: int64


a    None
b    None
dtype: object

In [32]:
def avg_3(x, y, z):
    return (x + y + z) / 3

In [33]:
sum(df.a + df.b)

150

In [34]:
assert avg_3(0,1,2) == 1

In [35]:
df.apply(avg_3)

TypeError: avg_3() missing 2 required positional arguments: 'y' and 'z'

In [36]:
avg_3(10)

TypeError: avg_3() missing 2 required positional arguments: 'y' and 'z'

In [37]:
def avg_3_apply(col):
    return col.mean()

In [38]:
df.apply(avg_3_apply)

a    20.0
b    30.0
dtype: float64

In [39]:
def avg_3_apply(col):
    x = col[0]
    y = col[1]
    z = col[2]
    return (x + y + z)/3

In [40]:
df.apply(avg_3_apply)

a    20.0
b    30.0
dtype: float64

In [41]:
df.apply(print_me, axis='columns')

a    10
b    20
Name: 0, dtype: int64
a    20
b    30
Name: 1, dtype: int64
a    30
b    40
Name: 2, dtype: int64


0    None
1    None
2    None
dtype: object

In [42]:
df.apply(print_me, axis=1)

a    10
b    20
Name: 0, dtype: int64
a    20
b    30
Name: 1, dtype: int64
a    30
b    40
Name: 2, dtype: int64


0    None
1    None
2    None
dtype: object

In [43]:
df.a + df.b

0    30
1    50
2    70
dtype: int64

In [44]:
(df.a + df.b)/2

0    15.0
1    25.0
2    35.0
dtype: float64

In [45]:
def avg_2(x, y):
    return (x + y) / 2

In [46]:
avg_2(df.a, df.b)

0    15.0
1    25.0
2    35.0
dtype: float64

In [48]:
import numpy as np

def avg_2_mod(x, y):
    if (x == 20):
        return np.nan
    else:
        return (x + y) / 2

In [49]:
avg_2_mod(20, 10)

nan

In [50]:
avg_2_mod(0, 10)

5.0

In [51]:
assert avg_2_mod(0, 10) == 5

In [52]:
avg_2_mod(df.a, df.b)

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [53]:
df.a == 20

0    False
1     True
2    False
Name: a, dtype: bool

In [55]:
(df.a == 20).any()

True

In [56]:
(df.a == 20).all()

False

In [57]:
avg_2_mod_np = np.vectorize(avg_2_mod)

In [58]:
avg_2_mod_np(df.a, df.b)

array([15., nan, 35.])

In [59]:
df

Unnamed: 0,a,b
0,10,20
1,20,30
2,30,40


In [60]:
@np.vectorize
def avg_2_mod(x, y):
    if (x == 20):
        return np.nan
    else:
        return (x + y) / 2

In [61]:
avg_2_mod(df.a, df.b)

array([15., nan, 35.])

In [62]:
import numba

In [63]:
@numba.vectorize
def avg_2_mod_numba(x, y):
    if (x == 20):
        return np.nan
    else:
        return (x + y) / 2

In [64]:
avg_2_mod_numba(10, 20)

15.0

In [65]:
avg_2_mod_numba(0, 10)

5.0

In [66]:
avg_2_mod_numba(df.a, df.b)

0    15.0
1     NaN
2    35.0
dtype: float64

In [67]:
%%timeit
avg_2(df.a, df.b)

156 µs ± 6.03 µs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [68]:
%%timeit
(df.a + df.b) / 2

145 µs ± 4.61 µs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [69]:
%%timeit
avg_2_mod_np(df.a, df.b)

46.2 µs ± 2.9 µs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [70]:
%%timeit
avg_2_mod_numba(df.a, df.b)

140 µs ± 9.99 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [71]:
%%timeit
avg_2_mod_numba(df.a.values, df.b.values)

14.8 µs ± 836 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)
