# Overview

`apply` is a method on `DataFrame` and `Series` object that execute a function on the elements of the container

In [None]:
import pandas as pd

`apply` on dataframe will iterate through each of the column

In [None]:
def iterate(series):
    print(type(series))
    return series

df = pd.DataFrame({"ages": [1, 2, 3], "names": ["Griffin", "Brian", "Stewie"]})
df.apply(iterate)

<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>


Unnamed: 0,ages,names
0,1,Griffin
1,2,Brian
2,3,Stewie


`apply(axis=1)` on dataframe will go over each row

In [None]:
def make_id(row: pd.Series):
    name = row["names"]
    age = row["ages"]

    return f"{age}-{name}"

df = pd.DataFrame({"ages": [1, 2, 3], "names": ["Griffin", "Brian", "Stewie"]})
df["ids"] = df.apply(make_id, axis=1)
df

Unnamed: 0,ages,names,ids
0,1,Griffin,1-Griffin
1,2,Brian,2-Brian
2,3,Stewie,3-Stewie


`apply` on series (aka. columns) will go over each element in the column

In [None]:
def uppercase(name: str) -> str:
    return name.upper()

df = pd.DataFrame({"ages": [1, 2, 3], "names": ["Griffin", "Brian", "Stewie"]})
df["names"] = df["names"].apply(uppercase)
df

Unnamed: 0,ages,names
0,1,GRIFFIN
1,2,BRIAN
2,3,STEWIE


# Benchmark

In [None]:
import time
from IPython.display import display

In [None]:
def create_benchmark_df():
    return pd.DataFrame({
        "values": range(100000)
    }) 

create_benchmark_df()

Unnamed: 0,values
0,0
1,1
2,2
3,3
4,4
...,...
99995,99995
99996,99996
99997,99997
99998,99998


In [None]:
def benchmark_iterrows():
    df = create_benchmark_df()
    start = time.time()

    for _label, row in df.iterrows():
        row["values"] = row["values"] + 2

    end = time.time()

    display(df)
    display(f"duration = {end - start}")
    

benchmark_iterrows()

Unnamed: 0,values
0,2
1,3
2,4
3,5
4,6
...,...
99995,99997
99996,99998
99997,99999
99998,100000


'duration = 1.5768036842346191'

In [None]:
def benchmark_apply():
    df = create_benchmark_df()
    start = time.time()
    df["values"] = df["values"].apply(lambda x: x + 2)
    end = time.time()

    
    display(df)
    display(f"duration = {end - start}")
    

benchmark_apply()

Unnamed: 0,values
0,2
1,3
2,4
3,5
4,6
...,...
99995,99997
99996,99998
99997,99999
99998,100000


'duration = 0.021382808685302734'