## Import libraries

In [1]:
import numpy as np 
import pandas as pd

## Table application - pipe()

Performed by passing the Series/DataFrame to functions

In [2]:
df_p = pd.DataFrame({"city_and_code": ["Chicago, IL"]})

In [3]:
df_p

Unnamed: 0,city_and_code
0,"Chicago, IL"


In [4]:
def extract_city_name(df):
    """
    Chicago, IL -> Chicago for city_name column
    """
    df["city_name"] = df["city_and_code"].str.split(",").str.get(0)
    return df


def add_country_name(df, country_name=None):
    """
    Chicago -> Chicago-US for city_name column
    """
    col = "city_name"
    df["city_and_country"] = df[col] + country_name
    return df

In [5]:
# Method 1: Nested functions (basic)
add_country_name(extract_city_name(df_p), country_name="US")

Unnamed: 0,city_and_code,city_name,city_and_country
0,"Chicago, IL",Chicago,ChicagoUS


In [6]:
# Method 2: Method chaining using pipe (pandas encouraged)
df_p.pipe(extract_city_name).pipe(add_country_name, country_name="US")

Unnamed: 0,city_and_code,city_name,city_and_country
0,"Chicago, IL",Chicago,ChicagoUS


## Row or Column wise application - apply()

Apply the function along an axis(row/column)

In [7]:
df = pd.DataFrame(
                    {
                        "one": pd.Series(np.random.randn(3), index=["a", "b", "c"]),
                        "two": pd.Series(np.random.randn(4), index=["a", "b", "c", "d"]),
                        "three": pd.Series(np.random.randn(3), index=["b", "c", "d"]),
                    }
                  )

In [8]:
df

Unnamed: 0,one,two,three
a,-0.532119,-1.925059,
b,1.684989,-0.780922,1.536296
c,-0.541863,1.423139,-0.155283
d,,1.232769,0.54825


In [9]:
df.apply(lambda x: x.max() - x.min())

one      2.226852
two      3.348198
three    1.691579
dtype: float64

In [14]:
df.apply(np.mean, axis=1)

# Or pass as named function
df.apply('mean', axis=1)

a   -1.228589
b    0.813454
c    0.241998
d    0.890510
dtype: float64

## Aggregation API - agg() and transform()

The aggregation API allows one to express possibly multiple aggregation operations in a single concise way. 

In [15]:
tsdf = pd.DataFrame(
            np.random.randn(10, 3),
            columns=["A", "B", "C"],
            index=pd.date_range("1/1/2000", periods=10),
        )

In [16]:
tsdf.iloc[3:7] = np.nan
tsdf

Unnamed: 0,A,B,C
2000-01-01,-0.739378,-0.337812,-1.504897
2000-01-02,1.540682,-0.350753,-0.860269
2000-01-03,-0.025358,2.458698,-0.059512
2000-01-04,,,
2000-01-05,,,
2000-01-06,,,
2000-01-07,,,
2000-01-08,1.343513,-0.340702,0.708854
2000-01-09,-0.434854,0.200975,-0.517259
2000-01-10,-1.349501,-0.774803,0.462019


In [19]:
## Multiple aggregation passed as a list
tsdf.agg(["sum", "mean"])

Unnamed: 0,A,B,C
sum,0.335104,0.855602,-1.771066
mean,0.055851,0.1426,-0.295178


In [20]:
## Transform an entire dataframe
tsdf.transform(np.abs)

Unnamed: 0,A,B,C
2000-01-01,0.739378,0.337812,1.504897
2000-01-02,1.540682,0.350753,0.860269
2000-01-03,0.025358,2.458698,0.059512
2000-01-04,,,
2000-01-05,,,
2000-01-06,,,
2000-01-07,,,
2000-01-08,1.343513,0.340702,0.708854
2000-01-09,0.434854,0.200975,0.517259
2000-01-10,1.349501,0.774803,0.462019


In [22]:
## Multi-transform operation
tsdf["A"].transform([np.abs, lambda x: x + 1])

Unnamed: 0,absolute,<lambda>
2000-01-01,0.739378,0.260622
2000-01-02,1.540682,2.540682
2000-01-03,0.025358,0.974642
2000-01-04,,
2000-01-05,,
2000-01-06,,
2000-01-07,,
2000-01-08,1.343513,2.343513
2000-01-09,0.434854,0.565146
2000-01-10,1.349501,-0.349501


## Applying elementwise functions - applymap()

applymap() on DataFrame and analogously map() on Series accept any Python function taking a single value and returning a single value

In [23]:
df

Unnamed: 0,one,two,three
a,-0.532119,-1.925059,
b,1.684989,-0.780922,1.536296
c,-0.541863,1.423139,-0.155283
d,,1.232769,0.54825


In [24]:
def f(x):
    return len(str(x))

In [26]:
df.applymap(f)

Unnamed: 0,one,two,three
a,19,19,3
b,18,19,18
c,18,18,19
d,3,18,18
