In [2]:
import pandas as pd
import numpy as np
import seaborn as sns

In [3]:
iris = sns.load_dataset('iris')

In [4]:
"""
Table-wise Function Application: pipe()
Row or Column-wise Application: apply()
Aggregation API: agg() and transform()
Apply Element-wise Functions: applymap()
"""
''

''

In [5]:
df = pd.DataFrame({
    'city_and_code': ['Chicago, IL']
})
df

Unnamed: 0,city_and_code
0,"Chicago, IL"


In [7]:
df['city_name'] = df['city_and_code'].str.split(',').str.get(0)
df

Unnamed: 0,city_and_code,city_name
0,"Chicago, IL",Chicago


In [8]:
df['city_and_country'] = df['city_name'] + ' - ' + 'U.S.'
df

Unnamed: 0,city_and_code,city_name,city_and_country
0,"Chicago, IL",Chicago,Chicago - U.S.


In [12]:
def extract_city_name(df):
    df['city_name'] = df['city_and_code'].str.split(',').str.get(0)
    return df
def add_country_name(df, country_name):
    df['city_and_country'] = df['city_name'] + ' - ' + country_name
    return df

In [13]:
df.pipe(extract_city_name).pipe(add_country_name, 'UK')

Unnamed: 0,city_and_code,city_name,city_and_country
0,"Chicago, IL",Chicago,Chicago - UK


In [14]:
pd.Series().pipe

<bound method NDFrame.pipe of Series([], dtype: object)>

In [15]:
# pipe can be on Series

In [16]:
# Row or columns wise functions

In [19]:
df = pd.DataFrame(np.random.rand(3, 3))
df

Unnamed: 0,0,1,2
0,0.658808,0.353102,0.284829
1,0.462695,0.848543,0.921378
2,0.026714,0.454836,0.270032


In [20]:
df.apply(np.mean)

0    0.382739
1    0.552160
2    0.492080
dtype: float64

In [21]:
df.apply(np.mean, axis=0)

0    0.382739
1    0.552160
2    0.492080
dtype: float64

In [22]:
df.apply(np.mean, axis=1)

0    0.432247
1    0.744205
2    0.250527
dtype: float64

In [23]:
df.apply(lambda x: x.mean())

0    0.382739
1    0.552160
2    0.492080
dtype: float64

In [24]:
df.apply(np.exp)

Unnamed: 0,0,1,2
0,1.932487,1.423477,1.329535
1,1.588349,2.33624,2.51275
2,1.027074,1.575915,1.310006


In [25]:
np.mean(df[0])

0.3827389867696948

In [26]:
np.exp(df[0])

0    1.932487
1    1.588349
2    1.027074
Name: 0, dtype: float64

In [28]:
df.apply(np.cumsum)

Unnamed: 0,0,1,2
0,0.658808,0.353102,0.284829
1,1.121503,1.201645,1.206207
2,1.148217,1.656481,1.476239


In [29]:
df.apply('exp')

Unnamed: 0,0,1,2
0,1.932487,1.423477,1.329535
1,1.588349,2.33624,2.51275
2,1.027074,1.575915,1.310006


In [31]:
df.apply('sin')

Unnamed: 0,0,1,2
0,0.612175,0.345811,0.280994
1,0.446361,0.750318,0.796435
2,0.026711,0.439315,0.266762


In [32]:
df.apply('tanh')

Unnamed: 0,0,1,2
0,0.577569,0.339124,0.277369
1,0.432278,0.690307,0.726549
2,0.026708,0.425866,0.263654


In [33]:
tsdf = pd.DataFrame(
    np.random.rand(1000, 3),
    columns=['A', 'B', 'C'],
    index=pd.date_range('1/1/2000', periods=1000)
)
tsdf

Unnamed: 0,A,B,C
2000-01-01,0.721154,0.402964,0.151049
2000-01-02,0.268027,0.320901,0.339031
2000-01-03,0.812794,0.423699,0.299881
2000-01-04,0.363282,0.130061,0.869946
2000-01-05,0.425134,0.146924,0.078900
...,...,...,...
2002-09-22,0.406767,0.050435,0.932587
2002-09-23,0.901614,0.007647,0.935639
2002-09-24,0.120580,0.244727,0.826220
2002-09-25,0.657925,0.550796,0.566510


In [34]:
tsdf.apply(lambda x: x.idxmax())

A   2000-11-19
B   2000-12-09
C   2002-04-18
dtype: datetime64[ns]

In [35]:
tsdf.loc['2000-11-19']

A    0.999056
B    0.666820
C    0.526181
Name: 2000-11-19 00:00:00, dtype: float64

In [38]:
df.apply(lambda x, y: x-y, args=(5,))

Unnamed: 0,0,1,2
0,-4.341192,-4.646898,-4.715171
1,-4.537305,-4.151457,-4.078622
2,-4.973286,-4.545164,-4.729968


In [39]:
df.apply(lambda x, y: x-y, y=45)

Unnamed: 0,0,1,2
0,-44.341192,-44.646898,-44.715171
1,-44.537305,-44.151457,-44.078622
2,-44.973286,-44.545164,-44.729968


In [40]:
# .apply is for cells

In [41]:
pd.Series.interpolate

<function pandas.core.generic.NDFrame.interpolate(self, method: 'InterpolateOptions' = 'linear', *, axis: 'Axis' = 0, limit: 'int | None' = None, inplace: 'bool_t' = False, limit_direction: "Literal['forward', 'backward', 'both'] | None" = None, limit_area: "Literal['inside', 'outside'] | None" = None, downcast: "Literal['infer'] | None | lib.NoDefault" = <no_default>, **kwargs) -> 'Self | None'>

In [42]:
tsdf.iloc[3:5] = np.nan

In [43]:
tsdf

Unnamed: 0,A,B,C
2000-01-01,0.721154,0.402964,0.151049
2000-01-02,0.268027,0.320901,0.339031
2000-01-03,0.812794,0.423699,0.299881
2000-01-04,,,
2000-01-05,,,
...,...,...,...
2002-09-22,0.406767,0.050435,0.932587
2002-09-23,0.901614,0.007647,0.935639
2002-09-24,0.120580,0.244727,0.826220
2002-09-25,0.657925,0.550796,0.566510


In [45]:
# interpolate = daron yabi
tsdf.apply(pd.Series.interpolate, method='linear') # fills none data with a appropriate interpolation data

Unnamed: 0,A,B,C
2000-01-01,0.721154,0.402964,0.151049
2000-01-02,0.268027,0.320901,0.339031
2000-01-03,0.812794,0.423699,0.299881
2000-01-04,0.546781,0.364539,0.501908
2000-01-05,0.280767,0.305379,0.703936
...,...,...,...
2002-09-22,0.406767,0.050435,0.932587
2002-09-23,0.901614,0.007647,0.935639
2002-09-24,0.120580,0.244727,0.826220
2002-09-25,0.657925,0.550796,0.566510


In [47]:
tsdf.apply(np.exp, raw=True) # will send raw data (numpy array) instead of columns series like df['column]

Unnamed: 0,A,B,C
2000-01-01,2.056805,1.496254,1.163054
2000-01-02,1.307382,1.378369,1.403587
2000-01-03,2.254197,1.527601,1.349698
2000-01-04,,,
2000-01-05,,,
...,...,...,...
2002-09-22,1.501954,1.051729,2.541075
2002-09-23,2.463575,1.007676,2.548842
2002-09-24,1.128151,1.277272,2.284666
2002-09-25,1.930781,1.734633,1.762106


In [48]:
# Aggregate

In [49]:
iris

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica


In [50]:
iris.species.value_counts()

species
setosa        50
versicolor    50
virginica     50
Name: count, dtype: int64

In [52]:
iris.groupby('species').mean()

Unnamed: 0_level_0,sepal_length,sepal_width,petal_length,petal_width
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
setosa,5.006,3.428,1.462,0.246
versicolor,5.936,2.77,4.26,1.326
virginica,6.588,2.974,5.552,2.026


In [53]:
iris.groupby('species').max()

Unnamed: 0_level_0,sepal_length,sepal_width,petal_length,petal_width
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
setosa,5.8,4.4,1.9,0.6
versicolor,7.0,3.4,5.1,1.8
virginica,7.9,3.8,6.9,2.5


In [54]:
iris.groupby('species').max() - iris.groupby('species').min()

Unnamed: 0_level_0,sepal_length,sepal_width,petal_length,petal_width
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
setosa,1.5,2.1,0.9,0.5
versicolor,2.1,1.4,2.1,0.8
virginica,3.0,1.6,2.4,1.1


In [56]:
iris.species.value_counts()

species
setosa        50
versicolor    50
virginica     50
Name: count, dtype: int64

In [72]:
t = sns.load_dataset('titanic')

In [73]:
t.sex.value_counts()

sex
male      577
female    314
Name: count, dtype: int64

In [80]:
# t.groupby('sex').agg('mean')
# t.groupby('sex').mean()

In [84]:
t.groupby('sex').mean(['survived'])

Unnamed: 0_level_0,survived,pclass,age,sibsp,parch,fare,adult_male,alone
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
female,0.742038,2.159236,27.915709,0.694268,0.649682,44.479818,0.0,0.401274
male,0.188908,2.389948,30.726645,0.429809,0.235702,25.523893,0.930676,0.712305


In [93]:
t.groupby(by='sex').mean(['age'])

Unnamed: 0_level_0,survived,pclass,age,sibsp,parch,fare,adult_male,alone
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
female,0.742038,2.159236,27.915709,0.694268,0.649682,44.479818,0.0,0.401274
male,0.188908,2.389948,30.726645,0.429809,0.235702,25.523893,0.930676,0.712305


In [94]:
t[['age', 'survived']].mean()

age         29.699118
survived     0.383838
dtype: float64

In [97]:
t[['age', 'survived']].aggregate(['sum', 'mean'])

Unnamed: 0,age,survived
sum,21205.17,342.0
mean,29.699118,0.383838


In [100]:
tsdf.iloc[0].agg([lambda x: x])

  tsdf.iloc[0].agg([lambda x: x])


Unnamed: 0,<lambda>
A,0.721154
B,0.402964
C,0.151049


In [104]:
t[['age', 'survived']].agg({
    'age': ['sum', 'mean'],
    'survived': 'sum'
})

Unnamed: 0,age,survived
sum,21205.17,342.0
mean,29.699118,


In [105]:
t.describe()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare
count,891.0,891.0,714.0,891.0,891.0,891.0
mean,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,0.0,1.0,0.42,0.0,0.0,0.0
25%,0.0,2.0,20.125,0.0,0.0,7.9104
50%,0.0,3.0,28.0,0.0,0.0,14.4542
75%,1.0,3.0,38.0,1.0,0.0,31.0
max,1.0,3.0,80.0,8.0,6.0,512.3292


In [108]:
t.groupby('sex')['survived'].mean()

sex
female    0.742038
male      0.188908
Name: survived, dtype: float64

In [117]:
t.agg({
    'sex': 'count',
    'age': ['mean', 'sum'],
    'survived': ['mean', 'sum']
})

Unnamed: 0,sex,age,survived
count,891.0,,
mean,,29.699118,0.383838
sum,,21205.17,342.0


In [121]:
from functools import partial
# partial = is a decorator function that takes a function and some arguments and returns that function which already fixed with some already initialized arguments
q_75 = partial(pd.Series.quantile, q=0.75)
q_75.__name__ = '75%'

In [124]:
titanic[['fare', 'survived']].agg(['mean', 'sum', q_75])

Unnamed: 0,fare,survived
mean,32.204208,0.383838
sum,28693.9493,342.0
75%,31.0,1.0


In [125]:
tsdf

Unnamed: 0,A,B,C
2000-01-01,0.721154,0.402964,0.151049
2000-01-02,0.268027,0.320901,0.339031
2000-01-03,0.812794,0.423699,0.299881
2000-01-04,,,
2000-01-05,,,
...,...,...,...
2002-09-22,0.406767,0.050435,0.932587
2002-09-23,0.901614,0.007647,0.935639
2002-09-24,0.120580,0.244727,0.826220
2002-09-25,0.657925,0.550796,0.566510


In [126]:
# tsdf.transform = transform is like aggregate and apply

In [127]:
tsdf.transform(np.abs)

Unnamed: 0,A,B,C
2000-01-01,0.721154,0.402964,0.151049
2000-01-02,0.268027,0.320901,0.339031
2000-01-03,0.812794,0.423699,0.299881
2000-01-04,,,
2000-01-05,,,
...,...,...,...
2002-09-22,0.406767,0.050435,0.932587
2002-09-23,0.901614,0.007647,0.935639
2002-09-24,0.120580,0.244727,0.826220
2002-09-25,0.657925,0.550796,0.566510


In [128]:
tsdf.transform(np.exp)

Unnamed: 0,A,B,C
2000-01-01,2.056805,1.496254,1.163054
2000-01-02,1.307382,1.378369,1.403587
2000-01-03,2.254197,1.527601,1.349698
2000-01-04,,,
2000-01-05,,,
...,...,...,...
2002-09-22,1.501954,1.051729,2.541075
2002-09-23,2.463575,1.007676,2.548842
2002-09-24,1.128151,1.277272,2.284666
2002-09-25,1.930781,1.734633,1.762106


In [129]:
tsdf.transform(lambda x: x+8)

Unnamed: 0,A,B,C
2000-01-01,8.721154,8.402964,8.151049
2000-01-02,8.268027,8.320901,8.339031
2000-01-03,8.812794,8.423699,8.299881
2000-01-04,,,
2000-01-05,,,
...,...,...,...
2002-09-22,8.406767,8.050435,8.932587
2002-09-23,8.901614,8.007647,8.935639
2002-09-24,8.120580,8.244727,8.826220
2002-09-25,8.657925,8.550796,8.566510


In [132]:
tsdf.transform({
    'A': np.abs,
    'B': ['sqrt', lambda x: float((x+1)/(x-1))],
})

Unnamed: 0_level_0,A,B,B
Unnamed: 0_level_1,absolute,sqrt,<lambda>
2000-01-01,0.721154,0.634795,-2.349884
2000-01-02,0.268027,0.566481,-1.945080
2000-01-03,0.812794,0.650921,-2.470406
2000-01-04,,,
2000-01-05,,,
...,...,...,...
2002-09-22,0.406767,0.224577,-1.106228
2002-09-23,0.901614,0.087448,-1.015412
2002-09-24,0.120580,0.494699,-1.648049
2002-09-25,0.657925,0.742156,-3.452317


In [133]:
# in transform the output must be the same shape

In [136]:
# apply to individual elements or cells
# with map and applymap
# map for Series and DataFrame
# applymap for DataFrame

In [137]:
t.applymap(lambda x: str(x))

  t.applymap(lambda x: str(x))


Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0,S,Second,man,True,,Southampton,no,True
887,1,1,female,19.0,0,0,30.0,S,First,woman,False,B,Southampton,yes,True
888,0,3,female,,1,2,23.45,S,Third,woman,False,,Southampton,no,False
889,1,1,male,26.0,0,0,30.0,C,First,man,True,C,Cherbourg,yes,True


In [138]:
t.map(lambda x: str(x))

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0,S,Second,man,True,,Southampton,no,True
887,1,1,female,19.0,0,0,30.0,S,First,woman,False,B,Southampton,yes,True
888,0,3,female,,1,2,23.45,S,Third,woman,False,,Southampton,no,False
889,1,1,male,26.0,0,0,30.0,C,First,man,True,C,Cherbourg,yes,True


In [139]:
t.map(lambda x: str(x)).map(lambda x: x.strip())

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0,S,Second,man,True,,Southampton,no,True
887,1,1,female,19.0,0,0,30.0,S,First,woman,False,B,Southampton,yes,True
888,0,3,female,,1,2,23.45,S,Third,woman,False,,Southampton,no,False
889,1,1,male,26.0,0,0,30.0,C,First,man,True,C,Cherbourg,yes,True


In [142]:
t.sex.map({
    'max': 0,
    'female': 1
})
# replaceing values

0      NaN
1      1.0
2      1.0
3      1.0
4      NaN
      ... 
886    NaN
887    1.0
888    1.0
889    NaN
890    NaN
Name: sex, Length: 891, dtype: float64

In [143]:
t.sex.map({
    'max': 0
})

0     NaN
1     NaN
2     NaN
3     NaN
4     NaN
       ..
886   NaN
887   NaN
888   NaN
889   NaN
890   NaN
Name: sex, Length: 891, dtype: float64

In [144]:
t.sex.map({
    'female': 1
})

0      NaN
1      1.0
2      1.0
3      1.0
4      NaN
      ... 
886    NaN
887    1.0
888    1.0
889    NaN
890    NaN
Name: sex, Length: 891, dtype: float64

In [146]:
t.deck

0      NaN
1        C
2      NaN
3        C
4      NaN
      ... 
886    NaN
887      B
888    NaN
889      C
890    NaN
Name: deck, Length: 891, dtype: category
Categories (7, object): ['A', 'B', 'C', 'D', 'E', 'F', 'G']

In [148]:
print(t.deck.__doc__)


    One-dimensional ndarray with axis labels (including time series).

    Labels need not be unique but must be a hashable type. The object
    supports both integer- and label-based indexing and provides a host of
    methods for performing operations involving the index. Statistical
    methods from ndarray have been overridden to automatically exclude
    missing data (currently represented as NaN).

    Operations between Series (+, -, /, \*, \*\*) align values based on their
    associated index values-- they need not be the same length. The result
    index will be the sorted union of the two indexes.

    Parameters
    ----------
    data : array-like, Iterable, dict, or scalar value
        Contains data stored in Series. If data is a dict, argument order is
        maintained.
    index : array-like or Index (1d)
        Values must be hashable and have the same length as `data`.
        Non-unique index values are allowed. Will default to
        RangeIndex (0, 1, 2, ..., 