# Pandas: apply 
#### Topics:
- apply + lambda


## apply

#### apply
- Used with: **Series or DataFrame**
- Apply a function to rows or columns (if DataFrame) or elements (if Series)
- **More powerful than map**


In [42]:
import pandas as pd

import warnings
warnings.filterwarnings(action='ignore')

data_file = 'data.csv'

In [32]:
df = pd.read_csv('data.csv')
print(df)

       Name  Age         City  Exp  Exp2   Salary passport  bonus  hours
0     Alice   25      Chicago    2     2  70000.0      a43   1000    100
1       Bob   30  Los Angeles    5     5  80000.0      a44   3000    150
2   Charlie   35      Chicago    7     7      NaN      a45      0    200
3     David   40      Houston   10    10  90000.0      a46   7000    100
4       Eva   22      Houston    1     1  48000.0      a47   2555    120
5     Frank   28          NaN    3     3  72000.0      a48      0    100
6     Grace   32  San Antonio    6     6  85000.0      a49      0    200
7     Helen   26    San Diego    2     2  62000.0      a50   9000    100
8     Helen   26    San Diego    2     2  62000.0      a51      0     80
9     Helen   26    San Diego    2     2  62000.0      a52   3000    100
10    Jerry   23      Phoenix    6     6  78000.0      a53      0     80


In [43]:
# I want to create a column 'Age_group' that has values Young, Senior
# depending on Age and experience.
# if age < 30 and experience < 4 then Young
# else Senior

# # 1) row-wise CF
def categ_person(row):
    if row['Age'] < 30 and row['Exp'] < 4:
        return 'Young'
    else:
        return 'Senior'

df['age_group'] = df.apply(categ_person, axis=1)
print(df[["Name", "Age", "Exp", "age_group"]])

       Name  Age  Exp age_group
0     Alice   25    2     Young
1       Bob   30    5    Senior
2   Charlie   35    7    Senior
3     David   40   10    Senior
4       Eva   22    1     Young
5     Frank   28    3     Young
6     Grace   32    6    Senior
7     Helen   26    2     Young
8     Helen   26    2     Young
9     Helen   26    2     Young
10    Jerry   23    6    Senior


In [34]:
## 2) Column-wise custom function:
# I want to create another column salary_clean with "USD" appended to the Salary column

def add_usd(s):
    if pd.notnull(s):
        return f"USD {s}"
    return s  # Keep NaN as is
    
df['salary_clean'] = df['Salary'].apply(add_usd) 
print(df[["Name", "Age", "Salary", "salary_clean"]])

       Name  Age   Salary salary_clean
0     Alice   25  70000.0  USD 70000.0
1       Bob   30  80000.0  USD 80000.0
2   Charlie   35      NaN          NaN
3     David   40  90000.0  USD 90000.0
4       Eva   22  48000.0  USD 48000.0
5     Frank   28  72000.0  USD 72000.0
6     Grace   32  85000.0  USD 85000.0
7     Helen   26  62000.0  USD 62000.0
8     Helen   26  62000.0  USD 62000.0
9     Helen   26  62000.0  USD 62000.0
10    Jerry   23  78000.0  USD 78000.0


### apply + lambda

In [41]:
# Now lets use lambda
df = pd.read_csv('data.csv')
print(df,"\n++++\n")

# step1: create a new column ,Tax, which is 10% of your salary
# Using apply() on a Series (column-wise operation)
def add_tax(salary):
    return salary * 0.1

df['tax1'] = df['Salary'].apply(add_tax)

print(df[["Name", "Age", "Salary", "tax1"]])

       Name  Age         City  Exp  Exp2   Salary passport  bonus  hours
0     Alice   25      Chicago    2     2  70000.0      a43   1000    100
1       Bob   30  Los Angeles    5     5  80000.0      a44   3000    150
2   Charlie   35      Chicago    7     7      NaN      a45      0    200
3     David   40      Houston   10    10  90000.0      a46   7000    100
4       Eva   22      Houston    1     1  48000.0      a47   2555    120
5     Frank   28          NaN    3     3  72000.0      a48      0    100
6     Grace   32  San Antonio    6     6  85000.0      a49      0    200
7     Helen   26    San Diego    2     2  62000.0      a50   9000    100
8     Helen   26    San Diego    2     2  62000.0      a51      0     80
9     Helen   26    San Diego    2     2  62000.0      a52   3000    100
10    Jerry   23      Phoenix    6     6  78000.0      a53      0     80 
++++

       Name  Age   Salary    tax1
0     Alice   25  70000.0  7000.0
1       Bob   30  80000.0  8000.0
2   Charlie   3

In [36]:
# step2: Redo above using lambda
df['tax2'] = df['Salary'].apply(lambda x: x * 0.1)

print(df[["Name", "Age", "Salary", "tax1", "tax2"]])

       Name  Age   Salary    tax1    tax2
0     Alice   25  70000.0  7000.0  7000.0
1       Bob   30  80000.0  8000.0  8000.0
2   Charlie   35      NaN     NaN     NaN
3     David   40  90000.0  9000.0  9000.0
4       Eva   22  48000.0  4800.0  4800.0
5     Frank   28  72000.0  7200.0  7200.0
6     Grace   32  85000.0  8500.0  8500.0
7     Helen   26  62000.0  6200.0  6200.0
8     Helen   26  62000.0  6200.0  6200.0
9     Helen   26  62000.0  6200.0  6200.0
10    Jerry   23  78000.0  7800.0  7800.0


In [37]:
# Using apply+lambda across rows (axis=1)
# step1: without lambda function
def format_info(row):
    return f"{row['Name']} is {row['Age']} years old"

df['info1'] = df.apply(format_info, axis=1)
print(df[["Name", "Age", "info1"]])

       Name  Age                    info1
0     Alice   25    Alice is 25 years old
1       Bob   30      Bob is 30 years old
2   Charlie   35  Charlie is 35 years old
3     David   40    David is 40 years old
4       Eva   22      Eva is 22 years old
5     Frank   28    Frank is 28 years old
6     Grace   32    Grace is 32 years old
7     Helen   26    Helen is 26 years old
8     Helen   26    Helen is 26 years old
9     Helen   26    Helen is 26 years old
10    Jerry   23    Jerry is 23 years old


In [38]:
# step2: with lambda function
# df['info'] = df.apply(lambda row: "XXX", axis=1 ) #step1
df['info2'] = df.apply(lambda row: f"{row['Name']} is {row['Age']} years old", axis=1) #step2
print(df[["Name", "Age", "info1", "info2"]])

       Name  Age                    info1                    info2
0     Alice   25    Alice is 25 years old    Alice is 25 years old
1       Bob   30      Bob is 30 years old      Bob is 30 years old
2   Charlie   35  Charlie is 35 years old  Charlie is 35 years old
3     David   40    David is 40 years old    David is 40 years old
4       Eva   22      Eva is 22 years old      Eva is 22 years old
5     Frank   28    Frank is 28 years old    Frank is 28 years old
6     Grace   32    Grace is 32 years old    Grace is 32 years old
7     Helen   26    Helen is 26 years old    Helen is 26 years old
8     Helen   26    Helen is 26 years old    Helen is 26 years old
9     Helen   26    Helen is 26 years old    Helen is 26 years old
10    Jerry   23    Jerry is 23 years old    Jerry is 23 years old


In [44]:
# if age < 30 and experience < 4 then Young
# else Senior
df['Age_group'] = df.apply(lambda row: 'Young' if row['Age'] < 30 and row['Exp'] < 4 else 'Senior', axis=1)
print(df[["Name", "Age", "Exp", "age_group"]])

       Name  Age  Exp age_group
0     Alice   25    2     Young
1       Bob   30    5    Senior
2   Charlie   35    7    Senior
3     David   40   10    Senior
4       Eva   22    1     Young
5     Frank   28    3     Young
6     Grace   32    6    Senior
7     Helen   26    2     Young
8     Helen   26    2     Young
9     Helen   26    2     Young
10    Jerry   23    6    Senior


# STOP

In [40]:
# # SKIP
# # (OPTIONAL) I want to see all the rows/columns
# # step1
# df = pd.read_csv('breast_cancer_modified.csv') # i cannot see all rows

# # step2
# pd.set_option('display.max_rows', None) # to see all the rows
# # pd.set_option('display.max_columns', None)) # to see all the columns

# df = pd.read_csv('breast_cancer_modified.csv')
# print(df)

# # step3 when everything is done then reset back.
# pd.reset_option('display.max_rows') 

# map (IGNORE), applymap (IGNORE)

#### (IGNORE) map
- Used with: **Series only (1D)**
- Purpose: Apply a function element-wise

#### (IGNORE) applymap
- Used with: **DataFrame only (2D)**
- Purpose: Apply a function to every single element
- Like: Nested .apply() for all cells