# Coding Practice Session 6
## Applying Functions to Series and DataFrames

In [176]:
import pandas as pd
import numpy as np

### Applying Functions to Series

In [177]:
s = pd.Series(np.random.randint(1, 10, size=5))
s

0    9
1    8
2    4
3    4
4    2
dtype: int64

In [178]:
# using built-in functions
s.sum()

np.int64(27)

In [179]:
s.mean()

np.float64(5.4)

In [180]:
s.max()

9

In [181]:
s.min()

2

In [182]:
np.log10(s)

0    0.954243
1    0.903090
2    0.602060
3    0.602060
4    0.301030
dtype: float64

In [183]:
np.exp(s)

0    8103.083928
1    2980.957987
2      54.598150
3      54.598150
4       7.389056
dtype: float64

In [184]:
np.tan(s)

0   -0.452316
1   -6.799711
2    1.157821
3    1.157821
4   -2.185040
dtype: float64

In [185]:
np.cos(s)

0   -0.911130
1   -0.145500
2   -0.653644
3   -0.653644
4   -0.416147
dtype: float64

In [186]:
def decrement(x):
    return x - 1

In [187]:
# apply custom functions
s.apply(decrement)

0    8
1    7
2    3
3    3
4    1
dtype: int64

In [188]:
s - 1

0    8
1    7
2    3
3    3
4    1
dtype: int64

In [189]:
s.apply(lambda x: x + 1)

0    10
1     9
2     5
3     5
4     3
dtype: int64

In [190]:
def categorize(x):
    if x < 3:
        return "Low"
    if x < 5:
        return "Medium"
    
    return "High"

In [191]:
s

0    9
1    8
2    4
3    4
4    2
dtype: int64

In [192]:
s.apply(categorize)

0      High
1      High
2    Medium
3    Medium
4       Low
dtype: object

In [193]:
s = pd.Series(["cat", "dog", None, "rabbit"])
s

0       cat
1       dog
2      None
3    rabbit
dtype: object

In [194]:
s.map("This is a {}".format)

0       This is a cat
1       This is a dog
2      This is a None
3    This is a rabbit
dtype: object

In [195]:
s.apply("This is a {}".format)

0       This is a cat
1       This is a dog
2      This is a None
3    This is a rabbit
dtype: object

In [196]:
s.map("This is a {}".format, na_action="ignore")

0       This is a cat
1       This is a dog
2                None
3    This is a rabbit
dtype: object

In [197]:
fruits = pd.Series(["apple", "cherry", "blueberry", "kiwi", "banana"])
fruits_colors = {
    "apple": "red",
    "banana": "yellow",
    "kiwi": "brown",
    "blueberry": "blue",
}

In [198]:
fruits.map(fruits_colors)

0       red
1       NaN
2      blue
3     brown
4    yellow
dtype: object

### Applying Functions to DataFrames

In [199]:
data = {
    "A": np.random.randint(1, 10, size=5),
    "B": np.random.randint(50, 60, size=5),
    "C": np.random.randint(-10, 0, size=5),
}

df = pd.DataFrame(data)
df

Unnamed: 0,A,B,C
0,4,55,-8
1,1,50,-6
2,9,56,-4
3,2,50,-5
4,3,51,-4


In [200]:
df.sum()

A     19
B    262
C    -27
dtype: int64

In [201]:
df.abs()

Unnamed: 0,A,B,C
0,4,55,8
1,1,50,6
2,9,56,4
3,2,50,5
4,3,51,4


In [202]:
df.abs().sum()

A     19
B    262
C     27
dtype: int64

In [203]:
df.mean()

A     3.8
B    52.4
C    -5.4
dtype: float64

In [204]:
df.max()

A     9
B    56
C    -4
dtype: int64

In [205]:
df.min()

A     1
B    50
C    -8
dtype: int64

In [206]:
df.describe()

Unnamed: 0,A,B,C
count,5.0,5.0,5.0
mean,3.8,52.4,-5.4
std,3.114482,2.880972,1.67332
min,1.0,50.0,-8.0
25%,2.0,50.0,-6.0
50%,3.0,51.0,-5.0
75%,4.0,55.0,-4.0
max,9.0,56.0,-4.0


In [207]:
# apply custom functions
def column_stats(col):
    return pd.Series(
        {"min": col.min(), "max": col.max(), "mean": col.mean(), "median": col.median()}
    )

In [208]:
df.apply(column_stats)

Unnamed: 0,A,B,C
min,1.0,50.0,-8.0
max,9.0,56.0,-4.0
mean,3.8,52.4,-5.4
median,3.0,51.0,-5.0


In [209]:
def row_stats(row):
    return pd.Series(
        {"min": row.min(), "max": row.max(), "mean": row.mean(), "median": row.median()}
    )

In [210]:
df.apply(row_stats, axis=1)

Unnamed: 0,min,max,mean,median
0,-8.0,55.0,17.0,4.0
1,-6.0,50.0,15.0,1.0
2,-4.0,56.0,20.333333,9.0
3,-5.0,50.0,15.666667,2.0
4,-4.0,51.0,16.666667,3.0


In [211]:
df

Unnamed: 0,A,B,C
0,4,55,-8
1,1,50,-6
2,9,56,-4
3,2,50,-5
4,3,51,-4


In [None]:
df.sum(axis=1) # row-wise

0    51
1    45
2    61
3    47
4    50
dtype: int64

In [213]:
def categorize_row(row):
    row_sum = row.sum()
    if row_sum < 50:
        return "Low"
    return "High"

In [214]:
df.apply(categorize_row, axis=1)

0    High
1     Low
2    High
3     Low
4    High
dtype: object

In [215]:
# map a function to every element
df

Unnamed: 0,A,B,C
0,4,55,-8
1,1,50,-6
2,9,56,-4
3,2,50,-5
4,3,51,-4


In [216]:
df = df.map(lambda x: x**3)
df

Unnamed: 0,A,B,C
0,64,166375,-512
1,1,125000,-216
2,729,175616,-64
3,8,125000,-125
4,27,132651,-64


In [217]:
bins = [-np.inf, 0, 300, np.inf]
labels = ["Low", "Medium", "High"]

pd.DataFrame(
    {
        "A": pd.cut(df["A"], bins=bins, labels=labels),
        "B": pd.cut(df["B"], bins=bins, labels=labels),
        "C": pd.cut(df["C"], bins=bins, labels=labels),
    }
)

Unnamed: 0,A,B,C
0,Medium,High,Low
1,Medium,High,Low
2,High,High,Low
3,Medium,High,Low
4,Medium,High,Low


### Advanced Function Application

In [218]:
df = pd.DataFrame({"A": np.random.randn(100_000), "B": np.random.randn(100_000)})

df

Unnamed: 0,A,B
0,-1.004556,-0.512651
1,1.120927,-0.746464
2,0.512264,0.718478
3,-0.684788,0.560994
4,-0.365474,0.373866
...,...,...
99995,-1.039370,-0.606378
99996,-0.061082,-0.413805
99997,0.740099,-1.973292
99998,0.256219,0.999598


In [219]:
%%timeit
# vectorized operations
df["C"] = df["A"] + df["B"]

341 μs ± 42.7 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [220]:
%%timeit
# non-vectorized
# using apply method is like using a for loop which is slow
df["D"] = df.apply(lambda row: row["A"] + row["B"], axis=1)

453 ms ± 46 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [250]:
print(f"Vectorized Operation is {453e-3 / 341e-6:.2f} times faster.")

Vectorized Operation is 1328.45 times faster.


In [222]:
df.head()

Unnamed: 0,A,B,C,D
0,-1.004556,-0.512651,-1.517207,-1.517207
1,1.120927,-0.746464,0.374463,0.374463
2,0.512264,0.718478,1.230741,1.230741
3,-0.684788,0.560994,-0.123794,-0.123794
4,-0.365474,0.373866,0.008393,0.008393


In [223]:
df["E"] = np.sin(df["A"])
df["F"] = np.where(df["B"] > 0.5, "High", "Low")

df

Unnamed: 0,A,B,C,D,E,F
0,-1.004556,-0.512651,-1.517207,-1.517207,-0.843924,Low
1,1.120927,-0.746464,0.374463,0.374463,0.900504,Low
2,0.512264,0.718478,1.230741,1.230741,0.490152,High
3,-0.684788,0.560994,-0.123794,-0.123794,-0.632509,High
4,-0.365474,0.373866,0.008393,0.008393,-0.357392,Low
...,...,...,...,...,...,...
99995,-1.039370,-0.606378,-1.645748,-1.645748,-0.862085,Low
99996,-0.061082,-0.413805,-0.474887,-0.474887,-0.061044,Low
99997,0.740099,-1.973292,-1.233193,-1.233193,0.674361,Low
99998,0.256219,0.999598,1.255817,1.255817,0.253425,High


In [224]:
s = pd.Series([1, 2, 3, 4, 5])
s

0    1
1    2
2    3
3    4
4    5
dtype: int64

In [225]:
s.apply(lambda x: x**2 if x % 2 == 0 else x**3)

0      1
1      4
2     27
3     16
4    125
dtype: int64

In [226]:
df = pd.DataFrame({
    'A': [1, 2, 3, 4, 5],
    'B': [10, 20, 30, 40, 50]
})

df

Unnamed: 0,A,B
0,1,10
1,2,20
2,3,30
3,4,40
4,5,50


In [227]:
df["C"] = df["A"].apply(lambda x: "Even" if x % 2 == 0 else "Odd")
df

Unnamed: 0,A,B,C
0,1,10,Odd
1,2,20,Even
2,3,30,Odd
3,4,40,Even
4,5,50,Odd


In [228]:
df["D"] = df.apply(lambda row: row["A"] * row["B"], axis=1)
df

Unnamed: 0,A,B,C,D
0,1,10,Odd,10
1,2,20,Even,40
2,3,30,Odd,90
3,4,40,Even,160
4,5,50,Odd,250


In [229]:
df = pd.DataFrame({
    'A': [1, 2, 3, 4, 5],
    'B': [10, 20, 30, 40, 50],
    'C': [100, 200, 300, 400, 500],
    'D': ['a', 'b', 'c', 'd', 'e']
})
df

Unnamed: 0,A,B,C,D
0,1,10,100,a
1,2,20,200,b
2,3,30,300,c
3,4,40,400,d
4,5,50,500,e


In [230]:
df[["A", "C"]].apply(lambda x: x**2)

Unnamed: 0,A,C
0,1,10000
1,4,40000
2,9,90000
3,16,160000
4,25,250000


In [231]:
df[["A", "B", "C"]].map(lambda x: f"{x:.2f}")

Unnamed: 0,A,B,C
0,1.0,10.0,100.0
1,2.0,20.0,200.0
2,3.0,30.0,300.0
3,4.0,40.0,400.0
4,5.0,50.0,500.0


In [232]:
np.pow(df[["A", "B"]], 2)

Unnamed: 0,A,B
0,1,100
1,4,400
2,9,900
3,16,1600
4,25,2500


In [233]:
df = pd.DataFrame({
    'A': [1, 2, 3, 4, 5],
    'B': [10, 20, 30, 40, 50],
    'C': [100, 200, 300, 400, 500]
})

df

Unnamed: 0,A,B,C
0,1,10,100
1,2,20,200
2,3,30,300
3,4,40,400
4,5,50,500


In [234]:
def cal_range(col):
    return col.max() - col.min()

df.agg(["sum", "mean", cal_range])

Unnamed: 0,A,B,C
sum,15.0,150.0,1500.0
mean,3.0,30.0,300.0
cal_range,4.0,40.0,400.0


In [235]:
df.agg(["max", "min", "median", "mean"], axis=1)

Unnamed: 0,max,min,median,mean
0,100.0,1.0,10.0,37.0
1,200.0,2.0,20.0,74.0
2,300.0,3.0,30.0,111.0
3,400.0,4.0,40.0,148.0
4,500.0,5.0,50.0,185.0


### Exercise

In [251]:
# 1
df = pd.DataFrame({
    'Name': ['JOHN smith', 'JAnE Doe', 'Bob Johnson', 'alice bRown'],
    'Age': [28, 35, 42, 31],
    'Income': ['$45,000', '$60,000', '$75,000', '$55,000'],
    'City': ['NEW YORK', 'los angeles', 'Chicago', 'HOUston']
})

df

Unnamed: 0,Name,Age,Income,City
0,JOHN smith,28,"$45,000",NEW YORK
1,JAnE Doe,35,"$60,000",los angeles
2,Bob Johnson,42,"$75,000",Chicago
3,alice bRown,31,"$55,000",HOUston


In [252]:
df["Name"] = df["Name"].apply(lambda name: name.title())
df

Unnamed: 0,Name,Age,Income,City
0,John Smith,28,"$45,000",NEW YORK
1,Jane Doe,35,"$60,000",los angeles
2,Bob Johnson,42,"$75,000",Chicago
3,Alice Brown,31,"$55,000",HOUston


In [261]:
df["City"].str.split()

0       [NEW, YORK]
1    [los, angeles]
2         [Chicago]
3         [HOUston]
Name: City, dtype: object

In [264]:
any(df["City"].str.split().str.len() > 1)

True

In [265]:
df["City"] = df["City"].apply(lambda city: city.title())

In [266]:
df

Unnamed: 0,Name,Age,Income,City
0,John Smith,28,"$45,000",New York
1,Jane Doe,35,"$60,000",Los Angeles
2,Bob Johnson,42,"$75,000",Chicago
3,Alice Brown,31,"$55,000",Houston


In [267]:
df["Income"] = df["Income"].str.replace("$", "").str.replace(",", "").astype(int)

In [268]:
df

Unnamed: 0,Name,Age,Income,City
0,John Smith,28,45000,New York
1,Jane Doe,35,60000,Los Angeles
2,Bob Johnson,42,75000,Chicago
3,Alice Brown,31,55000,Houston


In [269]:
df["Income Bracket"] = pd.cut(
    df["Income"],
    bins=[0, 50_000, 70_000, np.inf],
    labels=["Low", "Medium", "High"]
)

df

Unnamed: 0,Name,Age,Income,City,Income Bracket
0,John Smith,28,45000,New York,Low
1,Jane Doe,35,60000,Los Angeles,Medium
2,Bob Johnson,42,75000,Chicago,High
3,Alice Brown,31,55000,Houston,Medium


In [270]:
# 2
df = pd.DataFrame({
    'Review': [
        "Great product, highly recommended!",
        "Disappointing quality, wouldn't buy again.",
        "Average product, nothing special.",
        "Excellent service and fast delivery!",
        "Terrible customer support, avoid this company.",
        "It was Awesome!!!"
    ]
})

df

Unnamed: 0,Review
0,"Great product, highly recommended!"
1,"Disappointing quality, wouldn't buy again."
2,"Average product, nothing special."
3,Excellent service and fast delivery!
4,"Terrible customer support, avoid this company."
5,It was Awesome!!!


In [271]:
def review_len(text):
    return len(text.split())


def detect_sentiment(text):
    positive_words = ["good", "excellent", "great", "best", "amazing"]
    negative_words = ["bad", "terrible", "worst", "disappointing"]

    text = text.lower() # case insensitive

    if any(word in text for word in positive_words):
        return "Positive"
    if any(word in text for word in negative_words):
        return "Negative"
    return "Neutral" 

In [272]:
df["Review Length"] = df["Review"].apply(review_len)

In [273]:
df["Sentiment"] = df["Review"].apply(detect_sentiment)

In [274]:
df["Sentiment"].value_counts()

Sentiment
Positive    2
Negative    2
Neutral     2
Name: count, dtype: int64

In [275]:
df["Review Length"].median()

4.5

In [276]:
df

Unnamed: 0,Review,Review Length,Sentiment
0,"Great product, highly recommended!",4,Positive
1,"Disappointing quality, wouldn't buy again.",5,Negative
2,"Average product, nothing special.",4,Neutral
3,Excellent service and fast delivery!,5,Positive
4,"Terrible customer support, avoid this company.",6,Negative
5,It was Awesome!!!,3,Neutral
