In [1]:
import numpy as np
import pandas as pd

## 1 - Create a seris from a numeric column that has the value of 'high' if it is equal to or above the mean and 'low' if it is below the mearn using .apply 

In [2]:
numeric_column = pd.Series(np.random.randn(5))

In [3]:
numeric_column

0    0.215490
1    0.532703
2    0.881968
3    0.414785
4    0.530502
5    0.728530
6    0.671015
7    0.643173
8    0.928301
9    0.893165
dtype: float64

In [6]:
numeric_column.agg(['mean','count'])

mean      0.643963
count    10.000000
dtype: float64

In [9]:
def high_low(val):
    if val > numeric_column.mean():
        return 'High'
    else:
        return 'Low'

numeric_column.apply(high_low)


0     Low
1     Low
2    High
3     Low
4     Low
5    High
6    High
7     Low
8    High
9    High
dtype: object

# 2 - Same exercise but using .case when 

In [13]:
numeric_column.case_when(caselist=[(numeric_column>numeric_column.mean(),'High'),
                                   (numeric_column<numeric_column.mean(),'Low'),
                                   (numeric_column==numeric_column.mean(),'Mean')])

0     Low
1     Low
2    High
3     Low
4     Low
5    High
6    High
7     Low
8    High
9    High
dtype: object

# 3 - Time the previous exercises to see which is faster

In [16]:
%%timeit
def high_low(val):
    if val > numeric_column.mean():
        return 'High'
    else:
        return 'Low'

numeric_column.apply(high_low)

140 µs ± 37.1 µs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [18]:
%%timeit
numeric_column.case_when(caselist=[(numeric_column>numeric_column.mean(),'High'),
                                   (numeric_column<numeric_column.mean(),'Low'),
                                   (numeric_column==numeric_column.mean(),'Mean')])

900 µs ± 241 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


# 4- Replace the values of  a numeric series with the median value

In [19]:
# Create a list of numbers from 0 to 99 and include NaN
values = list(range(100)) + [np.nan]

# Define probabilities: 100 values + 1 NaN
probabilities = [0.008] * 100 + [0.20]  # 0.008 * 100 + 0.20 = 1.0

# Create a Series with random numbers (0-99) and NaNs (~20% probability)
num_series = pd.Series(np.random.choice(values, size=10, p=probabilities))

print(num_series)

0    50.0
1    23.0
2    79.0
3    68.0
4    90.0
5    40.0
6     9.0
7    25.0
8     NaN
9    37.0
dtype: float64


In [20]:
num_series.median()

40.0

In [22]:
num_series.fillna((num_series.median()))

0    50.0
1    23.0
2    79.0
3    68.0
4    90.0
5    40.0
6     9.0
7    25.0
8    40.0
9    37.0
dtype: float64

# 5 - Clip the values of a numeric series between to 10th and 90th percentiles

In [29]:
clipped_values = numeric_column.clip(lower=numeric_column.quantile(0.1), upper=numeric_column.quantile(0.9))

In [30]:
clipped_values

0    0.394856
1    0.532703
2    0.881968
3    0.414785
4    0.530502
5    0.728530
6    0.671015
7    0.643173
8    0.896678
9    0.893165
dtype: float64

In [31]:
print(f"Lower bound (10th percentile): {numeric_column.quantile(0.1)}")
print(f"Upper bound (90th percentile): {numeric_column.quantile(0.9)}")

Lower bound (10th percentile): 0.3948559129916856
Upper bound (90th percentile): 0.8966784120828478
