In [2]:
import numpy as np
import pandas as pd

## 1 - Create a seris from a numeric column that has the value of 'high' if it is equal to or above the mean and 'low' if it is below the mearn using .apply 

In [3]:
numeric_column = pd.Series(np.random.randint(0,100,100))

In [4]:
numeric_column

0     94
1      7
2      5
3     45
4     96
      ..
95    69
96    22
97    50
98    71
99    93
Length: 100, dtype: int64

In [5]:
numeric_column.agg(['mean','count'])

mean      45.87
count    100.00
dtype: float64

In [6]:
def high_low(val):
    if val > numeric_column.mean():
        return 'High'
    else:
        return 'Low'

numeric_column.apply(high_low)


0     High
1      Low
2      Low
3      Low
4     High
      ... 
95    High
96     Low
97    High
98    High
99    High
Length: 100, dtype: object

# 2 - Same exercise but using .case when 

In [7]:
numeric_column.case_when(caselist=[(numeric_column>numeric_column.mean(),'High'),
                                   (numeric_column<numeric_column.mean(),'Low'),
                                   (numeric_column==numeric_column.mean(),'Mean')])

0     High
1      Low
2      Low
3      Low
4     High
      ... 
95    High
96     Low
97    High
98    High
99    High
Length: 100, dtype: object

# 3 - Time the previous exercises to see which is faster

In [8]:
%%timeit
def high_low(val):
    if val > numeric_column.mean():
        return 'High'
    else:
        return 'Low'

numeric_column.apply(high_low)

5.26 ms ± 1.35 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [9]:
%%timeit
numeric_column.case_when(caselist=[(numeric_column>numeric_column.mean(),'High'),
                                   (numeric_column<numeric_column.mean(),'Low'),
                                   (numeric_column==numeric_column.mean(),'Mean')])

2.27 ms ± 884 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


# 4- Replace the values of  a numeric series with the median value

In [10]:
# Create a list of numbers from 0 to 99 and include NaN
values = list(range(100)) + [np.nan]

# Define probabilities: 100 values + 1 NaN
probabilities = [0.008] * 100 + [0.20]  # 0.008 * 100 + 0.20 = 1.0

# Create a Series with random numbers (0-99) and NaNs (~20% probability)
num_series = pd.Series(np.random.choice(values, size=100, p=probabilities))

print(num_series)

0     25.0
1     41.0
2     14.0
3      NaN
4     71.0
      ... 
95     NaN
96    91.0
97    78.0
98     NaN
99     3.0
Length: 100, dtype: float64


In [11]:
num_series.median()

49.5

In [12]:
num_series.fillna((num_series.median()))

0     25.0
1     41.0
2     14.0
3     49.5
4     71.0
      ... 
95    49.5
96    91.0
97    78.0
98    49.5
99     3.0
Length: 100, dtype: float64

# 5 - Clip the values of a numeric series between to 10th and 90th percentiles

In [13]:
clipped_values = numeric_column.clip(lower=numeric_column.quantile(0.1), upper=numeric_column.quantile(0.9))

In [14]:
clipped_values

0     82.4
1     10.8
2     10.8
3     45.0
4     82.4
      ... 
95    69.0
96    22.0
97    50.0
98    71.0
99    82.4
Length: 100, dtype: float64

In [15]:
print(f"Lower bound (10th percentile): {numeric_column.quantile(0.1)}")
print(f"Upper bound (90th percentile): {numeric_column.quantile(0.9)}")

Lower bound (10th percentile): 10.8
Upper bound (90th percentile): 82.40000000000003


# 6 & 7 - Using categorical column, replace any value that is not in the top 10 (and top 5 for the 6 exercise ) most frequent values with 'Other'

In [16]:
top_10 = numeric_column.value_counts().index[:10]
top_5 = numeric_column.value_counts().index[:5]

In [17]:
numeric_column.where(numeric_column.isin(top_10),other='Other')

0        94
1     Other
2     Other
3     Other
4     Other
      ...  
95    Other
96    Other
97    Other
98    Other
99    Other
Length: 100, dtype: object

In [18]:
numeric_column.where(numeric_column.isin(top_5),other='Other')

0        94
1     Other
2     Other
3     Other
4     Other
      ...  
95    Other
96    Other
97    Other
98    Other
99    Other
Length: 100, dtype: object

# 8 - Make a function that takes a categorical series and a number (n) and returns a replace seris that replaces any value not in the top n most frequent values with 'Other'

In [19]:
def change_not_topn(column,n):
    top_n = column.value_counts().index[:n]
    return column.where(column.isin(top_n),other='Other')


# Using numeric column, bin it in 10 group with the same width

In [20]:
pd.cut(numeric_column,10)

0      (89.2, 99.0]
1     (0.902, 10.8]
2     (0.902, 10.8]
3      (40.2, 50.0]
4      (89.2, 99.0]
          ...      
95     (59.8, 69.6]
96     (20.6, 30.4]
97     (40.2, 50.0]
98     (69.6, 79.4]
99     (89.2, 99.0]
Length: 100, dtype: category
Categories (10, interval[float64, right]): [(0.902, 10.8] < (10.8, 20.6] < (20.6, 30.4] < (30.4, 40.2] ... (59.8, 69.6] < (69.6, 79.4] < (79.4, 89.2] < (89.2, 99.0]]

# 10 - using numerical column, bin it in 10 group that have equal sized bins

In [21]:
binned_column = pd.qcut(numeric_column, q=10)

In [22]:
binned_column

0      (82.4, 99.0]
1     (0.999, 10.8]
2     (0.999, 10.8]
3      (44.5, 53.4]
4      (82.4, 99.0]
          ...      
95     (64.3, 74.4]
96     (18.8, 25.4]
97     (44.5, 53.4]
98     (64.3, 74.4]
99     (82.4, 99.0]
Length: 100, dtype: category
Categories (10, interval[float64, right]): [(0.999, 10.8] < (10.8, 18.8] < (18.8, 25.4] < (25.4, 33.0] ... (53.4, 64.3] < (64.3, 74.4] < (74.4, 82.4] < (82.4, 99.0]]