In [6]:
import numpy as np
import pandas as pd

np.random.seed(42)

data = np.random.normal(loc=50, scale=5, size=50)

outliers = np.array([100, 110, 120, 130])

data_with_outliers = np.concatenate([data, outliers])

df = pd.DataFrame({"values": data_with_outliers})

df.describe()


Unnamed: 0,values
count,54.0
mean,53.770954
std,18.307407
min,40.201649
25%,46.548451
50%,49.089898
75%,52.655493
max,130.0


In [7]:
mean_val = df["values"].mean()
std_val = df["values"].std()

df["zscore"] = (df["values"] - mean_val) / std_val

df["is_outlier"] = (df["zscore"] > 3) | (df["zscore"] < -3)

df.head()


Unnamed: 0,values,zscore,is_outlier
0,52.483571,-0.07032,False
1,49.308678,-0.243742,False
2,53.238443,-0.029087,False
3,57.615149,0.20998,False
4,48.829233,-0.26993,False


In [8]:
df["log_values"] = np.log(df["values"])

df[["values", "log_values"]].head()


Unnamed: 0,values,log_values
0,52.483571,3.9605
1,49.308678,3.8981
2,53.238443,3.974781
3,57.615149,4.053786
4,48.829233,3.888329


In [9]:
log_mean = df["log_values"].mean()
log_std = df["log_values"].std()

df["zscore_log"] = (df["log_values"] - log_mean) / log_std

df["is_outlier_log"] = (df["zscore_log"] > 3) | (df["zscore_log"] < -3)

df[["values", "log_values", "zscore_log", "is_outlier_log"]].head(10)


Unnamed: 0,values,log_values,zscore_log,is_outlier_log
0,52.483571,3.9605,0.050479,False
1,49.308678,3.8981,-0.203548,False
2,53.238443,3.974781,0.108615,False
3,57.615149,4.053786,0.430239,False
4,48.829233,3.888329,-0.243325,False
5,48.829315,3.888331,-0.243319,False
6,57.896064,4.058649,0.45004,False
7,53.837174,3.985964,0.154142,False
8,47.652628,3.863938,-0.342621,False
9,52.7128,3.964858,0.068221,False


In [10]:
df_no_outliers = df[~df["is_outlier"]].copy()

df_no_outliers = df_no_outliers.reset_index(drop=True)

df_no_outliers.describe()


Unnamed: 0,values,zscore,log_values,zscore_log
count,51.0,51.0,51.0,51.0
mean,49.875128,-0.212801,3.898861,-0.200451
std,8.521304,0.465457,0.138447,0.563612
min,40.201649,-0.741192,3.693908,-1.034804
25%,45.930329,-0.428276,3.827073,-0.492695
50%,48.829315,-0.269926,3.888331,-0.243319
75%,51.798291,-0.107752,3.947356,-0.00303
max,100.0,2.525155,4.60517,2.674898
