In [1]:
import pandas as pd
import seaborn as sn

In [2]:
df = pd.read_csv("height.csv")
df

Unnamed: 0,Gender,Height
0,Male,73.847017
1,Male,68.781904
2,Male,74.110105
3,Male,71.730978
4,Male,69.881796
...,...,...
9995,Female,66.172652
9996,Female,67.067155
9997,Female,63.867992
9998,Female,69.034243


In [3]:
df.head(5)

Unnamed: 0,Gender,Height
0,Male,73.847017
1,Male,68.781904
2,Male,74.110105
3,Male,71.730978
4,Male,69.881796


## (1) Outlier detection and removal using Standard Deviation:

In [4]:
df.Height.describe()

count    10000.000000
mean        66.367560
std          3.847528
min         54.263133
25%         63.505620
50%         66.318070
75%         69.174262
max         78.998742
Name: Height, dtype: float64

In [5]:
df.shape

(10000, 2)

In [6]:
# sn.histplot(df.Height, kde = True)

In [7]:
mean = df.Height.mean()
mean

66.3675597548656

In [8]:
std_dev = df.Height.std()
std_dev

3.847528120795573

In [9]:
mean - 3 * std_dev

54.824975392478876

In [10]:
mean + 3 * std_dev

77.91014411725232

In [11]:
df[ (df.Height<54.82)]

Unnamed: 0,Gender,Height
6624,Female,54.616858
9285,Female,54.263133


In [12]:
df[(df.Height>77.91)]

Unnamed: 0,Gender,Height
994,Male,78.095867
1317,Male,78.462053
2014,Male,78.998742
3285,Male,78.52821
3757,Male,78.621374


In [13]:
df[(df.Height<54.82) | (df.Height>77.91)]  # These 7 rows are outliers in this dataset

Unnamed: 0,Gender,Height
994,Male,78.095867
1317,Male,78.462053
2014,Male,78.998742
3285,Male,78.52821
3757,Male,78.621374
6624,Female,54.616858
9285,Female,54.263133


In [18]:
# REmove the outliers from given dataset using 3 * std_dev general rule
df1_no_outliers = df[(df.Height>54.82) & (df.Height<77.91)] 
df1_no_outliers

Unnamed: 0,Gender,Height
0,Male,73.847017
1,Male,68.781904
2,Male,74.110105
3,Male,71.730978
4,Male,69.881796
...,...,...
9995,Female,66.172652
9996,Female,67.067155
9997,Female,63.867992
9998,Female,69.034243


In [19]:
df1_no_outliers.shape

(9993, 2)

## (2) Outlier detection and removal using Z Score:

In [20]:
# Z score is a way to achieve same thing that we did above in part (1)

# Z score indicates how many standard deviation away a data point is.

# For example in our case mean is 66.37 and standard deviation is 3.84.

# If a value of a data point is 77.91 then Z score for that is 3 because it is 3 standard deviation away (77.91 = 66.37 + 3 * 3.84)

# Calculate the Z Score

# Let's add a new column in our dataframe for this Z score


In [21]:
df['zscore'] = (df.Height - df.Height.mean()) / df.Height.std()
df

Unnamed: 0,Gender,Height,zscore
0,Male,73.847017,1.943964
1,Male,68.781904,0.627505
2,Male,74.110105,2.012343
3,Male,71.730978,1.393991
4,Male,69.881796,0.913375
...,...,...,...
9995,Female,66.172652,-0.050658
9996,Female,67.067155,0.181830
9997,Female,63.867992,-0.649655
9998,Female,69.034243,0.693090


In [22]:
df.head(5)

Unnamed: 0,Gender,Height,zscore
0,Male,73.847017,1.943964
1,Male,68.781904,0.627505
2,Male,74.110105,2.012343
3,Male,71.730978,1.393991
4,Male,69.881796,0.913375


In [28]:
df[df['zscore'] > 3]

Unnamed: 0,Gender,Height,zscore
994,Male,78.095867,3.048271
1317,Male,78.462053,3.143445
2014,Male,78.998742,3.282934
3285,Male,78.52821,3.16064
3757,Male,78.621374,3.184854


In [29]:
df[df['zscore']< -3]

Unnamed: 0,Gender,Height,zscore
6624,Female,54.616858,-3.054091
9285,Female,54.263133,-3.146027


In [40]:
df_new_outlier = df[ (df.zscore > 3) | (df.zscore < -3)]
df_new_outlier

Unnamed: 0,Gender,Height,zscore
994,Male,78.095867,3.048271
1317,Male,78.462053,3.143445
2014,Male,78.998742,3.282934
3285,Male,78.52821,3.16064
3757,Male,78.621374,3.184854
6624,Female,54.616858,-3.054091
9285,Female,54.263133,-3.146027


In [42]:
df_without_outliers = df[(df.zscore> -3 )& (df.zscore < 3)]
df_without_outliers

Unnamed: 0,Gender,Height,zscore
0,Male,73.847017,1.943964
1,Male,68.781904,0.627505
2,Male,74.110105,2.012343
3,Male,71.730978,1.393991
4,Male,69.881796,0.913375
...,...,...,...
9995,Female,66.172652,-0.050658
9996,Female,67.067155,0.181830
9997,Female,63.867992,-0.649655
9998,Female,69.034243,0.693090
