# Outlier 
## An outlier is a data point in a data set that is distant from all other observations. A data point that lies outside the overall distribution of the data set 

In [1]:
import numpy as np 
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
dataset=[0, 2, 5, 6, 9, 12, 35,28, 26, 29, 30, 81, 32, 37,16, 14, 3, 12, 15, 17, 22, 15, 52]

# Detecting outlier using z score

In [3]:
outliers = []
def detect_outliers(data):
    threshold = 3
    mean = np.mean(data)
    std = np.std(data)
    
    for i in data:
        z_score= (i-mean)/std
        if np.abs(z_score)>threshold:
            outliers.append(i)
    return outliers

In [6]:
outlier_point = detect_outliers(dataset)

In [8]:
outlier_point

[81, 81]

# Detecting outliers using Interquantile Range
## 75%-25% values in a dataset
# Steps
1. Arrange the data in increasing order
2. Calculate first(q1) and third quartile(q3)
3. Find interquartile range(q3-q1)
4. Find lower bound q1*1.5
5. Find upper bound q3*1.5

Result:
Anything that lies outside of lower and upper bound is an outlier

In [10]:
# Detecting using interquantile range using python
sorted(dataset)

[0,
 2,
 3,
 5,
 6,
 9,
 12,
 12,
 14,
 15,
 15,
 16,
 17,
 22,
 26,
 28,
 29,
 30,
 32,
 35,
 37,
 52,
 81]

In [11]:
quantile1,quantile3 = np.percentile(dataset,[25,75])# value between 25 and 75 percentile

In [14]:
print(quantile1,quantile3) # Quantile1 - lower bound & Quantile3 - upper bound

10.5 29.5


In [15]:
# Finding the IQR
IQR_Value= quantile3-quantile1
print(IQR_Value)

19.0


In [16]:
# Finding the lower bound value 
lower_bound_value = quantile1-(1.5*IQR_Value)

In [17]:
# Finding the upper bound value
upper_bound_value = quantile3 +(1.5*IQR_Value)

In [19]:
print(lower_bound_value,upper_bound_value)

-18.0 58.0
