## `Single Column Outlier`

### Data Loading Phase

In [8]:
import pandas as pd

data = {
    "salary" : [20000, 22000, 24000, 26000, 28000, 30000, 32000, 1500000]
}

df = pd.DataFrame(data)
df.head(3)

Unnamed: 0,salary
0,20000
1,22000
2,24000


### 01. Detecting Normal Value Range

- Anything that is not in between lower limit or upper limit is an outlier

In [9]:
Q1 = df['salary'].quantile(0.25)
Q2 = df['salary'].median()
Q3 = df['salary'].quantile(0.75)

IQR = Q3 - Q1

lower_limit = Q1 - (1.5 * IQR)
upper_limit = Q3 + (1.5 * IQR)

print(f"Q1 : {Q1}")
print(f"Q2 : {Q2}")
print(f"Q3 : {Q3}")
print(f"IQR : {IQR}")
print(f"Lower Limit : {lower_limit}")
print(f"Upper Limit : {upper_limit}")

Q1 : 23500.0
Q2 : 27000.0
Q3 : 30500.0
IQR : 7000.0
Lower Limit : 13000.0
Upper Limit : 41000.0


### 02. Detecting Outliers

In [10]:
outliers = df[(df['salary'] < lower_limit) | (df['salary'] > upper_limit)]
outliers_cols = outliers.rename(columns = {'salary' : 'outlier_salary'})
outliers_cols

Unnamed: 0,outlier_salary
7,1500000


### 03. Getting Clean Values

- Removing Outliers

In [11]:
df_clean = df[(df['salary'] >= lower_limit) & (df['salary'] <= upper_limit)]
df_clean.head(3)

Unnamed: 0,salary
0,20000
1,22000
2,24000


---
---

## `Multiple Columns Outlier`

### 01. Intitial Phase

In [12]:
import pandas as pd

data = {
    "Age": [18, 19, 20, 21, 22, 23, 24, 100],   # 100 is outlier
    "Salary": [20000, 22000, 24000, 26000, 28000, 30000, 32000, 150000]  # 150000 is outlier
}

df = pd.DataFrame(data)

df.head(3)

Unnamed: 0,Age,Salary
0,18,20000
1,19,22000
2,20,24000


### 02. Function to remove outliers from numerical columns automatically

In [13]:
def remove_outliers_iqr(dataframe, col_name):
    # Calculate Q1 & Q3
    q1 = dataframe[col_name].quantile(0.25)
    q3 = dataframe[col_name].quantile(0.75)

    # Calculate IQR
    iqr = q3 - q1

    # Calculate lower limit and upper limit
    lower_limit = q1 - (1.5 * iqr)
    upper_limit = q3 + (1.5 * iqr)

    print(f"\nColumn : {col_name}")
    print(f"Upper Limit : {upper_limit}")
    print(f"Lower Limit : {lower_limit}")

    # Remove Outliers
    cleaned_df = dataframe[
        (dataframe[col_name] >= lower_limit) & 
        (dataframe[col_name] <= upper_limit)
    ]

    return cleaned_df

### 03. Apply the function on desired columns

In [14]:
# Apply On Specific Columns
df_cleaned = remove_outliers_iqr(df, 'Age')
df_cleaned = remove_outliers_iqr(df, 'Salary')


Column : Age
Upper Limit : 28.5
Lower Limit : 14.5

Column : Salary
Upper Limit : 41000.0
Lower Limit : 13000.0


### Print the cleaned dataframe

In [15]:
df_cleaned

Unnamed: 0,Age,Salary
0,18,20000
1,19,22000
2,20,24000
3,21,26000
4,22,28000
5,23,30000
6,24,32000


### END