In [8]:
# Detect & Remove Outliers using IQR Method

# Objective: Learn to identify and remove outliers from a dataset using the Interquartile Range (IQR) method.
# Instructions:
# For each example, perform the following steps:
#     1. Load the Dataset: Load the dataset into your environment. You can use pandas to read the CSV file.
#     2. Calculate IQR: Calculate the first quartile (Q1), third quartile (Q3), and the IQR for the specified column.
#     3. Identify Outliers: Determine which data points are considered outliers.
#     4. Remove Outliers: Remove the outliers from the dataset.
#     5. Verify: Ensure the outliers are removed by checking the size or summary statistics of the dataset before and after the removal.
    
    
    

# Task:
#     Dataset: sales_data.csv(get it by your own it includes the column of Monthly_Sales)
#     Column to analyze: Monthly_Sales
#     Steps:
#         1. Load sales_data.csv .
#         2. Calculate Q1, Q3, and IQR for Monthly_Sales .
#         3. Identify outliers.
#         4. Remove the outliers.
#         5. Check the number of rows removed.

csv_content = """ID,Monthly_Sales
1,200
2,220
3,210
4,215
5,205
6,230
7,1000
8,225
9,215
10,2050
11,210
12,220
13,215
14,205
15,230
"""

with open("sales_data.csv", "w") as file:
    file.write(csv_content)

print("sales_data.csv file created successfully.")






sales_data.csv file created successfully.


In [9]:
import pandas as pd

# Load dataset
df = pd.read_csv("sales_data.csv")
print("Original DataFrame shape:", df.shape)

# Calculate Q1, Q3 and IQR for 'Monthly_Sales'
Q1 = df['Monthly_Sales'].quantile(0.25)
Q3 = df['Monthly_Sales'].quantile(0.75)
IQR = Q3 - Q1

print(f"Q1: {Q1}")
print(f"Q3: {Q3}")
print(f"IQR: {IQR}")

# Define bounds for outliers
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

print(f"Lower bound: {lower_bound}")
print(f"Upper bound: {upper_bound}")

# Detect outliers
outliers = df[(df['Monthly_Sales'] < lower_bound) | (df['Monthly_Sales'] > upper_bound)]
print("\nOutliers detected:")
print(outliers)

# Remove outliers
df_cleaned = df[(df['Monthly_Sales'] >= lower_bound) & (df['Monthly_Sales'] <= upper_bound)]
print("\nDataFrame shape after removing outliers:", df_cleaned.shape)

# Number of rows removed
rows_removed = df.shape[0] - df_cleaned.shape[0]
print(f"Number of rows removed: {rows_removed}")


Original DataFrame shape: (15, 2)
Q1: 210.0
Q3: 227.5
IQR: 17.5
Lower bound: 183.75
Upper bound: 253.75

Outliers detected:
   ID  Monthly_Sales
6   7           1000
9  10           2050

DataFrame shape after removing outliers: (13, 2)
Number of rows removed: 2
