In [1]:
# Detect & Remove Outliers using IQR Method

# Objective: Learn to identify and remove outliers from a dataset using the Interquartile Range (IQR) method.
# Instructions:
# For each example, perform the following steps:
#     1. Load the Dataset: Load the dataset into your environment. You can use pandas to read the CSV file.
#     2. Calculate IQR: Calculate the first quartile (Q1), third quartile (Q3), and the IQR for the specified column.
#     3. Identify Outliers: Determine which data points are considered outliers.
#     4. Remove Outliers: Remove the outliers from the dataset.
#     5. Verify: Ensure the outliers are removed by checking the size or summary statistics of the dataset before and after the removal.
    
    
    

# Task:
#     Dataset: sales_data.csv(get it by your own it includes the column of Monthly_Sales)
#     Column to analyze: Monthly_Sales
#     Steps:
#         1. Load sales_data.csv .
#         2. Calculate Q1, Q3, and IQR for Monthly_Sales .
#         3. Identify outliers.
#         4. Remove the outliers.
#         5. Check the number of rows removed.
# Import necessary libraries
import pandas as pd
import numpy as np

# --- Step 1: Load the Dataset ---
# IMPORTANT: Replace this section with your actual code to load sales_data.csv
# Example: df = pd.read_csv('sales_data.csv')

# Creating a sample DataFrame that simulates sales data with outliers
# This is for demonstration purposes as I cannot access local files.
data = {
    'Month': pd.to_datetime(pd.date_range(start='2023-01-01', periods=100, freq='M')),
    'Monthly_Sales': np.random.normal(loc=50000, scale=15000, size=100) # Simulate normal sales
}
df = pd.DataFrame(data)

# Introduce some artificial outliers in 'Monthly_Sales'
outlier_sales = np.array([150000, 180000, 5000, 2000, 250000, -1000]) # High and low outliers
# Randomly pick some indices to replace with outliers
outlier_indices = np.random.choice(df.index, size=len(outlier_sales), replace=False)
df.loc[outlier_indices, 'Monthly_Sales'] = outlier_sales

print("Original Data (first 5 rows):")
print(df.head())
print("\nOriginal Data Description:")
print(df.describe())
print(f"\nOriginal DataFrame shape: {df.shape}")

# --- Step 2: Calculate Q1, Q3, and IQR for Monthly_Sales ---

# Calculate the first quartile (Q1)
Q1 = df['Monthly_Sales'].quantile(0.25)

# Calculate the third quartile (Q3)
Q3 = df['Monthly_Sales'].quantile(0.75)

# Calculate the Interquartile Range (IQR)
IQR = Q3 - Q1

print(f"\nCalculated Q1: {Q1:.2f}")
print(f"Calculated Q3: {Q3:.2f}")
print(f"Calculated IQR: {IQR:.2f}")

# --- Step 3: Identify Outliers ---

# Define the lower and upper bounds for outlier detection using the 1.5 * IQR rule
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

print(f"\nLower Bound for Outliers (Q1 - 1.5*IQR): {lower_bound:.2f}")
print(f"Upper Bound for Outliers (Q3 + 1.5*IQR): {upper_bound:.2f}")

# Identify outliers: values below the lower bound or above the upper bound
outliers = df[(df['Monthly_Sales'] < lower_bound) | (df['Monthly_Sales'] > upper_bound)]

print("\nIdentified Outliers in 'Monthly_Sales':")
print(outliers)

# --- Step 4: Remove Outliers ---

# Remove outliers: keep only the rows where 'Monthly_Sales' is within the bounds
df_cleaned = df[(df['Monthly_Sales'] >= lower_bound) & (df['Monthly_Sales'] <= upper_bound)].copy() # Use .copy() to avoid SettingWithCopyWarning

print("\nCleaned Data (first 5 rows after outlier removal):")
print(df_cleaned.head())
print("\nCleaned Data Description:")
print(df_cleaned.describe())

# --- Step 5: Verify ---

# Check the shape of the DataFrame after removing outliers
print(f"\nCleaned DataFrame shape: {df_cleaned.shape}")

# Calculate the number of rows removed
rows_removed = df.shape[0] - df_cleaned.shape[0]

print(f"\nNumber of rows removed (outliers): {rows_removed}")

# You can further verify by checking if the outlier values are no longer present
# in the cleaned DataFrame.







Original Data (first 5 rows):
       Month  Monthly_Sales
0 2023-01-31   28963.254775
1 2023-02-28   81635.114708
2 2023-03-31   -1000.000000
3 2023-04-30   74697.100298
4 2023-05-31   59769.536448

Original Data Description:
                     Month  Monthly_Sales
count                  100     100.000000
mean   2027-03-16 12:00:00   53259.891195
min    2023-01-31 00:00:00   -1000.000000
25%    2025-02-21 00:00:00   40139.625722
50%    2027-03-15 12:00:00   50220.336922
75%    2029-04-07 12:00:00   61679.101804
max    2031-04-30 00:00:00  250000.000000
std                    NaN   30557.309599

Original DataFrame shape: (100, 2)

Calculated Q1: 40139.63
Calculated Q3: 61679.10
Calculated IQR: 21539.48

Lower Bound for Outliers (Q1 - 1.5*IQR): 7830.41
Upper Bound for Outliers (Q3 + 1.5*IQR): 93988.32

Identified Outliers in 'Monthly_Sales':
        Month  Monthly_Sales
2  2023-03-31        -1000.0
5  2023-06-30       150000.0
12 2024-01-31       250000.0
56 2027-09-30       180000.0


  'Month': pd.to_datetime(pd.date_range(start='2023-01-01', periods=100, freq='M')),
