In [1]:
    
import pandas as pd

# Create the sample dataset
data = {
    'Store': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    'Monthly_Sales': [1500, 2000, 1800, 3000, 40000, 2500, 2200, 2100, 1900, 2300]
}

# Convert to DataFrame
df = pd.DataFrame(data)

# Save to CSV
df.to_csv('sales_data.csv', index=False)

print("CSV file 'sales_data.csv' created successfully.")

    

# Task:
#     Dataset: sales_data.csv(get it by your own it includes the column of Monthly_Sales)
#     Column to analyze: Monthly_Sales
#     Steps:
#         1. Load sales_data.csv .
#         2. Calculate Q1, Q3, and IQR for Monthly_Sales .
#         3. Identify outliers.
#         4. Remove the outliers.
#         5. Check the number of rows removed.



import pandas as pd

def remove_outliers_iqr(file_path):
    try:
        # Step 1: Load the dataset
        df = pd.read_csv(file_path)
        
        # Check if the dataset is empty
        if df.empty:
            print("The dataset is empty. Exiting...")
            return None

        # Ensure the dataset has the 'Monthly_Sales' column
        if 'Monthly_Sales' not in df.columns:
            print("'Monthly_Sales' column is missing from the dataset. Exiting...")
            return None

        # Step 2: Calculate Q1, Q3, and IQR for Monthly_Sales
        Q1 = df['Monthly_Sales'].quantile(0.25)
        Q3 = df['Monthly_Sales'].quantile(0.75)
        IQR = Q3 - Q1

        # Step 3: Identify outliers using the IQR method
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR

        # Step 4: Filter out the outliers
        df_cleaned = df[(df['Monthly_Sales'] >= lower_bound) & (df['Monthly_Sales'] <= upper_bound)]

        # Step 5: Verify the number of rows removed
        rows_removed = len(df) - len(df_cleaned)
        print(f"\nNumber of rows removed: {rows_removed}")

        # Display cleaned dataset or summary statistics
        print("\nCleaned DataFrame (first 5 rows):")
        print(df_cleaned.head())

        return df_cleaned

    except FileNotFoundError:
        print(f"Error: The file '{file_path}' was not found.")
        return None
    except pd.errors.EmptyDataError:
        print(f"Error: The file '{file_path}' is empty.")
        return None
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        return None


# Example usage
file_path = 'sales_data.csv'
cleaned_data = remove_outliers_iqr(file_path)

if cleaned_data is not None:
    # You can continue with further analysis or saving the cleaned dataset if needed
    cleaned_data.to_csv('cleaned_sales_data.csv', index=False)

CSV file 'sales_data.csv' created successfully.

Number of rows removed: 1

Cleaned DataFrame (first 5 rows):
   Store  Monthly_Sales
0      1           1500
1      2           2000
2      3           1800
3      4           3000
5      6           2500
