
# Outlier Detection and Removal

This notebook demonstrates how to detect and remove outliers using **Z-Score** and **IQR** methods.

Dataset: Simulated crypto prices with intentional outliers.


In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Create sample data with outliers
np.random.seed(42)
prices = np.random.normal(loc=50000, scale=5000, size=100)
prices = np.append(prices, [150000, 200000, 250000])  # extreme outliers
df = pd.DataFrame({'Crypto_Price': prices})

# Save to CSV for inspection
df.to_csv('../data/crypto_prices_with_outliers.csv', index=False)
df.head()


In [None]:

# Original boxplot with outliers
plt.figure(figsize=(6, 4))
sns.boxplot(x=df['Crypto_Price'], color='skyblue')
plt.title("Original Data with Outliers")
plt.grid(True)
plt.show()


In [None]:

# Z-Score Method
from scipy.stats import zscore

df_z = df[(np.abs(zscore(df['Crypto_Price'])) < 3)]
print(f"Original: {len(df)}, After Z-Score Removal: {len(df_z)}")


In [None]:

# IQR Method
Q1 = df['Crypto_Price'].quantile(0.25)
Q3 = df['Crypto_Price'].quantile(0.75)
IQR = Q3 - Q1
lower = Q1 - 1.5 * IQR
upper = Q3 + 1.5 * IQR

df_iqr = df[(df['Crypto_Price'] >= lower) & (df['Crypto_Price'] <= upper)]
print(f"Original: {len(df)}, After IQR Removal: {len(df_iqr)}")


In [None]:

# Compare boxplots
fig, axs = plt.subplots(1, 3, figsize=(18, 5))

sns.boxplot(ax=axs[0], x=df['Crypto_Price'], color='skyblue')
axs[0].set_title("Original")

sns.boxplot(ax=axs[1], x=df_z['Crypto_Price'], color='orange')
axs[1].set_title("Z-Score Cleaned")

sns.boxplot(ax=axs[2], x=df_iqr['Crypto_Price'], color='lightgreen')
axs[2].set_title("IQR Cleaned")

plt.tight_layout()
plt.show()
