In [1]:
# Question: Winsorization of Data
# Description: Apply Winsorization to a given dataset to handle outliers.
# Import necessary libraries
import pandas as pd
import numpy as np
from scipy.stats import winsorize
import matplotlib.pyplot as plt
import seaborn as sns

# --- Create a sample dataset with outliers ---
# Let's create a dataset with some values that are clearly outside the typical range
data = {'Values': [10, 12, 15, 18, 20, 22, 25, 28, 30, 100, -5, 32, 35, 40, 45, 50]}
df = pd.DataFrame(data)

print("Original Data:")
print(df)
print("\nOriginal Data Description:")
print(df.describe())

# --- Visualize the original data using a boxplot to see outliers ---
# Boxplots are a great way to visually identify potential outliers.
plt.figure(figsize=(8, 4))
sns.boxplot(x=df['Values'])
plt.title('Boxplot of Original Data')
plt.xlabel('Values')
plt.show()

# --- Apply Winsorization ---
# We will winsorize at the 10th percentile from the lower tail
# and the 90th percentile from the upper tail.
# The 'limits' parameter is a tuple (lower_limit, upper_limit).
# These limits are expressed as fractions of the data size.
# So, 0.1 means 10% from the lower end, and 0.1 means 10% from the upper end.
# This is equivalent to capping at the 10th and 90th percentiles.
winsorized_values = winsorize(df['Values'], limits=[0.10, 0.10])

# Create a new DataFrame or add a new column for the winsorized data
# We assign the winsorized numpy array back to a new column in the DataFrame.
df['Values_Winsorized'] = winsorized_values

print("\nData after Winsorization:")
print(df)
print("\nWinsorized Data Description:")
# Describe the new winsorized column to see the effect on statistics.
print(df['Values_Winsorized'].describe())

# --- Visualize the winsorized data using a boxplot ---
# Plot the boxplot of the winsorized data to see how the outliers have been handled.
plt.figure(figsize=(8, 4))
sns.boxplot(x=df['Values_Winsorized'])
plt.title('Boxplot of Winsorized Data')
plt.xlabel('Values (Winsorized)')
plt.show()

# --- Interpretation ---
# Compare the boxplots and the describe() outputs for the original and winsorized data.
# You should observe that the extreme values (100 and -5 in this example)
# have been replaced by values closer to the main body of the data (the 10th and 90th percentiles).
# The whiskers in the winsorized boxplot will likely be shorter,
# and the extreme points will no longer appear as individual markers outside the whiskers,
# indicating that the outliers have been capped.



ImportError: cannot import name 'winsorize' from 'scipy.stats' (/workspaces/AI_DATA_ANALYSIS_/.venv/lib/python3.10/site-packages/scipy/stats/__init__.py)