In [1]:
import pandas as pd
import numpy as np

# --- Introduction ---
# This script demonstrates setting up basic automated data validation
# using pure Python and the pandas library.
# It covers checks for missing values, data types, and value ranges.
#
# We will:
# 1. Create a sample DataFrame with some data quality issues.
# 2. Define validation functions for different types of checks.
# 3. Apply these validation functions to the DataFrame.
# 4. Report the validation results.

# --- 1. Create a Sample DataFrame ---
print("--- Creating Sample DataFrame ---")
data = {
    'ProductID': [101, 102, 103, 104, 105, 106, 107, 108, 109, 110],
    'ProductName': ['Laptop', 'Keyboard', 'Mouse', 'Monitor', 'Webcam', 'Printer', 'Speaker', 'Headphones', 'Microphone', 'Router'],
    'Category': ['Electronics', 'Electronics', 'Electronics', 'Electronics', 'Electronics', 'Electronics', 'Audio', np.nan, 'Audio', 'Network'], # Missing value
    'Price': [1200.50, 75.00, 25.99, 300.00, -50.00, 250.00, 150.00, 99.50, 70.00, 80.00], # Negative price
    'StockQuantity': [10, 50, 0, 15, 25, 5, 12, 30, 8, 20],
    'ReleaseDate': ['2023-01-10', '2023-01-11', '2023-01-11', '2023-01-12', '2023-01-12', '2023-01-13', '2023-01-13', '2023-01-14', '2023-01-14', 'InvalidDate'], # Invalid date format
    'IsActive': [True, False, True, True, False, True, True, False, True, True]
}
df = pd.DataFrame(data)

print(df)
print("\n")

# --- 2. Define Validation Functions ---

def check_missing_values(df, column_name, is_required=True):
    """Checks for missing values in a specified column."""
    if is_required and df[column_name].isnull().any():
        null_count = df[column_name].isnull().sum()
        return f"Column '{column_name}': Contains {null_count} missing values, but is required."
    elif df[column_name].isnull().any() and not is_required:
         # Optional columns can have missing values, maybe just report count
         # null_count = df[column_name].isnull().sum()
         # return f"Column '{column_name}': Contains {null_count} missing values (optional column)."
         pass # No violation if not required
    return None

def check_data_type(df, column_name, expected_dtype):
    """Checks if a column's data type matches the expected type."""
    # Pandas dtypes can be tricky, especially with NaNs.
    # This is a basic check. More robust checks might involve type casting.
    if df[column_name].dtype != expected_dtype:
        # Allow numeric types to be checked against compatible numeric dtypes
        if not (pd.api.types.is_numeric_dtype(df[column_name].dtype) and pd.api.types.is_numeric_dtype(expected_dtype)):
             return f"Column '{column_name}': Incorrect data type. Expected '{expected_dtype}', got '{df[column_name].dtype}'."
    return None

def check_value_range(df, column_name, min_value=None, max_value=None):
    """Checks if values in a numerical column are within a specified range."""
    if not pd.api.types.is_numeric_dtype(df[column_name].dtype):
        return f"Column '{column_name}': Cannot perform range check on non-numeric type '{df[column_name].dtype}'."

    violations = []
    if min_value is not None:
        # Check for values below the minimum, ignoring NaNs
        if (df[column_name].dropna() < min_value).any():
            invalid_count = (df[column_name].dropna() < min_value).sum()
            violations.append(f"Contains {invalid_count} values below the minimum ({min_value}).")

    if max_value is not None:
         # Check for values above the maximum, ignoring NaNs
         if (df[column_name].dropna() > max_value).any():
             invalid_count = (df[column_name].dropna() > max_value).sum()
             violations.append(f"Contains {invalid_count} values above the maximum ({max_value}).")

    if violations:
        return f"Column '{column_name}': Value range violations: {', '.join(violations)}"
    return None

def check_allowed_values(df, column_name, allowed_list):
    """Checks if values in a column are within a list of allowed values."""
    # Check if non-null unique values are in the allowed list
    invalid_values = df[column_name].dropna()[~df[column_name].dropna().isin(allowed_list)].unique()
    if invalid_values.size > 0:
        return f"Column '{column_name}': Contains invalid values: {list(invalid_values)}"
    return None

# --- 3. Apply Validation ---
print("--- Applying Validation Checks ---")

validation_issues = []

# Define checks to apply
# Format: (check_function, column_name, *args)
checks_to_run = [
    (check_missing_values, 'ProductID', True), # ProductID required
    (check_data_type, 'ProductID', 'int64'),
    (check_missing_values, 'ProductName', True), # ProductName required
    (check_data_type, 'ProductName', 'object'),
    (check_missing_values, 'Category', True), # Category required
    (check_data_type, 'Category', 'object'),
    (check_missing_values, 'Price', True), # Price required
    (check_data_type, 'Price', 'float64'),
    (check_value_range, 'Price', 0.0, None), # Price must be >= 0
    (check_missing_values, 'StockQuantity', True), # StockQuantity required
    (check_data_type, 'StockQuantity', 'int64'),
    (check_value_range, 'StockQuantity', 0, None), # StockQuantity must be >= 0
    (check_missing_values, 'ReleaseDate', True), # ReleaseDate required
    (check_data_type, 'ReleaseDate', 'object'), # Basic type check, format check below
    # Note: Date format validation requires more specific logic (e.g., using pd.to_datetime with format)
    # Example (conceptual):
    # (check_date_format, 'ReleaseDate', '%Y-%m-%d'),
    (check_missing_values, 'IsActive', True), # IsActive required
    (check_data_type, 'IsActive', 'bool'),
    (check_allowed_values, 'Category', ['Electronics', 'Audio', 'Accessories', 'Software', 'Network']), # Allowed categories
]

# Run checks
for check_func, col_name, *args in checks_to_run:
    # Ensure column exists before checking
    if col_name not in df.columns:
        if args and args[0] is True: # Check if it was a required column check
             validation_issues.append(f"Required Column Missing: Column '{col_name}' is defined in checks but not found in DataFrame.")
        continue # Skip checks for non-existent columns

    issue = check_func(df, col_name, *args)
    if issue:
        validation_issues.append(issue)

# --- 4. Report Validation Results ---
print("--- Data Validation Report ---")

if not validation_issues:
    print("Validation Successful: No issues found based on defined checks.")
else:
    print("Validation Failed: The following data quality issues were detected:")
    for issue in validation_issues:
        print(f"- {issue}")

# --- Conclusion ---
# This script provides a basic framework for automated data validation
# using pandas. You can extend this by adding more validation functions
# and defining comprehensive checks_to_run for your specific dataset.
# For more complex scenarios, dedicated data validation libraries are recommended.



--- Creating Sample DataFrame ---
   ProductID ProductName     Category    Price  StockQuantity  ReleaseDate  \
0        101      Laptop  Electronics  1200.50             10   2023-01-10   
1        102    Keyboard  Electronics    75.00             50   2023-01-11   
2        103       Mouse  Electronics    25.99              0   2023-01-11   
3        104     Monitor  Electronics   300.00             15   2023-01-12   
4        105      Webcam  Electronics   -50.00             25   2023-01-12   
5        106     Printer  Electronics   250.00              5   2023-01-13   
6        107     Speaker        Audio   150.00             12   2023-01-13   
7        108  Headphones          NaN    99.50             30   2023-01-14   
8        109  Microphone        Audio    70.00              8   2023-01-14   
9        110      Router      Network    80.00             20  InvalidDate   

   IsActive  
0      True  
1     False  
2      True  
3      True  
4     False  
5      True  
6      Tr

In [2]:
# Question 2: Removing Outliers by Rescaling
# Description: Remove outliers by standardizing a numerical column using z-scores.
import pandas as pd
import numpy as np
from scipy.stats import zscore # SciPy's zscore function is convenient

# --- Introduction ---
# This script demonstrates how to identify and handle outliers in a
# numerical column of a pandas DataFrame using the z-score method.
# Outliers are typically defined as values with a z-score above a certain threshold.
#
# We will:
# 1. Create a sample DataFrame with a numerical column containing outliers.
# 2. Calculate the z-score for each value in the column.
# 3. Define a z-score threshold.
# 4. Identify values that exceed the threshold (outliers).
# 5. Handle the identified outliers (e.g., replace with NaN).
# 6. Show the DataFrame before and after handling outliers.

# --- 1. Create a Sample DataFrame ---
print("--- Creating Sample DataFrame ---")
# Create a column with mostly values around 100, but a few much larger values (outliers)
data = {
    'DataPointID': range(1, 16),
    'Measurement': [
        101, 105, 98, 103, 99, 110, 95, 102, 108, 97,
        550,  # Outlier 1
        104, 106, 96,
        600   # Outlier 2
    ]
}
df = pd.DataFrame(data)

print("Original DataFrame:")
print(df)
print("\n")

# --- 2. Calculate Z-scores ---
# Calculate the z-score for each value in the 'Measurement' column.
# The z-score measures how many standard deviations away from the mean a data point is.
print("--- Calculating Z-scores ---")

# Calculate z-scores using pandas and numpy:
# mean = df['Measurement'].mean()
# std_dev = df['Measurement'].std()
# df['Z_Score'] = (df['Measurement'] - mean) / std_dev

# Or using SciPy's zscore function (handles NaNs if present, but our data has none initially)
df['Z_Score'] = zscore(df['Measurement'])

print("DataFrame with Z-scores:")
print(df)
print("\n")

# --- 3. Define Z-score Threshold ---
# A common threshold for identifying outliers is an absolute z-score of 2 or 3.
# Values with |Z_Score| > threshold are considered outliers.
z_score_threshold = 3

print(f"--- Defining Z-score Threshold: {z_score_threshold} ---")

# --- 4. Identify Outliers ---
# Identify rows where the absolute Z_Score is greater than the threshold.
outliers = df[abs(df['Z_Score']) > z_score_threshold]

print(f"Identified Outliers (where |Z_Score| > {z_score_threshold}):")
print(outliers)
print("\n")

# --- 5. Handle Outliers ---
# There are several ways to handle outliers:
# A) Remove the rows containing outliers.
# B) Replace the outlier values with a specific value (e.g., NaN, mean, median).
# C) Cap the outlier values at a certain limit.

# Option B: Replace outliers with NaN (a common approach)
print("--- Handling Outliers (Replacing with NaN) ---")

# Create a copy to avoid modifying the original DataFrame directly if you need it later
df_cleaned = df.copy()

# Replace values in 'Measurement' column where the absolute Z_Score is > threshold with NaN
df_cleaned.loc[abs(df_cleaned['Z_Score']) > z_score_threshold, 'Measurement'] = np.nan

# Drop the 'Z_Score' column from the cleaned DataFrame as it's temporary
df_cleaned = df_cleaned.drop(columns=['Z_Score'])

print("DataFrame after replacing outliers with NaN:")
print(df_cleaned)
print("\n")

# Option A: Remove rows with outliers (alternative handling)
# print("--- Alternative Handling (Removing Rows with Outliers) ---")
# df_rows_removed = df[abs(df['Z_Score']) <= z_score_threshold].drop(columns=['Z_Score'])
# print("DataFrame after removing rows with outliers:")
# print(df_rows_removed)
# print("\n")


# --- Conclusion ---
# The script successfully identified values with high z-scores as outliers
# and demonstrated how to handle them by replacing them with NaN.
# The choice of z-score threshold depends on the data and the specific problem.
# Z-score assumes the data is roughly normally distributed; for non-normal data,
# other methods like IQR (Interquartile Range) might be more appropriate.




--- Creating Sample DataFrame ---
Original DataFrame:
    DataPointID  Measurement
0             1          101
1             2          105
2             3           98
3             4          103
4             5           99
5             6          110
6             7           95
7             8          102
8             9          108
9            10           97
10           11          550
11           12          104
12           13          106
13           14           96
14           15          600


--- Calculating Z-scores ---
DataFrame with Z-scores:
    DataPointID  Measurement   Z_Score
0             1          101 -0.396719
1             2          105 -0.371898
2             3           98 -0.415335
3             4          103 -0.384309
4             5           99 -0.409129
5             6          110 -0.340872
6             7           95 -0.433950
7             8          102 -0.390514
8             9          108 -0.353283
9            10           97 -0.4215

In [3]:
# Question 3: Applying Data Type Conversion
# Description: Convert the 'Age' column to integers after filling missing values.
import pandas as pd
import numpy as np

# --- Introduction ---
# This script demonstrates how to handle missing values in a numerical column
# (specifically 'Age') and then convert the column to an integer data type
# using the pandas library.
#
# We will:
# 1. Create a sample DataFrame with an 'Age' column containing missing values.
# 2. Identify and fill the missing values.
# 3. Convert the 'Age' column to an integer type.
# 4. Show the DataFrame before and after these operations.

# --- 1. Create a Sample DataFrame ---
print("--- Creating Sample DataFrame ---")
data = {
    'PersonID': range(1, 11),
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve', 'Frank', 'Grace', 'Heidi', 'Ivan', 'Judy'],
    'Age': [25, 32, np.nan, 41, 29, 35, np.nan, 27, 38, 30], # Introduce missing values (NaN)
    'City': ['New York', 'London', 'Paris', 'Tokyo', 'Sydney', 'New York', 'London', 'Paris', 'Tokyo', 'Sydney']
}
df = pd.DataFrame(data)

print("Original DataFrame:")
print(df)
print("\n")

print("Data types before conversion:")
print(df.dtypes)
print("\n")

# --- 2. Identify and Fill Missing Values in 'Age' ---
print("--- Filling Missing Values in 'Age' Column ---")

# Check for missing values
print("Missing values in 'Age' before filling:", df['Age'].isnull().sum())

# Option A: Fill missing values with the mean age
# mean_age = df['Age'].mean()
# df['Age'].fillna(mean_age, inplace=True)

# Option B: Fill missing values with the median age (often more robust to outliers)
median_age = df['Age'].median()
df['Age'].fillna(median_age, inplace=True)

print("Missing values in 'Age' after filling:", df['Age'].isnull().sum())
print("DataFrame after filling missing 'Age' values (using median):")
print(df)
print("\n")

# --- 3. Convert the 'Age' Column to Integer Type ---
print("--- Converting 'Age' Column to Integer Type ---")

# Convert the column to integer.
# Use .astype(int) or .astype('int64') or .astype('int32').
# Note: This step requires that there are no NaN values left,
# as pandas integer types by default do not support NaNs.
# If you might have NaNs you want to preserve, use nullable integer types like 'Int64'.
try:
    df['Age'] = df['Age'].astype(int)
    print("Successfully converted 'Age' to integer.")
except ValueError as e:
    print(f"Error converting 'Age' to integer: {e}")
    print("This might happen if there are still non-integer values or NaNs.")
    print("Consider using a nullable integer type like 'Int64' if NaNs should be preserved.")
    # Example using nullable integer type:
    # df['Age'] = df['Age'].astype('Int64')


print("\nData types after conversion:")
print(df.dtypes)
print("\n")

print("DataFrame after converting 'Age' to integer:")
print(df)
print("\n")


# --- Conclusion ---
# The script demonstrated filling missing values in the 'Age' column
# and then converting it to an integer data type.
# Ensure missing values are handled appropriately before converting to
# non-nullable integer types.




--- Creating Sample DataFrame ---
Original DataFrame:
   PersonID     Name   Age      City
0         1    Alice  25.0  New York
1         2      Bob  32.0    London
2         3  Charlie   NaN     Paris
3         4    David  41.0     Tokyo
4         5      Eve  29.0    Sydney
5         6    Frank  35.0  New York
6         7    Grace   NaN    London
7         8    Heidi  27.0     Paris
8         9     Ivan  38.0     Tokyo
9        10     Judy  30.0    Sydney


Data types before conversion:
PersonID      int64
Name         object
Age         float64
City         object
dtype: object


--- Filling Missing Values in 'Age' Column ---
Missing values in 'Age' before filling: 2
Missing values in 'Age' after filling: 0
DataFrame after filling missing 'Age' values (using median):
   PersonID     Name   Age      City
0         1    Alice  25.0  New York
1         2      Bob  32.0    London
2         3  Charlie  31.0     Paris
3         4    David  41.0     Tokyo
4         5      Eve  29.0    Sydne

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(median_age, inplace=True)


In [4]:
# Question 4: Automating Data Cleaning with Functions
# Description: Create a function that automates the process of filling missing values, removing duplicates, and standardizing column names.
import pandas as pd
import numpy as np

# --- Introduction ---
# This script defines a Python function to automate common data cleaning tasks:
# 1. Standardizing column names (e.g., to lowercase with underscores).
# 2. Filling missing values using a specified strategy.
# 3. Removing duplicate rows.
#
# We will:
# 1. Create a sample DataFrame with data quality issues.
# 2. Define the automated cleaning function.
# 3. Apply the function to the sample DataFrame.
# 4. Show the DataFrame before and after cleaning.

# --- 1. Create a Sample DataFrame ---
print("--- Creating Sample DataFrame with Issues ---")
data = {
    'Employee ID': [101, 102, 103, 104, 105, 101, 106, 107, 108, 109], # Duplicate ID
    'Employee Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve', 'Alice', 'Frank', 'Grace', 'Heidi', 'Ivan'],
    'Department': ['Sales', 'IT', 'IT', 'Marketing', np.nan, 'Sales', 'HR', 'IT', 'Marketing', 'Sales'], # Missing value
    'Salary (USD)': [60000, 75000, 70000, 65000, 80000, 60000, 72000, np.nan, 68000, 62000], # Missing value
    'Hire_Date': ['2022-01-15', '2021-08-20', '2022-03-10', '2023-01-01', '2022-06-18', '2022-01-15', '2023-02-01', '2021-11-11', '2022-09-25', '2023-03-10']
}
df = pd.DataFrame(data)

print("Original DataFrame:")
print(df)
print("\n")

print("Original Column Names:", df.columns.tolist())
print("\n")

# --- 2. Define the Automated Cleaning Function ---

def clean_dataframe(df, fill_strategy='median', fill_value=None, subset_fill=None, remove_duplicates=True, standardize_cols=True):
    """
    Automates common data cleaning tasks on a pandas DataFrame.

    Args:
        df (pd.DataFrame): The input DataFrame to clean.
        fill_strategy (str): Strategy for filling missing values ('mean', 'median', 'mode', 'constant', None).
                             If None, missing values are not filled.
        fill_value: The constant value to use if fill_strategy is 'constant'.
        subset_fill (list): List of column names to apply the fill strategy to.
                            If None, applies to all columns of appropriate type.
        remove_duplicates (bool): Whether to remove duplicate rows.
        standardize_cols (bool): Whether to standardize column names (lowercase, underscores).

    Returns:
        pd.DataFrame: The cleaned DataFrame.
    """
    cleaned_df = df.copy() # Work on a copy to avoid modifying the original DataFrame

    # 1. Standardize Column Names
    if standardize_cols:
        print("Standardizing column names...")
        # Convert to lowercase and replace spaces/special characters with underscores
        cleaned_df.columns = cleaned_df.columns.str.lower().str.replace('[^a-z0-9]+', '_', regex=True).str.strip('_')
        print("Column names standardized.")

    # 2. Fill Missing Values
    if fill_strategy is not None:
        print(f"Filling missing values using strategy: '{fill_strategy}'...")
        cols_to_fill = subset_fill if subset_fill is not None else cleaned_df.columns

        for col in cols_to_fill:
            if col in cleaned_df.columns and cleaned_df[col].isnull().any():
                if fill_strategy == 'mean' and pd.api.types.is_numeric_dtype(cleaned_df[col].dtype):
                    fill_val = cleaned_df[col].mean()
                    cleaned_df[col].fillna(fill_val, inplace=True)
                    print(f"  Filled '{col}' with mean ({fill_val:.2f}).")
                elif fill_strategy == 'median' and pd.api.types.is_numeric_dtype(cleaned_df[col].dtype):
                    fill_val = cleaned_df[col].median()
                    cleaned_df[col].fillna(fill_val, inplace=True)
                    print(f"  Filled '{col}' with median ({fill_val:.2f}).")
                elif fill_strategy == 'mode':
                     # Mode might return multiple values; take the first
                     mode_val = cleaned_df[col].mode()
                     if not mode_val.empty:
                         fill_val = mode_val[0]
                         cleaned_df[col].fillna(fill_val, inplace=True)
                         print(f"  Filled '{col}' with mode ({fill_val}).")
                     else:
                         print(f"  Could not find a mode for '{col}' to fill NaNs.")
                elif fill_strategy == 'constant' and fill_value is not None:
                    cleaned_df[col].fillna(fill_value, inplace=True)
                    print(f"  Filled '{col}' with constant value ({fill_value}).")
                elif fill_strategy == 'constant' and fill_value is None:
                    print(f"  Warning: fill_strategy is 'constant' for '{col}', but fill_value is None. Skipping fill for this column.")
                else:
                    # Handle cases where strategy doesn't match dtype or is unsupported
                    if fill_strategy in ['mean', 'median'] and not pd.api.types.is_numeric_dtype(cleaned_df[col].dtype):
                         print(f"  Skipping fill for '{col}': Strategy '{fill_strategy}' requires numeric data, but column type is '{cleaned_df[col].dtype}'.")
                    elif fill_strategy not in ['mean', 'median', 'mode', 'constant']:
                         print(f"  Skipping fill for '{col}': Unsupported fill_strategy '{fill_strategy}'.")
                    # If subset_fill was used, and the column wasn't in the original df, this would also skip
                    # but the initial check 'if col in cleaned_df.columns' handles that.

        print("Missing value filling complete.")

    # 3. Remove Duplicate Rows
    if remove_duplicates:
        print("Removing duplicate rows...")
        initial_rows = len(cleaned_df)
        cleaned_df.drop_duplicates(inplace=True)
        rows_after_removal = len(cleaned_df)
        duplicates_removed = initial_rows - rows_after_removal
        print(f"Removed {duplicates_removed} duplicate rows.")

    print("\nData cleaning process finished.")
    return cleaned_df

# --- 3. Apply the Cleaning Function ---
print("--- Applying the Automated Cleaning Function ---")

# Example usage:
# - Fill missing values with median for numeric columns
# - Remove duplicates
# - Standardize column names
cleaned_df = clean_dataframe(
    df,
    fill_strategy='median',
    subset_fill=['Department', 'Salary (USD)'], # Specify columns to attempt filling
    remove_duplicates=True,
    standardize_cols=True
)

print("\n--- Cleaned DataFrame ---")
print(cleaned_df)
print("\n")

print("Cleaned Column Names:", cleaned_df.columns.tolist())
print("\n")

# --- Conclusion ---
# The `clean_dataframe` function successfully standardized column names,
# filled missing values in the specified columns using the median strategy,
# and removed the duplicate row.
# You can customize the parameters to suit different cleaning needs.



--- Creating Sample DataFrame with Issues ---
Original DataFrame:
   Employee ID Employee Name Department  Salary (USD)   Hire_Date
0          101         Alice      Sales       60000.0  2022-01-15
1          102           Bob         IT       75000.0  2021-08-20
2          103       Charlie         IT       70000.0  2022-03-10
3          104         David  Marketing       65000.0  2023-01-01
4          105           Eve        NaN       80000.0  2022-06-18
5          101         Alice      Sales       60000.0  2022-01-15
6          106         Frank         HR       72000.0  2023-02-01
7          107         Grace         IT           NaN  2021-11-11
8          108         Heidi  Marketing       68000.0  2022-09-25
9          109          Ivan      Sales       62000.0  2023-03-10


Original Column Names: ['Employee ID', 'Employee Name', 'Department', 'Salary (USD)', 'Hire_Date']


--- Applying the Automated Cleaning Function ---
Standardizing column names...
Column names standardized.

In [5]:
# Question 5: Complex Data Normalization
# Description: Normalize a numeric column to a range using min-max scaling.
import pandas as pd
import numpy as np

# --- Introduction ---
# This script demonstrates how to perform min-max scaling (normalization)
# on a numerical column in a pandas DataFrame.
# Min-max scaling transforms data to a specified range, typically [0, 1].
# The formula is: X_normalized = (X - X_min) / (X_max - X_min)
# To scale to a different range [a, b]: X_scaled = a + (X - X_min) * (b - a) / (X_max - X_min)
#
# We will:
# 1. Create a sample DataFrame with a numerical column.
# 2. Define the target range for normalization (e.g., [0, 1]).
# 3. Calculate the minimum and maximum values of the original column.
# 4. Apply the min-max scaling formula to normalize the column.
# 5. Show the DataFrame before and after normalization.

# --- 1. Create a Sample DataFrame ---
print("--- Creating Sample DataFrame ---")
data = {
    'DataPointID': range(1, 11),
    'Feature_Value': [10, 25, 5, 40, 15, 30, 8, 35, 20, 45]
}
df = pd.DataFrame(data)

print("Original DataFrame:")
print(df)
print("\n")

# --- 2. Define the Target Range ---
# We will normalize the 'Feature_Value' column to the range [0, 1].
target_min = 0
target_max = 1

print(f"--- Normalizing 'Feature_Value' to the range [{target_min}, {target_max}] ---")

# --- 3. Calculate Original Min and Max ---
original_min = df['Feature_Value'].min()
original_max = df['Feature_Value'].max()

print(f"Original Min: {original_min}")
print(f"Original Max: {original_max}\n")

# Handle the case where original_max and original_min are the same
# (i.e., all values in the column are identical) to avoid division by zero.
if original_max == original_min:
    print("Warning: All values in the column are identical. Normalization will result in a constant value.")
    df['Feature_Value_Normalized'] = target_min # Or any value within the target range
else:
    # --- 4. Apply Min-Max Scaling Formula ---
    # X_scaled = a + (X - X_min) * (b - a) / (X_max - X_min)
    # Here, a = target_min, b = target_max, X = df['Feature_Value']
    df['Feature_Value_Normalized'] = target_min + (df['Feature_Value'] - original_min) * (target_max - target_min) / (original_max - original_min)

# --- 5. Show the DataFrame After Normalization ---
print("DataFrame after Min-Max Normalization:")
print(df)
print("\n")

# Verify the min and max of the normalized column
print("Min of Normalized Feature:", df['Feature_Value_Normalized'].min())
print("Max of Normalized Feature:", df['Feature_Value_Normalized'].max())

# --- Conclusion ---
# The 'Feature_Value' column has been successfully scaled to the range [0, 1].
# Min-max scaling is sensitive to outliers, as they will affect the calculated
# min and max values, potentially compressing the range of the majority of data points.
# Consider outlier handling before applying min-max scaling if necessary.



--- Creating Sample DataFrame ---
Original DataFrame:
   DataPointID  Feature_Value
0            1             10
1            2             25
2            3              5
3            4             40
4            5             15
5            6             30
6            7              8
7            8             35
8            9             20
9           10             45


--- Normalizing 'Feature_Value' to the range [0, 1] ---
Original Min: 5
Original Max: 45

DataFrame after Min-Max Normalization:
   DataPointID  Feature_Value  Feature_Value_Normalized
0            1             10                     0.125
1            2             25                     0.500
2            3              5                     0.000
3            4             40                     0.875
4            5             15                     0.250
5            6             30                     0.625
6            7              8                     0.075
7            8             35        