In [6]:
# Question: Evaluating Data Distribution
# Description: Analyze the distribution of a numeric column using histograms and descriptive statistics.

import pandas as pd
import matplotlib.pyplot as plt

def evaluate_distribution(data, column_name):
    df = pd.DataFrame(data)

    if column_name not in df.columns:
        raise ValueError(f"Column '{column_name}' not found in the dataset.")

    # Drop NaN values for the column of interest
    col_data = pd.to_numeric(df[column_name], errors='coerce').dropna()

    if col_data.empty:
        raise ValueError(f"Column '{column_name}' has no valid numeric data.")

    # Descriptive Statistics
    description = col_data.describe()

    # Print stats
    print("Descriptive Statistics:")
    print(description)

    # Plot histogram
    plt.figure(figsize=(8, 5))
    plt.hist(col_data, bins=10, color='skyblue', edgecolor='black')
    plt.title(f'Distribution of {column_name}')
    plt.xlabel(column_name)
    plt.ylabel('Frequency')
    plt.grid(True)
    plt.tight_layout()
    plt.show()

    return description
