# Assignment 1:
### - The automated_stat_analyzer Function
- Scenario: A retail company needs a utility to quickly summarize sales data. Students must create a function that identifies the 
"Central Tendency" and "Dispersion" of any numerical column.
- ### Requirements:

* Accept a Pandas DataFrame and a column name.

* Calculate the Mean, Median, and Standard Deviation .

* Identify if the data is "Skewed" by comparing the Mean and Median.


* Bonus: If the column is categorical, return the Mode instead.

### Your Data

In [4]:
import pandas as pd
import numpy as np

# Create a synthetic Company Sales Dataset
data = {
    'Transaction_ID': range(1, 11),
    'Product_Category': ['Electronics', 'Home', 'Electronics', 'Sports', 'Home', 
                         'Electronics', 'Home', 'Sports', 'Electronics', 'Electronics'],
    'Sales_Amount': [150, 200, 155, 300, 210, 180, 205, 1000, 190, 160], # 1000 is an Outlier
    'Customer_Age': [25, 34, np.nan, 45, 23, 31, 29, np.nan, 38, 40],    # Contains Nulls (NaN)
    'Rating': [5, 4, 3, 5, 2, 4, 5, 2, 4, 3]
}

df_test = pd.DataFrame(data)

# Save to CSV for students to practice loading files [cite: 74]
df_test.to_csv('company_sales_test.csv', index=False)
print("Test dataset created successfully!")

Test dataset created successfully!


In [2]:
df_test.head()

Unnamed: 0,Transaction_ID,Product_Category,Sales_Amount,Customer_Age,Rating
0,1,Electronics,150,25.0,5
1,2,Home,200,34.0,4
2,3,Electronics,155,,3
3,4,Sports,300,45.0,5
4,5,Home,210,23.0,2


In [None]:
import pandas as pd

def automated_stat_analyzer(df, column_name):
    """
    Company Task: Provide a summary report of a specific data variable.
    
    Instructions:
    1. Check if the column is numerical or categorical.
    2. For numerical: Calculate Mean, Median, and Standard Deviation.
    3. For categorical: Calculate the Mode.
    4. Return a dictionary with these statistical measures.
    """
    if df[column_name].dtype in ['int64', 'float64']:
        # Numerical column processing
        mean_val = df[column_name].mean()
        median_val = df[column_name].median()
        std_val = df[column_name].std()
        
        # Determine skewness by comparing mean and median
        if abs(mean_val - median_val) > 0.1:
            skew_status = "Skewed"
        else:
            skew_status = "Symmetric"
        
        return {
            'column': column_name,
            'type': 'numerical',
            'mean': round(mean_val, 2),
            'median': round(median_val, 2),
            'std_dev': round(std_val, 2),
            'skewness': skew_status
        }
    else:
        # Categorical column processing
        mode_val = df[column_name].mode()[0] if not df[column_name].mode().empty else None
        
        return {
            'column': column_name,
            'type': 'categorical',
            'mode': mode_val
        }

# Test with Sales_Amount
print("Analysis for Sales_Amount:")
result_sales = automated_stat_analyzer(df_test, 'Sales_Amount')
for key, value in result_sales.items():
    print(f"  {key}: {value}")

print("\nAnalysis for Product_Category:")
result_category = automated_stat_analyzer(df_test, 'Product_Category')
for key, value in result_category.items():
    print(f"  {key}: {value}")

print("\nAnalysis for Customer_Age:")
result_age = automated_stat_analyzer(df_test, 'Customer_Age')
for key, value in result_age.items():
    print(f"  {key}: {value}")

## Assignment 2: 
  ### The null_handling_strategy Function


#### Scenario: Incoming user data often has missing values.Students must implement a flexible strategy to handle these "Null Values" to prepare data for Machine Learning.
### Requirements:

* Check for null values in the DataFrame.

* Apply a strategy based on parameters: "drop_rows", "fill_mean", or "fill_median" .

* Ensure the function only fills numerical columns when using mean or median.

In [None]:
def null_handling_strategy(df, strategy="fill_mean"):
    """
    Company Task: Clean a dataset by resolving missing (NaN) values.
    """
    df_copy = df.copy()
    
    if strategy == "drop_rows":
        # Remove rows with any null values
        df_copy = df_copy.dropna()
        return df_copy
    
    elif strategy == "fill_mean":
        # Fill numerical columns with their mean value
        numerical_cols = df_copy.select_dtypes(include=['int64', 'float64']).columns
        for col in numerical_cols:
            df_copy[col].fillna(df_copy[col].mean(), inplace=True)
        return df_copy
    
    elif strategy == "fill_median":
        # Fill numerical columns with their median value
        numerical_cols = df_copy.select_dtypes(include=['int64', 'float64']).columns
        for col in numerical_cols:
            df_copy[col].fillna(df_copy[col].median(), inplace=True)
        return df_copy
    
    else:
        raise ValueError(f"Unknown strategy: {strategy}")

# Display original data with nulls
print("Original DataFrame with Missing Values:")
print(df_test)
print(f"\nNull counts:\n{df_test.isnull().sum()}")

# Test drop_rows strategy
print("\n" + "="*50)
print("Strategy: drop_rows")
print("="*50)
df_dropped = null_handling_strategy(df_test, strategy="drop_rows")
print(df_dropped)
print(f"Shape changed from {df_test.shape} to {df_dropped.shape}")

# Test fill_mean strategy
print("\n" + "="*50)
print("Strategy: fill_mean")
print("="*50)
df_filled_mean = null_handling_strategy(df_test, strategy="fill_mean")
print(df_filled_mean)
print(f"\nNull counts after fill_mean:\n{df_filled_mean.isnull().sum()}")

# Test fill_median strategy
print("\n" + "="*50)
print("Strategy: fill_median")
print("="*50)
df_filled_median = null_handling_strategy(df_test, strategy="fill_median")
print(df_filled_median)
print(f"\nNull counts after fill_median:\n{df_filled_median.isnull().sum()}")

In [None]:
# Summary of Solutions
print("\n" + "="*60)
print("PANDAS ASSIGNMENT SUMMARY")
print("="*60)

print("\n✓ Assignment 1: automated_stat_analyzer")
print("-" * 60)
print("Calculates Mean, Median, Std Dev for numerical columns")
print("Calculates Mode for categorical columns")
print("Detects if data is skewed by comparing mean vs median")

print("\n✓ Test Results for Sales_Amount:")
print(f"  Mean: {result_sales['mean']}")
print(f"  Median: {result_sales['median']}")
print(f"  Std Dev: {result_sales['std_dev']}")
print(f"  Skewness: {result_sales['skewness']}")

print("\n✓ Test Results for Product_Category:")
print(f"  Mode: {result_category['mode']}")

print("\n✓ Assignment 2: null_handling_strategy")
print("-" * 60)
print("Three strategies for handling missing values:")
print("  1. drop_rows - Remove rows with any NaN")
print("  2. fill_mean - Fill numerical columns with mean")
print("  3. fill_median - Fill numerical columns with median")

print("\n✓ Test Results:")
print(f"  Original shape: {df_test.shape}")
print(f"  After drop_rows: {df_dropped.shape}")
print(f"  After fill_mean: {df_filled_mean.shape}")
print(f"  After fill_median: {df_filled_median.shape}")
print("\n" + "="*60)
print("All assignments completed successfully!")
print("="*60)