In [1]:
# Activity 1: Handling Missing Data

# Task A: Dropping vs Imputation

# 1. Dropping Missing Data:
# - Load a dataset (e.g., a CSV file with some missing values like employees.csv ).
# - Inspect the dataset for missing values using a Python library (e.g., Pandas).
# - Drop rows with missing data and save the result.






# 2. Imputation using Mean:
# - Use the same dataset.
# - Fill missing numerical values with the column mean.
# - Save and display the modified data.









# 3. Imputation using Median and Mode:
# - For numerical columns, replace missing values with the median.
# - For categorical columns, use the mode.
# - Display the updated dataset.

import pandas as pd
import numpy as np

# Sample data creation function (replace with your actual data loading)
def create_sample_data():
    data = {
        'EmployeeID': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
        'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve', 'Frank', 'Grace', None, 'Ivy', 'Jack'],
        'Age': [25, 30, None, 28, 35, None, 40, 45, None, 50],
        'Department': ['HR', 'IT', 'IT', None, 'Finance', 'Finance', None, 'HR', 'IT', 'HR'],
        'Salary': [50000, 60000, None, None, 80000, 90000, 100000, 110000, None, 120000],
        'Experience': [2, 5, 1, None, 10, 8, 12, None, 3, 15]
    }
    return pd.DataFrame(data)

# 1. Dropping Missing Data
def drop_missing_data(df):
    print("\n=== Original Data ===")
    print(df)
    
    print("\n=== Missing Values Summary ===")
    print(df.isnull().sum())
    
    # Drop rows with any missing values
    df_dropped = df.dropna()
    
    print("\n=== After Dropping Missing Values ===")
    print(f"Rows dropped: {len(df) - len(df_dropped)}")
    print(f"Remaining rows: {len(df_dropped)}")
    
    # Save to new file
    df_dropped.to_csv('employees_dropped.csv', index=False)
    print("\nSaved cleaned data to 'employees_dropped.csv'")
    
    return df_dropped

# 2. Imputation using Mean
def impute_with_mean(df):
    print("\n=== Imputing Numerical Columns with Mean ===")
    
    # Select numerical columns
    num_cols = df.select_dtypes(include=['int64', 'float64']).columns
    
    # Create a copy for imputation
    df_mean = df.copy()
    
    # Fill numerical columns with mean
    for col in num_cols:
        col_mean = df_mean[col].mean()
        df_mean[col].fillna(col_mean, inplace=True)
        print(f"Imputed {col} with mean: {col_mean:.2f}")
    
    print("\n=== After Mean Imputation ===")
    print(df_mean)
    
    # Save to new file
    df_mean.to_csv('employees_mean_imputed.csv', index=False)
    print("\nSaved mean-imputed data to 'employees_mean_imputed.csv'")
    
    return df_mean

# 3. Imputation using Median and Mode
def impute_with_median_mode(df):
    print("\n=== Imputing with Median (Numerical) and Mode (Categorical) ===")
    
    # Create a copy for imputation
    df_imputed = df.copy()
    
    # Impute numerical columns with median
    num_cols = df_imputed.select_dtypes(include=['int64', 'float64']).columns
    for col in num_cols:
        col_median = df_imputed[col].median()
        df_imputed[col].fillna(col_median, inplace=True)
        print(f"Imputed {col} with median: {col_median:.2f}")
    
    # Impute categorical columns with mode
    cat_cols = df_imputed.select_dtypes(include=['object']).columns
    for col in cat_cols:
        col_mode = df_imputed[col].mode()[0]
        df_imputed[col].fillna(col_mode, inplace=True)
        print(f"Imputed {col} with mode: {col_mode}")
    
    print("\n=== After Median/Mode Imputation ===")
    print(df_imputed)
    
    # Save to new file
    df_imputed.to_csv('employees_median_mode_imputed.csv', index=False)
    print("\nSaved median/mode-imputed data to 'employees_median_mode_imputed.csv'")
    
    return df_imputed

# Main execution
if __name__ == "__main__":
    # Load or create sample data
    employees_df = create_sample_data()
    
    # 1. Dropping missing data approach
    print("\n" + "="*50)
    print("1. DROPPING MISSING DATA APPROACH")
    print("="*50)
    dropped_df = drop_missing_data(employees_df)
    
    # 2. Mean imputation approach
    print("\n" + "="*50)
    print("2. MEAN IMPUTATION APPROACH")
    print("="*50)
    mean_imputed_df = impute_with_mean(employees_df)
    
    # 3. Median/mode imputation approach
    print("\n" + "="*50)
    print("3. MEDIAN/MODE IMPUTATION APPROACH")
    print("="*50)
    median_mode_df = impute_with_median_mode(employees_df)







1. DROPPING MISSING DATA APPROACH

=== Original Data ===
   EmployeeID     Name   Age Department    Salary  Experience
0           1    Alice  25.0         HR   50000.0         2.0
1           2      Bob  30.0         IT   60000.0         5.0
2           3  Charlie   NaN         IT       NaN         1.0
3           4    David  28.0       None       NaN         NaN
4           5      Eve  35.0    Finance   80000.0        10.0
5           6    Frank   NaN    Finance   90000.0         8.0
6           7    Grace  40.0       None  100000.0        12.0
7           8     None  45.0         HR  110000.0         NaN
8           9      Ivy   NaN         IT       NaN         3.0
9          10     Jack  50.0         HR  120000.0        15.0

=== Missing Values Summary ===
EmployeeID    0
Name          1
Age           3
Department    2
Salary        3
Experience    2
dtype: int64

=== After Dropping Missing Values ===
Rows dropped: 6
Remaining rows: 4

Saved cleaned data to 'employees_dropped.csv'

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_mean[col].fillna(col_mean, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_imputed[col].fillna(col_median, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values 

In [2]:
# Task B: Predictive Imputation

# 4. ML-based Imputation with Simple Imputer:
# - Use SimpleImputer from sklearn to fill missing values.
# - Choose a strategy (e.g., mean) and apply it to the dataset.





# 5. Imputation using a Regression Model:
# - Use a regression model to predict missing values.
# - Train the model on complete cases and fill the missing data.




# 6. K-Nearest Neighbors Imputation:
# - Use KNNImputer from sklearn .
# - Impute missing data based on neighbors' information.




