In [1]:
# Part 1: Detect & Handle Missing Data

# Task 1: Detect Missing Data
#     1. Load the Data:
#     2. Detect Missing Data:



# Task 2: Handle Missing Data by Dropping
#     1. Drop Rows with Missing Values:
        
        

# Task 3: Handle Missing Data by Imputation
#     1. Fill Missing Values:
        
import pandas as pd
import numpy as np

# --- Part 1: Detect & Handle Missing Data ---
# This script demonstrates how to detect missing values, handle them by dropping
# rows/columns, and handle them by imputation (filling values).

# --- Task 1: Detect Missing Data ---

# 1. Load the Data:
print("--- Task 1: Detect Missing Data ---")
print("--- 1. Load the Data (Creating Sample Data) ---")

# Create a sample DataFrame with missing values
data = {
    'CustomerID': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    'ProductName': ['Laptop', 'Keyboard', np.nan, 'Monitor', 'Webcam', 'Printer', 'Mouse', 'Keyboard', np.nan, 'Monitor'], # Missing Product Name
    'Price': [1200.50, 75.00, 25.99, 300.00, 50.00, 250.00, 150.00, np.nan, 70.00, 80.00], # Missing Price
    'StockQuantity': [10, 50, 15, 8, np.nan, 5, 12, 30, 8, 20], # Missing Stock Quantity
    'Category': ['Electronics', 'Electronics', 'Electronics', 'Electronics', 'Electronics', 'Electronics', 'Audio', 'Electronics', 'Audio', np.nan], # Missing Category
    'Rating': [4, 5, 3, 4, 5, 4, 3, 5, 4, 4] # No missing values
}
df = pd.DataFrame(data)

print("Original DataFrame:")
print(df)
print("\n")

# 2. Detect Missing Data:
print("--- 2. Detect Missing Data ---")

# Use the isnull() method to find missing values.
# This returns a boolean DataFrame of the same shape.
print("Boolean DataFrame indicating missing values:")
print(df.isnull())
print("\n")

# Summarize missing data: Use the sum() function to count the number of missing values in each column.
print("Count of missing values per column:")
print(df.isnull().sum())
print("\n")

# --- Task 2: Handle Missing Data by Dropping ---
print("--- Task 2: Handle Missing Data by Dropping ---")

# Create a fresh copy of the original DataFrame for dropping examples
df_for_dropping = df.copy()

# 1. Drop Rows with Missing Values:
print("--- 1. Drop Rows with Missing Values ---")
# Use the dropna() method to remove rows with missing values.
# By default, dropna() removes rows (axis=0) that have *any* missing values (how='any').
df_rows_dropped = df_for_dropping.dropna(axis=0, how='any')

print("DataFrame after dropping rows with any missing values:")
print(df_rows_dropped)
print(f"\nOriginal number of rows: {len(df_for_dropping)}")
print(f"Number of rows after dropping: {len(df_rows_dropped)}")
print("\n")

# Demonstrate dropping rows only if *all* values in the row are missing
# (Less common, but possible with how='all')
# df_rows_dropped_all = df_for_dropping.dropna(axis=0, how='all')
# print("DataFrame after dropping rows where ALL values are missing:")
# print(df_rows_dropped_all)
# print("\n")


# Demonstrate dropping columns with missing values
print("--- Drop Columns with Missing Values ---")
# Use dropna() with axis=1 to remove columns.
# By default, how='any' is used, removing columns with *any* missing values.
df_cols_dropped = df_for_dropping.dropna(axis=1, how='any')

print("DataFrame after dropping columns with any missing values:")
print(df_cols_dropped)
print(f"\nOriginal columns: {df_for_dropping.columns.tolist()}")
print(f"Columns after dropping: {df_cols_dropped.columns.tolist()}")
print("\n")

# Demonstrate dropping columns only if *all* values in the column are missing
# (Useful if you have entirely empty columns)
# df_cols_dropped_all = df_for_dropping.dropna(axis=1, how='all')
# print("DataFrame after dropping columns where ALL values are missing:")
# print(df_cols_dropped_all)
# print("\n")


# --- Task 3: Handle Missing Data by Imputation ---
print("--- Task 3: Handle Missing Data by Imputation ---")

# Create a fresh copy of the original DataFrame for imputation examples
df_for_imputation = df.copy()

# 1. Fill Missing Values:
print("--- 1. Fill Missing Values ---")

# Example 1: Mean Imputation for a Numerical Column ('Price')
print("Applying Mean Imputation to 'Price'...")
mean_price = df_for_imputation['Price'].mean()
df_for_imputation['Price'].fillna(mean_price, inplace=True)
print(f"Filled missing 'Price' values with the mean ({mean_price:.2f}).")
print("\n")

# Example 2: Mode Imputation for a Categorical Column ('Category')
print("Applying Mode Imputation to 'Category'...")
# .mode() can return multiple values; take the first one [0]
mode_category = df_for_imputation['Category'].mode()[0] if not df_for_imputation['Category'].mode().empty else "Unknown"
df_for_imputation['Category'].fillna(mode_category, inplace=True)
print(f"Filled missing 'Category' values with the mode ('{mode_category}').")
print("\n")

# Example 3: Filling with a Constant Value ('ProductName')
print("Applying Constant Value Imputation to 'ProductName'...")
constant_fill_value = 'Unknown Product'
df_for_imputation['ProductName'].fillna(constant_fill_value, inplace=True)
print(f"Filled missing 'ProductName' values with the constant '{constant_fill_value}'.")
print("\n")

# Example 4: Forward Fill ('StockQuantity') - assuming some order might matter, though not strictly time series here
print("Applying Forward Fill (ffill) to 'StockQuantity'...")
df_for_imputation['StockQuantity'].fillna(method='ffill', inplace=True)
print("Applied ffill to 'StockQuantity'.")
print("\n")

# Note: After ffill, there might still be NaNs at the beginning if the first value was NaN.
# You might need a bfill or other method to handle those if necessary.

print("DataFrame after Imputation:")
print(df_for_imputation)
print("\n")

print("Count of missing values after imputation:")
print(df_for_imputation.isnull().sum()) # Should show 0 for the columns we imputed
print("\n")

# --- Conclusion ---
# This script covered detecting missing data, handling it by dropping
# rows or columns, and handling it by imputing values using mean, mode,
# a constant value, and forward fill. The best method depends on the data
# and the specific analysis or modeling task.


--- Task 1: Detect Missing Data ---
--- 1. Load the Data (Creating Sample Data) ---
Original DataFrame:
   CustomerID ProductName    Price  StockQuantity     Category  Rating
0           1      Laptop  1200.50           10.0  Electronics       4
1           2    Keyboard    75.00           50.0  Electronics       5
2           3         NaN    25.99           15.0  Electronics       3
3           4     Monitor   300.00            8.0  Electronics       4
4           5      Webcam    50.00            NaN  Electronics       5
5           6     Printer   250.00            5.0  Electronics       4
6           7       Mouse   150.00           12.0        Audio       3
7           8    Keyboard      NaN           30.0  Electronics       5
8           9         NaN    70.00            8.0        Audio       4
9          10     Monitor    80.00           20.0          NaN       4


--- 2. Detect Missing Data ---
Boolean DataFrame indicating missing values:
   CustomerID  ProductName  Price  St

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_for_imputation['Price'].fillna(mean_price, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_for_imputation['Category'].fillna(mode_category, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate objec