### Garbage In, Garbage Out (GIGO): Cleaning Missing Data
**Description**: Load a dataset (e.g., Titanic dataset) and identify missing values. Use
appropriate techniques to handle these missing values.

In [1]:
# --- Step 1: Import Required Libraries ---
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Optional: Uncomment below if missingno is not installed
# !pip install missingno

import missingno as msno

# --- Step 2: Load the Titanic Dataset ---
df = sns.load_dataset('titanic')
print("Initial Dataset Shape:", df.shape)
display(df.head())

# --- Step 3: Check for Missing Values ---
print("\nMissing Values Per Column:\n")
print(df.isnull().sum())

# --- Step 4: Visualize Missing Data ---
msno.matrix(df)
plt.title("Missing Values Visualization")
plt.show()

# --- Step 5: Drop Columns with Too Many Missing Values ---
threshold = 0.5  # Drop columns with more than 50% missing data
cols_to_drop = [col for col in df.columns if df[col].isnull().mean() > threshold]
df.drop(columns=cols_to_drop, inplace=True)
print("\nDropped Columns:", cols_to_drop)

# --- Step 6: Fill Remaining Missing Values ---

# Fill numeric column 'age' with median
df['age'].fillna(df['age'].median(), inplace=True)

# Fill categorical columns with mode
df['embarked'].fillna(df['embarked'].mode()[0], inplace=True)

# For 'embark_town', ensure 'Unknown' is added to the categories before filling
if df['embark_town'].dtype.name == 'category':
    df['embark_town'] = df['embark_town'].cat.add_categories('Unknown')

df['embark_town'].fillna('Unknown', inplace=True)

# Just in case any are still left (safety net)
df.fillna("Unknown", inplace=True)

# --- Step 7: Verify Cleanup ---
print("\nRemaining Missing Values After Cleaning:\n")
print(df.isnull().sum())
print("\nFinal Dataset Shape:", df.shape)

ModuleNotFoundError: No module named 'missingno'