#  01 – EDA: Messy Data vs Raw Data

# 1.Load df_Messy Telco churn dataset
# 2.Explore schema, types, distributions
# 3.Inspect key categorical and numeric variables
# 4.This notebook is **read-only / analysis-only**: no modification of source CSVs
# 5.compare structure + quality for Messy vs Raw


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

warnings.filterwarnings("ignore")
sns.set(style="whitegrid")

# EDA on Messy Data (`df_messy`)

## 1. Now load the processed messy dataset and compare structure & quality Messy vs the original raw data.


In [37]:
MESSY_PATH = "../data/processed/telco-Customer-Churn-messy-data.csv"
RAW_PATH = "../data/raw/telco-Customer-Churn.csv"

df_raw = pd.read_csv(RAW_PATH)
df_messy = pd.read_csv(MESSY_PATH)

In [None]:
print("Messy data shape:", df_messy.shape)
df_messy.head()

In [None]:
print("Messy data count (non-null per column):")
print(df_messy.count())

In [None]:
print("\nMESSY DATA TYPES:\n")
print(df_messy.dtypes)

In [None]:
print("\nMESSY DATA INFO:\n")
df_messy.info()

In [None]:
print("\nMESSY DATA DESCRIPTION (numeric + categorical):\n")
df_messy.describe(include="all")

## 2. Helper Functions for Quick Inspection

In [None]:
def print_unique_values_in_df(df, max_cols=20, max_values=20):
    """
    Print unique values for object/category columns.
    Limit the number of columns/values printed to keep it readable.
    """
    obj_cols = [c for c in df.columns if df[c].dtype == "object"]
    print(f"\nObject columns (showing up to {max_cols} columns): {len(obj_cols)} total\n")
    for col in obj_cols[:max_cols]:
        unique_vals = df[col].unique()
        n_unique = len(unique_vals)
        print(f"Column: {col}  |  n_unique = {n_unique}")
        print("  Sample unique values:", unique_vals[:max_values])
        print("-" * 60)

def plot_missing_bar(df, title):
    missing_counts = df.isna().sum()
    missing_counts = missing_counts[missing_counts > 0].sort_values(ascending=False)
    if missing_counts.empty:
        print(f"No missing values in {title}.")
        return
    plt.figure(figsize=(10, 4))
    missing_counts.plot(kind="bar")
    plt.title(f"Missing Values per Column – {title}")
    plt.ylabel("Count of missing values")
    plt.xticks(rotation=45, ha="right")
    plt.tight_layout()
    plt.show()

def plot_numeric_distributions(df, numeric_cols, title_prefix="", bins=30):
    for col in numeric_cols:
        plt.figure(figsize=(10, 4))
        sns.histplot(df[col], bins=bins, kde=False)
        plt.title(f"{title_prefix}Distribution of {col}")
        plt.xlabel(col)
        plt.ylabel("Count")
        plt.tight_layout()
        plt.show()

def plot_boxplots(df, numeric_cols, title_prefix=""):
    for col in numeric_cols:
        plt.figure(figsize=(6, 4))
        sns.boxplot(x=df[col])
        plt.title(f"{title_prefix}Boxplot of {col}")
        plt.xlabel(col)
        plt.tight_layout()
        plt.show()


## 3. Messy Data – Unique Values for Categorical Columns

In [None]:
print_unique_values_in_df(df_messy, max_cols=25, max_values=20)

## 4. Messy Data – Target Variable (`Churn`) Distribution

In [None]:
if "Churn" in df_messy.columns:
    plt.figure(figsize=(4, 4))
    df_messy["Churn"].value_counts().plot(kind="bar")
    plt.title("Churn Distribution (Messy Data)")
    plt.xlabel("Churn")
    plt.ylabel("Count")
    plt.tight_layout()
    plt.show()
    print(df_messy["Churn"].value_counts(normalize=True))
else:
    print("Column 'Churn' not found in messy data.")


## 5. Raw Data – Numeric Columns: Distributions & Basic Stats

In [None]:
# Make sure TotalCharges is numeric if it's not already
if "TotalCharges" in df_raw.columns:
    df_raw["TotalCharges"] = pd.to_numeric(df_raw["TotalCharges"], errors="coerce")

# Make sure SeniorCitizen is categorical/object , showing as numeric
if "SeniorCitizen" in df_raw.columns:
    df_raw["SeniorCitizen"] = df_raw["SeniorCitizen"].astype("object")

numeric_cols_raw = df_raw.select_dtypes(include=["number"]).columns.tolist()
print("Numeric columns (raw):", numeric_cols_raw)

df_raw[numeric_cols_raw].describe()

## 6. Messy Data – Numeric Columns: Distributions & Stats

In [None]:
# Make sure TotalCharges is numeric-like where possible
if "TotalCharges" in df_messy.columns:
    total_tmp = pd.to_numeric(df_messy["TotalCharges"], errors="coerce")
    # Optionally drop inf for summary
    total_tmp_no_inf = total_tmp.replace([np.inf, -np.inf], np.nan)
    print("Messy TotalCharges summary (coerced to numeric, inf treated as NaN):\n",
          total_tmp_no_inf.describe())

# Use the numeric schema from RAW data as reference
numeric_cols_expected = numeric_cols_raw  # from earlier in the notebook
numeric_cols_messy = [c for c in numeric_cols_expected if c in df_messy.columns]

print("Numeric columns expected (from raw):", numeric_cols_expected)
print("Numeric columns present in messy:", numeric_cols_messy)

if numeric_cols_messy:
    # Coerce messy numeric columns back to numeric for EDA purposes
    messy_numeric = df_messy[numeric_cols_messy].apply(
        lambda s: pd.to_numeric(s, errors="coerce")
    )
    print("\nMessy numeric columns summary (coerced):")
    print(messy_numeric.describe())
else:
    print("No numeric-like columns found in df_messy based on the raw schema.")

In [None]:
plot_numeric_distributions(df_messy, numeric_cols_messy, title_prefix="Messy – ")

In [None]:
plot_boxplots(df_messy, numeric_cols_messy, title_prefix="Messy – ")

## 7. Messy Data – Missingness Overvie

In [None]:
print("Missing values per column in MESSY data:")
print(df_messy.isna().sum())

plot_missing_bar(df_messy, title="Messy Data")

## 8. Raw vs Messy – Missingness Comparison

In [None]:
raw_missing = df_raw.isna().sum()
messy_missing = df_messy.isna().sum()

missing_compare = pd.DataFrame(
    {
        "raw_missing": raw_missing,
        "messy_missing": messy_missing,
    }
)
missing_compare["delta"] = missing_compare["messy_missing"] - missing_compare["raw_missing"]
missing_compare.sort_values("delta", ascending=False, inplace=True)

print("Missingness comparison (messy - raw):")
missing_compare

In [None]:
plt.figure(figsize=(10, 4))
missing_compare["delta"].plot(kind="bar")
plt.title("Increase in Missing Values: Messy vs Raw")
plt.ylabel("Δ missing (messy - raw)")
plt.xticks(rotation=45, ha="right")
plt.tight_layout()
plt.show()

## 9. Raw vs Messy – Example Column Inspection

In [None]:
cols_to_inspect = ["InternetService", "Contract", "PaymentMethod", "SeniorCitizen"]
for col in cols_to_inspect:
    if col in df_raw.columns and col in df_messy.columns:
        print(f"\n=== Column: {col} ===")
        print("Raw value counts (top 10):")
        print(df_raw[col].value_counts(dropna=False).head(10))
        print("\nMessy value counts (top 10):")
        print(df_messy[col].value_counts(dropna=False).head(10))
        print("-" * 60)

## 10. Save Simple Summary for Reference (Optional)

In [None]:
summary = {
    "raw_shape": df_raw.shape,
    "messy_shape": df_messy.shape,
    "raw_missing_total": int(df_raw.isna().sum().sum()),
    "messy_missing_total": int(df_messy.isna().sum().sum()),
    "raw_churn_dist": df_raw["Churn"].value_counts(normalize=True).to_dict() if "Churn" in df_raw.columns else {},
    "messy_churn_dist": df_messy["Churn"].value_counts(normalize=True).to_dict() if "Churn" in df_messy.columns else {},
}

print("Summary of Raw vs Messy Data:")
print(summary)
summary_s= pd.Series(summary, name="value")
summary_df = summary_s.to_frame()
summary_df