In [10]:
import pandas as pd
import numpy as np

# Paths
data_path = "C:/Users/HP/OneDrive/Desktop/VERO_code/Phase_1/data/processed/merged_codige_wide_english_values_translated.xlsx"
mapping_book_path = "C:/Users/HP/OneDrive/Desktop/VERO_code/Phase_1/outputs/profiling/column_index_mapping.xlsx"

# Load data
df = pd.read_excel(data_path)

# --- Create (or recreate) the column index mapping base sheet ---
def excel_col_letter(n):
    result = ""
    while n > 0:
        n, remainder = divmod(n - 1, 26)
        result = chr(65 + remainder) + result
    return result

col_map_df = pd.DataFrame({
    "Column Name": df.columns,
    "Excel Column Letter": [excel_col_letter(i+1) for i in range(len(df.columns))]
})

with pd.ExcelWriter(mapping_book_path, engine="xlsxwriter") as writer:
    col_map_df.to_excel(writer, sheet_name="column_index", index=False)

# ------------------ Build profiling pieces ------------------
# 1) Dtypes
dtypes_df = pd.DataFrame({
    "column": df.columns,
    "dtype": [str(t) for t in df.dtypes]
})

# 2) Missingness
missing_df = pd.DataFrame({
    "column": df.columns,
    "missing_count": df.isna().sum().values,
    "missing_pct": (df.isna().mean().values * 100).round(2)
}).sort_values("missing_pct", ascending=False)

# 3) Cardinality / Uniques
card_df = pd.DataFrame({
    "column": df.columns,
    "n_unique": [df[c].nunique(dropna=True) for c in df.columns]
}).sort_values("n_unique", ascending=False)

# 4) Numeric summary
num_cols = df.select_dtypes(include=["number"]).columns.tolist()
numeric_summary = df[num_cols].describe(percentiles=[0.25, 0.5, 0.75]).T if num_cols else pd.DataFrame()
if not numeric_summary.empty:
    numeric_summary = numeric_summary.rename(columns={
        "25%": "p25", "50%": "median", "75%": "p75"
    }).reset_index().rename(columns={"index": "column"})

# 5) Categorical summary (object columns)
cat_cols = df.select_dtypes(include=["object"]).columns.tolist()
cat_rows = []
for c in cat_cols:
    s = df[c].astype("string")
    vc = s.value_counts(dropna=False)
    top = vc.index[0] if len(vc) else np.nan
    top_freq = int(vc.iloc[0]) if len(vc) else np.nan
    cat_rows.append({"column": c, "n_unique": int(s.nunique(dropna=True)), "top": top, "top_freq": top_freq})
categorical_summary = pd.DataFrame(cat_rows).sort_values("n_unique", ascending=False)

# 6) Datetime summary (if any)
dt_cols = df.select_dtypes(include=["datetime64[ns]", "datetime64[ns, UTC]"]).columns.tolist()
dt_rows = []
for c in dt_cols:
    s = pd.to_datetime(df[c], errors="coerce")
    dt_rows.append({
        "column": c,
        "min_date": s.min(),
        "max_date": s.max(),
        "n_missing": s.isna().sum(),
        "n_unique": s.nunique(dropna=True)
    })
datetime_summary = pd.DataFrame(dt_rows)

# 7) Overview
overview = pd.DataFrame({
    "n_rows": [len(df)],
    "n_columns": [df.shape[1]],
    "n_numeric": [len(num_cols)],
    "n_categorical": [len(cat_cols)],
    "n_datetime": [len(dt_cols)],
    "missing_any_cols": [(missing_df["missing_count"] > 0).sum()],
})

# Append profiling sheets
with pd.ExcelWriter(mapping_book_path, mode="a", engine="openpyxl", if_sheet_exists="replace") as writer:
    overview.to_excel(writer, sheet_name="overview", index=False)
    dtypes_df.to_excel(writer, sheet_name="dtypes", index=False)
    missing_df.to_excel(writer, sheet_name="missingness", index=False)
    card_df.to_excel(writer, sheet_name="cardinality", index=False)
    if not numeric_summary.empty:
        numeric_summary.to_excel(writer, sheet_name="numeric_summary", index=False)
    categorical_summary.to_excel(writer, sheet_name="categorical_summary", index=False)
    if not datetime_summary.empty:
        datetime_summary.to_excel(writer, sheet_name="datetime_summary", index=False)

mapping_book_path


'C:/Users/HP/OneDrive/Desktop/VERO_code/Phase_1/outputs/profiling/column_index_mapping.xlsx'