In [None]:
import numpy as np
import pandas as pd
from pandas.api.types import CategoricalDtype

# --- Load dataset (update path if needed) ---
ds_jobs = pd.read_csv("customer_train.csv")

# keep original untouched for memory comparisons
original = ds_jobs

# --- Copy for transformation ---
ds_jobs_transformed = ds_jobs.copy()

# ---------------------------
# Explicit: map job_change -> bool
# ---------------------------
def _to_bool_val(x):
    if pd.isna(x):
        return pd.NA
    s = str(x).strip().lower()
    if s in {"1", "true", "yes", "y", "t"}:
        return True
    if s in {"0", "false", "no", "n", "f"}:
        return False
    return pd.NA

mapped = ds_jobs_transformed['job_change'].map(_to_bool_val)

# if any NA remain, keep pandas nullable boolean dtype; else use plain bool
if mapped.isna().any():
    ds_jobs_transformed['job_change'] = mapped.astype('boolean')   # nullable boolean
else:
    ds_jobs_transformed['job_change'] = mapped.astype(bool)

# ---------------------------
# Convert columns with exactly two non-null unique values to boolean (skip job_change)
# ---------------------------
for col in ds_jobs_transformed.columns:
    if col == 'job_change':
        continue
    # consider non-null uniques only (so that NaNs don't mask two-factor detection)
    uniques = ds_jobs_transformed[col].dropna().unique()
    if len(uniques) == 2:
        # map common affirmative values to True when possible, otherwise map first->True, second->False
        lower_uniques = [str(u).strip().lower() for u in uniques]
        if any(u in {"1","true","yes","y","t"} for u in lower_uniques):
            # map the one that looks like True -> True
            true_val = uniques[lower_uniques.index(next(u for u in lower_uniques if u in {"1","true","yes","y","t"}))]
            false_val = [u for u in uniques if u is not true_val][0]
            ds_jobs_transformed[col] = ds_jobs_transformed[col].map({true_val: True, false_val: False}).astype('boolean' if ds_jobs_transformed[col].isna().any() else bool)
        else:
            # fallback: map first unique -> True, second -> False (keeps dtype boolean)
            ds_jobs_transformed[col] = ds_jobs_transformed[col].map({uniques[0]: True, uniques[1]: False}).astype('boolean' if ds_jobs_transformed[col].isna().any() else bool)

# ---------------------------
# Integers -> int32
# ---------------------------
# If integer columns are float (e.g., have NaN), we skip automatic conversion here.
int_cols = ds_jobs_transformed.select_dtypes(include=['int64']).columns.tolist()
for col in int_cols:
    ds_jobs_transformed[col] = ds_jobs_transformed[col].astype(np.int32)

# ---------------------------
# Floats -> float16
# ---------------------------
float_cols = ds_jobs_transformed.select_dtypes(include=['float64']).columns.tolist()
for col in float_cols:
    ds_jobs_transformed[col] = ds_jobs_transformed[col].astype(np.float16)

# ---------------------------
# Define ordinal categories (ordered) using CategoricalDtype
# ---------------------------
# EXPERIENCE: <1, 1..20, 21+
exp_order = ['<1'] + [str(i) for i in range(1, 21)] + ['21+']
exp_dtype = CategoricalDtype(categories=exp_order, ordered=True)
# COMPANY SIZE
size_order = ['<10','10-49','50-99','100-499','500-999','1000-4999','5000-9999','10000+']
size_dtype = CategoricalDtype(categories=size_order, ordered=True)
# ENROLLED UNIVERSITY
uni_order = ['no_enrollment','Part time course','Full time course']
uni_dtype = CategoricalDtype(categories=uni_order, ordered=True)
# EDUCATION LEVEL (natural order)
edu_order = ['Primary School','High School','Graduate','Masters','Phd']
edu_dtype = CategoricalDtype(categories=edu_order, ordered=True)
# LAST NEW JOB
job_order = ['never','1','2','3','4','5+']
job_dtype = CategoricalDtype(categories=job_order, ordered=True)

# Normalize possible variants and set ordered categoricals (safe, using .astype after cleaning)
# normalize experience string representation: unify '>20' -> '21+'
ds_jobs_transformed['experience'] = (
    ds_jobs_transformed['experience']
    .replace({'>20': '21+'})
    .astype(str)
    .str.strip()
)

# apply ordered dtypes if columns exist
if 'experience' in ds_jobs_transformed.columns:
    ds_jobs_transformed['experience'] = ds_jobs_transformed['experience'].astype(exp_dtype)
if 'company_size' in ds_jobs_transformed.columns:
    ds_jobs_transformed['company_size'] = ds_jobs_transformed['company_size'].astype(size_dtype)
if 'enrolled_university' in ds_jobs_transformed.columns:
    ds_jobs_transformed['enrolled_university'] = ds_jobs_transformed['enrolled_university'].astype(uni_dtype)
if 'education_level' in ds_jobs_transformed.columns:
    ds_jobs_transformed['education_level'] = ds_jobs_transformed['education_level'].astype(edu_dtype)
if 'last_new_job' in ds_jobs_transformed.columns:
    ds_jobs_transformed['last_new_job'] = ds_jobs_transformed['last_new_job'].astype(job_dtype)

# ---------------------------
# Nominal categories -> category (all object columns not in ordinals and not boolean)
# ---------------------------
ordinal_cols = {'experience','company_size','enrolled_university','education_level','last_new_job'}
for col in ds_jobs_transformed.select_dtypes(include=['object']).columns:
    if col in ordinal_cols:
        continue
    # skip job_change and any columns already converted to boolean
    if col == 'job_change':
        continue
    ds_jobs_transformed[col] = ds_jobs_transformed[col].astype('category')

# ---------------------------
# Drop nulls only in filtering columns (experience, company_size)
# ---------------------------
ds_jobs_transformed = ds_jobs_transformed.dropna(subset=['experience', 'company_size'])

# ---------------------------
# Final filtering: keep only students with >=10 years experience and company size >= 1000 employees
# ---------------------------
ds_jobs_transformed = ds_jobs_transformed[
    (ds_jobs_transformed['experience'] >= '10') &
    (ds_jobs_transformed['company_size'] >= '1000-4999')
].copy()

# ---------------------------
# Final: print memory usage and dtype summary
# ---------------------------
print("=== ORIGINAL (before transformations) ===")
print(original.info(memory_usage='deep'))
print("\n=== TRANSFORMED & FILTERED (ds_jobs_transformed) ===")
print(ds_jobs_transformed.info(memory_usage='deep'))

# quick preview
print("\nPreview of transformed data (first 5 rows):")
print(ds_jobs_transformed.head().T)
