In [None]:
# ==============================
# 0. Imports and data loading
# ==============================
import re
import numpy as np
import pandas as pd


import pandas as pd

df = pd.read_json("renttherunway_final_data.json.gz", 
                  orient="records",
                  lines=True)

# Example: load from CSV (change this to your file path)
# df = pd.read_csv("rent_the_runway.csv")

# If you already have df in memory, you can skip the load line above.
# For now, let's just assume df exists.


# ==========================================
# 1. Helper functions for specific columns
# ==========================================

def parse_height_to_inches(x):
    """
    Convert height like "5' 8\"" or "5'8\"" or "5'8" to inches.
    Returns NaN if parsing fails.
    """
    if pd.isna(x):
        return np.nan
    x = str(x).strip()
    # Match feet and optional inches
    m = re.match(r"(\d+)\s*'\s*(\d+)?", x)
    if not m:
        return np.nan
    feet = int(m.group(1))
    inches = int(m.group(2)) if m.group(2) is not None else 0
    return feet * 12 + inches


def parse_weight_lbs(x):
    """
    Extract numeric part from strings like '137lbs'.
    """
    if pd.isna(x):
        return np.nan
    x = str(x)
    m = re.search(r"(\d+\.?\d*)", x)
    if not m:
        return np.nan
    return float(m.group(1))


def parse_bust_band_and_cup(x):
    """
    Parse '34d' -> (34, 'D').
    Returns (np.nan, np.nan) if parsing fails.
    """
    if pd.isna(x):
        return np.nan, np.nan
    x = str(x).strip().upper()  # e.g. '34D'
    m = re.match(r"(\d+)\s*([A-Z]+)", x)
    if not m:
        return np.nan, np.nan
    band = float(m.group(1))
    cup = m.group(2)  # e.g. D, DD
    return band, cup


def cup_to_numeric(cup):
    """
    Map cup letters to an ordered numeric scale.
    You can extend this mapping if your data has more.
    """
    if pd.isna(cup):
        return np.nan
    cup = str(cup).upper()
    cup_map = {
        "AA": 0,
        "A": 1,
        "B": 2,
        "C": 3,
        "D": 4,
        "DD": 5,
        "DDD": 6,
        "E": 7,
        "F": 8
    }
    return cup_map.get(cup, np.nan)


def compute_bmi(weight_lbs, height_inches):
    """
    BMI = weight(kg) / height(m)^2.
    """
    if pd.isna(weight_lbs) or pd.isna(height_inches) or height_inches == 0:
        return np.nan
    weight_kg = weight_lbs * 0.453592
    height_m = height_inches * 0.0254
    return weight_kg / (height_m ** 2)


# =============================================
# 2. Basic cleaning + target label processing
# =============================================

# Keep only rows with non-null fit label
df = df[df['fit'].notna()].copy()

# Standardize fit labels to lowercase
df['fit'] = df['fit'].str.lower().str.strip()

# Filter to the three classes we care about
valid_fits = {'small', 'fit', 'large'}
df = df[df['fit'].isin(valid_fits)].copy()

# Encode target in two ways:
# 1) class_encoding: small=0, fit=1, large=2 (for classification models)
# 2) numeric_encoding: small=-1, fit=0, large=1 (for computing mean fit bias)
fit_to_class = {'small': 0, 'fit': 1, 'large': 2}
fit_to_num = {'small': -1, 'fit': 0, 'large': 1}

df['fit_class'] = df['fit'].map(fit_to_class)
df['fit_num'] = df['fit'].map(fit_to_num)


# ======================================
# 3. Process user-level numeric fields
# ======================================

# Height -> inches
df['height_inches'] = df['height'].apply(parse_height_to_inches)

# Weight -> lbs (numeric)
df['weight_lbs'] = df['weight'].apply(parse_weight_lbs)

# Age -> numeric
df['age'] = pd.to_numeric(df['age'], errors='coerce')

# Bust size -> band + cup
bust_band_list = []
bust_cup_list = []

for val in df['bust size']:
    band, cup = parse_bust_band_and_cup(val)
    bust_band_list.append(band)
    bust_cup_list.append(cup)

df['bust_band'] = bust_band_list
df['bust_cup'] = bust_cup_list

# Cup -> numeric
df['bust_cup_num'] = df['bust_cup'].apply(cup_to_numeric)

# BMI + simple ratio weight/height
df['BMI'] = compute_bmi(df['weight_lbs'], df['height_inches'])
df['weight_per_inch'] = df['weight_lbs'] / df['height_inches']


# ======================================
# 4. Process categorical features
# ======================================

# Body type: fill missing with 'unknown'
df['body type'] = df['body type'].fillna('unknown')

# Category: fill missing with 'unknown'
df['category'] = df['category'].fillna('unknown')

# One-hot encode category and body type
df = pd.get_dummies(df,
                    columns=['category', 'body type'],
                    prefix=['cat', 'body'],
                    dummy_na=False)


# ======================================
# 5. Item-level aggregate features
# ======================================

# For each item, compute stats of fit_num
item_stats = df.groupby('item_id')['fit_num'].agg(
    item_fit_mean='mean'
).reset_index()

# Also item-level probability of small / large
item_small_rate = df.groupby('item_id')['fit'].apply(
    lambda x: (x == 'small').mean()
).reset_index(name='item_small_rate')

item_large_rate = df.groupby('item_id')['fit'].apply(
    lambda x: (x == 'large').mean()
).reset_index(name='item_large_rate')

# Merge item stats back
df = df.merge(item_stats, on='item_id', how='left')
df = df.merge(item_small_rate, on='item_id', how='left')
df = df.merge(item_large_rate, on='item_id', how='left')


# ======================================
# 6. User-level aggregate features
# ======================================

# User mean fit bias: numeric encoding
user_stats = df.groupby('user_id')['fit_num'].agg(
    user_fit_mean='mean'
).reset_index()

df = df.merge(user_stats, on='user_id', how='left')


# ======================================
# 7. Simple target encoding of user_id / item_id
#    (using fit_class for classification signal)
# ======================================

# user_id -> average class value
user_te = df.groupby('user_id')['fit_class'].mean().reset_index()
user_te = user_te.rename(columns={'fit_class': 'user_fit_class_mean'})
df = df.merge(user_te, on='user_id', how='left')

# item_id -> average class value
item_te = df.groupby('item_id')['fit_class'].mean().reset_index()
item_te = item_te.rename(columns={'fit_class': 'item_fit_class_mean'})
df = df.merge(item_te, on='item_id', how='left')


# ======================================
# 8. Final feature selection
# ======================================

# Example: choose numeric + one-hot + encodings
# (You can adjust this list based on experiments.)
feature_cols = [
    # user-level numeric
    'height_inches',
    'weight_lbs',
    'age',
    'bust_band',
    'bust_cup_num',
    'BMI',
    'weight_per_inch',

    # clothing size
    'size',                 # selected size (numeric in your dataset)

    # item-level aggregates
    'item_fit_mean',
    'item_small_rate',
    'item_large_rate',

    # user-level aggregates
    'user_fit_mean',

    # target encodings
    'user_fit_class_mean',
    'item_fit_class_mean',
]

# Add all one-hot columns (category & body type)
one_hot_cols = [c for c in df.columns if c.startswith('cat_') or c.startswith('body_')]
feature_cols.extend(one_hot_cols)

# Drop rows with missing values in the selected features (simple approach)
df_model = df.dropna(subset=feature_cols + ['fit_class']).copy()

X = df_model[feature_cols]
y = df_model['fit_class']

print("Number of samples after preprocessing:", len(df_model))
print("Number of features:", X.shape[1])


# ======================================
# 9. (Optional) Train/validation split by user_id
# ======================================
from sklearn.model_selection import train_test_split

# To avoid leakage, split by user_id, not by random rows.
users = df_model['user_id'].unique()
train_users, test_users = train_test_split(users, test_size=0.2, random_state=42)

train_mask = df_model['user_id'].isin(train_users)
test_mask = df_model['user_id'].isin(test_users)

X_train, y_train = X[train_mask], y[train_mask]
X_test, y_test = X[test_mask], y[test_mask]

print("Train samples:", len(X_train))
print("Test samples:", len(X_test))
