In [None]:
# ===============================
# 0️⃣ Safety: clean inf / NaN
# ===============================
trend.replace([np.inf, -np.inf], np.nan, inplace=True)
trend.dropna(inplace=True)

# ===============================
# 1️⃣ Encode season & color
# ===============================
encoder = OneHotEncoder(sparse_output=False)
X_cat = trend[['season', 'color']]
X_encoded = encoder.fit_transform(X_cat)

# ===============================
# 2️⃣ Create lag features (3 previous seasons)
# ===============================
# Ensure season order
season_order = ['Winter', 'Spring', 'Summer', 'Fall']
season_to_num = {s: i for i, s in enumerate(season_order)}
trend['season_id'] = trend['season'].map(season_to_num)

trend = trend.sort_values(['color', 'year', 'season_id'])

# Lag features
for lag in range(1, 4):
    trend[f'prev_addCount_{lag}'] = trend.groupby('color')['addCount'].shift(lag)

trend['avg_prev3'] = trend[[f'prev_addCount_{i}' for i in range(1, 4)]].mean(axis=1)

# Percentage change features
trend['pct_change_1'] = trend['prev_addCount_1'] / trend['prev_addCount_2']
trend['pct_change_2'] = trend['prev_addCount_2'] / trend['prev_addCount_3']

# Clean again after creating lag/pct_change
trend.replace([np.inf, -np.inf], np.nan, inplace=True)
trend.fillna(0, inplace=True)

# ===============================
# 3️⃣ Prepare feature matrix
# ===============================
X_num = trend[['year', 'prev_addCount_1', 'prev_addCount_2', 'prev_addCount_3',
               'avg_prev3', 'pct_change_1', 'pct_change_2']].to_numpy()
X = np.hstack([X_num, X_encoded])
y = trend['relative_popularity'].to_numpy()

# ===============================
# 4️⃣ Train/test split
# ===============================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, shuffle=False
)

# ===============================
# 5️⃣ Train Random Forest
# ===============================
model = RandomForestRegressor(
    n_estimators=500,
    max_depth=None,
    random_state=42,
    n_jobs=-1
)
model.fit(X_train, y_train)

# ===============================
# 6️⃣ Evaluate
# ===============================
y_pred = model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)
print(f"✅ Model trained — RMSE: {rmse:.4f}, MAE: {mae:.4f}")

# ===============================
# 7️⃣ Predict next season
# ===============================
latest_year = trend['year'].max()
next_year = latest_year + 1
seasons = trend['season'].unique()
colors = trend['color'].unique()

future_rows = pd.DataFrame([(next_year, s, c) for s in seasons for c in colors],
                           columns=['year','season','color'])

# Encode future categorical features
X_future_cat = future_rows[['season','color']]
X_future_encoded = encoder.transform(X_future_cat)

# Use previous lag values from last season
last_season = trend[trend['year']==latest_year].copy()
lag_dict = last_season.set_index(['season','color'])[['addCount']].to_dict()['addCount']

def get_lags(row):
    # Fill lag features from last year; if missing, use 0
    prev1 = lag_dict.get((row['season'], row['color']), 0)
    prev2 = prev1
    prev3 = prev1
    avg_prev3 = np.mean([prev1, prev2, prev3])
    pct1 = prev1/prev2 if prev2 !=0 else 0
    pct2 = prev2/prev3 if prev3 !=0 else 0
    return pd.Series([prev1, prev2, prev3, avg_prev3, pct1, pct2])

future_rows[['prev_addCount_1','prev_addCount_2','prev_addCount_3',
             'avg_prev3','pct_change_1','pct_change_2']] = future_rows.apply(get_lags, axis=1)

X_future_num = future_rows[['year','prev_addCount_1','prev_addCount_2','prev_addCount_3',
                            'avg_prev3','pct_change_1','pct_change_2']].to_numpy()
X_future = np.hstack([X_future_num, X_future_encoded])

# Predict relative popularity
future_rows['predicted_popularity'] = model.predict(X_future)

# Normalize per season
future_rows['predicted_popularity'] = future_rows.groupby('season')['predicted_popularity'].transform(lambda x: x/x.sum())

# ===============================
# 8️⃣ Top-5 colors per season
# ===============================
top_colors_per_season = future_rows.sort_values(['season','predicted_popularity'], ascending=[True, False])\
                                   .groupby('season').head(5).reset_index(drop=True)

for season in seasons:
    print(f"\n🎨 Top 5 Colors for {season} {next_year}:")
    display(top_colors_per_season[top_colors_per_season['season']==season][['color','predicted_popularity']])



# ===============================
# 0️⃣ Load & Prepare Data
# ===============================
df = pd.read_csv("../data/genz_fashion_cleaned.csv")

# Convert date to datetime
df['date'] = pd.to_datetime(df['date'], errors='coerce')
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month

# Map month to season
def get_season(month):
    if month in [12, 1, 2]: return 'Winter'
    elif month in [3, 4, 5]: return 'Spring'
    elif month in [6, 7, 8]: return 'Summer'
    else: return 'Fall'
df['season'] = df['month'].apply(get_season)

# Normalize colors
df['color'] = df['color'].str.lower().replace({
    'neutral': 'beige',
    'multi': 'multicolor',
    'print': 'patterned'
})

# Aggregate by year, season, color
trend = df.groupby(['year', 'season', 'color'])['addCount'].sum().reset_index()

# ===============================
# 1️⃣ Encode season and color
# ===============================
le_color = LabelEncoder()
le_season = LabelEncoder()

trend['color_enc'] = le_color.fit_transform(trend['color'])
trend['season_enc'] = le_season.fit_transform(trend['season'])

# Create season_id for sorting
season_order = ['Spring', 'Summer', 'Fall', 'Winter']
season_to_num = {s: i for i, s in enumerate(season_order)}
trend['season_id'] = trend['season'].map(season_to_num)

# ===============================
# 2️⃣ Lag Features (previous 3 seasons)
# ===============================
trend = trend.sort_values(['color', 'year', 'season_id'])

for lag in range(1, 4):
    trend[f'prev_addCount_{lag}'] = trend.groupby('color')['addCount'].shift(lag)

# Rolling average of previous 3 seasons
trend['avg_prev3'] = trend[['prev_addCount_1', 'prev_addCount_2', 'prev_addCount_3']].mean(axis=1)

# Drop rows with missing lag values
trend = trend.dropna()

# ===============================
# 3️⃣ Compute relative popularity
# ===============================
trend['season_total'] = trend.groupby(['year', 'season'])['addCount'].transform('sum')
trend['relative_popularity'] = trend['addCount'] / trend['season_total']

# ===============================
# 4️⃣ Features & Target
# ===============================
features = ['year', 'season_enc', 'color_enc',
            'prev_addCount_1', 'prev_addCount_2', 'prev_addCount_3', 'avg_prev3']

X = trend[features]
y = trend['relative_popularity']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

# ===============================
# 5️⃣ Train Random Forest
# ===============================
model = RandomForestRegressor(
    n_estimators=500,
    max_depth=None,
    random_state=42,
    n_jobs=-1
)
model.fit(X_train, y_train)

# ===============================
# 6️⃣ Evaluate Model
# ===============================
y_pred = model.predict(X_test)
rmse = sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)

print(f"✅ Model trained — RMSE: {rmse:.4f}, MAE: {mae:.4f}\n")

# ===============================
# 7️⃣ Prepare unique top-5 accuracy
# ===============================
def unique_top5_accuracy(actual, predicted):
    actual_set = set(actual)
    predicted_set = set(predicted)
    if len(actual_set) == 0: 
        return 0.0
    return len(actual_set & predicted_set) / len(actual_set)

# Actual top-5 per season (unique)
actual_top = trend.groupby('season').apply(
    lambda x: x.sort_values('relative_popularity', ascending=False)['color'].unique()[:5].tolist()
).to_dict()

# ===============================
# 8️⃣ Multi-Season Forecast (Next 4 Seasons)
# ===============================
latest_year = trend['year'].max()
latest_season_idx = max(trend[trend['year']==latest_year]['season_id'])

future_forecasts = []
latest = trend[trend['year'] == latest_year].copy()

for step in range(4):
    next_season_idx = (latest_season_idx + 1) % 4
    next_season = season_order[next_season_idx]
    next_year = latest_year + 1 if next_season_idx == 0 else latest_year

    future = latest.copy()
    future['year'] = next_year
    future['season'] = next_season
    future['season_enc'] = season_to_num[next_season]

    # Update lag features
    future['prev_addCount_3'] = future['prev_addCount_2']
    future['prev_addCount_2'] = future['prev_addCount_1']
    future['prev_addCount_1'] = future['addCount']
    future['avg_prev3'] = future[['prev_addCount_1','prev_addCount_2','prev_addCount_3']].mean(axis=1)

    # Predict
    X_future = future[features]
    future['predicted_popularity'] = model.predict(X_future)

    preds = future[['year','season','color','predicted_popularity']].copy()
    preds['forecast_horizon'] = f"{next_season} {next_year}"
    future_forecasts.append(preds)

    # Prepare for next iteration
    latest = future.copy()
    latest['addCount'] = future['predicted_popularity'] * 1000  # scale for next lag
    latest_year, latest_season_idx = next_year, next_season_idx

forecast_df = pd.concat(future_forecasts, ignore_index=True)

# ===============================
# 9️⃣ Compute predicted top-5 (unique)
# ===============================
predicted_top = forecast_df.groupby('forecast_horizon').apply(
    lambda x: x.sort_values('predicted_popularity', ascending=False)['color'].unique()[:5].tolist()
).to_dict()

# ===============================
# 1️⃣0️⃣ Evaluate top-5 accuracy
# ===============================
for horizon in predicted_top.keys():
    season_name = horizon.split()[0]
    accuracy = unique_top5_accuracy(actual_top.get(season_name, []), predicted_top[horizon])
    print(f"{horizon} top-5 accuracy: {accuracy:.2f}")
    print(f"  Actual top 5: {list(actual_top.get(season_name, []))}")
    print(f"  Predicted top 5: {predicted_top[horizon]}\n")