In [70]:
import numpy as np
import pandas as pd
from math import sqrt
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error

# Dummy model

Load and preprocess data

In [71]:
df = pd.read_csv("../data/genz_fashion_cleaned.csv")

# Convert date and get year/month
df['date'] = pd.to_datetime(df['date'], errors='coerce')
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month

# Season
def get_season(month):
    if month in [12,1,2]: return 'Winter'
    elif month in [3,4,5]: return 'Spring'
    elif month in [6,7,8]: return 'Summer'
    else: return 'Fall'

df['season'] = df['month'].apply(get_season)

# Normalize colours
df['color'] = df['color'].str.lower().replace({
    'neutral': 'beige',
    'multi': 'multicolor',
    'print': 'patterned'
})

Aggregate counts

In [72]:
trend = df.groupby(['year', 'season', 'color'])['addCount'].sum().reset_index()
trend = trend.sort_values(['color', 'year', 'season'])

Lag features

In [73]:
trend['prev_addCount'] = trend.groupby('color')['addCount'].shift(1)
trend['prev2_addCount'] = trend.groupby('color')['addCount'].shift(2)
trend['avg_prev2'] = trend[['prev_addCount','prev2_addCount']].mean(axis=1)

trend = trend.dropna()

Encode categorical features

In [74]:
le_color = LabelEncoder()
le_season = LabelEncoder()

trend['color_enc'] = le_color.fit_transform(trend['color'])
trend['season_enc'] = le_season.fit_transform(trend['season'])

Compute relative popularity

In [75]:
trend['season_total'] = trend.groupby(['year','season'])['addCount'].transform('sum')
trend['relative_popularity'] = trend['addCount'] / trend['season_total']

# Add average seasonal popularity per color
trend['avg_season_popularity'] = trend.groupby(['color','season'])['relative_popularity'].transform('mean')

Prepare features & target

In [76]:
features = ['year','season_enc','color_enc','prev_addCount','prev2_addCount','avg_prev2','avg_season_popularity']
X = trend[features]
y = trend['relative_popularity']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

Dummy Regression model

In [77]:
from sklearn.dummy import DummyRegressor

# Dummy model that predicts the mean relative popularity
dummy_model = DummyRegressor(strategy="mean")
dummy_model.fit(X_train, y_train)

# Predict with dummy model
y_pred_dummy = dummy_model.predict(X_test)

# Evaluate dummy performance
rmse_dummy = sqrt(mean_squared_error(y_test, y_pred_dummy))
mae_dummy = mean_absolute_error(y_test, y_pred_dummy)

print(f"🪄 DummyRegressor — RMSE: {rmse_dummy:.4f}, MAE: {mae_dummy:.4f}")

🪄 DummyRegressor — RMSE: 0.0623, MAE: 0.0588


Dummy Random Forest

In [78]:
model = RandomForestRegressor(n_estimators=300, max_depth=10, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
rmse = sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)
print(f"✅ Model trained — RMSE: {rmse:.4f}, MAE: {mae:.4f}")

✅ Model trained — RMSE: 0.0329, MAE: 0.0215


Forecast next year for all seasons

In [79]:
latest_year = trend['year'].max()
season_order = ['Spring','Summer','Fall','Winter']

future_rows = pd.DataFrame([
    (latest_year+1, season, color) 
    for season in season_order 
    for color in trend['color'].unique()
], columns=['year','season','color'])

# Lag features from last known counts
last_counts = trend.groupby('color')['addCount'].last()
second_last_counts = trend.groupby('color')['addCount'].nth(-2)

future_rows['prev_addCount'] = future_rows['color'].map(last_counts)
future_rows['prev2_addCount'] = future_rows['color'].map(second_last_counts)
future_rows['avg_prev2'] = (future_rows['prev_addCount'] + future_rows['prev2_addCount'])/2

# Encode categories
future_rows['color_enc'] = le_color.transform(future_rows['color'])
future_rows['season_enc'] = le_season.transform(future_rows['season'])

# Avg seasonal popularity
future_rows['avg_season_popularity'] = future_rows.groupby(['color','season'])['prev_addCount'].transform('mean')

# Predict
X_future = future_rows[features]
future_rows['predicted_popularity'] = model.predict(X_future)

# Normalize per season
future_rows['predicted_popularity'] = future_rows.groupby('season')['predicted_popularity'].transform(lambda x: x / x.sum())


Compute top-5 accuracy

In [80]:
# Actual top-5 colors per season
actual_top = trend.groupby('season').apply(lambda x: list(pd.unique(x.sort_values('relative_popularity', ascending=False)['color']))[:5]).to_dict()

# Predicted top-5 colors per season (unique colors only)
predicted_top = future_rows.groupby('season').apply(lambda x: list(pd.unique(x.sort_values('predicted_popularity', ascending=False)['color']))[:5]).to_dict()

for season in actual_top.keys():
    actual_set = set(actual_top[season])
    predicted_set = set(predicted_top.get(season, []))
    intersection = actual_set & predicted_set
    accuracy = len(intersection)/len(actual_set) if actual_set else 0
    print(f"{season} top-5 accuracy: {accuracy:.2f}")
    print(f"  Actual top 5: {actual_top[season]}")
    print(f"  Predicted top 5: {predicted_top.get(season, [])}\n")


Fall top-5 accuracy: 0.00
  Actual top 5: ['gold', 'black', 'red', 'white', 'green']
  Predicted top 5: ['grey', 'orange', 'purple', 'yellow', 'silver']

Spring top-5 accuracy: 0.20
  Actual top 5: ['orange', 'black', 'blue', 'pink', 'red']
  Predicted top 5: ['grey', 'orange', 'purple', 'yellow', 'white']

Summer top-5 accuracy: 0.20
  Actual top 5: ['black', 'beige', 'blue', 'brown', 'white']
  Predicted top 5: ['grey', 'orange', 'purple', 'yellow', 'white']

Winter top-5 accuracy: 0.00
  Actual top 5: ['black', 'beige', 'blue', 'pink', 'red']
  Predicted top 5: ['grey', 'orange', 'purple', 'yellow', 'white']



  actual_top = trend.groupby('season').apply(lambda x: list(pd.unique(x.sort_values('relative_popularity', ascending=False)['color']))[:5]).to_dict()
  predicted_top = future_rows.groupby('season').apply(lambda x: list(pd.unique(x.sort_values('predicted_popularity', ascending=False)['color']))[:5]).to_dict()


In [81]:
for season in season_order:
    print(f"🎨 Top 5 Colors for {season} {latest_year+1}:")
    display(future_rows[future_rows['season']==season].sort_values('predicted_popularity', ascending=False)[['color','predicted_popularity']].head(5))


🎨 Top 5 Colors for Spring 2023:


Unnamed: 0,color,predicted_popularity
6,grey,0.062958
8,orange,0.062728
11,purple,0.062263
16,yellow,0.062263
15,white,0.061342


🎨 Top 5 Colors for Summer 2023:


Unnamed: 0,color,predicted_popularity
23,grey,0.063004
25,orange,0.062774
28,purple,0.062257
33,yellow,0.062257
32,white,0.061344


🎨 Top 5 Colors for Fall 2023:


Unnamed: 0,color,predicted_popularity
40,grey,0.06315
42,orange,0.062925
45,purple,0.061894
50,yellow,0.061894
47,silver,0.061476


🎨 Top 5 Colors for Winter 2023:


Unnamed: 0,color,predicted_popularity
57,grey,0.062929
59,orange,0.062702
62,purple,0.062169
67,yellow,0.062169
66,white,0.061322


# Training Model

Load & Prepare Data

In [82]:
df = pd.read_csv("../data/genz_fashion_cleaned.csv")

df['date'] = pd.to_datetime(df['date'], errors='coerce')
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month

def get_season(month):
    if month in [12, 1, 2]: return 'Winter'
    elif month in [3, 4, 5]: return 'Spring'
    elif month in [6, 7, 8]: return 'Summer'
    else: return 'Fall'
df['season'] = df['month'].apply(get_season)

df['color'] = df['color'].str.lower().replace({
    'neutral': 'beige',
    'multi': 'multicolor',
    'print': 'patterned'
})

trend = df.groupby(['year', 'season', 'color'])['addCount'].sum().reset_index()

Encode season and color

In [83]:
le_color = LabelEncoder()
le_season = LabelEncoder()

trend['color_enc'] = le_color.fit_transform(trend['color'])
trend['season_enc'] = le_season.fit_transform(trend['season'])

# season_id for ordering
season_order = ['Spring', 'Summer', 'Fall', 'Winter']
season_to_num = {s: i for i, s in enumerate(season_order)}
trend['season_id'] = trend['season'].map(season_to_num)

Lag Features (previous 3 seasons)

In [84]:
trend = trend.sort_values(['color', 'year', 'season_id'])

for lag in range(1, 4):
    trend[f'prev_addCount_{lag}'] = trend.groupby('color')['addCount'].shift(lag)

trend['avg_prev3'] = trend[['prev_addCount_1', 'prev_addCount_2', 'prev_addCount_3']].mean(axis=1)

trend.dropna(inplace=True)

Compute relative popularity

In [85]:
trend['season_total'] = trend.groupby(['year', 'season'])['addCount'].transform('sum')
trend['relative_popularity'] = trend['addCount'] / trend['season_total']

Features & Target

In [86]:
features = ['year', 'season_enc', 'color_enc',
            'prev_addCount_1', 'prev_addCount_2', 'prev_addCount_3', 'avg_prev3']
X = trend[features]
y = trend['relative_popularity']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

Train Random Forest model

In [87]:
model = RandomForestRegressor(
    n_estimators=500,
    max_depth=None,
    random_state=42,
    n_jobs=-1
)
model.fit(X_train, y_train)

0,1,2
,n_estimators,500
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


Evaluate Model

In [88]:
y_pred = model.predict(X_test)
rmse = sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)
print(f"✅ Model trained — RMSE: {rmse:.4f}, MAE: {mae:.4f}\n")

def unique_top5_accuracy(actual, predicted):
    actual_set, predicted_set = set(actual), set(predicted)
    if not actual_set:
        return 0.0
    return len(actual_set & predicted_set) / len(actual_set)

# Actual top-5 per season
actual_top = trend.groupby('season').apply(
    lambda x: x.sort_values('relative_popularity', ascending=False)['color'].unique()[:5].tolist()
).to_dict()

✅ Model trained — RMSE: 0.0863, MAE: 0.0781



  actual_top = trend.groupby('season').apply(


Multi-Season Forecast (Next 4)

In [89]:
latest_year = trend['year'].max()
latest_season_idx = max(trend[trend['year']==latest_year]['season_id'])
future_forecasts = []
latest = trend[trend['year'] == latest_year].copy()

for step in range(4):
    next_season_idx = (latest_season_idx + 1) % 4
    next_season = season_order[next_season_idx]
    next_year = latest_year + 1 if next_season_idx == 0 else latest_year

    future = latest.copy()
    future['year'] = next_year
    future['season'] = next_season
    future['season_enc'] = season_to_num[next_season]

    # Shift lag features
    future['prev_addCount_3'] = future['prev_addCount_2']
    future['prev_addCount_2'] = future['prev_addCount_1']
    future['prev_addCount_1'] = future['addCount']
    future['avg_prev3'] = future[['prev_addCount_1','prev_addCount_2','prev_addCount_3']].mean(axis=1)

    # Predict
    X_future = future[features]
    future['predicted_popularity'] = model.predict(X_future)

    preds = future[['year','season','color','predicted_popularity']].copy()
    preds['forecast_horizon'] = f"{next_season} {next_year}"
    future_forecasts.append(preds)

    # Update for next step
    latest = future.copy()
    latest['addCount'] = future['predicted_popularity'] * 1000
    latest_year, latest_season_idx = next_year, next_season_idx

forecast_df = pd.concat(future_forecasts, ignore_index=True)

Predicted Top-5 and accuracy

In [90]:
predicted_top = forecast_df.groupby('forecast_horizon').apply(
    lambda x: x.sort_values('predicted_popularity', ascending=False)['color'].unique()[:5].tolist()
).to_dict()

for horizon in predicted_top.keys():
    season_name = horizon.split()[0]
    accuracy = unique_top5_accuracy(actual_top.get(season_name, []), predicted_top[horizon])
    print(f"{horizon} top-5 accuracy: {accuracy:.2f}")
    print(f"  Actual top 5: {list(actual_top.get(season_name, []))}")
    print(f"  Predicted top 5: {predicted_top[horizon]}\n")

Fall 2023 top-5 accuracy: 0.40
  Actual top 5: ['gold', 'black', 'white', 'red', 'green']
  Predicted top 5: ['black', 'blue', 'beige', 'silver', 'white']

Spring 2023 top-5 accuracy: 0.60
  Actual top 5: ['black', 'blue', 'pink', 'red', 'white']
  Predicted top 5: ['pink', 'black', 'blue', 'beige', 'green']

Summer 2023 top-5 accuracy: 0.80
  Actual top 5: ['black', 'beige', 'blue', 'brown', 'white']
  Predicted top 5: ['black', 'blue', 'beige', 'red', 'brown']

Winter 2023 top-5 accuracy: 0.80
  Actual top 5: ['black', 'beige', 'blue', 'pink', 'red']
  Predicted top 5: ['black', 'blue', 'beige', 'white', 'red']



  predicted_top = forecast_df.groupby('forecast_horizon').apply(
