In [1]:
import os
from pathlib import Path

print("Current working directory:", os.getcwd())
print("Files here:", os.listdir())


Current working directory: /Users/elsablease
Files here: ['Untitled Folder', '.Rhistory', '.config', 'Music', 'customers.csv', '.condarc', 'HMFinal.ipynb', 'Untitled1.ipynb', 'all_seasons_2.csv', 'Types of Data & Slope Parameter Interpretation', 'FashionTrendVisuals.ipynb', 'HW1 Moneyball EGB', '.DS_Store', 'ElsaBlease_HW2_code.ipynb', '2_14Lab.ipynb', '.CFUserTextEncoding', 'NBA_Player_Salaries.numbers', '.pytest_cache', '.xonshrc', 'conda', 'anaconda_projects', 'Untitled3.ipynb', 'Untitled.ipynb', '.zshrc', 'W5HW', 'Grass', 'HW1 Moneyball', '.local', 'BokehLab.ipynb', 'Pictures', 'ElsaBlease_HW3.ipynb', 'altair_lab.ipynb', '.Terminal.plist', 'GeoPandas_Lab.ipynb', 'BleaseE W5W In-Class', 'Elsa_Blease_HW3.ipynb', 'articles.csv', 'PlotlyLab.ipynb', 'Untitled2.ipynb', '.ipython', 'Desktop', 'Library', '.vpn', 'test_graph.html', '.matplotlib', 'HeteroskedasticityEGB', 'models', 'untitled folder 3', 'testpython', '3D_plotly.html', 'Zotero', '.bash_profile.pysave', 'barcelona_airbnb_map.ht

In [2]:
list((Path.cwd() / "data").glob("*"))

[]

In [3]:
import pandas as pd
import numpy as np
from pathlib import Path
from datetime import datetime

from sklearn.model_selection import GroupShuffleSplit, ParameterGrid
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, average_precision_score
import joblib

# paths (Jupyter-safe)
PROJECT_DIR = Path.cwd()
DATA_DIR = PROJECT_DIR / "data"
MODELS_DIR = PROJECT_DIR / "models"
OUTPUTS_DIR = PROJECT_DIR / "outputs"

MODELS_DIR.mkdir(exist_ok=True)
OUTPUTS_DIR.mkdir(exist_ok=True)

NEGATIVE_SAMPLES_PER_POSITIVE = 5
RANDOM_SEED = 42


In [4]:
from pathlib import Path

DATA_DIR = Path.cwd() / "H&MFinal"


In [5]:
for f in ["articles.csv", "customers.csv", "transactions_train.csv"]:
    p = DATA_DIR / f
    print(f, "->", p, "| exists:", p.exists())


articles.csv -> /Users/elsablease/H&MFinal/articles.csv | exists: True
customers.csv -> /Users/elsablease/H&MFinal/customers.csv | exists: True
transactions_train.csv -> /Users/elsablease/H&MFinal/transactions_train.csv | exists: True


In [6]:
import pandas as pd

articles  = pd.read_csv(DATA_DIR / "articles.csv")
customers = pd.read_csv(DATA_DIR / "customers.csv")
tx        = pd.read_csv(DATA_DIR / "transactions_train.csv")


In [7]:
tx = pd.read_csv(
    DATA_DIR / "transactions_train.csv",
    nrows=2_000_000
)

In [8]:
tx["t_dat"] = pd.to_datetime(tx["t_dat"], errors="coerce")
tx = tx.dropna(subset=["t_dat", "customer_id", "article_id"])

tx["article_id"] = tx["article_id"].astype(int)
articles["article_id"] = articles["article_id"].astype(int)

customers = customers[
    ["customer_id", "FN", "Active", "club_member_status",
     "fashion_news_frequency", "age"]
]


In [9]:
cutoff = tx["t_dat"].max()

hist = tx[tx["t_dat"] < cutoff]

beh = (
    hist.groupby("customer_id")
    .agg(
        freq=("article_id", "count"),
        avg_price=("price", "mean"),
        std_price=("price", "std"),
        last_purchase=("t_dat", "max"),
    )
    .reset_index()
)

beh["recency_days"] = (cutoff - beh["last_purchase"]).dt.days
beh["std_price"] = beh["std_price"].fillna(0)
beh = beh.drop(columns="last_purchase")

In [10]:
rng = np.random.default_rng(RANDOM_SEED)
all_articles = articles["article_id"].unique()

positives = tx[["customer_id", "article_id", "t_dat", "price"]].copy()
positives["label"] = 1

purchased = positives.groupby("customer_id")["article_id"].apply(set)

neg_rows = []
for _, row in positives.iterrows():
    cust = row["customer_id"]
    bought = purchased[cust]
    samples = []

    while len(samples) < NEGATIVE_SAMPLES_PER_POSITIVE:
        cand = int(rng.choice(all_articles))
        if cand not in bought:
            samples.append(cand)

    for a in samples:
        neg_rows.append((cust, a, row["t_dat"]))

negatives = pd.DataFrame(
    neg_rows, columns=["customer_id", "article_id", "t_dat"]
)
negatives["price"] = np.nan
negatives["label"] = 0

data = pd.concat([positives, negatives], ignore_index=True)

In [11]:
df = (
    data
    .merge(customers, on="customer_id", how="left")
    .merge(beh, on="customer_id", how="left")
    .merge(articles, on="article_id", how="left")
)

df["age"] = df["age"].fillna(df["age"].median())
df["freq"] = df["freq"].fillna(0)
df["recency_days"] = df["recency_days"].fillna(df["recency_days"].median())
df["avg_price"] = df["avg_price"].fillna(df["avg_price"].median())
df["std_price"] = df["std_price"].fillna(0)

In [12]:
gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=RANDOM_SEED)
train_idx, test_idx = next(gss.split(df, groups=df["customer_id"]))
train, test = df.iloc[train_idx], df.iloc[test_idx]

gss2 = GroupShuffleSplit(n_splits=1, test_size=0.25, random_state=RANDOM_SEED)
tr_idx, val_idx = next(gss2.split(train, groups=train["customer_id"]))
tr, val = train.iloc[tr_idx], train.iloc[val_idx]


In [13]:
TARGET = "label"
X_cols = [c for c in tr.columns if c not in {"label", "customer_id", "article_id", "t_dat"}]

num_cols = tr[X_cols].select_dtypes(include="number").columns
cat_cols = [c for c in X_cols if c not in num_cols]

pre = ColumnTransformer([
    ("num", SimpleImputer(strategy="median"), num_cols),
    ("cat", Pipeline([
        ("imp", SimpleImputer(strategy="most_frequent")),
        ("oh", OneHotEncoder(handle_unknown="ignore"))
    ]), cat_cols)
])

pipe = Pipeline([
    ("pre", pre),
    ("model", LogisticRegression(max_iter=200, class_weight="balanced"))
])

pipe.fit(tr[X_cols], tr[TARGET])

In [14]:
p_val = pipe.predict_proba(val[X_cols])[:, 1]
p_test = pipe.predict_proba(test[X_cols])[:, 1]

print("Validation AUC:", roc_auc_score(val[TARGET], p_val))
print("Test AUC:", roc_auc_score(test[TARGET], p_test))

Validation AUC: 0.7987477478430773
Test AUC: 0.7994062591917213


In [15]:
joblib.dump({"pipeline": pipe, "features": X_cols},
            MODELS_DIR / "purchase_model.joblib")

['/Users/elsablease/models/purchase_model.joblib']

In [16]:
# Create a one-row Excel template for prediction
template = pd.DataFrame(columns=X_cols)

template_path = DATA_DIR / "new_datapoint.xlsx"
template.to_excel(template_path, index=False)

print("Created:", template_path)

Created: /Users/elsablease/H&MFinal/new_datapoint.xlsx


In [17]:
new = pd.read_excel(DATA_DIR / "new_datapoint.xlsx")
print("new.shape =", new.shape)
new.head()


new.shape = (0, 34)


Unnamed: 0,price,FN,Active,club_member_status,fashion_news_frequency,age,freq,avg_price,std_price,recency_days,...,department_name,index_code,index_name,index_group_no,index_group_name,section_no,section_name,garment_group_no,garment_group_name,detail_desc


In [18]:
new = pd.read_excel(DATA_DIR / "new_datapoint.xlsx")

# if user accidentally left extra empty rows, keep the first non-empty row
new = new.dropna(how="all")
new = new.iloc[:1]

print("After cleanup new.shape =", new.shape)


After cleanup new.shape = (0, 34)


In [19]:
new = pd.read_excel(DATA_DIR / "new_datapoint.xlsx")
print(new.shape)
print(new.columns[:10])
print(new.head(3))


(0, 34)
Index(['price', 'FN', 'Active', 'club_member_status', 'fashion_news_frequency',
       'age', 'freq', 'avg_price', 'std_price', 'recency_days'],
      dtype='object')
Empty DataFrame
Columns: [price, FN, Active, club_member_status, fashion_news_frequency, age, freq, avg_price, std_price, recency_days, product_code, prod_name, product_type_no, product_type_name, product_group_name, graphical_appearance_no, graphical_appearance_name, colour_group_code, colour_group_name, perceived_colour_value_id, perceived_colour_value_name, perceived_colour_master_id, perceived_colour_master_name, department_no, department_name, index_code, index_name, index_group_no, index_group_name, section_no, section_name, garment_group_no, garment_group_name, detail_desc]
Index: []

[0 rows x 34 columns]


In [20]:
import pandas as pd

# 1) Build one-row example using whatever columns your model expects
example = pd.DataFrame([{
    "age": 25,
    "club_member_status": "ACTIVE",
    "fashion_news_frequency": "Regularly",
    "freq": 10,
    "recency_days": 30,
    "avg_price": 0.03,
    "std_price": 0.01
}])

# 2) Force it to have exactly the same columns (missing ones become NaN)
example = example.reindex(columns=X_cols)

# 3) Save to Excel (overwrites the blank template)
example_path = DATA_DIR / "new_datapoint.xlsx"
example.to_excel(example_path, index=False)

print("Wrote 1-row file to:", example_path)
print("example.shape =", example.shape)
example.head()


Wrote 1-row file to: /Users/elsablease/H&MFinal/new_datapoint.xlsx
example.shape = (1, 34)


Unnamed: 0,price,FN,Active,club_member_status,fashion_news_frequency,age,freq,avg_price,std_price,recency_days,...,department_name,index_code,index_name,index_group_no,index_group_name,section_no,section_name,garment_group_no,garment_group_name,detail_desc
0,,,,ACTIVE,Regularly,25,10,0.03,0.01,30,...,,,,,,,,,,


In [21]:
new = pd.read_excel(DATA_DIR / "new_datapoint.xlsx")
print("new.shape =", new.shape)
display(new.head())

prob = pipe.predict_proba(new[X_cols])[:, 1]
print("Predicted purchase probability:", prob[0])


new.shape = (1, 34)


Unnamed: 0,price,FN,Active,club_member_status,fashion_news_frequency,age,freq,avg_price,std_price,recency_days,...,department_name,index_code,index_name,index_group_no,index_group_name,section_no,section_name,garment_group_no,garment_group_name,detail_desc
0,,,,ACTIVE,Regularly,25,10,0.03,0.01,30,...,,,,,,,,,,


Predicted purchase probability: 0.4323184553994005


In [22]:
prob = pipe.predict_proba(new[X_cols])[:, 1]

In [23]:
missing = [c for c in X_cols if c not in new.columns]
print("Missing columns:", missing)

Missing columns: []
