In [35]:
import torch

print(torch.__version__)        
print(torch.version.cuda)          
print(torch.cuda.is_available())     

1.11.0+cu113
11.3
True


In [36]:
import gc
import numpy as np
import pandas as pd

df = pd.read_csv("/HMData/transactions_train.csv", 
                 dtype={"article_id": str})
print(df.shape)
df.head()

(31788324, 5)


Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id
0,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,663713001,0.050831,2
1,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,541518023,0.030492,2
2,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,505221004,0.015237,2
3,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687003,0.016932,2
4,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687004,0.016932,2


In [37]:
df["t_dat"] = pd.to_datetime(df["t_dat"])
df["t_dat"].max()

Timestamp('2020-09-22 00:00:00')

In [38]:
active_articles = df.groupby("article_id")["t_dat"].max().reset_index()
active_articles = active_articles[active_articles["t_dat"] >= "2019-09-01"].reset_index()
active_articles.shape

(72581, 3)

In [39]:
df = df[df["article_id"].isin(active_articles["article_id"])].reset_index(drop=True)
df.shape

(29634404, 5)

In [40]:
df["week"] = (df["t_dat"].max() - df["t_dat"]).dt.days // 7
df["week"].value_counts()

65     620104
13     549443
42     518403
12     517428
64     508664
        ...  
93     174190
102    164298
104    163143
97     162580
94     152807
Name: week, Length: 105, dtype: int64

In [41]:
from sklearn.preprocessing import LabelEncoder


article_ids = np.concatenate([["placeholder"], np.unique(df["article_id"].values)])

le_article = LabelEncoder()
le_article.fit(article_ids)
df["article_id"] = le_article.transform(df["article_id"])

In [42]:
WEEK_HIST_MAX = 5

def create_dataset(df, week):
    hist_df = df[(df["week"] > week) & (df["week"] <= week + WEEK_HIST_MAX)]
    hist_df = hist_df.groupby("customer_id").agg({"article_id": list, "week": list}).reset_index()
    hist_df.rename(columns={"week": 'week_history'}, inplace=True)
    
    target_df = df[df["week"] == week]
    target_df = target_df.groupby("customer_id").agg({"article_id": list}).reset_index()
    target_df.rename(columns={"article_id": "target"}, inplace=True)
    target_df["week"] = week
    
    return target_df.merge(hist_df, on="customer_id", how="left")

val_weeks = [0]
train_weeks = [i for i in range(WEEK_HIST_MAX)]

val_df = pd.concat([create_dataset(df, w) for w in val_weeks]).reset_index(drop=True)
train_df = pd.concat([create_dataset(df, w) for w in train_weeks]).reset_index(drop=True)

train_df = train_df.sample(n = 10000)
print(gc.collect())

train_articles = train_df['customer_id'].unique().tolist()
train_df.shape

0


(10000, 5)

In [43]:
from torch.utils.data import Dataset, DataLoader
import torch
from tqdm import tqdm

class HMDataset(Dataset):
    def __init__(self, df, seq_len, is_test=False):
        self.df = df.reset_index(drop=True)
        self.seq_len = seq_len
        self.is_test = is_test
    
    def __len__(self):
        return self.df.shape[0]
    
    def __getitem__(self, index):
        row = self.df.iloc[index]
        
        if self.is_test:
            target = torch.zeros(2).float()
        else:
            target = torch.zeros(len(article_ids)).float()
            for t in row.target:
                target[t] = 1.0
            
        article_hist = torch.zeros(self.seq_len).long()
        week_hist = torch.ones(self.seq_len).float()
        
        
        if isinstance(row.article_id, list):
            if len(row.article_id) >= self.seq_len:
                article_hist = torch.LongTensor(row.article_id[-self.seq_len:])
                week_hist = (torch.LongTensor(row.week_history[-self.seq_len:]) - row.week)/WEEK_HIST_MAX/2
            else:
                article_hist[-len(row.article_id):] = torch.LongTensor(row.article_id)
                week_hist[-len(row.article_id):] = (torch.LongTensor(row.week_history) - row.week)/WEEK_HIST_MAX/2
                
        return article_hist, week_hist, target
    

In [44]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class HMModel(nn.Module):
    def __init__(self, article_shape):
        super(HMModel, self).__init__()
        
        self.article_emb = nn.Embedding(article_shape[0], embedding_dim=article_shape[1])
        
        self.article_likelihood = nn.Parameter(torch.zeros(article_shape[0]), requires_grad=True)
        self.top = nn.Sequential(nn.Conv1d(3, 8, kernel_size=1), nn.LeakyReLU(),nn.BatchNorm1d(8),
                                 nn.Conv1d(8, 32, kernel_size=1), nn.LeakyReLU(),nn.BatchNorm1d(32),
                                 nn.Conv1d(32, 8, kernel_size=1), nn.LeakyReLU(),nn.BatchNorm1d(8),
                                 nn.Conv1d(8, 1, kernel_size=1),nn.LeakyReLU(),)
        
    def forward(self, inputs):
        article_hist, week_hist = inputs[0], inputs[1]
        x = self.article_emb(article_hist)
        x = F.normalize(x, dim=2)
        
        x = x@F.normalize(self.article_emb.weight).T
        
        x, indices = x.max(axis=1)
        x = x.clamp(1e-3, 0.999)
        x = -torch.log(1/x - 1)
        
        max_week = week_hist.unsqueeze(2).repeat(1, 1, x.shape[-1]).gather(1, indices.unsqueeze(1).repeat(1, 
            week_hist.shape[1], 1))
        max_week = max_week.mean(axis=1).unsqueeze(1)
        
        x = torch.cat([x.unsqueeze(1), max_week,
                       self.article_likelihood[None, None, :].repeat(x.shape[0], 1, 1)], axis=1)
        
        x = self.top(x).squeeze(1)
        return x
    
device = "cuda"
model = HMModel((len(le_article.classes_), 512))

model = model.to(device)

In [45]:
import sys

def mean_average_precision(topk_preds, target_array, k=12):
    metric = []
    tp, fp = 0, 0
    
    for pred in topk_preds:
        if target_array[pred]:
            tp += 1
            metric.append(tp/(tp + fp))
        else:
            fp += 1
            
    return np.sum(metric) / min(k, target_array.sum())

def read_data(data):
    return tuple(d.cuda() for d in data[:-1]), data[-1].cuda()


def validate(model, val_loader, k=12):
    model.eval()
    
    tbar = tqdm(val_loader, file=sys.stdout)
    
    maps = []
    
    with torch.no_grad():
        for idx, data in enumerate(tbar):
            inputs, target = read_data(data)

            logits = model(inputs)

            _, indices = torch.topk(logits, k, dim=1)

            indices = indices.detach().cpu().numpy()
            target = target.detach().cpu().numpy()

            for i in range(indices.shape[0]):
                maps.append(mean_average_precision(indices[i], target[i]))
        
    
    return np.mean(maps)


In [46]:
from torchsummary import summary
def train(model, train_loader, epochs, val_loader):
    np.random.seed(10)
    
    optimizer = optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=1e-4)

    scaler = torch.cuda.amp.GradScaler()

    criterion = torch.nn.BCEWithLogitsLoss()
    
    for e in range(epochs):
        model.train()
        tbar = tqdm(train_loader, file=sys.stdout)
        
        loss_list = []

        for idx, data in enumerate(tbar):
            inputs, target = read_data(data)

            optimizer.zero_grad()
            
            with torch.cuda.amp.autocast():
                logits = model(inputs)
                loss = criterion(logits, target)
            
            
            #loss.backward()
            scaler.scale(loss).backward()
            #optimizer.step()
            scaler.step(optimizer)
            scaler.update()
            
            loss_list.append(loss.detach().cpu().item())
            
            avg_loss = np.round(100*np.mean(loss_list), 4)

            tbar.set_description(f"Epoch {e+1} Loss: {avg_loss}")
            
        model.eval()
        
        tbar = tqdm(val_loader, file=sys.stdout)
        
        maps = []
        
        with torch.no_grad():
            for idx, data in enumerate(tbar):
                inputs, target = read_data(data)

                logits = model(inputs)

                _, indices = torch.topk(logits, 12, dim=1)

                indices = indices.detach().cpu().numpy()
                target = target.detach().cpu().numpy()

                for i in range(indices.shape[0]):
                    maps.append(mean_average_precision(indices[i], target[i]))
            
        
        accuracy = np.mean(maps)

        log_text = f"Epoch {e+1}\nEval Acc: {accuracy}"
            
        print(log_text)
        
    return model
SEED = 10

val_dataset = HMDataset(val_df, 16)
val_loader = DataLoader(val_dataset, batch_size=256, shuffle=False, pin_memory=True)

train_dataset = HMDataset(train_df, 16)
train_loader = DataLoader(train_dataset, batch_size=256, shuffle=True, pin_memory=True)

#print(model)
#print(summary(model, input_size=(16, 1, 72582, 5)))

model = train(model, train_loader, 10, val_loader)

Epoch 1 Loss: 78.729: 100%|██████████| 40/40 [00:35<00:00,  1.13it/s] 
100%|██████████| 270/270 [01:58<00:00,  2.29it/s]
Epoch 1
Eval Acc: 0.0003202451966709931
Epoch 2 Loss: 76.2165: 100%|██████████| 40/40 [00:33<00:00,  1.19it/s]
100%|██████████| 270/270 [01:58<00:00,  2.27it/s]
Epoch 2
Eval Acc: 1.818802243538414e-05
Epoch 3 Loss: 75.2791: 100%|██████████| 40/40 [00:35<00:00,  1.14it/s]
100%|██████████| 270/270 [01:56<00:00,  2.32it/s]
Epoch 3
Eval Acc: 2.666254088685377e-05
Epoch 4 Loss: 74.7369: 100%|██████████| 40/40 [00:34<00:00,  1.16it/s]
100%|██████████| 270/270 [02:05<00:00,  2.15it/s]
Epoch 4
Eval Acc: 0.00011089501029024618
Epoch 5 Loss: 74.17: 100%|██████████| 40/40 [00:34<00:00,  1.14it/s]  
100%|██████████| 270/270 [02:03<00:00,  2.19it/s]
Epoch 5
Eval Acc: 7.302862457680966e-05
Epoch 6 Loss: 73.6444: 100%|██████████| 40/40 [00:34<00:00,  1.15it/s]
100%|██████████| 270/270 [02:00<00:00,  2.24it/s]
Epoch 6
Eval Acc: 4.1974958881156587e-05
Epoch 7 Loss: 73.1511: 100%|████

In [47]:
test_df = pd.read_csv('/hmData/sample_submission.csv').drop("prediction", axis=1)
test_df = test_df.loc[test_df['customer_id'].isin(train_articles)] # Predict only for trained customers
print(test_df.shape)
test_df.head()

(9904, 1)


Unnamed: 0,customer_id
185,0007c3ba357b0e54131e54d7a5d619263c5a9e6826308b...
327,000fa1b80857fa40bf25990bc1b1b65afc63923a8e4b57...
330,000fb6e772c5d0023892065e659963da90b1866035558e...
349,0010e8eb18f131e724d6997909af0808adbba057529edb...
367,0011d4b6e8a2fe30df2fc31eec0207c17a4b0dd9dd2997...


In [48]:
def create_test_dataset(test_df):
    week = -1
    test_df["week"] = week
    
    hist_df = df[(df["week"] > week) & (df["week"] <= week + WEEK_HIST_MAX)]
    hist_df = hist_df.groupby("customer_id").agg({"article_id": list, "week": list}).reset_index()
    hist_df.rename(columns={"week": 'week_history'}, inplace=True)
    
    
    return test_df.merge(hist_df, on="customer_id", how="left")

test_df = create_test_dataset(test_df)
test_df.head()

Unnamed: 0,customer_id,week,article_id,week_history
0,0007c3ba357b0e54131e54d7a5d619263c5a9e6826308b...,-1,"[63200, 72334]","[3, 3]"
1,000fa1b80857fa40bf25990bc1b1b65afc63923a8e4b57...,-1,"[20994, 21017, 62450, 38240, 48205, 39250, 353...","[2, 2, 2, 1, 1, 1, 1, 1, 1, 1]"
2,000fb6e772c5d0023892065e659963da90b1866035558e...,-1,"[68883, 70515, 69487, 59095, 10784, 54486, 474...","[3, 3, 3, 3, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, ..."
3,0010e8eb18f131e724d6997909af0808adbba057529edb...,-1,"[55338, 66851, 1341, 281, 6447, 10691, 2012, 7...","[4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
4,0011d4b6e8a2fe30df2fc31eec0207c17a4b0dd9dd2997...,-1,[27642],[4]


In [49]:
test_ds = HMDataset(test_df, 16, is_test=True)
test_loader = DataLoader(test_ds, batch_size=256, shuffle=False,
                          pin_memory=True)


def inference(model, loader, k=12):
    model.eval()
    
    tbar = tqdm(loader, file=sys.stdout)
    
    preds = []
    
    with torch.no_grad():
        for idx, data in enumerate(tbar):
            inputs, target = read_data(data)

            logits = model(inputs)

            _, indices = torch.topk(logits, k, dim=1)

            indices = indices.detach().cpu().numpy()
            target = target.detach().cpu().numpy()

            for i in range(indices.shape[0]):
                preds.append(" ".join(list(le_article.inverse_transform(indices[i]))))
        
    
    return preds


test_df["prediction"] = inference(model, test_loader)
test_df = test_df[['customer_id', 'prediction']]

100%|██████████| 39/39 [00:27<00:00,  1.43it/s]


In [50]:
# Merge with baseline [0.0236]
sub_df = pd.read_csv('submission.csv')
sub_df = sub_df.loc[~sub_df['customer_id'].isin(train_articles)]

sub_df = pd.concat([sub_df, test_df])
sub_df = sub_df.sort_values('customer_id')

In [51]:
sub_df.to_csv("cnn_large_data_submission.csv", index=False, columns=["customer_id", "prediction"])

Reference: https://www.kaggle.com/code/aerdem4/h-m-pure-pytorch-baseline