In [1]:
pip install torch pandas numpy scikit-learn tqdm transformers


Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [3]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.nn import TransformerEncoder, TransformerEncoderLayer
from sklearn.preprocessing import LabelEncoder
from collections import Counter
from transformers import AutoTokenizer, AutoModel
import torch.nn.functional as F

#  =======  =======
MAX_LEN = 50
BATCH_SIZE = 256
EPOCHS = 3
MASK_PROB = 0.15
EMB_DIM = 512
LR = 1e-3
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

#  =======  =======
drive_path = "/content/drive/MyDrive/RS"
train_csv = os.path.join(drive_path, "train.csv")
sample_csv = os.path.join(drive_path, "sample_submission.csv")
meta_csv = os.path.join(drive_path, "item_meta.csv")
model_path = os.path.join(drive_path, "bert4rec.pt")
output_csv = os.path.join(drive_path, "submission.csv")

#  =======  =======
class BERT4RecDataset(Dataset):
    def __init__(self, user_sequences, max_len=50, mask_prob=0.2, num_items=10000):
        self.user_sequences = user_sequences
        self.max_len = max_len
        self.mask_prob = mask_prob
        self.num_items = num_items

    def __len__(self):
        return len(self.user_sequences)

    def __getitem__(self, idx):
        seq = self.user_sequences[idx][-self.max_len:]
        pad_len = self.max_len - len(seq)
        seq = [0] * pad_len + seq

        tokens = []
        labels = []

        for item in seq:
            if item == 0:
                tokens.append(0)
                labels.append(0)
            elif np.random.rand() < self.mask_prob:
                tokens.append(self.num_items + 1)  #  MASK token
                labels.append(item)
            else:
                tokens.append(item)
                labels.append(0)

        return torch.LongTensor(tokens), torch.LongTensor(labels)

#  ======= BERT4Rec  =======
class BERT4Rec(nn.Module):
    def __init__(self, num_items, d_model=128, max_len=50, num_heads=4, num_layers=2, dropout=0.1):
        super(BERT4Rec, self).__init__()
        self.item_embedding = nn.Embedding(num_items + 2, d_model, padding_idx=0)
        self.position_embedding = nn.Embedding(max_len, d_model)
        encoder_layer = TransformerEncoderLayer(d_model, num_heads, d_model * 4, dropout, batch_first=True)
        self.transformer = TransformerEncoder(encoder_layer, num_layers)
        self.output_layer = nn.Linear(d_model, num_items + 2)
        self.max_len = max_len

    def forward(self, input_seq):
        pos = torch.arange(self.max_len, device=input_seq.device).unsqueeze(0).expand_as(input_seq)
        x = self.item_embedding(input_seq) + self.position_embedding(pos)
        x = self.transformer(x)
        return self.output_layer(x)

#  ======= NCF =======
class NCF(nn.Module):
    def __init__(self, num_users, num_items, emb_dim=64):
        super(NCF, self).__init__()
        self.user_emb = nn.Embedding(num_users + 1, emb_dim)
        self.item_emb = nn.Embedding(num_items + 1, emb_dim)
        self.mlp = nn.Sequential(
            nn.Linear(emb_dim * 2, 128),
            nn.ReLU(),
            nn.Linear(128, 1)
        )

    def forward(self, user, item):
        u = self.user_emb(user)
        i = self.item_emb(item)
        x = torch.cat([u, i], dim=1)
        return self.mlp(x).squeeze()

#  =======  =======
def get_user_sequences(csv_path):
    df = pd.read_csv(csv_path)
    user_seq = df.groupby('user_id')['item_id'].apply(list)
    return user_seq.tolist(), df['item_id'].max()

#  ======= Item Popularity  =======
def get_item_popularity(train_df):
    return dict(train_df['item_id'].value_counts(normalize=True))

#  ======= Reciprocal Rank Fusion =======
def rrf_fusion(rank_lists, k=60):
    scores = Counter()
    for rank in rank_lists:
        for i, item in enumerate(rank):
            scores[item] += 1 / (k + i)
    ranked = [x[0] for x in sorted(scores.items(), key=lambda x: -x[1])]
    return ranked

#  ======= Title embedding  =======
def load_title_embeddings(meta_csv, model_name="distilbert-base-uncased", device="cpu"):
    if not os.path.exists(meta_csv):
        return {}

    meta_df = pd.read_csv(meta_csv)
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name).to(device)
    model.eval()

    title_embeddings = {}
    for _, row in tqdm(meta_df.iterrows(), total=len(meta_df), desc=" Edit title"):
        item_id = row["item_id"]
        title = str(row["title"])
        inputs = tokenizer(title, return_tensors="pt", truncation=True, padding=True).to(device)

        with torch.no_grad():
            output = model(**inputs).last_hidden_state[:, 0, :]  #  [CLS] token
            title_embeddings[item_id] = output.squeeze().cpu()

    return title_embeddings

#  ======= =======
def inference_and_fusion():
    train_df = pd.read_csv(train_csv)
    sample_df = pd.read_csv(sample_csv)
    user_sequences = train_df.groupby("user_id")["item_id"].apply(list).to_dict()
    num_items = train_df["item_id"].max()
    num_users = train_df["user_id"].max()

    model = BERT4Rec(num_items=num_items, d_model=EMB_DIM, max_len=MAX_LEN, num_layers=3).to(DEVICE)
    model.load_state_dict(torch.load(model_path, map_location=DEVICE))
    model.eval()

    title_embeddings = load_title_embeddings(meta_csv, device=DEVICE)
    all_items = list(title_embeddings.keys())
    all_embs = torch.stack([title_embeddings[i] for i in all_items]).to(DEVICE)

    popularity = get_item_popularity(train_df)

    submission = []
    print(" Fusion recommendations are being generated...")
    for idx, row in tqdm(sample_df.iterrows(), total=len(sample_df)):
        user_id = row["user_id"]
        seq = user_sequences.get(user_id, [])[-MAX_LEN:]
        seq = [0] * (MAX_LEN - len(seq)) + seq

        masked_seq = []
        for item in seq:
            if item == 0:
                masked_seq.append(0)
            elif np.random.rand() < MASK_PROB:
                masked_seq.append(num_items + 1)
            else:
                masked_seq.append(item)

        input_seq = torch.LongTensor(masked_seq).unsqueeze(0).to(DEVICE)

        with torch.no_grad():
            logits = model(input_seq)[0]  #  shape: [seq_len, num_items+2]
            mask_indices = [i for i, t in enumerate(masked_seq) if t == num_items + 1]

            score_pool = []
            for i in mask_indices:
                score = logits[i]
                top_items = torch.topk(score, 100).indices.cpu().numpy().tolist()
                score_pool.extend(top_items)

            topk_bert = [x for x, _ in Counter(score_pool).most_common(100) if x >= 2]

        pop_sorted = [i for i, _ in sorted(popularity.items(), key=lambda x: -x[1])]
        topk_pop = pop_sorted[:100]

        if len(seq) > 1:
            last_item = seq[-2]
            if last_item in title_embeddings:
                query_emb = title_embeddings[last_item].to(DEVICE)
                sim = F.cosine_similarity(query_emb.unsqueeze(0), all_embs)
                topk_idx = sim.topk(100).indices.cpu().numpy()
                topk_title = [all_items[i] for i in topk_idx if all_items[i] >= 2]
            else:
                topk_title = []
        else:
            topk_title = []

        fused = rrf_fusion([topk_bert, topk_pop, topk_title])[:10]

        submission.append({
            "ID": row["ID"],
            "user_id": user_id,
            "item_id": ",".join(map(str, fused))
        })

    pd.DataFrame(submission).to_csv(output_csv, index=False)
    print(f" Submissions have been saved to the：{output_csv}")

#  =======  =======
def train():
    sequences, max_item_id = get_user_sequences(train_csv)
    dataset = BERT4RecDataset(sequences, max_len=MAX_LEN, mask_prob=MASK_PROB, num_items=max_item_id)
    dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)

    model = BERT4Rec(num_items=max_item_id, d_model=EMB_DIM, max_len=MAX_LEN, num_layers=3).to(DEVICE)
    optimizer = torch.optim.Adam(model.parameters(), lr=LR)
    criterion = nn.CrossEntropyLoss(ignore_index=0)

    model.train()
    for epoch in range(1, EPOCHS + 1):
        total_loss = 0
        loop = tqdm(dataloader, desc=f"Epoch {epoch}")
        for tokens, labels in loop:
            tokens, labels = tokens.to(DEVICE), labels.to(DEVICE)
            logits = model(tokens)
            logits = logits.view(-1, logits.size(-1))
            labels = labels.view(-1)
            loss = criterion(logits, labels)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            loop.set_postfix(loss=loss.item())

        avg_loss = total_loss / len(dataloader)
        print(f"[Epoch {epoch}] Avg Loss: {avg_loss:.4f}")

    torch.save(model.state_dict(), model_path)
    print(f" The model has been saved to {model_path}")

if __name__ == "__main__":
    train()
    inference_and_fusion()




Epoch 1:   6%|▌         | 78/1265 [00:36<09:07,  2.17it/s, loss=11.3]