<a href="https://colab.research.google.com/github/alisa7979/20252R0136DATA30400/blob/main/2023320036_final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# IMPORT LIBRARIES & DATA PATH

In [5]:
import json
import csv
import os
import random
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from pathlib import Path
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from google.colab import drive

# configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
drive.mount('/content/drive')
ROOT = Path("/content/drive/MyDrive/Amazon_products")
TRAIN_CORPUS = ROOT / "train" / "train_corpus.txt"
TEST_CORPUS = ROOT / "test" / "test_corpus.txt"
KEYWORDS_PATH = ROOT / "class_related_keywords.txt"
CLASSES_PATH = ROOT / "classes.txt"


# set seeds for reproducibility
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)
torch.cuda.manual_seed_all(42)

Mounted at /content/drive


In [9]:
# load class
cid2name = {}
with open(CLASSES_PATH, 'r', encoding='utf-8') as f:
    for line in f:
        parts = line.strip().split('\t')                            # parts = ['0', 'grocery_gourmet_food']
        if len(parts) >= 2:
            cid2name[int(parts[0])] = parts[1]                      # cid2name[0] = 'grocery_gourmet_food'

# dictionary cid2name = {0: 'grocery_gourmet_food', 1: 'meat_poultry', ...}

# dictionary cid2text = cid2name
cid2text = {cid: name for cid, name in cid2name.items()}

with open(KEYWORDS_PATH, 'r', encoding='utf-8') as f:
    for line in f:
        if ':' in line:
            name_part, keyword_part = line.strip().split(':', 1)    # name part = 'grocery_gourmet_food', keyword_part = 'snacks,condiments,beverages...'
            class_name = name_part.strip()                          # class_name = 'grocery_gourmet_food' for safety
            # find ID for this name
            found_id = None
            for cid, cname in cid2name.items():                     # cid = 0, cname = 'grocery_gourmet_food'
                if cname == class_name:
                    found_id = cid                                  # found_id = 0
                    break

            if found_id is not None:
                # append keywords to the class description
                # replace commas with spaces for TF-IDF
                clean_keyword = keyword_part.replace(',', ' ')      # clean_keyword = 'snacks condiments beverages ...'
                cid2text[found_id] += " " + clean_keyword           # cid2text = {0: 'grocery_gourmet_food snacks condiments beverages ...', 1: 'meat_poultry butcher cuts...'}

# sort classes for safery, get list of classes + keywords
sorted_cids = sorted(cid2text.keys())                               # sorted_cids = [0, 1, 2, ..., 530]
class_texts = [cid2text[cid] for cid in sorted_cids]                # class_texts = ['grocery_gourmet_food snacks condiments beverages ...', 'meat_poultry butcher cuts...', ...]

print(f"Prepared {len(class_texts)} Class Prototypes.")

# load train data
train_pids, train_texts = [], []
with open(TRAIN_CORPUS, 'r', encoding='utf-8') as f:
    for line in f:
        parts = line.strip().split('\t')                            # parts = ['0', 'omron hem 790it automatic...']
        if len(parts) >= 2:
            train_pids.append(parts[0])                             # train_pids = ['0', '1', '2',...]
            train_texts.append(parts[-1])                           # train_texts = ['omron hem 790it automatic...', 'natural factors whey factors...', ]
print('Loaded Training Corpus.')

Prepared 531 Class Prototypes.
Loaded Training Corpus.


# KAGGLE SUBMISSION

In [10]:
# ------------------------
# Dummy baseline for Kaggle submission
# Generates random multi-label predictions
# ------------------------
import os
import csv
import random
from tqdm import tqdm

# --- Paths ---
SUBMISSION_PATH = "submission.csv"  # output file

# --- Constants ---
NUM_CLASSES = 531  # total number of classes (0–530)
MIN_LABELS = 1     # minimum number of labels per sample
MAX_LABELS = 3     # maximum number of labels per sample

# --- Load test corpus ---
def load_corpus(path):
    """Load test corpus into {pid: text} dictionary."""
    pid2text = {}
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            parts = line.strip().split("\t", 1)
            if len(parts) == 2:
                pid, text = parts
                pid2text[pid] = text
    return pid2text

pid2text_test = load_corpus(TEST_CORPUS)
pid_list_test = list(pid2text_test.keys())

# --- Generate random predictions ---
all_pids, all_labels = [], []
for pid in tqdm(pid_list_test, desc="Generating dummy predictions"):
    n_labels = random.randint(MIN_LABELS, MAX_LABELS)
    labels = random.sample(range(NUM_CLASSES), n_labels)
    labels = sorted(labels)
    all_pids.append(pid)
    all_labels.append(labels)

# --- Save submission file ---
with open("submission.csv", "w", newline="", encoding="utf-8") as f:
    writer = csv.writer(f)

    # CHANGE 1: Use 'id' and 'label' as headers
    writer.writerow(["id", "label"])

    for pid, labels in zip(all_pids, all_labels):
        # The logic here is already correct; it will auto-quote the label string
        writer.writerow([pid, ",".join(map(str, labels))])

print(f"Dummy submission file saved to: {SUBMISSION_PATH}")
print(f"Total samples: {len(all_pids)}, Classes per sample: {MIN_LABELS}-{MAX_LABELS}")

Generating dummy predictions: 100%|██████████| 19658/19658 [00:00<00:00, 238309.15it/s]

Dummy submission file saved to: submission.csv
Total samples: 19658, Classes per sample: 1-3



