In [2]:
TRAIN_DATA_PATH = "./data/dataset/train.csv"
TEST_DATA_PATH = "./data/dataset/test.csv"

In [3]:
import pandas as pd
import re

train_data = pd.read_csv(TRAIN_DATA_PATH)
test_data = pd.read_csv(TEST_DATA_PATH)

def fillMissing(df):
    df.TITLE.fillna(value="missing", inplace=True)
    df.BULLET_POINTS.fillna(value="missing", inplace=True)
    df.DESCRIPTION.fillna(value="missing", inplace=True)
    return df

def normalize_text(text):
    text = text.replace('""', ' inch')
    text = text.replace('"', '')
    text = text.replace('/p', '')
    text = text.replace('/b', '')
    text = text.replace('-', ' to ')
    text = re.sub(r'[^a-zA-Z0-9.:/\s%_"]|(?<=\d)_(?=\d)', '', text)
    text = text.replace('_', ' ')
    splits = text.strip().split(' ')
    return u" ".join([x for x in splits if len(x) >= 1])

def lowercase_text(text):
    return text.lower()

train_data = fillMissing(train_data)
test_data = fillMissing(test_data)

# train data
train_data["TITLE"] = train_data["TITLE"].apply(lowercase_text)
train_data["BULLET_POINTS"] = train_data["BULLET_POINTS"].apply(lowercase_text)
train_data["DESCRIPTION"] = train_data["DESCRIPTION"].apply(lowercase_text)

train_data["TITLE"] = train_data["TITLE"].apply(normalize_text)
train_data["BULLET_POINTS"] = train_data["BULLET_POINTS"].apply(normalize_text)
train_data["DESCRIPTION"] = train_data["DESCRIPTION"].apply(normalize_text)

# test data
test_data["TITLE"] = test_data["TITLE"].apply(lowercase_text)
test_data["BULLET_POINTS"] = test_data["BULLET_POINTS"].apply(lowercase_text)
test_data["DESCRIPTION"] = test_data["DESCRIPTION"].apply(lowercase_text)

test_data["TITLE"] = test_data["TITLE"].apply(normalize_text)
test_data["BULLET_POINTS"] = test_data["BULLET_POINTS"].apply(normalize_text)
test_data["DESCRIPTION"] = test_data["DESCRIPTION"].apply(normalize_text)

train_data

KeyboardInterrupt: 

In [None]:
import torch
import torch.nn as nn

DEVICE = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

In [None]:
import torchtext.transforms as T
from torch.hub import load_state_dict_from_url

padding_idx = 1
bos_idx = 0
eos_idx = 2
max_seq_len = 256
xlmr_vocab_path = r"https://download.pytorch.org/models/text/xlmr.vocab.pt"
xlmr_spm_model_path = r"https://download.pytorch.org/models/text/xlmr.sentencepiece.bpe.model"

text_transform = T.Sequential(
    T.SentencePieceTokenizer(xlmr_spm_model_path),
    T.VocabTransform(load_state_dict_from_url(xlmr_vocab_path)),
    T.Truncate(max_seq_len - 2),
    T.AddToken(token=bos_idx, begin=True),
    T.AddToken(token=eos_idx, begin=False),
)


from torch.utils.data import DataLoader, Dataset

In [None]:
class AmazonDataset(Dataset):
    
    def __init__(self, meta_df):
        
        self.df = meta_df
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        item = self.df.iloc[index]
        title = item["TITLE"]
        description = item["DESCRIPTION"]
        bullets = item["BULLET_POINTS"]
        product_type = item["PRODUCT_TYPE_ID"]
        product_length = item["PRODUCT_LENGTH"]
        return (title, description, bullets, product_type, product_length)
    
dev_dataset = AmazonDataset(train_data)
test_dataset = AmazonDataset(test_data)

train_set_size = int(len(dev_dataset) * 0.8)
valid_set_size = len(dev_dataset) - train_set_size

train_dataset, validation_dataset = torch.utils.data.random_split(dev_dataset, [train_set_size, valid_set_size])

In [7]:
train_dataset[0]

('dickies to herren 13 inch multi pocket work short charcoal grey',
 'missing',
 'sits at waist roomier in seat and thigh skims the kneeflat front with permanent crease 8.5 oz. heavyweight twill 65% polyester/35% cottonsignature long to tunnel belt loops hold heavy tool beltsmulti to use pocket on right legeasy to care stain release wrinkle resistant',
 2854,
 2350.0)

In [8]:
import os

BATCH_SIZE = 16
NUM_WORKERS = os.cpu_count()

def data_collate(data, data_transform):
    title_feats = []
    desc_feats = []
    bullet_feats = []
    lengths = []
    
    for sample in data:
        title_feats.append(text_transform(sample[0])) #title
        desc_feats.append(text_transform(sample[1]))
        bullet_feats.append(text_transform(sample[2]))
        lengths.append(sample[4])

    return title_feats, lengths

train_loader = torch.utils.data.DataLoader(
    dataset=train_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True,
    collate_fn=lambda x: data_collate(x, text_transform),
    num_workers= NUM_WORKERS
)

validation_loader = torch.utils.data.DataLoader(
    dataset=validation_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False,
    collate_fn=lambda x: data_collate(x, text_transform)
)

In [9]:
batch = next(iter(train_loader))