In [1]:
TRAIN_DATA_PATH = "./data/dataset/train.csv"
TEST_DATA_PATH = "./data/dataset/test.csv"

In [2]:
import pandas as pd
import re

train_data = pd.read_csv(TRAIN_DATA_PATH)
test_data = pd.read_csv(TEST_DATA_PATH)

def fillMissing(df):
    df.TITLE.fillna(value="missing", inplace=True)
    df.BULLET_POINTS.fillna(value="missing", inplace=True)
    df.DESCRIPTION.fillna(value="missing", inplace=True)
    return df

def normalize_text(text):
    text = text.replace('""', ' inch')
    text = text.replace('"', '')
    text = text.replace('/p', '')
    text = text.replace('/b', '')
    text = text.replace('-', ' to ')
    text = re.sub(r'[^a-zA-Z0-9.:/\s%_"]|(?<=\d)_(?=\d)', '', text)
    text = text.replace('_', ' ')
    splits = text.strip().split(' ')
    return u" ".join([x for x in splits if len(x) >= 1])

def lowercase_text(text):
    return text.lower()

train_data = fillMissing(train_data)
test_data = fillMissing(test_data)

# train data
train_data["TITLE"] = train_data["TITLE"].apply(lowercase_text)
train_data["BULLET_POINTS"] = train_data["BULLET_POINTS"].apply(lowercase_text)
train_data["DESCRIPTION"] = train_data["DESCRIPTION"].apply(lowercase_text)

train_data["TITLE"] = train_data["TITLE"].apply(normalize_text)
train_data["BULLET_POINTS"] = train_data["BULLET_POINTS"].apply(normalize_text)
train_data["DESCRIPTION"] = train_data["DESCRIPTION"].apply(normalize_text)

# test data
test_data["TITLE"] = test_data["TITLE"].apply(lowercase_text)
test_data["BULLET_POINTS"] = test_data["BULLET_POINTS"].apply(lowercase_text)
test_data["DESCRIPTION"] = test_data["DESCRIPTION"].apply(lowercase_text)

test_data["TITLE"] = test_data["TITLE"].apply(normalize_text)
test_data["BULLET_POINTS"] = test_data["BULLET_POINTS"].apply(normalize_text)
test_data["DESCRIPTION"] = test_data["DESCRIPTION"].apply(normalize_text)

train_data

Unnamed: 0,PRODUCT_ID,TITLE,BULLET_POINTS,DESCRIPTION,PRODUCT_TYPE_ID,PRODUCT_LENGTH
0,1925202,artzfolio tulip flowers blackout curtain for d...,luxurious appealing: beautiful custom to made ...,missing,1650,2125.980000
1,2673191,marks spencer girls pyjama sets t862561c navy ...,harry potter hedwig pyjamas 6 to 16 yrs100% co...,missing,2755,393.700000
2,2765088,priknik horn red electric air horn compressor ...,loud dual tone trumpet horn compatible with sx...,specifications: color: red material: aluminium...,7537,748.031495
3,1594019,alishah womens cotton ankle length leggings co...,made by 95%cotton and 5% lycra which gives you...,aishah womens lycra cotton ankel leggings. bra...,2996,787.401574
4,283658,the united empire loyalists: a chronicle of th...,missing,missing,6112,598.424000
...,...,...,...,...,...,...
2249693,2422167,nike womens as w ny df swsh hn kh bra cz7610 t...,material : polyester,missing,3009,1181.100000
2249694,2766635,3pcs goose game cute cartoon enamel pins funny...,inspiration inspired by the untitled goose gam...,pbbrand: xvieonr pbr pbproduct name: fashion c...,3413,125.984252
2249695,1987786,kangroo sweep movement printed wooden wall clo...,dial size: 12 inches in diameterbig clear repr...,wall clocks are very attractive in looks and e...,1574,1200.000000
2249696,1165754,electro voice ekx to brkt15 wall mount bracket...,missing,missing,592,2900.000000


In [7]:
import torch
import torch.nn as nn

DEVICE = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

In [8]:
import torchtext.transforms as T
from torch.hub import load_state_dict_from_url

padding_idx = 1
bos_idx = 0
eos_idx = 2
max_seq_len = 256
xlmr_vocab_path = r"https://download.pytorch.org/models/text/xlmr.vocab.pt"
xlmr_spm_model_path = r"https://download.pytorch.org/models/text/xlmr.sentencepiece.bpe.model"

text_transform = T.Sequential(
    T.SentencePieceTokenizer(xlmr_spm_model_path),
    T.VocabTransform(load_state_dict_from_url(xlmr_vocab_path)),
    T.Truncate(max_seq_len - 2),
    T.AddToken(token=bos_idx, begin=True),
    T.AddToken(token=eos_idx, begin=False),
)

from torch.utils.data import DataLoader, Dataset

In [9]:
class AmazonDataset(Dataset):
    
    def __init__(self, meta_df):
        
        self.df = meta_df
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        item = self.df.iloc[index]
        title = item["TITLE"]
        description = item["DESCRIPTION"]
        bullets = item["BULLET_POINTS"]
        product_type = item["PRODUCT_TYPE_ID"]
        product_length = item["PRODUCT_LENGTH"]
        return (title, description, bullets, product_type, product_length)
    
dev_dataset = AmazonDataset(train_data)
test_dataset = AmazonDataset(test_data)

train_set_size = int(len(dev_dataset) * 0.8)
valid_set_size = len(dev_dataset) - train_set_size

train_dataset, validation_dataset = torch.utils.data.random_split(dev_dataset, [train_set_size, valid_set_size])

In [17]:
import os
import torchtext
from torchtext.functional import to_tensor

BATCH_SIZE = 64
NUM_WORKERS = os.cpu_count()

xlmr_base = torchtext.models.XLMR_BASE_ENCODER
xlmr_model = xlmr_base.get_model()
xlmr_transform = xlmr_base.transform()

def data_collate(data, data_transform):
    title_feats = []
    desc_feats = []
    bullet_feats = []
    lengths = []
    
    for sample in data:
        title_feats.append(sample[0]) #title
        desc_feats.append(sample[1]) # desc
        bullet_feats.append(sample[2]) #bullets
        lengths.append(sample[4])
    
    title_feats = to_tensor(data_transform(title_feats), padding_value=1)
    desc_feats = to_tensor(data_transform(desc_feats), padding_value=1)
    bullet_feats = to_tensor(data_transform(bullet_feats), padding_value=1)
    lengths = torch.Tensor(lengths)
    
    title_feats = xlmr_model(title_feats).detach()
    desc_feats = xlmr_model(desc_feats).detach()
    bullet_feats = xlmr_model(bullet_feats).detach()
            
    return title_feats, desc_feats, bullet_feats, lengths

train_loader = torch.utils.data.DataLoader(
    dataset=train_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True,
    collate_fn=lambda x: data_collate(x, xlmr_transform),
    num_workers= 1
)

validation_loader = torch.utils.data.DataLoader(
    dataset=validation_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False,
    collate_fn=lambda x: data_collate(x, xlmr_transform)
)

In [24]:
import torch.nn.functional as F

class Regressor(torch.nn.Module):
    
    def __init__(self, dim_cell=768, dim_emb=512):
        super().__init__()
        
        # attentive pooling layers
        self.embedding = nn.Linear(dim_cell, dim_emb)
        self.linear = nn.Linear(dim_emb, 1)
        
        self.linear2 = nn.Linear(dim_emb, 256)
        self.estimateLayer = nn.Linear(256, 1)
            
    def forward(self, title, desc, bullets):
        
        encoder_out = torch.cat((title, desc, bullets), dim=1)
                
        embeds = torch.tanh(self.embedding(encoder_out))  # (batch, seg_len, dim_emb)
        attn_weights = F.softmax(self.linear(embeds), dim=1)
        embeds = torch.sum(embeds * attn_weights, dim=1)
        embedding = embeds.div(embeds.norm(p=2, dim=-1, keepdim=True)).unsqueeze(1)
        embedding = embedding.squeeze(dim=1)
        
        out = F.relu(self.linear2(embedding))
        estimate = F.relu(self.estimateLayer(out))
        
        return estimate

In [23]:
packageEstimator = Regressor(dim_cell=768, dim_emb=256)
batch = next(iter(train_loader))
out = packageEstimator(batch[0], batch[1], batch[2])

out.shape

torch.Size([16, 555, 768])


torch.Size([16, 1])