In [1]:
import numpy as np 
import pandas as pd 
from copy import deepcopy

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder, Normalizer

from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import DataLoader

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

  from pandas.core.computation.check import NUMEXPR_INSTALLED
  from pandas.core import (


In [2]:
df = pd.read_csv("Downloads/amz_us_price_prediction_dataset.csv")

In [3]:
df.shape

(1735414, 9)

In [4]:
df["title"] = df["title"] + ". " + df["category"]

In [5]:
df = df[["title", "stars", "isBestSeller", "boughtInLastMonth", "price"]].copy()

In [6]:
categories = "isBestSeller"
le = LabelEncoder()
df[categories] = le.fit_transform(df[categories])

In [7]:
numericals = ["stars", "boughtInLastMonth", "price"]
norm = Normalizer()
df[numericals] = norm.fit_transform(df[numericals])

In [8]:
df.dropna(inplace=True)

In [9]:
tokenizer = get_tokenizer("basic_english")

def extract_tokens(x):
    for txt, _, _, _, _ in x:
        yield tokenizer(txt)
        
        
vocab = build_vocab_from_iterator(extract_tokens(df.values), specials=["<unk>"])
vocab.set_default_index(vocab["<unk>"])

In [10]:
text_pipeline = lambda x: vocab(tokenizer(x))

In [11]:
def collate(batch):
    text, numericals, target, offsets = [], [], [], [0]
    for txt, stars, seller, bought, price in batch:
        processed_text = torch.tensor(text_pipeline(txt))
        text.append(processed_text)
        numericals.append([stars, seller, bought])
        target.append(price)
        offsets.append(processed_text.size(0))
        
    text = torch.cat(text)
    numericals = torch.tensor(numericals)
    target = torch.tensor(target)
    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
    
    return text, numericals, target, offsets

In [12]:
training_phase, testing = train_test_split(df.values, random_state=42, test_size=0.2)
train, val = train_test_split(training_phase, random_state=42, test_size=0.3)

In [13]:
BATCH = 128
LR = 0.1
EPOCHS = 5

In [14]:
train_dl = DataLoader(train, batch_size=BATCH, shuffle=True, collate_fn=collate)
val_dl = DataLoader(val, batch_size=BATCH, shuffle=True, collate_fn=collate)

In [15]:
class Price(nn.Module):
    def __init__(self, vocab_size, embed_size, feat_size):
        super(Price, self).__init__()
        self.embed = nn.EmbeddingBag(vocab_size, embed_size)
        self.layer = nn.Sequential(nn.Linear(embed_size, 512),
                                  nn.ReLU(),
                                  nn.BatchNorm1d(512),
                                  nn.Linear(512, 256),
                                  nn.ReLU(),
                                  nn.BatchNorm1d(256),
                                  nn.Linear(256, feat_size),
                                  nn.ReLU(),
                                  nn.BatchNorm1d(feat_size))
        
        self.fc = nn.Linear(feat_size*2, 1)
        
    def forward(self, x, nums, off):
        x = self.embed(x, off)
        x = self.layer(x)
        x = torch.cat((x, nums), dim=1)
        x = self.fc(x)
        return nn.functional.relu(x)

In [16]:
vocab_size = len(vocab)
embed_size = 256
feats = len(numericals)

In [17]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [18]:
model = Price(vocab_size, embed_size, feats)
model = model.to(device)
optimizer = torch.optim.SGD(model.parameters(), lr=LR)
criterion = nn.MSELoss()

In [None]:
best_model = deepcopy(model)
best_loss = 10e9
train_history = []
val_history  = []

for i in range(1, EPOCHS+1):
    model.train()
    train_loss = 0.0
    train_total = 0
    for txt, nums, target, off in train_dl:
        optimizer.zero_grad()
        if torch.cuda.is_available():
            txt, nums, target, off = txt.cuda(), nums.cuda(), target.cuda(), off.cuda()
        
        out = model(txt, nums, off)
        loss = criterion(out, target)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
        train_total += out.size(0)
    train_loss = train_loss/train_total
    train_history += [train_loss]
    
    model.eval()
    val_loss = 0.0
    val_total = 0
    with torch.no_grad():
        for txt, nums, target, off in val_dl:
            if torch.cuda.is_available():
                txt, nums, target, off = txt.cuda(), nums.cuda(), target.cuda(), off.cuda()

            out = model(txt, nums, off)
            loss = criterion(out, target)
            val_loss += loss.item()
            val_total += out.size(0)
            
    val_loss = val_loss/val_total
    val_history += [val_loss]
    if val_loss < best_loss:
        best_model = deepcopy(model)
        best_loss = val_loss
        
    print("Epoch {} train loss {} val loss {}".format(i, train_loss, val_loss))

  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch 1 train loss 0.0013114311904987872 val loss 0.0013117852678397387
Epoch 2 train loss 0.0013050373146411794 val loss 0.0013086697098350746
Epoch 3 train loss 0.0013050795582389473 val loss 0.001309809182865362


In [None]:
epochs = list(range(1, EPOCHS+1))
plt.plot(epochs, train_history)
plt.plot(epochs, val_history)
plt.legend(["training", "validation"])
plt.title("Training performance")
plt.show()

In [None]:
def predict(x):
    txt, stars, seller, bought, price = x
    processed = torch.tensor(text_pipeline(txt))
    numericals = [[stars, seller, bought]]
    numericals = torch.tensor(numericals)
    off = torch.tensor([0])
    model.eval()
    with torch.no_grad():
        if torch.cuda.is_available():
            processed, numericals, off = processed.cuda(), numericals.cuda(), off.cuda()
        out = model(processed, numericals, off)
            
    return out.item()

In [None]:
predicted = []
real = []
for i in range(len(testing)):
    predicted += [predict(testing[i])]
    real += [testing[i, -1]]

In [None]:
mse = mean_squared_error(real, predicted)
mae = mean_absolute_error(real, predicted)
r2 = r2_score(real, predicted)

print("MSE: {}\nMAE: {}\nr2 score: {}".format(mse, mae, r2))