## Preprocessing Amazon Fashion Dataset

In [1]:
import pandas as pd
import gzip
import json

def parse(path):
  g = gzip.open(path, 'rb')
  for l in g:
    yield json.loads(l)

def getDF(path):
  df = {}
  for i, d in enumerate(parse(path)):
    df[i] = d
  return pd.DataFrame.from_dict(df, orient='index')

df = getDF('../../../data/raw/AMAZON_FASHION.json.gz')

In [2]:
# Remove duplicate reviews
# Any review with the same reviewText, reviewerID, overall ranking is considered a duplicate
df = df.drop_duplicates(subset=['reviewText', 'reviewerID', 'overall'])

# Drop reviews with no reviewText since we are primarily interested in analyzing review text
df = df.dropna(subset=['reviewText'])

df['overallInt'] = df['overall'].astype(int)
df['reviewFull'] = df['reviewText'] + ' ' + df['summary']
df['reviewFull'] = df['reviewFull'].astype(str)
df.head()

Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,reviewerName,reviewText,summary,unixReviewTime,vote,style,image,overallInt,reviewFull
0,5.0,True,"10 20, 2014",A1D4G1SNUZWQOT,7106116521,Tracy,Exactly what I needed.,perfect replacements!!,1413763200,,,,5,Exactly what I needed. perfect replacements!!
1,2.0,True,"09 28, 2014",A3DDWDH9PX2YX2,7106116521,Sonja Lau,"I agree with the other review, the opening is ...","I agree with the other review, the opening is ...",1411862400,3.0,,,2,"I agree with the other review, the opening is ..."
2,4.0,False,"08 25, 2014",A2MWC41EW7XL15,7106116521,Kathleen,Love these... I am going to order another pack...,My New 'Friends' !!,1408924800,,,,4,Love these... I am going to order another pack...
3,2.0,True,"08 24, 2014",A2UH2QQ275NV45,7106116521,Jodi Stoner,too tiny an opening,Two Stars,1408838400,,,,2,too tiny an opening Two Stars
4,3.0,False,"07 27, 2014",A89F3LQADZBS5,7106116521,Alexander D.,Okay,Three Stars,1406419200,,,,3,Okay Three Stars


In [3]:
# Keep relevant columns
df = df[['reviewText', 'summary', 'overall', 'overallInt', 'reviewFull']]
df.head()

Unnamed: 0,reviewText,summary,overall,overallInt,reviewFull
0,Exactly what I needed.,perfect replacements!!,5.0,5,Exactly what I needed. perfect replacements!!
1,"I agree with the other review, the opening is ...","I agree with the other review, the opening is ...",2.0,2,"I agree with the other review, the opening is ..."
2,Love these... I am going to order another pack...,My New 'Friends' !!,4.0,4,Love these... I am going to order another pack...
3,too tiny an opening,Two Stars,2.0,2,too tiny an opening Two Stars
4,Okay,Three Stars,3.0,3,Okay Three Stars


In [4]:
# Save preprocessed data
df.to_csv('../../../data/processed/bert_amazon_fashion.csv', index=False)

## Bert for Amazon Fashion Dataset star rating prediction

In [5]:
import torch
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, BertModel, BertForSequenceClassification, AdamW

# Load preprocessed data
df = pd.read_csv('../../../data/processed/bert_amazon_fashion.csv')

print(set(df['overallInt']))
df['overallInt'] = df['overallInt'].apply(lambda x: x - 1)
print(set(df['overallInt']))

# Split data into training and testing sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)


{1, 2, 3, 4, 5}
{0, 1, 2, 3, 4}


In [6]:
# Define custom dataset
class AmazonFashionDataset(Dataset):
    def __init__(self, df, tokenizer, max_length=512):
        self.df = df
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        review = row['reviewFull']
        rating = row['overallInt']

        review = str(review)
        rating = int(rating)

        inputs = self.tokenizer(review, max_length=self.max_length, padding='max_length', truncation=True, return_tensors='pt')
        inputs['input_ids'] = inputs['input_ids'].squeeze(0)
        inputs['attention_mask'] = inputs['attention_mask'].squeeze(0)
        inputs['labels'] = torch.tensor(rating)
        return inputs


In [7]:

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
model = BertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=5)

# Create dataloaders
train_dataset = AmazonFashionDataset(train_df, tokenizer)
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)

test_dataset = AmazonFashionDataset(test_df, tokenizer)
test_loader = DataLoader(test_dataset, batch_size=4, shuffle=False)


You are using a model of type distilbert to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing BertForSequenceClassification: ['distilbert.transformer.layer.0.attention.k_lin.weight', 'distilbert.transformer.layer.0.attention.out_lin.weight', 'distilbert.transformer.layer.1.attention.v_lin.bias', 'distilbert.transformer.layer.5.ffn.lin2.weight', 'distilbert.transformer.layer.4.attention.v_lin.bias', 'distilbert.transformer.layer.5.ffn.lin2.bias', 'distilbert.transformer.layer.5.attention.q_lin.weight', 'distilbert.transformer.layer.1.attention.q_lin.weight', 'distilbert.transformer.layer.4.ffn.lin1.bias', 'distilbert.transformer.layer.1.ffn.lin2.bias', 'distilbert.transformer.layer.0.attention.v_lin.bias', 'distilbert.transformer.layer.0.output_layer_norm.weight', 'distilbert.transformer.layer.3.attention.out_lin.bias', 'distilbert.

In [8]:
# Sample batch
for batch in tqdm(train_loader):
    print(batch['input_ids'].shape)
    print(batch['input_ids'][0].shape)
    print(batch['input_ids'][1].shape)
    print(batch['attention_mask'].shape)
    print(batch['labels'].shape)
    print(batch['labels'])
    break

  0%|          | 0/170348 [00:00<?, ?it/s]

torch.Size([4, 512])
torch.Size([512])
torch.Size([512])
torch.Size([4, 512])
torch.Size([4])
tensor([4, 1, 4, 0])





In [9]:
train_dataset[0]['input_ids'].shape

torch.Size([512])

In [10]:
# Train model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

optimizer = AdamW(model.parameters(), lr=5e-5)

model.train()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [11]:
def evaluate(model, test):
    model.eval()
    total = 0
    correct = 0
    with torch.no_grad():
        for batch in test:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            predictions = torch.argmax(outputs.logits, dim=1) + 1

            total += labels.size(0)
            correct += (predictions == labels).sum().item()

    return correct / total

In [None]:
accumulation_steps = 4
optimizer.zero_grad()

for epoch in range(3):
    model.train()
    for i, batch in enumerate(tqdm(train_loader)):
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        input_ids = batch['input_ids'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss = loss / accumulation_steps  # Normalize our loss (if averaged)
        loss.backward()

        if (i+1) % accumulation_steps == 0:  # Wait for several backward steps
            optimizer.step()  # Now we can do an optimizer step
            optimizer.zero_grad()

    print(f'Epoch {epoch + 1}, Loss: {loss.item()}')
    print("Test Set: Accuracy", str(evaluate(model, test_loader)))
    model.save_pretrained('../../../models/bert_amazon_fashion_epoch' + str(epoch))

 51%|█████     | 86580/170348 [4:26:16<4:30:57,  5.15it/s] 

In [None]:
# Save model
model.save_pretrained('../../../models/bert_amazon_fashion')
tokenizer.save_pretrained('../../../models/bert_amazon_fashion_tokenizer')

In [None]:
print("Accuracy", str(evaluate(model, test_loader)))

In [None]:
def predict_rating(review, model, tokenizer):
    inputs = tokenizer(review, max_length=512, padding='max_length', truncation=True, return_tensors='pt')
    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)

    outputs = model(input_ids, attention_mask=attention_mask)
    prediction = torch.argmax(outputs.logits, dim=1) + 1
    return prediction.item()


## Conclusion
