In [19]:
!pip install evaluate
import torch
import numpy as np
import evaluate
from datasets import load_dataset
from datasets import Dataset
from transformers import pipeline
from transformers import DataCollatorWithPadding
from transformers import TrainingArguments, Trainer
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification, AutoModelForCausalLM
import os
os.environ["WANDB_DISABLED"] = "true"



In [20]:
trainset = np.load('train_with_label.npy', allow_pickle=True)
testset = np.load('test_with_label.npy', allow_pickle=True)

In [21]:
trainset = [{'title': data['title'], 'label':data['label']} for data in trainset]
testset = [{'title': data['title'], 'label':data['label']} for data in testset]
train_dataset = Dataset.from_list(trainset)
test_dataset = Dataset.from_list(testset)

Change the label to binary 0 is smartphone, 1 is other.(This is because at first, we manually label data as 0 smartphone, 1 phone associate, 2 other. However, the only needs is to classify phone from other, so we chance this to two labels)

In [23]:
trainset = [{'title': data['title'], 'label':data['label']} for data in trainset]
testset = [{'title': data['title'], 'label':data['label']} for data in testset]

In [24]:
for data in trainset:
  if data['label'] == 2:
    data['label'] = 1
for data in testset:
  if data['label'] == 2:
    data['label'] = 1

In [25]:
train_dataset = Dataset.from_list(trainset)
test_dataset = Dataset.from_list(testset)
train_dataset

Dataset({
    features: ['title', 'label'],
    num_rows: 125
})

### Bert with 2 label

In [26]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
def tokenize_function(examples):
    return tokenizer(text=examples["title"], padding="max_length", truncation=True)
tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_test = test_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/125 [00:00<?, ? examples/s]

Map:   0%|          | 0/25 [00:00<?, ? examples/s]

In [27]:
model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=2, device_map="cuda")
training_args = TrainingArguments(output_dir="test_trainer")
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    data_collator=data_collator,
    tokenizer = tokenizer
)
metric = evaluate.load("accuracy")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


In [28]:
trainer.train()

Step,Training Loss


TrainOutput(global_step=48, training_loss=0.2568815549214681, metrics={'train_runtime': 52.0886, 'train_samples_per_second': 7.199, 'train_steps_per_second': 0.922, 'total_flos': 98666645760000.0, 'train_loss': 0.2568815549214681, 'epoch': 3.0})

In [29]:
predictions = trainer.predict(tokenized_test)
preds = np.argmax(predictions.predictions, axis=-1)
metric.compute(predictions=preds, references=tokenized_test['label'])

{'accuracy': 0.92}

In [30]:
data = np.load('filtered_price_50.npy', allow_pickle=True)

In [31]:
data.shape

(7271,)

In [32]:
unlabel_data = data[150:]
generalization_set = [{'title': data['title']} for data in unlabel_data]
Generalization_set = Dataset.from_list(generalization_set)
tokenized_gen = Generalization_set.map(tokenize_function, batched = True)

Map:   0%|          | 0/7121 [00:00<?, ? examples/s]

In [33]:
model_labels = trainer.predict(tokenized_gen)
labels = np.argmax(model_labels.predictions, axis=-1)
labels.shape

(7121,)

In [34]:
for data, label in zip(unlabel_data, labels):
  data['label'] = label

In [35]:
unlabel_data[370]

{'main_category': 'Cell Phones & Accessories',
 'title': "LG G7 ThinQ G710 64GB Unlocked GSM Phone w/ Dual 16MP Camera's - New Aurora Black (Renewed)",
 'average_rating': 3.9,
 'rating_number': 37,
 'features': ['6.1in QHD+ full vision display with notch',
  'Super bright Camera',
  'Dual 16MP/16MP wide rear Camera w/ real glass lens',
  'Boom box speaker'],
 'description': ['This pre-owned or refurbished product has been professionally inspected and tested to work and look like new. How a product becomes part of Amazon Renewed, your destination for pre-owned, refurbished products: A customer buys a new product and returns it or trades it in for a newer or different model. That product is inspected and tested to work and look like new by Amazon-qualified suppliers. Then, the product is sold as an Amazon Renewed product on Amazon. If not satisfied with the purchase, renewed products are eligible for replacement or refund under the Amazon Renewed Guarantee.'],
 'price': '99.0',
 'images'

In [36]:
np.save('bert_labeled_data.npy', unlabel_data, allow_pickle=True)