In [1]:
import pandas as pd
import torch
from torch.utils.data import DataLoader, TensorDataset
from transformers import AutoTokenizer, BertForSequenceClassification, AdamW
from sklearn.metrics import accuracy_score, f1_score
from nltk.translate.bleu_score import sentence_bleu


# Initialize tokenizer and model
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=5).cuda()  # Assuming review scores range from 1 to 5


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [2]:
def parse_reviews(lines):
    reviews = []
    review = {}
    
    for line in lines:
        line = line.strip()
        if line == '':  
            if review:
                reviews.append(review)
                review = {}
        else:
            if ': ' in line:
                key, value = line.split(': ', 1)
                review[key] = value
    
    if review:
        reviews.append(review)
    
    return reviews

file_path = '/kaggle/input/amazon/Beauty.txt'  
with open(file_path, 'r') as file:
    lines = file.readlines()

parsed_data = parse_reviews(lines)
df = pd.DataFrame(parsed_data)
print(df)

       product/productId                                      product/title  \
0             B00064C0IU  Oscar Eau de Toilette for Women by Oscar de La...   
1             B00064C0IU  Oscar Eau de Toilette for Women by Oscar de La...   
2             B00064C0IU  Oscar Eau de Toilette for Women by Oscar de La...   
3             B00064C0IU  Oscar Eau de Toilette for Women by Oscar de La...   
4             B000K5JBZU  Optimum Care Anti-Breakage Therapy Moisture Re...   
...                  ...                                                ...   
252051        B000FKGRSO  Artec Kiwi Coloreflector Shine Wax, 2-Ounce Ja...   
252052        B00025X06E                 Goldleaf Perfumed Body Cream 230ml   
252053        B00025X06E                 Goldleaf Perfumed Body Cream 230ml   
252054        B000BR64OS                   Guerlain Vetiver Eau de Toilette   
252055        B000BR64OS                   Guerlain Vetiver Eau de Toilette   

       product/price   review/userId            rev

In [3]:
df['review/score'] = df['review/score'].astype(float).astype(int)


train_df = df.sample(frac=0.8, random_state=42)
test_df = df.drop(train_df.index)


train_texts = train_df['review/text'].tolist()
train_labels = train_df['review/score'].astype(int) - 1  

train_inputs = tokenizer(train_texts, return_tensors='pt', padding=True, truncation=True, max_length=512)


In [4]:
train_input_ids = train_inputs['input_ids'].cuda()
train_attention_masks = train_inputs['attention_mask'].cuda()
train_labels = torch.tensor(train_labels.values).cuda()

train_dataset = TensorDataset(train_input_ids, train_attention_masks, train_labels)
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)

test_texts = test_df['review/text'].tolist()
test_labels = test_df['review/score'].astype(int) - 1  

test_inputs = tokenizer(test_texts, return_tensors='pt', padding=True, truncation=True, max_length=512)

test_input_ids = test_inputs['input_ids'].cuda()
test_attention_masks = test_inputs['attention_mask'].cuda()
test_labels = torch.tensor(test_labels.values).cuda()

test_dataset = TensorDataset(test_input_ids, test_attention_masks, test_labels)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)


optimizer = AdamW(model.parameters(), lr=2e-5)



In [None]:
import torch.profiler
model.train()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

with torch.profiler.profile(
    activities=[torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.CUDA],
    schedule=torch.profiler.schedule(wait=1, warmup=1, active=3, repeat=2),
    on_trace_ready=torch.profiler.tensorboard_trace_handler('./log'),
    record_shapes=True,
    profile_memory=True,
    with_stack=True
) as prof:
    for epoch in range(1):
        for batch in train_loader:
            input_ids, attention_mask, labels = [t.to(device) for t in batch]
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            prof.step()


STAGE:2024-05-27 07:56:08 34:34 ActivityProfilerController.cpp:312] Completed Stage: Warm Up
STAGE:2024-05-27 07:56:10 34:34 ActivityProfilerController.cpp:318] Completed Stage: Collection
STAGE:2024-05-27 07:56:10 34:34 ActivityProfilerController.cpp:322] Completed Stage: Post Processing
STAGE:2024-05-27 07:56:13 34:34 ActivityProfilerController.cpp:312] Completed Stage: Warm Up
STAGE:2024-05-27 07:56:14 34:34 ActivityProfilerController.cpp:318] Completed Stage: Collection
STAGE:2024-05-27 07:56:14 34:34 ActivityProfilerController.cpp:322] Completed Stage: Post Processing


In [None]:
model.eval()
predictions, true_labels = [], []

with torch.no_grad():
    for batch in test_loader:
        input_ids, attention_mask, labels = batch
        
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        
        predictions.extend(torch.argmax(logits, dim=1).tolist())
        true_labels.extend(labels.tolist())

# Calculate metrics
accuracy = accuracy_score(true_labels, predictions)
f1 = f1_score(true_labels, predictions, average='weighted')

print(f'Accuracy: {accuracy}')
print(f'F1 Score: {f1}')

# Calculate BLEU score
bleu_scores = []
for ref, hyp in zip(test_texts, [test_texts[i] for i in predictions]):
    bleu_scores.append(sentence_bleu([ref.split()], hyp.split()))
bleu_score_avg = sum(bleu_scores) / len(bleu_scores)

print(f'BLEU Score: {bleu_score_avg}')