In [None]:
import zipfile
import os

from google.colab import drive
drive.mount('/content/drive')

zip_path = "/content/drive/My Drive/yakaboo-book-reviews-dataset.zip"
extract_path = "/content/dataset"

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

print("Архів успішно розпаковано в:", extract_path)

In [None]:
import zipfile

zip_files = ["reviews.jsonlines.zip", "books.jsonlines.zip"]

for zip_file in zip_files:
    zip_path = os.path.join(extract_path, zip_file) 
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_path)  
    print(f"{zip_file} розпаковано")

print("Файли після розпакування:", os.listdir(extract_path))

In [None]:
import pandas as pd
import os

reviews_path = os.path.join(extract_path, "reviews.jsonlines")

In [None]:
df_reviews = pd.read_json(reviews_path, lines=True)

print(df_reviews.head()) 

In [None]:
def get_sentiment(rating):
    if rating >= 4:
        return 0   # "POSITIVE"
    else:
        return 1  # "NEGATIVE"

In [None]:
df_reviews['sentiment'] = df_reviews['rating'].apply(get_sentiment)

print(df_reviews[['review', 'sentiment']])

In [None]:
sentiment_counts = df_reviews["sentiment"].value_counts()
print(sentiment_counts)

num_positive = (df_reviews["sentiment"] == 0).sum()
num_negative = (df_reviews["sentiment"] == 1).sum()

print(f"Позитивних: {num_positive}")
print(f"Негативних: {num_negative}")


sentiment
0    66200
1     3056
Name: count, dtype: int64
Позитивних: 66200
Негативних: 3056


In [None]:
from sklearn.utils import resample
from sklearn.model_selection import train_test_split

df_positive = df_reviews[df_reviews["sentiment"] == 0]
df_negative = df_reviews[df_reviews["sentiment"] == 1]

df_positive_balanced = resample(df_positive,
                                replace=False,
                                n_samples=len(df_negative),
                                random_state=42)

df_balanced = pd.concat([df_positive_balanced, df_negative])

train_reviews, test_reviews, train_sentiment, test_sentiment = train_test_split(
    df_balanced["review"], df_balanced["sentiment"], test_size=0.2, random_state=42
)

print(f"Train size: {len(train_reviews)}, Test size: {len(test_reviews)}")
print(df_balanced["sentiment"].value_counts())

Train size: 4889, Test size: 1223
sentiment
0    3056
1    3056
Name: count, dtype: int64


In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from tqdm import tqdm

# Підготовка даних
class SentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        text = self.texts[item]
        label = self.labels[item]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,  
            max_length=self.max_len, 
            padding='max_length',  
            truncation=True,  
            return_tensors='pt', 
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }


max_len = 50
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

train_texts = train_reviews.reset_index(drop=True)
val_texts = test_reviews.reset_index(drop=True)
train_labels = train_sentiment.reset_index(drop=True)
val_labels = test_sentiment.reset_index(drop=True)

train_dataset = SentimentDataset(train_texts, train_labels, tokenizer, max_len)
val_dataset = SentimentDataset(val_texts, val_labels, tokenizer, max_len)

train_dataloader = DataLoader(train_dataset, batch_size=256, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=256)

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

optimizer = AdamW(model.parameters(), lr=2e-5)

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model = model.to(device)

In [None]:
def train_epoch(model, data_loader, optimizer, device):
    model = model.train()
    losses = []
    correct_predictions = 0

    for batch in tqdm(data_loader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        losses.append(loss.item())
        preds = torch.argmax(logits, dim=1)
        correct_predictions += torch.sum(preds == labels)

        loss.backward()
        optimizer.step()

    return correct_predictions.double() / len(data_loader.dataset), sum(losses) / len(data_loader)

def eval_epoch(model, data_loader, device):
    model = model.eval()
    losses = []
    correct_predictions = 0

    with torch.no_grad():
        for batch in tqdm(data_loader):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            logits = outputs.logits

            losses.append(loss.item())
            preds = torch.argmax(logits, dim=1)
            correct_predictions += torch.sum(preds == labels)

    return correct_predictions.double() / len(data_loader.dataset), sum(losses) / len(data_loader)

In [None]:
for epoch in range(40):
    print(f"Epoch {epoch + 1}")

    train_accuracy, train_loss = train_epoch(model, train_dataloader, optimizer, device)
    print(f"Train loss {train_loss} accuracy {train_accuracy}")

    val_accuracy, val_loss = eval_epoch(model, val_dataloader, device)
    print(f"Validation loss {val_loss} accuracy {val_accuracy}")

Epoch 1


100%|██████████| 20/20 [01:13<00:00,  3.68s/it]


Train loss 0.7002316117286682 accuracy 0.4913070157496421


100%|██████████| 5/5 [00:12<00:00,  2.60s/it]


Validation loss 0.6905019760131836 accuracy 0.5331152902698283
Epoch 2


100%|██████████| 20/20 [01:15<00:00,  3.75s/it]


Train loss 0.693141308426857 accuracy 0.5183064021272245


100%|██████████| 5/5 [00:13<00:00,  2.63s/it]


Validation loss 0.688468074798584 accuracy 0.5347506132461162
Epoch 3


100%|██████████| 20/20 [01:15<00:00,  3.77s/it]


Train loss 0.6897245168685913 accuracy 0.539578645939865


100%|██████████| 5/5 [00:13<00:00,  2.62s/it]


Validation loss 0.6911427259445191 accuracy 0.5200327064595258
Epoch 4


100%|██████████| 20/20 [01:14<00:00,  3.74s/it]


Train loss 0.6842154294252396 accuracy 0.5698506852116998


100%|██████████| 5/5 [00:13<00:00,  2.65s/it]


Validation loss 0.6856559038162231 accuracy 0.5682747342600164
Epoch 5


100%|██████████| 20/20 [01:14<00:00,  3.75s/it]


Train loss 0.6768446743488312 accuracy 0.5604418081407241


100%|██████████| 5/5 [00:13<00:00,  2.64s/it]


Validation loss 0.6723822951316833 accuracy 0.5870809484873263
Epoch 6


100%|██████████| 20/20 [01:15<00:00,  3.77s/it]


Train loss 0.6598524749279022 accuracy 0.60687257107793


100%|██████████| 5/5 [00:13<00:00,  2.62s/it]


Validation loss 0.667347502708435 accuracy 0.6009811937857727
Epoch 7


100%|██████████| 20/20 [01:14<00:00,  3.73s/it]


Train loss 0.6411153227090836 accuracy 0.6281448148905707


100%|██████████| 5/5 [00:13<00:00,  2.67s/it]


Validation loss 0.6663802146911622 accuracy 0.6083401471790679
Epoch 8


100%|██████████| 20/20 [01:17<00:00,  3.86s/it]


Train loss 0.6147601753473282 accuracy 0.6504397627326652


100%|██████████| 5/5 [00:13<00:00,  2.63s/it]


Validation loss 0.6740730762481689 accuracy 0.6099754701553557
Epoch 9


100%|██████████| 20/20 [01:15<00:00,  3.78s/it]


Train loss 0.6008154824376106 accuracy 0.6694620576805073


100%|██████████| 5/5 [00:13<00:00,  2.62s/it]


Validation loss 0.6793051838874817 accuracy 0.6099754701553557
Epoch 10


100%|██████████| 20/20 [01:15<00:00,  3.78s/it]


Train loss 0.5674305856227875 accuracy 0.7083248107997546


100%|██████████| 5/5 [00:13<00:00,  2.61s/it]


Validation loss 0.7002516269683838 accuracy 0.6181520850367948
Epoch 11


100%|██████████| 20/20 [01:15<00:00,  3.77s/it]


Train loss 0.5305296316742897 accuracy 0.7340969523419922


100%|██████████| 5/5 [00:13<00:00,  2.63s/it]


Validation loss 0.7177393436431885 accuracy 0.6238757154538022
Epoch 12


100%|██████████| 20/20 [01:15<00:00,  3.78s/it]


Train loss 0.49481216371059417 accuracy 0.7698915933728779


100%|██████████| 5/5 [00:12<00:00,  2.58s/it]


Validation loss 0.7535698175430298 accuracy 0.6214227309893704
Epoch 13


100%|██████████| 20/20 [01:14<00:00,  3.75s/it]


Train loss 0.4559673607349396 accuracy 0.7823685825322152


100%|██████████| 5/5 [00:13<00:00,  2.65s/it]


Validation loss 0.7552526593208313 accuracy 0.634505314799673
Epoch 14


100%|██████████| 20/20 [01:15<00:00,  3.79s/it]


Train loss 0.4204413786530495 accuracy 0.8114133769687053


100%|██████████| 5/5 [00:13<00:00,  2.64s/it]


Validation loss 0.8386084556579589 accuracy 0.6116107931316436
Epoch 15


100%|██████████| 20/20 [01:14<00:00,  3.74s/it]


Train loss 0.3885962441563606 accuracy 0.8304356719165474


100%|██████████| 5/5 [00:13<00:00,  2.63s/it]


Validation loss 0.8605525016784668 accuracy 0.6132461161079313
Epoch 16


100%|██████████| 20/20 [01:15<00:00,  3.79s/it]


Train loss 0.34331168085336683 accuracy 0.8570259766823481


100%|██████████| 5/5 [00:13<00:00,  2.61s/it]


Validation loss 0.9061566352844238 accuracy 0.6206050695012265
Epoch 17


100%|██████████| 20/20 [01:14<00:00,  3.70s/it]


Train loss 0.309446557611227 accuracy 0.8707302106770302


100%|██████████| 5/5 [00:13<00:00,  2.62s/it]


Validation loss 0.966775918006897 accuracy 0.6132461161079313
Epoch 18


100%|██████████| 20/20 [01:15<00:00,  3.76s/it]


Train loss 0.2685634844005108 accuracy 0.892616076907343


100%|██████████| 5/5 [00:13<00:00,  2.67s/it]


Validation loss 0.9984898209571839 accuracy 0.6336876533115291
Epoch 19


100%|██████████| 20/20 [01:14<00:00,  3.72s/it]


Train loss 0.24524682462215425 accuracy 0.9069339333196973


100%|██████████| 5/5 [00:13<00:00,  2.62s/it]


Validation loss 1.0845348834991455 accuracy 0.6287816843826656
Epoch 20


100%|██████████| 20/20 [01:15<00:00,  3.75s/it]


Train loss 0.2409287102520466 accuracy 0.9036612804254449


100%|██████████| 5/5 [00:13<00:00,  2.62s/it]


Validation loss 1.1257475376129151 accuracy 0.6140637775960752
Epoch 21


100%|██████████| 20/20 [01:16<00:00,  3.83s/it]


Train loss 0.2068902775645256 accuracy 0.921865412149724


100%|██████████| 5/5 [00:13<00:00,  2.63s/it]


Validation loss 1.1663759708404542 accuracy 0.6189697465249387
Epoch 22


100%|██████████| 20/20 [01:13<00:00,  3.69s/it]


Train loss 0.19578770846128463 accuracy 0.9300470443853549


100%|██████████| 5/5 [00:13<00:00,  2.60s/it]


Validation loss 1.1829889059066772 accuracy 0.6287816843826656
Epoch 23


100%|██████████| 20/20 [01:14<00:00,  3.74s/it]


Train loss 0.17623313404619695 accuracy 0.9398650030681122


100%|██████████| 5/5 [00:13<00:00,  2.63s/it]


Validation loss 1.2588499069213868 accuracy 0.6116107931316436
Epoch 24


100%|██████████| 20/20 [01:15<00:00,  3.78s/it]


Train loss 0.18169318363070489 accuracy 0.9314788300265904


100%|██████████| 5/5 [00:13<00:00,  2.79s/it]


Validation loss 1.2294366598129272 accuracy 0.6009811937857727
Epoch 25


100%|██████████| 20/20 [01:18<00:00,  3.93s/it]


Train loss 0.14085316024720668 accuracy 0.949069339333197


100%|██████████| 5/5 [00:13<00:00,  2.67s/it]


Validation loss 1.3472553014755249 accuracy 0.598528209321341
Epoch 26


100%|██████████| 20/20 [01:16<00:00,  3.80s/it]


Train loss 0.14817148875445127 accuracy 0.9406831662916753


100%|██████████| 5/5 [00:13<00:00,  2.66s/it]


Validation loss 1.4465530395507813 accuracy 0.598528209321341
Epoch 27


100%|██████████| 20/20 [01:16<00:00,  3.80s/it]


Train loss 0.09840396977961063 accuracy 0.967273471057476


100%|██████████| 5/5 [00:13<00:00,  2.72s/it]


Validation loss 1.457781767845154 accuracy 0.6034341782502044
Epoch 28


100%|██████████| 20/20 [01:16<00:00,  3.81s/it]


Train loss 0.0925604447722435 accuracy 0.963591736551442


100%|██████████| 5/5 [00:13<00:00,  2.67s/it]


Validation loss 1.589827060699463 accuracy 0.5854456255110384
Epoch 29


100%|██████████| 20/20 [01:16<00:00,  3.80s/it]


Train loss 0.09320282004773617 accuracy 0.9642053589691144


100%|██████████| 5/5 [00:13<00:00,  2.63s/it]


Validation loss 1.5461992263793944 accuracy 0.6107931316434996
Epoch 30


100%|██████████| 20/20 [01:14<00:00,  3.71s/it]


Train loss 0.08231473341584206 accuracy 0.9719779095929638


100%|██████████| 5/5 [00:13<00:00,  2.66s/it]


Validation loss 1.5671987056732177 accuracy 0.6009811937857727
Epoch 31


100%|██████████| 20/20 [01:17<00:00,  3.87s/it]


Train loss 0.08136124350130558 accuracy 0.9697279607281654


100%|██████████| 5/5 [00:13<00:00,  2.70s/it]


Validation loss 1.6221316576004028 accuracy 0.6058871627146362
Epoch 32


100%|██████████| 20/20 [01:16<00:00,  3.84s/it]


Train loss 0.061911534238606694 accuracy 0.9744323992636531


100%|██████████| 5/5 [00:13<00:00,  2.66s/it]


Validation loss 1.689212441444397 accuracy 0.6026165167620605
Epoch 33


100%|██████████| 20/20 [01:14<00:00,  3.74s/it]


Train loss 0.07142301527783275 accuracy 0.9730006136224177


100%|██████████| 5/5 [00:13<00:00,  2.63s/it]


Validation loss 1.7139668941497803 accuracy 0.6034341782502044
Epoch 34


100%|██████████| 20/20 [01:16<00:00,  3.83s/it]


Train loss 0.0512307851575315 accuracy 0.9822049498875026


100%|██████████| 5/5 [00:13<00:00,  2.64s/it]


Validation loss 1.773531150817871 accuracy 0.6026165167620605
Epoch 35


100%|██████████| 20/20 [01:14<00:00,  3.75s/it]


Train loss 0.06389850648120046 accuracy 0.9779095929637963


100%|██████████| 5/5 [00:13<00:00,  2.62s/it]


Validation loss 1.7890857458114624 accuracy 0.6116107931316436
Epoch 36


100%|██████████| 20/20 [01:16<00:00,  3.80s/it]


Train loss 0.0457100591622293 accuracy 0.9842503579464104


100%|██████████| 5/5 [00:13<00:00,  2.63s/it]


Validation loss 1.8575234889984131 accuracy 0.6026165167620605
Epoch 37


100%|██████████| 20/20 [01:16<00:00,  3.84s/it]


Train loss 0.04738073125481605 accuracy 0.9840458171405195


100%|██████████| 5/5 [00:13<00:00,  2.64s/it]


Validation loss 1.9208091497421265 accuracy 0.6107931316434996
Epoch 38


100%|██████████| 20/20 [01:14<00:00,  3.75s/it]


Train loss 0.056709591671824454 accuracy 0.9793413786050318


100%|██████████| 5/5 [00:13<00:00,  2.64s/it]


Validation loss 1.8596239805221557 accuracy 0.598528209321341
Epoch 39


100%|██████████| 20/20 [01:16<00:00,  3.80s/it]


Train loss 0.04218751190928742 accuracy 0.9852730619758642


100%|██████████| 5/5 [00:13<00:00,  2.67s/it]


Validation loss 1.8515033006668091 accuracy 0.6197874080130826
Epoch 40


100%|██████████| 20/20 [01:17<00:00,  3.88s/it]


Train loss 0.0468123019207269 accuracy 0.9801595418285949


100%|██████████| 5/5 [00:13<00:00,  2.67s/it]

Validation loss 1.8995713472366333 accuracy 0.6238757154538022





In [None]:
from google.colab import drive
drive.mount('/content/drive')

model_save_path = '/content/drive/MyDrive/bert_model_v2/bert_model.pth'
torch.save(model.state_dict(), model_save_path)
print(f"Model saved to {model_save_path}")

tokenizer_save_path = '/content/drive/MyDrive/bert_model_v2/bert_tokenizer'
tokenizer.save_pretrained(tokenizer_save_path)
print(f"Tokenizer saved to {tokenizer_save_path}")