In [1]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, TensorDataset, RandomSampler, SequentialSampler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [3]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly i

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [4]:
import pandas as pd
import numpy as np

In [5]:
train_df = pd.read_csv("train.csv")

train_df.head()

Unnamed: 0,ID,url,title,label
0,0,m.kp.md,"Экс-министр экономики Молдовы - главе МИДЭИ, ц...",0
1,1,www.kp.by,Эта песня стала известна многим телезрителям б...,0
2,2,fanserials.tv,Банши 4 сезон 2 серия Бремя красоты смотреть о...,0
3,3,colorbox.spb.ru,Не Беси Меня Картинки,0
4,4,tula-sport.ru,В Новомосковске сыграют следж-хоккеисты алекси...,0


In [6]:
test_df = pd.read_csv("test.csv")

test_df.head()

Unnamed: 0,ID,url,title
0,135309,www.kommersant.ru,Шестой кассационный суд в Самаре начнет работу...
1,135310,urexpert.online,"Что такое индексация алиментов, кем и в каких ..."
2,135311,imperimeha.ru,Женщинам | Империя Меха - Part 12
3,135312,national-porn.com,"Небритые, волосатые киски: Порно всех стран и ..."
4,135313,2gis.ru,67


In [10]:
train_df["label"][:10000].sum()

1209

In [11]:
texts = train_df["title"][:10000]
labels = train_df["label"][:10000]

In [12]:
texts, labels

(0       Экс-министр экономики Молдовы - главе МИДЭИ, ц...
 1       Эта песня стала известна многим телезрителям б...
 2       Банши 4 сезон 2 серия Бремя красоты смотреть о...
 3                                   Не Беси Меня Картинки
 4       В Новомосковске сыграют следж-хоккеисты алекси...
                               ...                        
 9995    Скачать песню Трофим (Сергей Трофимов) — Не Зр...
 9996    Xalq artistinə səhvən xərзəng diaqnozu qoyuldu...
 9997    Вакансия Комплектовщик в Улан-Удэ, ООО "ПРЕМИУ...
 9998    Goth Charlotte Anal Creampie - Charlotte Sartr...
 9999    ForteBank на просп. Богенбай Батыра, 28, ВП-4 ...
 Name: title, Length: 10000, dtype: object,
 0       0
 1       0
 2       0
 3       0
 4       0
        ..
 9995    0
 9996    0
 9997    0
 9998    1
 9999    0
 Name: label, Length: 10000, dtype: int64)

In [13]:
encoded_inputs = tokenizer(texts.tolist(), padding=True, truncation=True, return_tensors='pt')

In [14]:
input_ids = encoded_inputs['input_ids']
attention_mask =  encoded_inputs['attention_mask']
labels = torch.tensor(labels.tolist())

In [15]:
dataset = TensorDataset(input_ids, attention_mask, labels)

In [16]:
train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])

In [17]:
batch_size = 16
train_dataloader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=batch_size)
val_dataloader = DataLoader(val_dataset, sampler=SequentialSampler(val_dataset), batch_size=batch_size)

In [18]:
optimizer = torch.optim.Adam(model.parameters(), lr=2e-5, eps=1e-8)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=len(train_dataloader), gamma=0.1)

In [19]:
epochs = 1
model.train()  # Set the model to training mode
for epoch in range(epochs):
    for batch in train_dataloader:
        batch = tuple(t.to(device) for t in batch)
        inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[2]}
        outputs = model(**inputs)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

In [20]:
model.eval()  # Set the model to evaluation mode
predictions = []
true_labels = []

for batch in val_dataloader:
    batch = tuple(t.to(device) for t in batch)
    inputs = {"input_ids": batch[0], "attention_mask": batch[1]}
    with torch.no_grad():
        outputs = model(**inputs)

    logits = outputs.logits
    predictions.append(logits.detach().cpu().numpy())
    true_labels.append(batch[2].detach().cpu().numpy())

In [21]:
flat_predictions = np.argmax(np.concatenate(predictions, axis=0), axis=1)
flat_true_labels = np.concatenate(true_labels, axis=0)
accuracy = accuracy_score(flat_true_labels, flat_predictions)
precision, recall, f1, _ = precision_recall_fscore_support(flat_true_labels, flat_predictions, average='binary')


In [22]:
print(f'Accuracy: {accuracy:.4f}')
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1 Score: {f1:.4f}')

Accuracy: 0.9710
Precision: 0.8649
Recall: 0.8727
F1 Score: 0.8688


In [23]:
max_length = 128

In [24]:
def tokenize_text(text, max_length):
    return tokenizer(text, truncation=True, padding='max_length', max_length=max_length, return_tensors='pt')


In [25]:
train_title_encoded = tokenize_text(test_df["title"].tolist(), max_length)

In [30]:
from transformers import TextClassificationPipeline

2024-04-11 10:03:45.919541: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-04-11 10:03:46.724721: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2024-04-11 10:03:46.724862: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory


In [34]:
device = torch.device("cpu")
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [40]:
pipe = TextClassificationPipeline(model=model, tokenizer=tokenizer)#, return_all_scores=True)

In [53]:
def f(s):
    return int(pipe(s)[0]["label"] == "LABEL_1")

1

In [57]:
test_df["label"] = test_df["title"].apply(lambda x: f(str(x)))

In [58]:
test_df[["ID", "label"]].to_csv("bert_try_10k.csv", index=False)

!cat bert_try_10k.csv | head

cat: ml_baseline.csv: No such file or directory


In [59]:
!cat bert_try_10k.csv | head

ID,label
135309,0
135310,0
135311,0
135312,1
135313,0
135314,0
135315,0
135316,0
135317,0
cat: write error: Broken pipe
