In [30]:
import tqdm
import pandas as pd
from sklearn.model_selection import train_test_split
from dataset import NewsDataset
import torch

from transformers import AutoTokenizer, AutoModel, BertForSequenceClassification, Trainer, TrainingArguments

In [None]:
train_stances = pd.read_csv(filepath_or_buffer='train_stances.csv')
train_bodies = pd.read_csv(filepath_or_buffer='train_bodies.csv')
# train_stances = pd.read_csv(filepath_or_buffer='train_stances.csv')
# train_stances = pd.read_csv(filepath_or_buffer='train_stances.csv')

In [None]:
count=0
for i in tqdm.tqdm(range(train_stances.shape[0])):
    for j in range(train_bodies.shape[0]):
        if train_bodies.loc[j,'Body ID']==train_stances.loc[i,'Body ID']:
            train_stances.loc[i,'articleBody'] = train_bodies.loc[j,'articleBody']
            train_stances.loc[i,'articleBody1'] = train_bodies.loc[j,'articleBody1']
            train_stances.loc[i,'articleBody2'] = train_bodies.loc[j,'articleBody2']
    train_stances.to_csv('data_combined.csv',index=False)

100%|██████████| 4408/4408 [07:26<00:00,  9.87it/s]


In [3]:
train_data = pd.read_csv('data_combined.csv')

In [4]:
train_data = train_data[['articleBody', 'Stance']]
train_data['label'] = train_data['Stance'].map({'agree': 0, 'disagree': 1})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data['label'] = train_data['Stance'].map({'agree': 0, 'disagree': 1})


In [5]:
train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_data['articleBody'].tolist(),
    train_data['label'].tolist(),
    test_size=0.2,
    random_state=42
)

In [19]:
!nvidia-smi

Sat Sep 28 22:40:56 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   71C    P0              30W /  70W |   5531MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [20]:
tokenizer = AutoTokenizer.from_pretrained('DeepPavlov/rubert-base-cased')
model = BertForSequenceClassification.from_pretrained('DeepPavlov/rubert-base-cased', num_labels=2).to('cuda')

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at DeepPavlov/rubert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [21]:
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=512)

In [22]:
train_dataset = NewsDataset(train_encodings, train_labels)
val_dataset = NewsDataset(val_encodings, val_labels)

In [23]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_eval_batch_size=8,
    per_device_train_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs'
)

In [24]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

In [None]:
trainer.train()

In [26]:
trainer.evaluate()

Step,Training Loss,Validation Loss
500,0.1126,0.027156


{'eval_loss': 0.027156051248311996}

In [None]:
model.save_pretrained('./fine-tuned-rubert')
tokenizer.save_pretrained('./fine-tuned-rubert')

In [39]:
import torch.nn.functional as F

In [57]:
# Подготовка текста
text = "Разница между ожидаемыми по итогам марта нефтегазовыми доходами и фактически поступившими в российский бюджет составляет 302 миллиарда рублей, сообщает «Интерфакс» со ссылкой на Минфин РФ. Ранее планировалось, что в марте в федеральный бюджет поступят 790 миллиардов рублей.   «Корректировка обусловлена отклонением объемов экспорта газа от прогнозного уровня, а также более низкой динамикой поступлений в рамках режима НДД и ростом возмещения акциза на нефтяное сырье, в том числе в связи с изменением структуры отдельных элементов налогообложения», — говорится в сообщении. В апреле Минфин ожидает, что в федеральный бюджет поступят 798,4 миллиарда рублей дополнительных нефтегазовых доходов."
inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=512).to('cuda')

# Предсказание
with torch.no_grad():
    outputs = model(**inputs)
    logits = outputs.logits
    predicted_class = torch.argmax(logits, dim=1).item()
    probabilities = F.softmax(logits, dim=1)

# Интерпретация результата
class_names = {0: 'Правдивая', 1: 'Фейковая'}
print(f'Предсказанный класс: {class_names[predicted_class]}')
# print(F.softmax(logits, dim=1))
print(probabilities)
print(f'Вероятности: Правдивая - {probabilities[0][0].item():.4f}, Фейковая - {probabilities[0][1].item():.4f}')

Предсказанный класс: Правдивая
tensor([[9.9982e-01, 1.7735e-04]], device='cuda:0')
Вероятности: Правдивая - 0.9998, Фейковая - 0.0002


In [56]:
!pip freeze > requirements.txt