In [None]:
!pip install clean-text
!pip install transformers
!pip install torchmetrics

In [1]:
import os
import torch
import numpy as np
import pandas as pd
from tqdm import tqdm
from datasets import load_dataset
from torch.utils.data import DataLoader

from utils.datasets import TextDataset, collate_fn
from utils.models import BertClassifier
from utils.utils import clean_func, train_model, test_model

torch.manual_seed(0)
torch.cuda.manual_seed(0)
np.random.seed(0)

DATA_PATH = './data'
MODEL_PATH = './models'
tqdm.pandas()

  from .autonotebook import tqdm as notebook_tqdm
Since the GPL-licensed package `unidecode` is not installed, using Python's `unicodedata` package which yields worse results.
[nltk_data] Downloading package wordnet to /home/alex-utk/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


### Препроцессинг

In [2]:
dataset = load_dataset("rotten_tomatoes")
train_data = dataset['train'].to_pandas()
test_data = dataset['test'].to_pandas()
val_data = dataset['validation'].to_pandas()

train_data.head()

Unnamed: 0,text,label
0,the rock is destined to be the 21st century's ...,1
1,"the gorgeously elaborate continuation of "" the...",1
2,effective but too-tepid biopic,1
3,if you sometimes like to go to the movies to h...,1
4,"emerges as something rare , an issue movie tha...",1


In [4]:
train_data['text'] = train_data['text'].progress_apply(clean_func)
test_data['text'] = test_data['text'].progress_apply(clean_func)
val_data['text'] = val_data['text'].progress_apply(clean_func)

train_data.to_csv(os.path.join(DATA_PATH, 'train.csv'), index=False)
test_data.to_csv(os.path.join(DATA_PATH, 'test.csv'), index=False)
val_data.to_csv(os.path.join(DATA_PATH, 'val.csv'), index=False)

100%|██████████| 8530/8530 [24:58<00:00,  5.69it/s]
100%|██████████| 1066/1066 [03:14<00:00,  5.48it/s]
100%|██████████| 1066/1066 [03:12<00:00,  5.55it/s]


### Обучение

In [2]:
train = pd.read_csv(os.path.join(DATA_PATH, 'train.csv')) 
test = pd.read_csv(os.path.join(DATA_PATH, 'test.csv')) 
val = pd.read_csv(os.path.join(DATA_PATH, 'val.csv')) 

train_dataset = TextDataset(train)
val_dataset = TextDataset(val)
test_dataset = TextDataset(test)

train_loader = DataLoader(train_dataset, batch_size=2843, shuffle=True, drop_last=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=1066, shuffle=True, drop_last=True, collate_fn=collate_fn)

Пропробуем нашу модель полностью необученную

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = BertClassifier()
model.eval()
model.to(device)

test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False, drop_last=False, collate_fn=collate_fn)

precision, recall, f1 = test_model(model, test_loader)
print('Необученная модель')
print(f'Precision - {precision}') 
print(f'Recall - {recall}')
print(f'F1 - {f1}')

Необученная модель
Precision - 0.4749498963356018
Recall - 0.4446529150009155
F1 - 0.4593023359775543


Теперь тоже самое, но с обучением

In [None]:
# учил на кэггле
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = BertClassifier()
model.to(device)

criterion = torch.nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
sheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=4)

n_epochs = 10000
train_model(model, train_loader, val_loader, criterion, sheduler, optimizer,
            n_epochs, os.path.join(MODEL_PATH, 'BertClassifier.pt'))

In [4]:
# Оценка метрик
model.load_state_dict(torch.load(os.path.join(MODEL_PATH, 'BertClassifier.pt')))
model.eval()
model.to(device)

test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False, drop_last=False, collate_fn=collate_fn)

precision, recall, f1 = test_model(model, test_loader)
print('Обученная модель')
print(f'Precision - {precision}') 
print(f'Recall - {recall}')
print(f'F1 - {f1}')

Обученная модель
Precision - 0.756302535533905
Recall - 0.6754221320152283
F1 - 0.7135778069496155
