In [2]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import torch

In [3]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

data = pd.read_csv('../data/training.1600000.processed.noemoticon.csv', encoding='latin-1')

data = data.sample(frac=1, random_state=42).reset_index(drop=True)

data = data.sample(20000, random_state=42)

data.columns = ['target', 'id', 'date', 'flag', 'user', 'text']

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Korisnik\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Korisnik\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Korisnik\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


predprocesiranje teksta

In [4]:
def preprocess_text(text):

    text = re.sub(r'\d+', '', text)
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    text = text.strip()
    
    tokens = nltk.word_tokenize(text)
    
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word.lower() not in stop_words]
    
    return ' '.join(tokens)

In [5]:
data['text'] = data['text'].apply(preprocess_text)

data['target'] = data['target'].replace(4, 1)

train_texts, test_texts, train_labels, test_labels = train_test_split(data['text'], data['target'], test_size=0.2, random_state=42)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

train_encodings = tokenizer(train_texts.tolist(), truncation=True, padding=True, max_length=512)
test_encodings = tokenizer(test_texts.tolist(), truncation=True, padding=True, max_length=512)

In [6]:
unique_targets_full = data['target'].unique()
print(f'Jedinstvene oznake u celom skupu podataka: {unique_targets_full}')


Jedinstvene oznake u celom skupu podataka: [0 1]


In [12]:
class SentimentDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)


In [13]:
train_dataset = SentimentDataset(train_encodings, train_labels.tolist())
test_dataset = SentimentDataset(test_encodings, test_labels.tolist())


In [14]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
print(torch.cuda.is_available())  

False


In [17]:
last_checkpoint = './results/checkpoint-4500'

trainer.train(resume_from_checkpoint=last_checkpoint)

  0%|          | 0/6000 [00:00<?, ?it/s]

{'loss': 0.3286, 'grad_norm': 15.167470932006836, 'learning_rate': 1.3545454545454547e-05, 'epoch': 2.25}
{'loss': 0.3741, 'grad_norm': 0.8420503735542297, 'learning_rate': 1.3454545454545457e-05, 'epoch': 2.26}
{'loss': 0.1901, 'grad_norm': 29.57876968383789, 'learning_rate': 1.3363636363636364e-05, 'epoch': 2.27}
{'loss': 0.3552, 'grad_norm': 6.342377185821533, 'learning_rate': 1.3272727272727273e-05, 'epoch': 2.27}
{'loss': 0.2001, 'grad_norm': 0.4319951832294464, 'learning_rate': 1.318181818181818e-05, 'epoch': 2.27}
{'loss': 0.2636, 'grad_norm': 0.41427087783813477, 'learning_rate': 1.3090909090909093e-05, 'epoch': 2.28}
{'loss': 0.2241, 'grad_norm': 4.239613056182861, 'learning_rate': 1.3000000000000001e-05, 'epoch': 2.29}
{'loss': 0.358, 'grad_norm': 0.40873804688453674, 'learning_rate': 1.290909090909091e-05, 'epoch': 2.29}
{'loss': 0.2144, 'grad_norm': 0.32359832525253296, 'learning_rate': 1.2818181818181818e-05, 'epoch': 2.29}
{'loss': 0.4775, 'grad_norm': 31.846782684326172,

TrainOutput(global_step=6000, training_loss=0.0693953210785985, metrics={'train_runtime': 24184.1941, 'train_samples_per_second': 1.985, 'train_steps_per_second': 0.248, 'total_flos': 2614689588672000.0, 'train_loss': 0.0693953210785985, 'epoch': 3.0})

In [19]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

In [20]:
results = trainer.evaluate()
print(results)

  0%|          | 0/500 [00:00<?, ?it/s]

{'eval_loss': 0.9529119729995728, 'eval_accuracy': 0.75475, 'eval_precision': 0.7558748992722419, 'eval_recall': 0.75475, 'eval_f1': 0.7543155648032276, 'eval_runtime': 4186.5257, 'eval_samples_per_second': 0.955, 'eval_steps_per_second': 0.119}


In [21]:
# Sačuvajte model
model.save_pretrained('./saved_model')
# Sačuvajte tokenizer
tokenizer.save_pretrained('./saved_model')

('./saved_model\\tokenizer_config.json',
 './saved_model\\special_tokens_map.json',
 './saved_model\\vocab.txt',
 './saved_model\\added_tokens.json')