## STEP 1: Preprocessing text

In [2]:
import pandas as pd
import numpy as np
import re

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
!unzip 'drive/MyDrive/sentiment_dataset.zip'

Archive:  drive/MyDrive/sentiment_dataset.zip
  inflating: issuers.xlsx            
  inflating: mentions texts.pickle   
  inflating: mentions.csv            
  inflating: sentiment.csv           
  inflating: sentiment_texts.pickle  


In [89]:
# Список синонимов для каждой компании
synonyms_db = pd.read_csv('drive/MyDrive/new_names_and_synonyms.csv')

In [90]:
stripped_series = synonyms_db['EMITENT_FULL_NAME'].str.strip().str.lower().dropna()
# Сплитим, удаляем пробельные символы в начале и в конце строки
synonyms = stripped_series.str.split(',').apply(lambda lst: np.array(list(set([item.strip() for item in lst if item.strip()])))).values

In [91]:
mentions = pd.read_pickle("mentions texts.pickle")
sentiment = pd.read_pickle("sentiment_texts.pickle")

In [62]:
xl = pd.ExcelFile("issuers.xlsx")
issuers = xl.parse("Sheet1")
issuers.tail()

Unnamed: 0.1,Unnamed: 0,issuerid,EMITENT_FULL_NAME,datetrackstart,datetrackend,BGTicker,OtherTicker
250,250,270,Henderson,2024-01-24 00:00:00.000,,,HNFG
251,251,271,Совкомбанк,2024-01-24 00:00:00.000,,,SVCB
252,252,272,Евротранс,2024-01-24 00:00:00.000,,,EUTR
253,253,273,"""Каршеринг Руссия"", ПАО",2024-02-20 00:00:00.000,,,DELI
254,254,274,Диасофт,2024-02-20 00:00:00.000,,,DIAS


Будем искать вхождения компаний в тексте с помощью алгоритма Ахо Карасик \
https://ru.algorithmica.org/cs/string-structures/aho-corasick/

In [11]:
!pip install -q pyahocorasick

In [9]:
from ahocorasick import Automaton

def find_word_indices(texts: list[str], words: list[str], name_to_id: dict):
    '''
    Возвращает два массива: в первом для каждого сообщения перечислены issuer_id,
    синонимы к которым были найдены, во втором перечислены сами синонимы
    :param texts: массив сообщений
    :param words: массив синонимов, вхождения которых будем искать в сообщениях
    :param name_to_id: Словарь, где каждому синониму соответствует некоторый issuer_id
    '''
    # Инициализируем автомата для Ахо-Корасик
    automaton = Automaton()

    # Добавляем слова в автомат
    for idx, word in enumerate(words):
        automaton.add_word(word, idx)

    # Строим автомат
    automaton.make_automaton()

    id_list = []
    word_list = []
    # Итерируемся по всем сообщениям
    for text in texts:
        # Мапим issuer_id к синониму, который встретился в сообщении
        ids_found_in_text = set()
        words_found_in_text = set()
        # Находим вхождения словарных слов в сообщения и сохраняем их
        for end_index, word_index in automaton.iter(text.lower()):
            if name_to_id[words[word_index]] not in ids_found_in_text:
                ids_found_in_text.add(name_to_id[words[word_index]])
                words_found_in_text.add(words[word_index])

        # Список issuer_id найденных в каждом сообщении
        id_list.append(list(ids_found_in_text))
        # Список самих синонимов, которые были найдены
        word_list.append(list(words_found_in_text))

    return id_list, word_list

# Пример использования
texts = ['aboba biba boba', 'baibd aboba vkejr lsrnb', 'wsoure. guierbg hsge gher']

words = ['aboba', 'hsge', 'baibd']

id_list, word_list = find_word_indices(texts, words, {'aboba': 15, "hsge": 2, 'baibd': 7})
print(id_list, word_list)

[[15], [15, 7], [2]] [['aboba'], ['aboba', 'baibd'], ['hsge']]


In [92]:
import string
def process_all_messages(texts: list[str], synonyms: list[list[str]]):
    '''
    Обёртка над функцией find_word_indices
    :param texts: текст сообщения
    :param synonyms: синонимы для каждой компании
    '''
    # По синониму к названию компании находим issuerid
    name_to_id = dict()
    punct = [' ', ',', ';', '.']
    # Заполняем словарь
    new_synonyms = np.array([])
    for id, company_synonyms in enumerate(synonyms):
        issuer_id = issuers.iat[id, 1]
        tmp = list()
        for word in company_synonyms:
            if len(word) < 5:
                for symb in punct:
                    tmp.append(word + symb)
            else:
                tmp.append(word)

        for word in tmp:
            if word not in name_to_id:
                name_to_id[word] = issuer_id

        new_synonyms = np.append(new_synonyms, np.array(tmp))

    return find_word_indices(texts, new_synonyms, name_to_id)

Почистим входные данные

In [None]:
def remove_emoji(text):
    '''
    Удаление смайликов из текста
    :param text:  - текст сообщения, которое необходимо обработать
    '''
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

def preprocessing_text(text):
    # Удаление пунктуации
    text = text.translate(str.maketrans('', '', PUNCT_TO_REMOVE))

    # Удаление стоп слов
    text = " ".join([word for word in str(text).split() if word not in stop_words])

    # Удаление смайликов
    text = remove_emoji(text)

    return text

In [12]:
def preprocessing_dataset(df):
    #df["preprocessed_MessageText"] = df["MessageText"].apply(lambda text: preprocessing_text(text))
    df = df.dropna()
    df["CompanyId"], df["CompanyName"] = process_all_messages(df["MessageText"], synonyms)
    df = df.explode(["CompanyId", "CompanyName"])
    return df

В данных есть несколько объектов с нулевым SentimentScore - это выбросы, удалим их из датасета

In [80]:
sentiment = sentiment[sentiment["SentimentScore"] > 0]
sentiment["SentimentScore"] -= 1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sentiment["SentimentScore"] -= 1


In [93]:
sentiment = preprocessing_dataset(sentiment)

In [94]:
sentiment.head()

Unnamed: 0,MessageID,ChannelID,issuerid,SentimentScore,DateAdded,DatePosted,MessageText,IsForward,CompanyId,CompanyName
0,241407,1203560567,153,2,2023-05-12 19:03:20,2023-05-12 19:02:42,⚠️🇷🇺#SELG #дивиденд сд Селигдар: дивиденды 20...,False,153,selg
1,33684,1136626166,230,4,2023-02-03 20:56:29,2023-02-03 16:46:34,Ozon продолжает развивать специализированные ф...,False,230,ozon
2,10090,1063908560,118,4,2023-06-02 19:18:37,2023-06-02 18:50:00,​Фокусы продолжаются🔥Акции и инвестиции 📈ВТБ ...,False,224,лукойл
2,10090,1063908560,118,4,2023-06-02 19:18:37,2023-06-02 18:50:00,​Фокусы продолжаются🔥Акции и инвестиции 📈ВТБ ...,False,99,втб
2,10090,1063908560,118,4,2023-06-02 19:18:37,2023-06-02 18:50:00,​Фокусы продолжаются🔥Акции и инвестиции 📈ВТБ ...,False,103,нмтп


In [95]:
grouped = sentiment[["issuerid", "CompanyId", "MessageText"]].groupby(by="MessageText").agg(lambda x: set(x))

In [85]:
def step1_accuracy():
    successful = 0
    for idx, row in sentiment.iterrows():
        if row["issuerid"] == row["CompanyId"]:
            successful += 1
    print(successful / sentiment.index.nunique())

In [97]:
step1_accuracy()

0.9228119280869846


In [21]:
from sklearn.metrics import f1_score

In [75]:
def f1score(y_true, y_pred, classes):
    sum = 0
    len = classes.shape[0]
    for cls in classes:
        mask_true = np.zeros(y_pred.shape[0])
        mask_pred = np.zeros(y_pred.shape[0])
        for i, pred in enumerate(y_pred):
            if cls in pred:
                mask_pred[i] = 1

        for i, true in enumerate(y_true):
            if cls in true:
                mask_true[i] = 1

        if np.sum(mask_true) == 0 and np.sum(mask_pred) != 0:
            len -= 1
        sum += f1_score(mask_pred, mask_true, zero_division=1.0)
    return sum / len

In [98]:
f1score(grouped['issuerid'].values, grouped['CompanyId'].values, issuers["issuerid"].values)

0.6802990123134848

## STEP 2: sentiment analisys

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModel, TrainingArguments, Trainer, pipeline
from torch.utils.data import DataLoader, TensorDataset
from datasets import load_dataset
from datasets import Dataset
import torch.nn.functional as F
from torch import nn
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

2024-04-13 16:09:34.905837: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-13 16:09:34.905941: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-13 16:09:35.026163: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Device: cuda


In [None]:
df_step2 = sentiment[sentiment["issuerid"] == sentiment["CompanyId"]]

In [None]:
data = {
    'text': df_step2["MessageText"],
    'aspect': df_step2["CompanyName"],
    'label': df_step2["SentimentScore"]
}

df = pd.DataFrame(data)
df = df.reset_index()
df = df.drop(["index"], axis=1)
df.head()

Unnamed: 0,text,aspect,label
0,⚠️🇷🇺#SELG #дивиденд сд Селигдар: дивиденды 20...,selg,1
1,​Фокусы продолжаются🔥Акции и инвестиции 📈ВТБ ...,группа позитив,3
2,​Фокусы продолжаются🔥Акции и инвестиции 📈ВТБ ...,двмп,4
3,​​Windfall Tax — налог на сверхприбыль. Какие ...,газпром,1
4,​​Windfall Tax — налог на сверхприбыль. Какие ...,северсталь,1


In [None]:
dataset = Dataset.from_pandas(df,  preserve_index=False)
dataset = dataset.train_test_split(test_size=0.1)
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'aspect', 'label'],
        num_rows: 5750
    })
    test: Dataset({
        features: ['text', 'aspect', 'label'],
        num_rows: 639
    })
})

In [None]:
class TransformerClassificationModel(nn.Module):
    """
    """
    def init(self, base_transformer_model: str, num_classes: int):
        super(TransformerClassificationModel, self).init()
        config = AutoConfig.from_pretrained(base_transformer_model)
        self.backbone = AutoModel.from_pretrained(base_transformer_model, config=config)
        self.classifier = nn.Linear(self.backbone.config.hidden_size, num_classes)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, input_ids, attention_mask):
        outputs = self.backbone(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        logits = self.classifier(pooled_output)
        probabilities = self.softmax(logits)
        return {'logits': logits, 'probabilities': probabilities, 'backbone outputs': outputs}


In [None]:
def freeze_backbone_function(model: TransformerClassificationModel, freeze=True):
    for param in model.backbone.parameters():
        param.requires_grad = not freeze
    return model

In [None]:
def wrap(text, aspect):
      """Функция которая оборачивает текст и аспект в формат, принимаемый токенайзером"""
      return f"[CLS] {text} [SEP] {aspect} [SEP]"

In [None]:
# Параметры модели
MAX_LENGTH = 256
BATCH_SIZE = 32
LR = 3e-4
NUM_EPOCHS = 3
FREEZE_BACKBONE = False

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from transformers import AutoTokenizer
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split
import numpy as np
import copy


class TransformerClassificationTrainer:
    def __init__(self, model, tokenizer, max_length=MAX_LENGTH, batch_size=BATCH_SIZE, lr=LR,
                 num_epochs=NUM_EPOCHS, freeze_backbone=FREEZE_BACKBONE):
        self.model = freeze_backbone_function(model, freeze_backbone)
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.batch_size = batch_size
        self.lr = lr
        self.num_epochs = num_epochs

    def preprocess_data(self, texts, aspects, labels):
        #inputs = self.tokenizer(texts, padding=True, truncation=True, max_length=self.max_length, return_tensors="pt")
        inputs = self.tokenizer([wrap(text, aspect) for text, aspect in zip(texts, aspects)],
                                max_length=self.max_length, truncation=True, padding=True, return_tensors="pt")
        labels = torch.tensor(labels)
        return inputs, labels

    def train(self, train_texts, train_aspects, train_labels):
        train_inputs, train_labels = self.preprocess_data(train_texts, train_aspects, train_labels)
        train_data = torch.utils.data.TensorDataset(train_inputs['input_ids'], train_inputs['attention_mask'], train_labels)
        train_loader = DataLoader(train_data, batch_size=self.batch_size, shuffle=True)

        optimizer = AdamW(self.model.parameters(), lr=self.lr)
        loss_fn = nn.CrossEntropyLoss()

        self.model.to(device)
        self.model.train()

        for epoch in range(self.num_epochs):
            total_loss = 0
            for batch in train_loader:
                batch = tuple(t.to(device) for t in batch)
                input_ids, attention_mask, labels = batch

                optimizer.zero_grad()
                outputs = self.model(input_ids, attention_mask)
                logits = outputs['logits']
                loss = loss_fn(logits, labels)
                total_loss += loss.item()

                loss.backward()
                torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0)
                optimizer.step()

            avg_train_loss = total_loss / len(train_loader)
            print(f"Epoch {epoch+1}/{self.num_epochs}, Train Loss: {avg_train_loss:.4f}")

        return self.model

    def evaluate(self, test_texts, test_aspects, test_labels):
        self.model.eval()
        test_inputs, test_labels = self.preprocess_data(test_texts, test_aspects, test_labels)
        test_data = torch.utils.data.TensorDataset(test_inputs['input_ids'], test_inputs['attention_mask'], test_labels)
        test_loader = DataLoader(test_data, batch_size=self.batch_size, shuffle=False)

        predictions = []
        true_labels = []

        with torch.no_grad():
            for batch in test_loader:
                batch = tuple(t.to(device) for t in batch)
                input_ids, attention_mask, labels = batch
                outputs = self.model(input_ids, attention_mask)
                logits = outputs['logits']
                preds = torch.argmax(logits, dim=1).cpu().numpy()
                predictions.extend(preds)
                true_labels.extend(labels.cpu().numpy())

        accuracy = accuracy_score(true_labels, predictions)
        print(f"Accuracy: {accuracy:.4f}")
        f1 = f1_score(true_labels, predictions, average='macro')
        print(f"F1 score: {f1:.4f}")


In [None]:
torch.cuda.empty_cache()

In [None]:
rt2 = TransformerClassificationModel("cointegrated/rubert-tiny2", num_classes=5)
rt2_tokenizer = AutoTokenizer.from_pretrained("cointegrated/rubert-tiny2")
rt2_trainer = TransformerClassificationTrainer(model=rt2,
                                               tokenizer=rt2_tokenizer,
                                               freeze_backbone=False,
                                               num_epochs=15)

rt2_trained = rt2_trainer.train(dataset['train']['text'], dataset['train']['aspect'], dataset['train']['label'])
rt2_trainer.evaluate(dataset['test']['text'], dataset['test']['aspect'], dataset['test']['label'])



Epoch 1/15, Train Loss: 1.0534
Epoch 2/15, Train Loss: 0.7417
Epoch 3/15, Train Loss: 0.4975
Epoch 4/15, Train Loss: 0.3916
Epoch 5/15, Train Loss: 0.3236
Epoch 6/15, Train Loss: 0.2640
Epoch 7/15, Train Loss: 0.2480
Epoch 8/15, Train Loss: 0.2159
Epoch 9/15, Train Loss: 0.2083
Epoch 10/15, Train Loss: 0.1980
Epoch 11/15, Train Loss: 0.1895
Epoch 12/15, Train Loss: 0.1819
Epoch 13/15, Train Loss: 0.1840
Epoch 14/15, Train Loss: 0.1722
Epoch 15/15, Train Loss: 0.1654
Accuracy: 0.5947
F1 score: 0.4454


In [None]:
def show_examples(model, tokenizer, num_examples = 10):
    model.eval()
    for i in range(num_examples):
        #idx = np.random.randint(0, len(dataset))
        idx = i
        example = dataset["test"][idx]
        text, aspect, label = example["text"], example["aspect"], example["label"]
        inputs = tokenizer([wrap(text, aspect)], truncation=True, padding=True, return_tensors="pt").to(device)

        with torch.no_grad():
            input_ids, attention_mask = inputs["input_ids"], inputs["attention_mask"]
            output = model(input_ids, attention_mask)
            logits = output['logits']
            preds = torch.argmax(logits, dim=1).cpu().numpy()
            print(f"EXAMPLE {i+1}")
            print("TEXT: ", text)
            print("ASPECT: ", aspect)
            print("LABEL: ", label)
            print("PREDS: ", preds)