<a href="https://colab.research.google.com/github/goin2crazy/arabic-text-classification/blob/main/lenta_ru_inference.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Load Model

In [None]:

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import torch
from torch.utils.data import DataLoader

from transformers import (AutoTokenizer,
                          AutoModelForTokenClassification,
                          AutoModelForSequenceClassification,
                          pipeline
                         )

from tqdm import tqdm

tqdm.pandas()

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
class Config:
    datapath = '/kaggle/input/lenta-ru-private-dataset-for-tatar-hackathon/lenta_ru_news_2019_2023.csv'

    ner_preset = "Davlan/distilbert-base-multilingual-cased-ner-hrl"
    sentiment_preset = 'doublecringe123/lenta-ru-sentiments'

cfg = Config()

In [None]:
class NERModel:
    def __init__(self):
        self.tokenizer = AutoTokenizer.from_pretrained(cfg.ner_preset)
        self.model = AutoModelForTokenClassification.from_pretrained(cfg.ner_preset).to(device)
        self.nlp = pipeline("ner", model=self.model, tokenizer=self.tokenizer, aggregation_strategy="max", device = device)

    def __call__(self, batch):
        return self.nlp(batch)

class SentimentModel:
    def __init__(self):
        model_checkpoint = cfg.sentiment_preset
        self.tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
        self.model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint).to(device)


    def get_sentiment(self, text, focus = ''):
        """ Calculate sentiment of a text. `return_type` can be 'label', 'score' or 'proba' """
        with torch.no_grad():
            inputs = self.tokenizer(text, return_tensors='pt', truncation=True, padding=True).to(device)
            outputs = self.model(**inputs).logits

            probas = torch.sigmoid(outputs).cpu().numpy()

        for proba in probas:
              yield proba.dot([-1, 0, 1])

    def __call__(self, *args, **kwargs):
        return self.get_sentiment( *args, **kwargs)

In [None]:
ner_model = NERModel()
sentiment_model = SentimentModel()

tokenizer_config.json:   0%|          | 0.00/1.34k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/241k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/706k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/928 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/47.1M [00:00<?, ?B/s]

In [None]:
class Inference:
    def __init__(self):
        self.ner_model = NERModel()
        self.sentiment_model = SentimentModel()

    def predict_sentiment(self, word, text):
        """

        Args:
          word: focus word
          text: text to analize

        Returns:

        """
        p = list(self.sentiment_model(f"[focus: {word}] {text}"))

        return sum(p)/len(p)

    def add_sentiments(self, ner_text, text):
        text_sentences = text.split('.')

        ner_sentiments = {i['word']: self.predict_sentiment(i['word'], text) for i in ner_text}

        ner_sentiments = [{**nt, "sentiment": v} for nt, (k, v) in zip(ner_text, ner_sentiments.items())]
        return  ner_sentiments

    def process(self, batch):
        """
        Arguments:
            batch: list is strs
        """

        ner_batch = self.ner_model(batch)

        return [self.add_sentiments(ner_text, text) for ner_text, text in zip(ner_batch, batch)]

    def __call__(self, batch):
        if type(batch) == str:
            batch = [batch]

        return self.process(batch)

# Repo


In [None]:
! git clone https://github.com/goin2crazy/lenta_ru.git

Cloning into 'lenta_ru'...
remote: Enumerating objects: 26, done.[K
remote: Counting objects: 100% (26/26), done.[K
remote: Compressing objects: 100% (16/16), done.[K
remote: Total 26 (delta 9), reused 23 (delta 8), pack-reused 0[K
Receiving objects: 100% (26/26), 10.87 KiB | 2.17 MiB/s, done.
Resolving deltas: 100% (9/9), done.


In [None]:
! pip install -q /content/lenta_ru

  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.8/40.8 MB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.9/64.9 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m21.3/21.3 MB[0m [31m49.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for awesome-inference (setup.py) ... [?25l[?25hdone
[31mERROR: pip's dep

In [None]:
# prompt: import Inference from lenta_ru

from lenta_ru import Inference

In [None]:
inf = Inference()

tokenizer_config.json:   0%|          | 0.00/270 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/876 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/539M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.34k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/241k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/706k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/928 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/47.1M [00:00<?, ?B/s]

In [None]:
# sentiment > 0 - positive
# sentiment < 0 - negative

inf('I love working at Apple!')

[[{'entity_group': 'ORG',
   'score': 0.99978536,
   'word': 'Apple',
   'start': 18,
   'end': 23,
   'sentiment': 0.40757113695144653}]]