# import

In [None]:
import re
import string
import nltk
import pandas as pd
from nltk.corpus import stopwords
from razdel import sentenize
from tqdm import tqdm
from tqdm import notebook
tqdm.pandas()


nltk.download("stopwords")

# read data

In [None]:
df = pd.read_csv("data/se_spbu/works.csv")

In [None]:
def string_list_to_list(string):
    return string[2:-1].split("', '")

In [None]:
def preprocessing_pipeline(text):
    text = string_list_to_list(text)
    text = " ".join(text[1:]) # введение
    text = re.sub("- ", "", text) # переносы строки
    
    formulas_characters = "@#^&*+_=<✓α>/≡≡Σ∈≤"
    chars = '●•'
    
    text = text.translate(str.maketrans('', '', chars))
    text = list(sentenize(text))
    
    
    sents = [sent.text for sent in text]
    sents = list(filter(lambda x: "аблица" not in x, sents))
    sents = list(filter(lambda x: "траница" not in x, sents))
    sents = list(filter(lambda x: "исунок" not in x, sents))
    sents = list(filter(lambda x: len(x)>10, sents))
    sents = list(filter(lambda x: len(x)<1500, sents))
    sents = list(filter(lambda x: not any(c in formulas_characters for c in x), sents))
    
    
    return sents

In [None]:
df["preprocessed_texts"] = df.texts.apply(preprocessing_pipeline)
df['preprocessed_len'] = df.preprocessed_texts.apply(len)

In [None]:

# НАЗВАНИЕ СЕКЦИЙ
# РИСУНОК

In [None]:
df = df[df['preprocessed_len'] > 0]

In [None]:
train_texts = df[:-10]
val_texts = df[-10:]

In [None]:
df_sentences = df.preprocessed_texts.explode(ignore_index=True)
train_sentences = train_texts.preprocessed_texts.explode(ignore_index=True)
val_sentences = val_texts.preprocessed_texts.explode(ignore_index=True)

# sentiment

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification
import torch

In [None]:
device = torch.device("cuda:4")

In [None]:
model_name = 'blanchefort/rubert-base-cased-sentiment-rusentiment'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name)
if torch.cuda.is_available():
    model.to(device)

In [None]:
sent_dict ={
    0: "neutral",
    1: "positive",
    2: "negative"
}
def get_sentiment(text, return_type='label'):
    with torch.no_grad():
        inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True).to(model.device)
        proba = model(**inputs).logits.cpu().numpy()[0]
    if return_type == 'label':
        return sent_dict[proba.argmax()]
        return model.config.id2label[proba.argmax()]
    elif return_type == 'score':
        return proba.dot([-1, 0, 1])
    return proba

In [None]:
text = 'Какая гадость эта ваша заливная рыба!'
print(get_sentiment(text, 'label'))

In [None]:
num = 0
for sentence in df_sentences:
    sent = get_sentiment(sentence)
    if sent != "neutral":
        print(sent + ": " + sentence)
        num += 1
print("Всего: ", num)

## морфология

In [None]:
from natasha import (
    Segmenter,
    MorphVocab,
    
    NewsEmbedding,
    NewsMorphTagger,
    NewsSyntaxParser,
    NewsNERTagger,
    
    PER,
    NamesExtractor,

    Doc
)

import pymorphy2

In [None]:
morph = pymorphy2.MorphAnalyzer()
segmenter = Segmenter()
morph_vocab = MorphVocab()

emb = NewsEmbedding()
morph_tagger = NewsMorphTagger(emb)
syntax_parser = NewsSyntaxParser(emb)
ner_tagger = NewsNERTagger(emb)

In [None]:
docs = df_sentences.apply(Doc)
docs.progress_apply(lambda x: x.segment(segmenter))
docs.progress_apply(lambda x: x.tag_morph(morph_tagger))

In [None]:
docs_with_morph = []
for doc in docs:
    try:
        doc.tag_morph(morph_tagger)
        print(doc)
        docs_with_morph.append(doc)
    except:
        docs_with_morph.append(None)

docs = pd.Series(docs_with_morph)
# docs = docs.apply(lambda x: x.tag_morph(morph_tagger))

In [None]:
num = 0
for sent in docs:
    #if " Я " in sent.text or " я " in sent.text:
    #        num+=1
            #print("From text:", sent.text)
            #for token in sent.morph.tokens:
            #    pass
            #    print(token)
            
    for token in sent.morph.tokens:
        #print(token)
        #print(sent)
        if token.pos == "VERB":
            if "Number" in token.feats and "Person" in token.feats:
                if token.feats['Person'] == "1" and token.feats["Number"] == "Sing":
                    pass
                    num+=1
                    print("token : ", token.text)
                    print("From text:", sent.text)
print("Всего :", num)                