[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1fEcLtceekuHdohmoGTR1e4F3Pu938Y88?usp=sharing)



In [None]:
!pip install keybert

/bin/bash: /root/anaconda3/lib/libtinfo.so.6: no version information available (required by /bin/bash)
[0m

# Cleaning

In [None]:
!pip install nltk pymorphy2

/bin/bash: /root/anaconda3/lib/libtinfo.so.6: no version information available (required by /bin/bash)
[0m

In [None]:
import nltk
from pymorphy2 import MorphAnalyzer

nltk.download("stopwords")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Initialize the MorphAnalyzer and load the Russian stop words
morph = MorphAnalyzer(lang='ru')
stop_words = set(stopwords.words("russian"))

def normalize_and_remove_stop_words(text):
    # Tokenize the text
    words = word_tokenize(text, language="russian")
    
    # Lemmatize and remove stop words
    normalized_words = []
    for word in words:
        parsed_word = morph.parse(word)[0]
        lemma = parsed_word.normal_form
        if lemma not in stop_words:
            normalized_words.append(lemma)
    
    # Combine the normalized words into a single string
    normalized_text = ' '.join(normalized_words)
    
    return normalized_text

# Example usage
text = "Был солнечный день, и птицы пели весело на ветках."
normalized_text = normalize_and_remove_stop_words(text)
print(normalized_text)


солнечный день , птица петь весело ветка .


# Get keywords

In [None]:
from keybert import KeyBERT
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("xlm-r-large-en-ko-nli-ststb")
keyword_extractor = KeyBERT(model=model)

def extract_keywords_transformer(text, num_keywords=5):
    text = normalize_and_remove_stop_words(text)
    keywords = keyword_extractor.extract_keywords(
        text,
        keyphrase_ngram_range=(1, 1),
        stop_words=None,
        top_n=num_keywords,
        use_maxsum=True,
        use_mmr=True,
        diversity=0.7
    )
    return [keyword[0] for keyword in keywords]

2023-05-06 17:44:24.928354: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [None]:
text = '''
Один не смогу —
не снесу рояля
/тем более —
несгораемый шкаф/,
А если не шкаф,
не рояль,
то я ли
сердце снес бы, обратно взяв.
Банкиры знают:
«Богаты без края мы.
Карманов не хватит —
кладем в несгораемый».
Любовь
в тебя —
богатством в железо —
запрятал,
хожу
и радуюсь Крезом.
И разве,
если захочется очень,
улыбку возьму,
пол-улыбки
и мельче,
с другими кутя,
протрачу в полночи
рублей пятнадцать лирической мелочи.
'''
keywords = extract_keywords_transformer(text)

print(keywords)

['богатство', 'лирический', 'снести', 'железо', 'пятнадцать']


# Form dataset

In [None]:
import pandas as pd
import tqdm


dataset = pd.read_csv('poetry.csv')

In [None]:
poet_list = dataset['poet_id'].unique()

In [None]:

df = pd.DataFrame(columns=['text', 'autor', 'keywords'])

for author in poet_list:
  new_df = pd.DataFrame()
  print(author)

  data_poet = dataset[dataset['poet_id'] == author]

  new_df['text'] = data_poet['content']

  new_df['autor'] = [author for i in range(data_poet.shape[0])]

  new_df['keywords'] = data_poet['content'].apply(extract_keywords_transformer)
  
  df = pd.concat([new_df, df], ignore_index=True)

pushkin
esenin
blok
tyutchev
mayakovskij


In [None]:
df

Unnamed: 0,text,autor,keywords
0,"Влас Прогулкин —\nмилый мальчик,\nспать ложилс...",mayakovskij,"[спать, журнальчик, заставить, мальчик, мать]"
1,"Засыпает на рассвете,\nскомкав\nёрзаньем\nкров...",mayakovskij,"[вставать, детвора, отец, засыпать, ухо]"
2,"Разошлись\nдругие\nв школы,–\nВлас\nу крана\nп...",mayakovskij,"[мочить, дрематься, выходить, школа, чай]"
3,Пошагал\nи встал разиней:\nвывеска на магазине...,mayakovskij,"[магазин, вывеска, прочесть, пошагать, симон]"
4,"С конца прочёл\nзнаток наук, —\nНомисвыходит\n...",mayakovskij,"[номисвыходить, наука, пять, прочесть, третий]"
...,...,...,...
7750,Чудный сон мне бог послал—\nС длинной белой бо...,pushkin,"[сон, старец, ангел, плаватель, готовить]"
7751,"Бедный пахарь утомленный,\nОтрешишь волов от п...",pushkin,"[утомлённый, сон, ждать, исповедовать, плуг]"
7752,"И страшуся и надеюсь,\nКазни вечныя страшуся,\...",pushkin,"[успокоить, надеяться, казнь, творец, вечный]"
7753,"О нет, мне жизнь не надоела,\nЯ жить люблю, я ...",pushkin,"[надоесть, охладеть, жизнь, хотеть, молодость]"


In [None]:
df.to_csv('poetry_keywords.csv')

# fix name issues

In [None]:
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.12.0-py3-none-any.whl (474 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m474.6/474.6 kB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.7,>=0.3.0 (from datasets)
  Downloading dill-0.3.6-py3-none-any.whl (110 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/110.5 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.5/212.5 kB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.14-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.3/134.3 kB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m
Collec

In [None]:
from datasets import load_dataset

data = load_dataset('AnyaSchen/russian_poetry_with_keywords')

Downloading readme:   0%|          | 0.00/441 [00:00<?, ?B/s]

Downloading and preparing dataset None/None to /root/.cache/huggingface/datasets/AnyaSchen___parquet/AnyaSchen--russian_poetry_with_keywords-8da6f9e7bc09fe8e/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/7755 [00:00<?, ? examples/s]

Dataset parquet downloaded and prepared to /root/.cache/huggingface/datasets/AnyaSchen___parquet/AnyaSchen--russian_poetry_with_keywords-8da6f9e7bc09fe8e/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
author2name = {
    'mayakovskij':'Маяковский',
    'esenin':'Eceнин',
    'blok': 'Блок',
    'pushkin': 'Пушкин',
    'tyutchev': 'Тютчев'
}

In [None]:
data = data.rename_column('autor', 'author')

In [None]:
data

DatasetDict({
    train: Dataset({
        features: ['text', 'author', 'keywords'],
        num_rows: 7755
    })
})

In [None]:
def replace_name(example):
    example['author'] = author2name[example['author']]
    return example

# Apply the function to the dataset
data = data.map(replace_name)

Map:   0%|          | 0/7755 [00:00<?, ? examples/s]

In [None]:
data['train'][0]['author']

'Маяковский'

In [None]:
!pip install huggingface_hub
!huggingface-cli login --token {auth_token}

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid.
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [None]:
data.push_to_hub('AnyaSchen/russian_poetry_with_keywords')



Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/8 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Deleting unused files from dataset repository:   0%|          | 0/1 [00:00<?, ?it/s]