[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1eGgi8kYolhlCN1aPXAYxk4_HJ7C-Kw4t#scrollTo=wJJnj6OVfnEE)


## imports

In [None]:
import pandas as pd
import numpy as np

In [None]:
import nltk
nltk.download('punkt')

bad_new_line_symbols = '(\n)'

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
!wget https://www.dropbox.com/s/zjw0mgyuctzmglc/classic_poems.json?dl=0 -O classic_poems.json

--2023-05-06 13:59:00--  https://www.dropbox.com/s/zjw0mgyuctzmglc/classic_poems.json?dl=0
Resolving www.dropbox.com (www.dropbox.com)... 162.125.1.18, 2620:100:6016:18::a27d:112
Connecting to www.dropbox.com (www.dropbox.com)|162.125.1.18|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: /s/raw/zjw0mgyuctzmglc/classic_poems.json [following]
--2023-05-06 13:59:01--  https://www.dropbox.com/s/raw/zjw0mgyuctzmglc/classic_poems.json
Reusing existing connection to www.dropbox.com:443.
HTTP request sent, awaiting response... 302 Found
Location: https://uceeab1b781db57285ce2506a404.dl.dropboxusercontent.com/cd/0/inline/B7grj1ExW-5kWxZQY4r3SAPw3hSdPEwpnwuXL6actHxL5nCZ9JAIBRzK9ZJrEz3Tb87V66136miCJ7EaIWpQPw7mKgvMMYBTfnZQqpRUQjEKzRvRJ_2gjWRx1af0mhSLTbt6_613r-xMsvXxidQ8kNLBxFwkEmzYTY-b2-T5M1D3EQ/file# [following]
--2023-05-06 13:59:01--  https://uceeab1b781db57285ce2506a404.dl.dropboxusercontent.com/cd/0/inline/B7grj1ExW-5kWxZQY4r3SAPw3hSdPEwpnwuXL6actHxL5nCZ9JAIBRzK9

## func

In [None]:
import re

def clean_text(text):
    # Remove all non-Russian letters

    if '\xa0' in text:
      text = text.replace('\xa0', '')
    if '<?>' in text:
      text = text.replace('<?>', '')
    

    cleaned_text = re.sub(r"[^а-яА-ЯёЁ\s\.,!?;:«»—\n–\u2003\u2004]", "", text)
    
    # Remove extra whitespace
    cleaned_text = re.sub(r"[\t ]+", " ", cleaned_text)
    cleaned_text = re.sub(r"\n[\n]+", "\n", cleaned_text).strip()
    
    
    return cleaned_text

# Example usage
text = "Текст на русском языке c примерами 1234 и символами @$%!..\n Это предложение содержит много пунктуации!!!!\n"
cleaned_text = clean_text(text)
print(cleaned_text)

Текст на русском языке примерами и символами !..
 Это предложение содержит много пунктуации!!!!


In [None]:
def merge_lowercase(sentences):
    merged_sentences = []
    for sentence in sentences:
        if merged_sentences and sentence and sentence[0].islower():
            merged_sentences[-1] += ' ' + sentence
        else:
            merged_sentences.append(sentence)
    return merged_sentences

In [None]:
def fix_new_line_symbol(sentences, bad_new_line_symbols = bad_new_line_symbols):
  for i in range(len(sentences)):
    if sentences[i].startswith(bad_new_line_symbols):
        sentences[i-1] += '\n'
        sentences[i] = sentences[i][len(bad_new_line_symbols):]
    sentences[i] = sentences[i].replace(bad_new_line_symbols, '\n')
  return sentences

In [None]:
def preprocess(document):
    sentences = nltk.sent_tokenize(document) 
    sentences = fix_new_line_symbol(sentences)
    sentences = merge_lowercase(sentences)
    return sentences

In [None]:
def remove_non_rhyme_text(text, len_line = 15):
  text_list = text.split('\n')

  new_text = ''
  for line in text_list:
    if len(line.split()) <= len_line:
      new_text += line + '\n'
  
  return new_text

In [None]:
mayak_len_lines = 8
len_lines = 12
min_len_lines = 4

In [None]:
def split_text(text, min_len_lines):
  parts = []

  text = remove_non_rhyme_text(text)

  if 4 < len(text.split('\n')) < 20:
    return [text]
  elif 4 >= len(text.split('\n')):
    return np.nan

  sentences = preprocess(text.replace('\n', bad_new_line_symbols)) 

  curr_sent = ''

  for sent in sentences:
    curr_sent += sent
    sent_len = len(sent.split('\n'))

    curr_sent_len = len(curr_sent.split('\n'))

    if curr_sent_len >= len_lines:

      parts.append(curr_sent)
      curr_sent = ''
      continue
    if sent_len % 2 != 0 and sent_len >= min_len_lines:

      parts.append(sent)
      curr_sent = ''
  return parts

In [None]:
def split_data(row, mayak_len_lines, min_len_lines):
  if row['poet_id'] == 'mayakovskij':
    return split_text(row['content'], mayak_len_lines)
  else:
    return split_text(row['content'], min_len_lines)

In [None]:
def make_clear_dataset(df:pd.DataFrame):
  # чистим поэзию
  df['content'] = df['content'].apply(clean_text)

  # избавляемся от французского
  df = df[df['title'] != 'Стихотворения, написанные на французском языке']

  # делим поэзию на кусочки по предложениям
  df['content'] = df.apply(split_data, axis = 1, args=(mayak_len_lines, min_len_lines))

  # приводим к виду в одной колонке одна строка
  unstacked_df = df.explode('content').reset_index().drop('index', axis = 1).dropna()

  # удаляем заголовок
  unstacked_df.drop('title', axis=1, inplace = True)

  return unstacked_df

# main collection

In [None]:
data = pd.read_json('/content/classic_poems.json')

In [None]:
dataset = data

In [None]:
dataset = make_clear_dataset(data)
dataset.groupby('poet_id').count()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['content'] = df.apply(split_data, axis = 1, args=(mayak_len_lines, min_len_lines))


Unnamed: 0_level_0,content
poet_id,Unnamed: 1_level_1
blok,1963
esenin,1478
mayakovskij,1107
pushkin,2207
tyutchev,1000


In [None]:
dataset.to_csv('poetry.csv')