# Работа со строковыми значениями

Материалы:
* Макрушин С.В. Лекция 8: Работа со строковыми значениям
* https://pyformat.info/
* https://docs.python.org/3/library/re.html
* https://tproger.ru/translations/regular-expression-python/
* https://realpython.com/nltk-nlp-python/

In [53]:
import datetime as dt
import random
import re
import string
from pathlib import Path
from typing import Iterable

import nltk
import pandas as pd
from IPython.display import Pretty
from bs4 import BeautifulSoup

In [2]:
DATA_DIR = Path('data/')
OUTPUT_DIR = Path('output/')

In [3]:
def read_xml(path: str) -> BeautifulSoup:
    with open(path, encoding='utf-8') as f:
        return BeautifulSoup(''.join(f.readlines()), 'lxml')

## Задачи для совместного разбора

1. Вывести на экран данные из словаря `obj` построчно в виде `k = v`, задав формат таким образом, чтобы знак равенства оказался на одной и той же позиции во всех строках. Строковые литералы обернуть в кавычки.

In [4]:
obj = {
    'home_page': 'https://github.com/pypa/sampleproject',
    'keywords': 'sample setuptools development',
    'license': 'MIT'
}
left_max = max([len(i) for i in obj])
for k, v in obj.items():
    print(f"{k:{left_max}} = '{v}'")

home_page = 'https://github.com/pypa/sampleproject'
keywords  = 'sample setuptools development'
license   = 'MIT'


2. Дана строка 'aaa--bbb==ccc__ddd'. Написать регулярное выражение для разбивки строки на список ['aaa','bbb','ccc','ddd'].

In [5]:
s = 'aaa--bbb==ccc__ddd'
pattern = re.compile(r'[a-z]{3}')
pattern.findall(s)

['aaa', 'bbb', 'ccc', 'ddd']

3. Проверить корректность введенного E-mail

In [6]:
email = 'example@email.ru'
pattern = re.compile(r'^\w+@[a-z]+\.[a-z]{2,3}$')
match = pattern.match(email)
match.string

'example@email.ru'

4. Разбейте текст формулировки задачи 1 на слова.

In [7]:
task_text = (
    'Вывести на экран данные из словаря `obj` построчно в виде `k = v`, '
    'задав формат таким образом, чтобы знак равенства оказался на одной и '
    'той же позиции во всех строках. Строковые литералы обернуть в кавычки.'
)
nltk.word_tokenize(task_text)[:10]

['Вывести',
 'на',
 'экран',
 'данные',
 'из',
 'словаря',
 '`',
 'obj',
 '`',
 'построчно']

## Лабораторная работа 8

### Форматирование строк

1.1 Загрузите данные из файла `recipes_sample_with_tags_ingredients.csv` (__ЛР5__) в таблицу `recipes` При помощи форматирования строк выведите информацию об id рецепта и кол-ве игредиентов 5 случайных рецептов в виде таблицы следующего вида:

    
    |    id     | n_in  |
    |-------------------|
    |  400894   |  13   |
    |   68588   |   8   |
    |  362081   |   6   |
    |   53408   |  12   |
    |  221203   |   4   |

In [8]:
recipes_df = pd.read_csv(DATA_DIR.joinpath('recipes_sample_with_tags_ingredients.csv'))
recipes_df.head()

Unnamed: 0,id,name,minutes,contributor_id,submitted,n_steps,description,n_tags,tags,n_ingredients,ingredients
0,44123,george s at the cove black bean soup,90,35193,2002-10-25,11,an original recipe created by chef scott meska...,25,weeknight;time-to-make;course;main-ingredient;...,18,unsalted butter*carrot*onion*celery*broccoli s...
1,67664,healthy for them yogurt popsicles,10,91970,2003-07-26,3,my children and their friends ask for my homem...,31,15-minutes-or-less;time-to-make;course;prepara...,3,milk*frozen juice concentrate*plain yogurt
2,38798,i can t believe it s spinach,30,1533,2002-08-29,5,"these were so go, it surprised even me.",17,30-minutes-or-less;time-to-make;course;main-in...,8,onion*frozen chopped spinach*eggs*garlic powde...
3,35173,italian gut busters,45,22724,2002-07-27,7,my sister-in-law made these for us at a family...,11,60-minutes-or-less;time-to-make;course;prepara...,9,sandwich bun*good seasonings italian salad dre...
4,84797,love is in the air beef fondue sauces,25,4470,2004-02-23,4,i think a fondue is a very romantic casual din...,19,30-minutes-or-less;time-to-make;course;main-in...,12,beef steaks*vegetable oil*spicy mustard*fresh ...


In [9]:
def pprint_df(df: pd.DataFrame, n: int = 5) -> None:
    print(pformat_df(df, n=n))

In [10]:
def pformat_df(df: pd.DataFrame, n: int = 5) -> str:
    header = make_header(df)
    body = make_body(df, n=n)
    sep = f'|{"-" * (len(header) - 3)}|\n'
    return sep.join([header, body])

In [11]:
def make_header(df: pd.DataFrame) -> str:
    width_max_content = get_max_width_content(df)
    header_template = generate_header_template(width_max_content)
    return header_template.format(*df.columns)

In [12]:
def generate_header_template(width_max_content: Iterable) -> str:
    string_builder = ['{{:^{v}.{v}}} '.format(v=v + 2) for v in width_max_content]
    return f'| {" | ".join(string_builder)} |\n'

In [13]:
def make_body(df: pd.DataFrame, n: int = 5) -> str:
    width_max_content = get_max_width_content(df)
    row_template = generate_row_template(width_max_content)
    string_builder = [row_template.format(*row) for _, row in df.sample(n).astype(str).iterrows()]
    return ''.join(string_builder)

In [14]:
def generate_row_template(width_max_content: Iterable) -> str:
    string_builder = ['{{:>{v}.{v}}}  '.format(v=v + 1) for v in width_max_content]
    return f'| {" | ".join(string_builder)} |\n'

In [15]:
def get_max_width_content(df: pd.DataFrame) -> Iterable:
    return df.max().astype(str).str.len()

In [16]:
id_n_ingredients_df = recipes_df[['id', 'n_ingredients']]
pprint_df(id_n_ingredients_df)

|    id     | n_in  |
|-------------------|
|  136011   |  14   |
|  329307   |   7   |
|  232046   |  11   |
|  485268   |   9   |
|  244477   |  13   |



In [17]:
example = """
|    id     | n_in  |
|-------------------|
|  400894   |  13   |
|   68588   |   8   |
|  362081   |   6   |
|   53408   |  12   |
|  221203   |   4   |
"""
result = pformat_df(id_n_ingredients_df)
Pretty(f'\n{example}\n\n{result}\n')



|    id     | n_in  |
|-------------------|
|  400894   |  13   |
|   68588   |   8   |
|  362081   |   6   |
|   53408   |  12   |
|  221203   |   4   |


|    id     | n_in  |
|-------------------|
|  396111   |  10   |
|  100371   |   4   |
|  450478   |   8   |
|  171248   |  11   |
|  298327   |  11   |



1.2 Напишите функцию `show_info`, которая для рецепта по его `id` создает строку (в смысле объекта python) с описанием следующего вида:

```
"Название"

1. Шаг 1.
2. Шаг 2.
----------
#тэг1 #тэг2
```

    
Данные для создания строки получите из файлов `recipes_sample_with_tags_ingredients.csv`, `steps_sample.xml` (__ЛР4__) и `tags_sample.csv` (__ЛР5__). 
Выведите созданную строку на экран.

In [18]:
steps_sample_soup = read_xml(DATA_DIR.joinpath('steps_sample.xml'))

In [19]:
steps_dict = {'id': [], 'steps': []}
for recipe in steps_sample_soup.find_all('recipe'):
    steps_dict['id'].append(int(recipe.find('id').text))
    steps_dict['steps'].append('|'.join([step.text for step in recipe.find_all('step')]))
steps_df = pd.DataFrame(steps_dict)
steps_df.head()

Unnamed: 0,id,steps
0,44123,"in 1 / 4 cup butter , saute carrots , onion , ..."
1,67664,mix all the ingredients using a blender|pour i...
2,38798,combine all ingredients in a large bowl and mi...
3,35173,lay out sandwich rolls on jelly roll pans / co...
4,84797,honey mustard sauce: whisk all the ingredients...


In [20]:
recipes_steps_df = recipes_df.merge(steps_df, on='id').set_index('id')
recipes_steps_df.head()

Unnamed: 0_level_0,name,minutes,contributor_id,submitted,n_steps,description,n_tags,tags,n_ingredients,ingredients,steps
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
44123,george s at the cove black bean soup,90,35193,2002-10-25,11,an original recipe created by chef scott meska...,25,weeknight;time-to-make;course;main-ingredient;...,18,unsalted butter*carrot*onion*celery*broccoli s...,"in 1 / 4 cup butter , saute carrots , onion , ..."
67664,healthy for them yogurt popsicles,10,91970,2003-07-26,3,my children and their friends ask for my homem...,31,15-minutes-or-less;time-to-make;course;prepara...,3,milk*frozen juice concentrate*plain yogurt,mix all the ingredients using a blender|pour i...
38798,i can t believe it s spinach,30,1533,2002-08-29,5,"these were so go, it surprised even me.",17,30-minutes-or-less;time-to-make;course;main-in...,8,onion*frozen chopped spinach*eggs*garlic powde...,combine all ingredients in a large bowl and mi...
35173,italian gut busters,45,22724,2002-07-27,7,my sister-in-law made these for us at a family...,11,60-minutes-or-less;time-to-make;course;prepara...,9,sandwich bun*good seasonings italian salad dre...,lay out sandwich rolls on jelly roll pans / co...
84797,love is in the air beef fondue sauces,25,4470,2004-02-23,4,i think a fondue is a very romantic casual din...,19,30-minutes-or-less;time-to-make;course;main-in...,12,beef steaks*vegetable oil*spicy mustard*fresh ...,honey mustard sauce: whisk all the ingredients...


In [21]:
def show_info(id_: int, df: pd.DataFrame) -> str:
    recipe = df.loc[id_, ['name', 'steps', 'tags']].squeeze()

    name = recipe["name"]
    name = f'"{name[0].upper()}{name[1:]}"'

    sentence_end = {'.', '?', '!'}
    string_builder = []
    for i, step in enumerate(recipe['steps'].split('|')):
        step = step[0].upper() + step[1:]
        string_builder.append(f'{i + 1}. {step}{"" if step[-1] in sentence_end else "."}')
    steps = '\n'.join(string_builder)

    sep = '-' * len(name)
    tags = ' '.join([f'#{tag}' for tag in recipe['tags'].split(';')])
    return '\n'.join([name, '', steps, sep, tags])

In [22]:
info = show_info(35173, recipes_steps_df)
info

'"Italian  gut busters"\n\n1. Lay out sandwich rolls on jelly roll pans / cookie sheets.\n2. Melt butter , mix in italian dressing mix.\n3. Using a pastry or bbq brush , graciously apply seasoned butter to the top of the"bottom bun" and the top of the top bun.\n4. Don\'t miss this step , i don\'t know why , but it does make a difference !\n5. Here is where i create an assembly line w / the bottoms of the buns.\n6. Layer each bun w / ham , then swiss cheese , turkey , then cheddar cheese , pepperoni , then mozzerella cheese.\n7. Place"lids" on buns and place in 425 degree oven for approximly 12-15 minutes , or until you see the tops start to turn golden brown.\n----------------------\n#60-minutes-or-less #time-to-make #course #preparation #lunch #main-dish #oven #easy #dietary #sandwiches #equipment'

In [23]:
print(info)

"Italian  gut busters"

1. Lay out sandwich rolls on jelly roll pans / cookie sheets.
2. Melt butter , mix in italian dressing mix.
3. Using a pastry or bbq brush , graciously apply seasoned butter to the top of the"bottom bun" and the top of the top bun.
4. Don't miss this step , i don't know why , but it does make a difference !
5. Here is where i create an assembly line w / the bottoms of the buns.
6. Layer each bun w / ham , then swiss cheese , turkey , then cheddar cheese , pepperoni , then mozzerella cheese.
7. Place"lids" on buns and place in 425 degree oven for approximly 12-15 minutes , or until you see the tops start to turn golden brown.
----------------------
#60-minutes-or-less #time-to-make #course #preparation #lunch #main-dish #oven #easy #dietary #sandwiches #equipment


## Работа с регулярными выражениями

В задачах данного блока подразумевается, что вы не будете использовать никаких строковые методы (`split`, `startswith` и т.д.). Все задачи необходимо решить при помощи регулярных выражений.

2.1 Посчитайте кол-во отзывов, в которых встречаются числа.

In [24]:
reviews_df = pd.read_csv(
    DATA_DIR.joinpath('reviews_sample.csv'),
    sep=',',
    index_col=0,
    parse_dates=['date']
)
reviews_df.head()

Unnamed: 0,user_id,recipe_id,date,rating,review
370476,21752,57993,2003-05-01,5,Last week whole sides of frozen salmon fillet ...
624300,431813,142201,2007-09-16,5,So simple and so tasty! I used a yellow capsi...
187037,400708,252013,2008-01-10,4,"Very nice breakfast HH, easy to make and yummy..."
706134,2001852463,404716,2017-12-11,5,These are a favorite for the holidays and so e...
312179,95810,129396,2008-03-14,5,Excellent soup! The tomato flavor is just gre...


In [25]:
reviews = reviews_df['review'].fillna('')

In [26]:
pattern = re.compile(r'\d')
len(reviews[reviews.apply(lambda x: pattern.search(x) is not None)])

49246

2.2 Найдите все смайлики в отзывах. Смайлик состоит из трех частей: глаза (символ `=` или `:`), нос (символ `-`), губы (символ `)` или `(`). Смайлик может иметь вид "глаза-нос-губы" или "губы-нос-глаза". Нос может отсутствовать.

In [27]:
pattern = re.compile(r'([=:]-?[()])|([()]-?[=:])')
with_emoticons = reviews.apply(lambda x: pattern.findall(x))
emoticons = with_emoticons[with_emoticons.str.len() > 0]
emoticons

765972             [(:), )]
115270     [(:), ), (:), )]
756876             [(:), )]
467504            [(:-), )]
1017762            [(:), )]
                 ...       
678123             [(:), )]
248254             [(:), )]
935935             [(:), )]
895930             [(:), )]
112348            [(:-), )]
Name: review, Length: 7382, dtype: object

In [28]:
def flatten(l):
    for el in l:
        if isinstance(el, Iterable) and not isinstance(el, (str, bytes)):
            yield from flatten(el)
        else:
            yield el

In [29]:
emoticons_sample = [i for i in flatten(emoticons.values) if i]
random.sample(emoticons_sample, 5)

[':)', ':-)', ':)', ':)', ':)']

2.3 Проверьте, что все даты в датасете имеют вид "YYYY-MM-DD". Продемонстрируйте работу вашего решения, приведя пример из датасета и контрпример не из датасета.

In [30]:
def is_date(date: str, fmt: str = '%Y-%m-%d') -> bool:
    try:
        dt.datetime.strptime(date, fmt)
        return True
    except ValueError:
        return False

In [31]:
dates = reviews_df['date']
assert len(dates[~dates.astype(str).apply(is_date)]) == 0

In [32]:
valid_date = str(dates.sample(1).iat[0]).split()[0]
print(f'is_date({valid_date}) -> {is_date(valid_date)}')

is_date(2009-12-26) -> True


In [33]:
invalid_date = '2007-02-29'
print(f'is_date({invalid_date}) -> {is_date(invalid_date)}')

is_date(2007-02-29) -> False


2.4 Используя строку-результат задачи 1.2, найдите первое слово каждого шага в рецепте

In [34]:
pattern = re.compile(r'^\d+\. (.*?) ', flags=re.MULTILINE)
print(pattern.findall(info))

['Lay', 'Melt', 'Using', "Don't", 'Here', 'Layer', 'Place"lids"']


2.5 Используя регулярные выражения, удалите из описаний все символы, кроме английских букв, цифр и пробелов. Сохраните предобработанные описания в файл `preprocessed_descriptions.csv`, содержащий 2 столбца: `name` и `preprocessed_descriptions`.

In [35]:
pattern = re.compile(r'[^a-z0-9 ]|( {2,})', re.ASCII | re.IGNORECASE)
name_description = recipes_df.loc[:, ['name']]
name_description['preprocessed_descriptions'] = recipes_df['description'].fillna('').apply(lambda x: pattern.sub('', x))
name_description

Unnamed: 0,name,preprocessed_descriptions
0,george s at the cove black bean soup,an original recipe created by chef scott meska...
1,healthy for them yogurt popsicles,my children and their friends ask for my homem...
2,i can t believe it s spinach,these were so go it surprised even me
3,italian gut busters,my sisterinlaw made these for us at a family g...
4,love is in the air beef fondue sauces,i think a fondue is a very romantic casual din...
...,...,...
29995,zurie s holey rustic olive and cheddar bread,this is based on a french recipe but i changed...
29996,zwetschgenkuchen bavarian plum cake,this is a traditional fresh plum cake thought ...
29997,zwiebelkuchen southwest german onion cake,this is a traditional late summer early fall s...
29998,zydeco soup,this is a delicious soup that i originally fou...


In [36]:
name_description.to_csv(OUTPUT_DIR.joinpath('preprocessed_descriptions.csv'), sep=',', index=False)

### Сегментация текста

3.1 Разбейте предобработанные отзывы из задания 2.5 на предложения, а предложения - на слова (используйте `sent_tokenize` и `word_tokenize` из `nltk`). Каждый отзыв представьте в виде списка списков: внешний список - предложения, вложенные списки - слова в предложении.

`'Предложение номер один. Предложение номер два.' => [['Предложение', 'номер', 'один', '.'], ['Предложение', 'номер', 'два', '.']]`

In [67]:
reviews = reviews_df['review'].fillna('')
reviews.head()

"Last week whole sides of frozen salmon fillet was on sale in my local supermarket, so I bought tons (okay, only 3, but total weight was over 10 pounds).  This recipe is perfect for salmon fillet, even though it calls for salmon steaks.  I cut up the salmon into individual portions and followed the instructions exactly.  I'm on one of those food combining diets, so I left out the white wine but added just a dash of white wine vinegar instead (just a little bit, not enough to change the taste of the dish).  Super yummy, and leftovers for lunch today (lucky me)!"

In [68]:
word_sentence_list = [[nltk.word_tokenize(sent) for sent in nltk.sent_tokenize(review)] for review in reviews.tolist()]
word_sentence_list[0]

[['Last',
  'week',
  'whole',
  'sides',
  'of',
  'frozen',
  'salmon',
  'fillet',
  'was',
  'on',
  'sale',
  'in',
  'my',
  'local',
  'supermarket',
  ',',
  'so',
  'I',
  'bought',
  'tons',
  '(',
  'okay',
  ',',
  'only',
  '3',
  ',',
  'but',
  'total',
  'weight',
  'was',
  'over',
  '10',
  'pounds',
  ')',
  '.'],
 ['This',
  'recipe',
  'is',
  'perfect',
  'for',
  'salmon',
  'fillet',
  ',',
  'even',
  'though',
  'it',
  'calls',
  'for',
  'salmon',
  'steaks',
  '.'],
 ['I',
  'cut',
  'up',
  'the',
  'salmon',
  'into',
  'individual',
  'portions',
  'and',
  'followed',
  'the',
  'instructions',
  'exactly',
  '.'],
 ['I',
  "'m",
  'on',
  'one',
  'of',
  'those',
  'food',
  'combining',
  'diets',
  ',',
  'so',
  'I',
  'left',
  'out',
  'the',
  'white',
  'wine',
  'but',
  'added',
  'just',
  'a',
  'dash',
  'of',
  'white',
  'wine',
  'vinegar',
  'instead',
  '(',
  'just',
  'a',
  'little',
  'bit',
  ',',
  'not',
  'enough',
  'to',
  '

3.2 Посчитайте кол-во уникальных слов в датасете (без учета регистра).

In [79]:
data_set = []
for review in reviews.tolist():
    data_set.extend([i for i in nltk.word_tokenize(review.lower()) if i not in string.punctuation])

data_set[:30]

['last',
 'week',
 'whole',
 'sides',
 'of',
 'frozen',
 'salmon',
 'fillet',
 'was',
 'on',
 'sale',
 'in',
 'my',
 'local',
 'supermarket',
 'so',
 'i',
 'bought',
 'tons',
 'okay',
 'only',
 '3',
 'but',
 'total',
 'weight',
 'was',
 'over',
 '10',
 'pounds',
 'this']

In [80]:
freq_dist = nltk.FreqDist([word for word in data_set])
freq_dist

FreqDist({'i': 326096, 'the': 294057, 'and': 223842, 'a': 172590, 'it': 164434, 'this': 135273, 'to': 133320, 'for': 125426, 'of': 109590, 'was': 93401, ...})

In [81]:
print(f'Всего слов: {freq_dist.N()}')
print(f'Уникальных слов: {freq_dist.B()}')

Всего слов: 6736309
Уникальных слов: 63783


3.3 Найдите 5 самых длинных (по количеству слов) отзывов в датасете и выведите их в порядке убывания длины.

In [93]:
reviews_len = reviews.apply(lambda x: len([
    word for word in nltk.word_tokenize(x)
    if word not in string.punctuation
]))

In [98]:
reviews[reviews_len.sort_values(ascending=False).iloc[:5].index]

873900     One of my sisters was in need of some comfort ...
1031544    I am in the process of prepping all of my ingr...
1012304    I made these for a Valentine's day dinner and ...
1110888    I made these, and was worried that they would ...
419558     Your vet has severly mislead you and this diet...
Name: review, dtype: object

3.4 Напишите функцию, которая для заданного предложения выводит информацию о частях речи слов, входящих в предложение в следующем виде:
```
PRP   VBD   DT      NNS     CC   VBD      NNS        RB   
 I  omitted the raspberries and added strawberries instead
``` 
Для определения части речи слова можно воспользоваться `nltk.pos_tag`.

Проверьте работоспособность функции на любом предложении из отзывов.

In [112]:
def gen_template(tagged_list: list[tuple[str, str]]) -> str:
    string_builder = [f'{{:^{max(len(w), len(t))}}}' for w, t in tagged_list]
    return ' '.join(string_builder)

In [113]:
def tag_info(sentence: str) -> str:
    tagged_list = nltk.pos_tag(nltk.word_tokenize(sentence))
    template = gen_template(tagged_list)
    word_list, tag_list = list(zip(*tagged_list))
    return '\n'.join([template.format(*tag_list), template.format(*word_list)])

In [119]:
sentence = random.choice(nltk.sent_tokenize(reviews.sample(1).iat[0]))
print(tag_info(sentence))


 NNP    NN   .
Great method !
