# Извлечение данных из коллекции новостных текстов

In [148]:
from yargy import Parser, rule, or_, and_
from yargy.predicates import gram, dictionary, eq, is_capitalized, normalized, caseless
from yargy.interpretation import fact
from yargy.tokenizer import MorphTokenizer
from yargy.predicates import type as ytype
from yargy.predicates import gte, lte
from yargy.relations import gnc_relation
from yargy.pipelines import morph_pipeline
from dataclasses import dataclass
from typing import Optional

In [63]:
import os
current_directory = os.getcwd() 
file_path = os.path.join(current_directory, 'data', 'news.txt') 

In [35]:
Entry = fact(
    'Entry',
    ['name', 'birth_date', 'birth_place']
)

In [78]:
NameFact = fact(
    'NameFact',
    ['name']
)

NAME = rule(
    gram('Name')
)

SURNAME = rule(
    gram('Surn')
)

PERSON_NAME = rule(
    NAME,
    SURNAME
).interpretation(NameFact.name)

In [93]:
parser_name = Parser(PERSON_NAME)

text_names_found = []
with open(file_path, 'r', encoding='utf-8') as file:
    for text in file:
        names_found = []
        for match in parser_name.findall(text):
            name_fact = match.fact
            names_found.append(name_fact)
        text_names_found.append(names_found)
text_names_found

[[],
 ['Матс Сундин', 'Матс Сундин'],
 ['Владимир Филиппов', 'Ильгизу Фахриеву', 'Михаилом Прохоровым'],
 [],
 [],
 [],
 ['Бена Николсона', 'Анри Матисса', 'Пабло Пикассо', 'Жоржа Брака'],
 ['Владимиром Путиным'],
 [],
 ['Леонида Брежнева', 'Юрий Андропов'],
 [],
 ['Игоря Денисова', 'Игорь Денисов'],
 [],
 [],
 [],
 [],
 ['Ларри Гагосян', 'Виктор Пинчук', 'Марину Абрамович'],
 [],
 [],
 [],
 ['Энди Коулсон', 'Руперта Мердока', 'Энди Коулсона'],
 ['из еврокубковАрмейский', 'Леонида Слуцкого'],
 [],
 [],
 ['Юлия Александрова',
  'Елена Валюшкина',
  'Валентина Мазулина',
  'Андрея Першина'],
 [],
 [],
 [],
 [],
 [],
 ['Игорь Артемьев'],
 ['Эльвира Набиуллина', 'Михаил Сухов'],
 [],
 [],
 ['Владимиром Путиным'],
 [],
 [],
 ['Владимир Долинский', 'Марина Могилевская', 'Анна Терехова'],
 [],
 ['Юрий Чайка',
  'Александру Бастрыкину',
  'Дениса Вороненкова',
  'Отари Кобахидзе'],
 ['Майкла Джексона', 'Ева Польна', 'Майкл Джексон'],
 [],
 ['Хайо Зеппельт',
  'Ольга Скабеева',
  'матчМария Шар

In [129]:
Birth_Date = fact(
    'BirthFact',
    ['birth_date']
)

DAY = and_(
    gte(1),
    lte(31)
)
MONTH = and_(
    gte(1),
    lte(12)
)
YEAR = and_(
    gte(1),
    lte(2030)
)
DATE = rule(
    YEAR,
    '-',
    MONTH,
    '-',
    DAY
)

MONTH = morph_pipeline([
    'январь',
    'февраль',
    'март',
    'апрель',
    'мая',
    'июнь',
    'июль',
    'август',
    'сентябрь',
    'октябрь',
    'ноябрь',
    'декабрь'
])

YEAR_WORDS = or_(
    rule(caseless('г'), '.'),
    rule(normalized('год'))
)
DATE = or_(
    rule(
        YEAR,
        '-',
        MONTH,
        '-',
        DAY
    ),
    rule(
        DAY.optional(),
        MONTH.optional(),
        YEAR,
        YEAR_WORDS.optional()
    )
)

BORN_WORD = morph_pipeline([
    'родился'])

BIRTH_DATE = rule(
    BORN_WORD,
    dictionary({'в'}).optional(),
    gram("NOUN").optional().repeatable(),
    dictionary({'в'}).optional(),
    DATE.interpretation(Birth_Date.birth_date)
).interpretation(Birth_Date)

In [163]:
parser_birth_date = Parser(BIRTH_DATE)

text_birth_date_found = []
with open(file_path, 'r', encoding='utf-8') as file:
    for text in file:
        birth_dates_found = []
        for match in parser_birth_date.findall(text):
            birth_date_fact = match.fact
            birth_dates_found.append(birth_date_fact.birth_date)
            print(birth_dates_found)
        text_birth_date_found.append(birth_dates_found)
text_birth_date_found

['1926 году']
['1928 году']
['11 сентября 1865 года']
['25']
['2013 году']
['1935 году']
['5 марта 1992 года']
['1928 году']
['1938 году']
['1938 году', '1913 году']
['1928 году']
['1992 году']
['18 декабря 1937 года']
['7']
['1990 году']
['27 октября 1944 года']
['2010 году']
['1950 году']
['1941 году']
['24 ноября 1932 года']
['25 мая 1927 года']
['1984 году']
['1979 году']
['1841 году']
['22 июля 2013']
['13 марта 1988 года']
['13 марта 1988 года', '1967 году']
['1956 году']
['1953 году']
['1816 году']
['2012']
['1962 году']
['1979 году']
['1925 году']
['2007 году']
['2004 году']
['2012 года']
['1990 году']
['18 ноября 1927 года']
['1928 году']
['1928 году']
['1927 году']
['1928 году']
['1922 году']
['12 января 1936 года']
['1967 года']
['1834 году']
['июле 1904 года']
['2013']
['2013', '2012 году']
['12 декабря 1863 года']
['26 марта 2012 года']
['1917 году']
['17 апреля 1919 года']
['1932 году']
['1816 году']
['4 мая 1934 года']
['1930 году']
['1895 году']
['1938 году']
['мае 1984

[[],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],


In [143]:
Birth_Place = fact(
    'BirthPlaceFact',
    ['birth_place']
)

PLACE_NAME = rule(
    gram('Geox').repeatable(max=2)
).interpretation(Birth_Place.birth_place)

PLACE_WORD = morph_pipeline([
    'город',
    'страна'
])

BIRTH_PLACE = rule(
    BORN_WORD,
    dictionary({'в'}).optional(),
    DATE.optional(),
    dictionary({'в'}),
    PLACE_WORD.optional(), 
    PLACE_NAME
).interpretation(Birth_Place)

In [164]:
parser_birth_place = Parser(BIRTH_PLACE)

text_birth_places_found = []
with open(file_path, 'r', encoding='utf-8') as file:
    for text in file:
        birth_places_found = []
        for match in parser_birth_place.findall(text):
            birth_place_fact = match.fact
            birth_places_found.append(birth_place_fact.birth_place)
            print(birth_places_found)
        text_birth_places_found.append(birth_places_found)
text_birth_places_found

['Сиэтле']
['США']
['Севастополе']
['Грузии']
['Польше']
['Польше']
['Калифорнии']
['Артемовске']
['Лондоне']
['США']
['Токио']
['Москве']
['Калининграде']
['Москве']
['Львове']
['Сочи']
['Массачусетсе']
['Ростове']
['США']
['Рочестере']
['Самаре']
['Баку']
['ЮАР']
['Лондоне']
['Риге']
['Каннах']
['Мексике']
['Ливане']
['США']
['Сальск']
['Ленинграде']
['США']
['Ленинграде']
['мае']
['Москве']
['Ленинграде']
['Фатеж']
['Копенгагене']
['Ленинграде']
['мае']
['Смоленске']
['России']
['Ливерпуле']
['Витебске']
['Риге']
['Риге', 'Братиславе']
['Саратове']
['Лондоне']
['Сарапуле']
['Кемерово']
['Индии']
['Косово']
['Москве']
['Грозном']
['Красноярске']
['Бостоне']
['Кирове']
['Москве']
['Омске']
['Индии']


[[],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 ['Си

In [149]:
@dataclass
class Entry:
    name: str
    birth_date: Optional[str]
    birth_place: Optional[str]

extracted_entries = []

In [165]:
def align_to_first_size(main_list, *other_lists):
    target_size = len(main_list)
    aligned_lists = []
    
    for lst in other_lists:
        if len(lst) < target_size:
            aligned_lists.append(lst + [None] * (target_size - len(lst)))
        else:
            aligned_lists.append(lst[:target_size])
    return aligned_lists

In [None]:
text_names_found
text_birth_places_found
text_birth_date_found
for names, birth_places, birth_dates in zip(text_names_found, text_birth_places_found, text_birth_date_found):
    aligned_birth_places, aligned_birth_dates = align_to_first_size(names, birth_places, birth_dates)
    for name, birth_place, birth_dates in zip(names, aligned_birth_places, aligned_birth_dates):
        print(Entry(name, birth_dates, birth_place))

Entry(name='Матс Сундин', birth_date=None, birth_place=None)
Entry(name='Матс Сундин', birth_date=None, birth_place=None)
Entry(name='Владимир Филиппов', birth_date=None, birth_place=None)
Entry(name='Ильгизу Фахриеву', birth_date=None, birth_place=None)
Entry(name='Михаилом Прохоровым', birth_date=None, birth_place=None)
Entry(name='Бена Николсона', birth_date=None, birth_place=None)
Entry(name='Анри Матисса', birth_date=None, birth_place=None)
Entry(name='Пабло Пикассо', birth_date=None, birth_place=None)
Entry(name='Жоржа Брака', birth_date=None, birth_place=None)
Entry(name='Владимиром Путиным', birth_date=None, birth_place=None)
Entry(name='Леонида Брежнева', birth_date=None, birth_place=None)
Entry(name='Юрий Андропов', birth_date=None, birth_place=None)
Entry(name='Игоря Денисова', birth_date=None, birth_place=None)
Entry(name='Игорь Денисов', birth_date=None, birth_place=None)
Entry(name='Ларри Гагосян', birth_date=None, birth_place=None)
Entry(name='Виктор Пинчук', birth_date=