# FRIENDS CORPUS

Привет! Это протоверсия корпуса по сериалу «Друзья». 

Перед началом работы рекомендуем ознакомиться с [правилами работы корпуса](https://github.com/ancheveleva/friends_corpus/blob/main/README.md).

Чтобы воспользоваться поиском, нажми **Run all**.

## Реализация логики поиска

### Подгрузка базы данных, открытие соединения, инициализация языковой модели

In [None]:
import sqlite3

con = sqlite3.connect('friends_corpus_new.db')
cursor = con.cursor()

In [None]:
import spacy
model = spacy.load('en_core_web_sm')

### Самые низкоуровневые функции: запросы идентификаторов параметров запроса

In [None]:
from typing import List

def get_lemma_ids(lemma: str) -> List:
    '''
    retrives list of id corresponding to lemma
    :param lemma: lema id of which is sought
    :return: list of id corresponding to requested lemma
    '''
    cursor.execute('''SELECT lemma_id FROM Lemmas
    WHERE lemma = ?''', (lemma,))
    return [i[0] for i in cursor.fetchall()]


def get_token_ids(lemma_ids: list) -> List:
    '''
    retrives list of tokens ids corresponding to lemma ids provided
    :param lemma_ids: list of lemmas ids tokens of which are sought
    :return: list of tokens ids corresponding to requested lemmas
    '''
    get_tokens_query = '''
    SELECT token_id FROM Tokens
    WHERE lemma_id = ?
    '''
    token_ids = []
    for lemma_id in lemma_ids:
        cursor.execute(get_tokens_query, (lemma_id,))
        token_ids.extend([i[0] for i in cursor.fetchall()])
    return token_ids


def get_token_id(token: str) -> List:
    '''
    retrives list of id corresponding to token provided
    :param token: token, id of which is sought
    :return: list of id corresponding to requested token
    '''
    get_token_query = '''
    SELECT token_id FROM Tokens
    WHERE token = ?
    '''
    cursor.execute(get_token_query, (token,))
    return [i[0] for i in cursor.fetchall()]


def get_pos_lemma_ids(lemma: str, pos: str) -> List:
    '''
    retrives list of ids corresponding to lemma and pos
    :param lemma: lema, id of which is sought
    :param pos: condition on part-of-speech tag
    :return: list of id corresponding to requested lemma + pos
    '''
    get_pos_lemma_query = '''
    SELECT lemma_id FROM Lemmas
    WHERE (lemma = ?) AND (pos = ?)
    '''
    cursor.execute(get_pos_lemma_query, (lemma, pos,))
    try:
        return cursor.fetchall()[0]
    except IndexError:
        return []


def get_sentence_ids(token_ids: List) -> List:
    '''
    retrives list of sentences ids corresponding to token ids list
    :param token_ids: list of token ids to be present in a sentence
    :return: list of sentences ids containing at least one of requested tokens
    '''
    get_sent_query = '''
    SELECT sentence_id FROM Match
    WHERE token_id = ?
    '''
    sentence_ids = []
    for token_id in token_ids:
        cursor.execute(get_sent_query, (token_id,))
        sentence_ids.extend([i[0] for i in cursor.fetchall()])
    return sentence_ids


### Чуть более высокоуровневый функционал: поиск идентификаторов подходящих предложений по определенному параметру запроса, использующий функции предыдущего раздела

In [None]:
def token_to_tokens(token: str, model=model) -> List:
    '''
    retrives list of sentences ids containing token in any form 
    :param token: token to be present in a sentence
    :param model: language model to lemmatize token
    :return: list of sentences ids containing any form of token provided
    '''
    lemma = model(token)[0].lemma_
    if lemma.startswith('-') and lemma.endswith('-'):
        lemma = token
    return get_sentence_ids(get_token_ids(get_lemma_ids(lemma)))


def token_to_token(token: str) -> List:
    '''
    retrives list of sentences ids containing token in the exact form provided
    :param token: token to be present in a sentence
    :return: list of sentences ids containing exact form of token provided
    '''
    return get_sentence_ids(get_token_id(token))


def lemma_pos_to_token(token, pos):
    '''
    retrives list of sentences ids containing tokens with provided lemma
    and specified pos tag
    :param token: token to be present in a sentence
    :pos: requested part-of-speech tag
    :return: list of sentences ids containing specific tokens of exact pos
    '''
    return get_sentence_ids(get_token_ids(get_pos_lemma_ids(token, pos)))


def pos_to_tokens(pos: str) -> List:
    '''
    retrives list of sentences ids containing tokens of requested part of speech
    :param pos:requested part-of-speech tag
    :return: list of sentences ids containing words of specified pos
    '''
    query = '''
    SELECT sentence_id FROM Match
    JOIN Tokens ON Match.token_id = Tokens.token_id
    JOIN Lemmas ON Tokens.lemma_id = Lemmas.lemma_id
    WHERE Lemmas.pos = ?
    '''
    cursor.execute(query, (pos,))
    return [i[0] for i in cursor.fetchall()]

### Еще более высокоуровневая конструкция: функция, обрабатывающая цельный запрос, работающая на функциях, заданных выше

In [None]:
import re
from typing import Tuple

tags = ['ADJ', 'ADP', 'ADV', 'AUX', 'CCONJ', 'DET', 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM', 'VERB', 'X']

def execute_query(initial_query: str) -> Tuple[List, List]:
    '''
    interprets query, retrieves ids of candidates sentences,
    provides regular expressions for filterng
    :param initial_query: query to interpret and execute
    :return: list of corresponding sentences id, list of patterns to look for
    '''
    queries = initial_query.split()
    sentence_ids = {}
    patterns = []

    for query in queries:
        if '+' in query:
            lemma, pos = query.split('+')
            new = set(lemma_pos_to_token(lemma, pos))
            patterns.append(re.compile(f"[a-zA-Z'\-_]+\+{lemma}\+{pos}"))
        elif query in tags:
            new = set(pos_to_tokens(query))
            patterns.append(re.compile(f"[a-zA-Z'\-_]+\+[a-zA-Z'\-_]+\+{query}"))
        elif query.startswith('"') or query.startswith("'"):
            new = set(token_to_token(query[1:-1]))
            patterns.append(re.compile(f"{query[1:-1]}\+[a-zA-Z'\-_]+\+[A-Z]+"))
        else:
            new = set(token_to_tokens(query))
            patterns.append(re.compile(f"[a-zA-Z'\-_]+\+{query}\+[A-Z]+"))
        # print(bool(new))
        sentence_ids = sentence_ids.intersection(new) if sentence_ids else new
    return sentence_ids, patterns


### Вспомогательные функции для валидации предложений по критерию порядка и расстояния

In [None]:
def get_annotated(sents_id):    
    get_annotated_query = f'''
    SELECT sentence_id, annotated FROM Sentences
    WHERE sentence_id IN ({('?, ' * len(sents_id))[:-2]})
    '''
    cursor.execute(get_annotated_query, tuple(sents_id))
    return [i for i in cursor.fetchall()]

In [None]:
def is_valid(observed, last, sentence):
    if not last:
        last = observed.span()[-1]
        return True, last
    beg, end = observed.span()
    return beg == last + 1, end

def is_sentence_valid(sentence, patterns):

    matched = []
    for pattern in patterns:
        matched.append([x for x in re.finditer(pattern, sentence)])

    last = 0
    correct = True

    for match in matched:
        if len(match) == 1:
            observed = match[0]
            valid, last = is_valid(observed, last, sentence)
            if not valid:
                correct = False
                break
        else:
            valid_is_found = False
            for observed in match:
                if not valid_is_found:
                    result = is_valid(observed, last, sentence)
                    if result[0] == True:
                        last = result[-1]
                        valid_is_found = True
            if not valid_is_found:
                correct = False
                break
    return correct

## Самая главная функция, соединющаяя все заданное выше в одну логику

In [None]:
import csv

In [None]:
def find(query: str) -> None:
    '''
    interprets and executes query, 
    filters corresponding sentences,
    retrieves meta info, 
    creates csv file with output
    :param query: query to execute
    '''
    sentences_ids, patterns = execute_query(query)
    annotated = get_annotated(sentences_ids)
    valid = [sentence[0] for sentence in annotated if is_sentence_valid(sentence[-1], patterns)]

    get_everything = f'''
    SELECT sentence, character, episode, season, scriptwriters
    FROM Sentences
    JOIN Meta ON Sentences.sentence_id = Meta.sentence_id
    WHERE Sentences.sentence_id IN ({('?, ' * len(valid))[:-2]})
    '''

    cursor.execute(get_everything, valid)

    result = cursor.fetchall()
    num_res = len(result)
    if num_res == 0:
        print("""К сожалению, по данному запросу ничего не найдено :(
Проверь, соответствует ли запрос правилам.""")
    else:
        with open("./results.csv", 'w', newline='') as myfile:
            wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
            wr.writerow(["sentence", "character", "episode", "season", "scriptwriters"])
            for sent in result:
                wr.writerow(sent)
        print(f"""Количество предложений, содержащих данный запрос: {num_res}
Они записаны в файл results.csv ;)
Не забудь переименовать этот файл перед следующим запросом!""")
    #return result

## Интерфейс поиска

In [None]:
print("Введи запрос (напоминаем о правилах запроса вот здесь https://github.com/ancheveleva/friends_corpus/blob/main/README.md ):")
find(input())

Введи запрос (напоминаем о правилах запроса вот здесь https://github.com/ancheveleva/friends_corpus/blob/main/README.md ):
do+AUX do+VERB
Количество предложений, содержащих данный запрос: 4
Они записаны в файл results.csv ;)
Не забудь переименовать этот файл перед следующим запросом!
