# FRIENDS CORPUS

Привет! Это протоверсия корпуса по сериалу «Друзья». Чтобы воспользоваться поиском, нажми **Run all**.

In [1]:
import sqlite3

con = sqlite3.connect('friends_corpus_new.db')
cursor = con.cursor()

In [2]:
import spacy
model = spacy.load('en_core_web_sm')

любая форма --> любая форма

1. лемматизировать инпут
2. найти айди леммы
3. найти айди токенов
4. найти предложения с токенами

In [3]:
def get_lemma_ids(lemma):
    cursor.execute('''SELECT lemma_id FROM Lemmas
    WHERE lemma = ?''', (lemma,))
    return [i[0] for i in cursor.fetchall()]

def get_token_ids(lemma_ids):
    get_tokens_query = '''
    SELECT token_id FROM Tokens
    WHERE lemma_id = ?
    '''
    token_ids = []
    for lemma_id in lemma_ids:
        cursor.execute(get_tokens_query, (lemma_id,))
        token_ids.extend([i[0] for i in cursor.fetchall()])
    return token_ids

def get_sentence_ids(token_ids):
    get_sent_query = '''
    SELECT sentence_id FROM Match
    WHERE token_id = ?
    '''
    sentence_ids = []
    for token_id in token_ids:
        cursor.execute(get_sent_query, (token_id,))
        sentence_ids.extend([i[0] for i in cursor.fetchall()])
    return sentence_ids

def token_to_tokens(token, model=model):
    lemma = model(token)[0].lemma_
    if lemma.startswith('-') and lemma.endswith('-'):
        lemma = token
    return get_sentence_ids(get_token_ids(get_lemma_ids(lemma)))

#token_to_tokens('you')

In [4]:
#model('you')[0].lemma_

любая форма --> эта же форма

1. найти айди токена (если нет, то лемматизировать и как I)
2. найти айди предложений

In [5]:

def get_token_id(token):
    get_token_query = '''
    SELECT token_id FROM Tokens
    WHERE token = ?
    '''
    cursor.execute(get_token_query, (token,))
    return [i[0] for i in cursor.fetchall()]

def token_to_token(token):
    return get_sentence_ids(get_token_id(token))
    
#token_to_token('snows')

лемма+пос --> любая форма

1. найти лемма айди
2. найти айди токенов
3. найти предложения

In [6]:
def get_pos_lemma_ids(lemma, pos):

    get_pos_lemma_query = '''
    SELECT lemma_id FROM Lemmas
    WHERE (lemma = ?) AND (pos = ?)
    '''
    cursor.execute(get_pos_lemma_query, (lemma, pos,))
    try:
        return cursor.fetchall()[0]
    except IndexError:
        return []

def lemma_pos_to_token(token, pos):
    return get_sentence_ids(get_token_ids(get_pos_lemma_ids(token, pos)))

#lemma_pos_to_token('like', 'VERB')

пос -- > токены
1. найти леммы айди
2. найти токены айди
3. найти предложения айди

In [7]:
def pos_to_tokens(pos):
    query = '''
    SELECT sentence_id FROM Match
    JOIN Tokens ON Match.token_id = Tokens.token_id
    JOIN Lemmas ON Tokens.lemma_id = Lemmas.lemma_id
    WHERE Lemmas.pos = ?
    '''
    cursor.execute(query, (pos,))
    return [i[0] for i in cursor.fetchall()]

#pos_to_tokens('PART')

In [8]:
import re

tags = ['ADJ', 'ADP', 'ADV', 'AUX', 'CCONJ', 'DET', 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM', 'VERB', 'X']

def execute_query(initial_query):
    queries = initial_query.split()
    sentence_ids = {}
    patterns = []

    for query in queries:
        if '+' in query:
            lemma, pos = query.split('+')
            new = set(lemma_pos_to_token(lemma, pos))
            patterns.append(re.compile(f"[a-zA-Z'\-_]+\+{lemma}\+{pos}"))
        elif query in tags:
            new = set(pos_to_tokens(query))
            patterns.append(re.compile(f"[a-zA-Z'\-_]+\+[a-zA-Z'\-_]+\+{query}"))
        elif query.startswith('"') or query.startswith("'"):
            new = set(token_to_token(query[1:-1]))
            patterns.append(re.compile(f"{query[1:-1]}\+[a-zA-Z'\-_]+\+[A-Z]+"))
        else:
            new = set(token_to_tokens(query))
            patterns.append(re.compile(f"[a-zA-Z'\-_]+\+{query}\+[A-Z]+"))
        # print(bool(new))
        sentence_ids = sentence_ids.intersection(new) if sentence_ids else new
    return sentence_ids, patterns

#sents, patterns = execute_query("you 'need' ADV")

In [9]:
def get_annotated(sents_id):    
    get_annotated_query = f'''
    SELECT sentence_id, annotated FROM Sentences
    WHERE sentence_id IN ({('?, ' * len(sents_id))[:-2]})
    '''
    cursor.execute(get_annotated_query, tuple(sents_id))
    return [i for i in cursor.fetchall()]

In [10]:

def is_valid(observed, last, sentence):
    if not last:
        last = observed.span()[-1]
        return True, last
    beg, end = observed.span()
    return beg == last + 1, end

def is_sentence_valid(sentence, patterns):

    matched = []
    for pattern in patterns:
        matched.append([x for x in re.finditer(pattern, sentence)])

    last = 0
    correct = True

    for match in matched:
        if len(match) == 1:
            observed = match[0]
            valid, last = is_valid(observed, last, sentence)
            if not valid:
                correct = False
                break
        else:
            valid_is_found = False
            for observed in match:
                if not valid_is_found:
                    result = is_valid(observed, last, sentence)
                    if result[0] == True:
                        last = result[-1]
                        valid_is_found = True
            if not valid_is_found:
                correct = False
                break
    return correct

In [11]:
#is_sentence_valid('oh+oh+INTJ hey+hey+INTJ but+but+CCONJ ,+,+PUNCT before+before+ADP you+you+PRON guys+guy+NOUN do+do+VERB that+that+SCONJ i+I+PRON need+need+VERB to+to+PART talk+talk+VERB to+to+ADP you+you+PRON ,+,+PUNCT and+and+CCONJ ross+Ross+PROPN ,+,+PUNCT i+I+PRON need+need+VERB to+to+PART talk+talk+VERB to+to+ADP you+you+PRON .+.+PUNCT', patterns)

In [12]:
import csv

In [13]:
def find(query):
    sentences_ids, patterns = execute_query(query)
    annotated = get_annotated(sentences_ids)
    valid = [sentence[0] for sentence in annotated if is_sentence_valid(sentence[-1], patterns)]

    get_everything = f'''
    SELECT sentence, character, episode, season, scriptwriters
    FROM Sentences
    JOIN Meta ON Sentences.sentence_id = Meta.sentence_id
    WHERE Sentences.sentence_id IN ({('?, ' * len(valid))[:-2]})
    '''

    cursor.execute(get_everything, valid)

    result = cursor.fetchall()
    with open("./results.csv", 'w', newline='') as myfile:
        wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
        wr.writerow(["sentence", "character", "episode", "season", "scriptwriters"])
        for sent in result:
            wr.writerow(sent)
    #return result

In [14]:
find('do+AUX do+VERB')