**Побудувати двословний індекс і координатний інвертований індекс по колекції документів.**

**Реалізувати фразовий пошук та пошук з урахуванням відстані для кожного з них.**

In [None]:
import os
from functools import reduce
import codecs
import re
from itertools import islice

In [None]:
books = os.listdir('./books')
books

['The White Company.txt',
 "At the Earth's Core.txt",
 "Montezuma's Daughter.txt",
 'Gone with the Wind.txt',
 'The House of the Seven Gables.txt',
 'The Pathfinder, or The Inland Sea.txt',
 'Oliver Twist.txt',
 'Rob Roy.txt',
 'Sister Carrie.txt',
 'The Antiquary.txt',
 'The Prairie.txt']

# Двословний індекс


In [None]:
two_word = {} 
def set_two_word(doc_name, w):
    for i in range(len(w)-1):
        key = f'{w[i]} {w[i+1]}'
        if key in two_word:
            two_word[key].add(doc_name)
        else:
            two_word[key] = set([doc_name])

# Координатний інвертований індекс 


In [None]:
inv = {}  
def set_inverted_coordinate(doc_name, w):
    for i in range(len(w)):
        if w[i] in inv:
            if doc_name in inv[w[i]]:
                inv[w[i]][doc_name].add(i)
            else:
                inv[w[i]][doc_name] = set([i])
        else:
            inv[w[i]] = {doc_name: set([i])}

In [None]:
for b in books:
    book_path = os.path.join('./books', b)
    print(book_path)
    with codecs.open(book_path, "r", "utf_8_sig") as fileObj:
        text = fileObj.read()
    w = re.findall("[a-z]+['-]?[a-z]+", text.lower())
    set_two_word(b, w)
    set_inverted_coordinate(b, w)

./books/The White Company.txt
./books/At the Earth's Core.txt
./books/Montezuma's Daughter.txt
./books/Gone with the Wind.txt
./books/The House of the Seven Gables.txt
./books/The Pathfinder, or The Inland Sea.txt
./books/Oliver Twist.txt
./books/Rob Roy.txt
./books/Sister Carrie.txt
./books/The Antiquary.txt
./books/The Prairie.txt


In [None]:
dict(islice(two_word.items(), 0, 3))

{'chapter how': {'Oliver Twist.txt', 'The White Company.txt'},
 'how the': {'Gone with the Wind.txt',
  "Montezuma's Daughter.txt",
  'Oliver Twist.txt',
  'Rob Roy.txt',
  'Sister Carrie.txt',
  'The Antiquary.txt',
  'The House of the Seven Gables.txt',
  'The Pathfinder, or The Inland Sea.txt',
  'The White Company.txt'},
 'the black': {"At the Earth's Core.txt",
  'Gone with the Wind.txt',
  "Montezuma's Daughter.txt",
  'Oliver Twist.txt',
  'Rob Roy.txt',
  'The Antiquary.txt',
  'The House of the Seven Gables.txt',
  'The Pathfinder, or The Inland Sea.txt',
  'The Prairie.txt',
  'The White Company.txt'}}

In [None]:
dict(islice(inv.items(), 100, 102))

{'marl': {'The White Company.txt': {154}},
 'pits': {'The White Company.txt': {155},
  "At the Earth's Core.txt": {16075, 29872, 29906, 30246, 31443},
  'Gone with the Wind.txt': {36450,
   36651,
   39527,
   115173,
   116481,
   119080,
   119165,
   119208,
   119246,
   119259,
   123220,
   123610,
   123909,
   125653,
   128399,
   128668,
   129761,
   130612,
   133587},
  'Oliver Twist.txt': {53282},
  'The Antiquary.txt': {59415}}}

In [None]:
def search_in_two_word(two_word, text):
    w = text.split()
    res = []
    for i in range(len(w)-1):
        key = f'{w[i]} {w[i+1]}'
        res.append(two_word[key])
    return reduce(lambda x,y: x&y, res)

In [None]:
search_in_two_word(two_word, 'among other public')

{'Oliver Twist.txt'}

In [None]:
search_in_two_word(two_word, 'had to come home')

{'Gone with the Wind.txt',
 "Montezuma's Daughter.txt",
 'Oliver Twist.txt',
 'Sister Carrie.txt',
 'The White Company.txt'}

----------------

In [None]:
def search_in_inverted(inv, text, distance=1):
    w = text.split()
    res = []
    documents = reduce(lambda x,y: x&y, [set(inv[n]) for n in w])
    for d in documents:
        found = True
        for i in range(len(w)-1):
            for coord in inv[w[i]][d]: 
                distances = set(coord+dist for dist in range(1, distance+1))
                if distances & inv[w[i+1]][d]:
                    break 
            else:  
                found = False
                break  
        if found:
            res.append(d)
    return res

In [None]:
search_in_inverted(inv, 'door which led to her own apartment')

['The Antiquary.txt', 'Rob Roy.txt']

In [None]:
search_in_inverted(inv, 'which led apartment', distance=5)

['Oliver Twist.txt', 'The Antiquary.txt', 'Rob Roy.txt']