In [1]:
import numpy as np
import codecs, string

with codecs.open("common-english-words.txt", "r", "utf-8") as f:
    common_words = f.read().split(",")


In [2]:
# Create and clean the text
text = "Intelligent behavior in people is a product of the mind. But the mind itself is more like what the human brain does."
def clean(text):
    # Remove stop words
    text = " ".join(list(filter(lambda x: x not in common_words, text.split(" "))))
    # Remove punctutation
    text = "".join(list(filter(lambda x: x not in string.punctuation+"\n\r\t\0", text)))
    # To lower case
    text = text.lower()
    return text
text = clean(text)
text

'intelligent behavior people product mind but mind itself more human brain does'

In [3]:
# Create inverted index
vocab = dict([(key, (1, text.count(key))) for key in set(text.split(" "))])
vocab

{'more': (1, 1),
 'people': (1, 1),
 'mind': (1, 2),
 'does': (1, 1),
 'intelligent': (1, 1),
 'product': (1, 1),
 'brain': (1, 1),
 'human': (1, 1),
 'but': (1, 1),
 'itself': (1, 1),
 'behavior': (1, 1)}

In [4]:
# Construct a (normal) inverted index
# For one document this is just a frequency list
def gen_idx(corpus):
    # Initiate the index as a dict('term', dict('doc', num_occ))
    idx_list = dict([(key, {}) for key in set(" ".join(corpus).split(" "))])
    for doc_idx, doc in enumerate(corpus, 1):
        # Increment number of occurrences for each occurrence
        for term in doc.split(" "):
            if doc_idx not in idx_list[term].keys():
                idx_list[term][doc_idx] = 0
            idx_list[term][doc_idx] += 1
    return idx_list
gen_idx([text])

{'more': {1: 1},
 'people': {1: 1},
 'mind': {1: 2},
 'does': {1: 1},
 'intelligent': {1: 1},
 'product': {1: 1},
 'brain': {1: 1},
 'human': {1: 1},
 'but': {1: 1},
 'itself': {1: 1},
 'behavior': {1: 1}}

In [5]:
def gen_idx_block(corpus, block_size=3):
    # Initiate the index as a dict('term', dict('doc', [block_ids]))
    idx_list = dict([(key, {}) for key in set(" ".join(corpus).split(" "))])
    corpus_blocks = []
    for doc_idx, doc in enumerate(corpus, 1):
        # Generate blocks
        blocks = [doc.split(" ")[block_size*i:block_size*i+block_size] for i in range(len(doc.split(" "))//block_size+1)]
        blocks = list(filter(lambda x: len(x)>0, blocks))
        corpus_blocks.append(blocks)
        # For each distinct term in the document
        for term in set(doc.split(" ")):
            if doc_idx not in idx_list[term].keys():
                idx_list[term][doc_idx] = []
            # Find occurrences and add block to block list:
            for block_idx, block in enumerate(blocks):
                if term in block:
                    idx_list[term][doc_idx].append(block_idx)

    return idx_list, corpus_blocks

In [6]:
# 'word': {doc_id: [block_indices]}
# Indexing using block addressing

print(text)
idx, blocks = gen_idx_block([text], block_size=3)
for idx, block in enumerate(blocks[0]):
    print(idx,block)
idx

intelligent behavior people product mind but mind itself more human brain does
0 ['intelligent', 'behavior', 'people']
1 ['product', 'mind', 'but']
2 ['mind', 'itself', 'more']
3 ['human', 'brain', 'does']


3

## Creating a Suffix Tree

In [7]:
# Vocabulary suffix trie
from typing import Any


class Node:
    def __init__(self, data=None, index=-1):
        self.children = {}
        self.data = data
        self.index = index

txt = "missing mississippi"
suffixes = [txt[i:] for i in range(len(txt))]
suffixes

class SuffixTrie:

    def __call__(self, *args: Any, **kwds: Any) -> Any:
        print("test")

    def create_tree(self, txt):
        self.root = self.build_tree(txt)
        self.root = self.reduce(self.root, txt)

    def build_tree(self, txt):
        txt += "$"
        root = Node()
        current = root
        for i in range(len(txt)):
            current = root
            for j in range(i, len(txt)):
                c = txt[j]
                if c not in current.children:
                    newNode = Node(index=j-(j-i)+1, data=c if c != " " else "_")
                    current.children[c] = newNode
                current = current.children[c]
        return root

    def search(self, txt, count=0):
        current = root
        if len(txt) <= 0:
            if txt not in current.children:
                return False
            return current.children[txt].index
        for c in txt:
            if c in current.children:
                current = current.children[c]
            else:
                return self.search(txt[current.index:len(txt)+1], count+1)
        return current.index-1

    def reduce(self, current: Node, txt):
        idx = self.search()

        for child in current.children:
            self.reduce(child)

root = SuffixTrie()


In [8]:
txt = "missing mississippi"

def search(key, count=0):
    print(key)
    current = root
    if len(key) <= 0:
        if key not in current.children:
            return False
        print(current.children)
        return current.children[key].index
    for c in key:
        if c in current.children:
            current = current.children[c]
        else:
            return search(txt[current.index:len(key)+1], count+1)
    return current.index-1

# search("issing")

In [9]:
# from suffix_trees import STree

# # Suffix-Tree example.
# st = STree.STree(text)

In [10]:
# def print_tree(current, offset=0, c="*"):
#     # print(" "*offset*2 + "-" + (current.data if current.data else "*"),end="")
#     print(" "*offset*2 + "-" + c,end="")
#     print(current.idx if not current.transition_links else "")
#     for c, child in current.transition_links.items():
#         print_tree(child, offset+1, c)
# print_tree(st.root)

In [11]:
# def print_tree(current: Node, offset=0):
#     print(" "*offset*2 + "-" + (current.data if current.data else "*"),end="")
#     # print(" "*offset*2 + "-",end="")
#     print(current.index if not current.children else "")
#     for child in current.children.values():
#         print_tree(child, offset+1)
# print_tree(root)

## Listing on corpus

In [12]:
corpus = [
    "Although we know much more about the human brain than we did even",
    "ten years ago, the thinking it engages in remains pretty much a total",
    "mystery. It is like a big jigsaw puzzle where we can see many of the",
    "pieces, but cannot yet put them together. There is so much about us",
    "that we do not understand at all.",
]
corpus = [clean(text) for text in corpus]
corpus

['although know much more human brain even',
 'ten years ago thinking engages remains pretty much total',
 'mystery it big jigsaw puzzle see many',
 'pieces put together there much',
 'understand all']

In [13]:
# index, blocks = gen_idx_block(corpus, block_size=3)
index = gen_idx(corpus)
index

{'thinking': {2: 1},
 'know': {1: 1},
 'pieces': {4: 1},
 'puzzle': {3: 1},
 'even': {1: 1},
 'jigsaw': {3: 1},
 'ten': {2: 1},
 'there': {4: 1},
 'understand': {5: 1},
 'years': {2: 1},
 'remains': {2: 1},
 'total': {2: 1},
 'it': {3: 1},
 'pretty': {2: 1},
 'see': {3: 1},
 'together': {4: 1},
 'although': {1: 1},
 'all': {5: 1},
 'more': {1: 1},
 'many': {3: 1},
 'big': {3: 1},
 'much': {1: 1, 2: 1, 4: 1},
 'put': {4: 1},
 'brain': {1: 1},
 'human': {1: 1},
 'engages': {2: 1},
 'ago': {2: 1},
 'mystery': {3: 1}}

In [14]:
index1 = index['much']
index1

{1: 1, 2: 1, 4: 1}

In [15]:
tf = np.array(list(index1.values()))
df = len(index1)
idf = np.log2(len(corpus) / df)
tf*idf

array([0.73696559, 0.73696559, 0.73696559])

# Test

In [16]:
from nltk.stem.porter import PorterStemmer
import string

# This is used for preprocessing of both the corpus and queries
def preprocessing(text):
    # Initiate stemmer
    stemmer = PorterStemmer()

    # Define unwanted characters (punctuation)
    bad_chars = string.punctuation+"\n\r\t"

    # Clean, tokenize and stem text
    new_text = text = text.lower() # all lower case
    new_text = "".join(list(filter(lambda x: x not in bad_chars, new_text))) # remove unwanted chars
    new_text = new_text.split(" ") # tokenize (split into words)
    new_text = list(filter(lambda c: len(c) > 0, new_text)) # remove empty strings
    new_text = [stemmer.stem(word) for word in new_text] # perform stemming
    new_text = " ".join(new_text)
    return new_text

In [17]:
corpus = []
for i in range(1,7):
    with open(f"./DataAssignment4/Text{i}.txt") as f:
        corpus.append(f.read())
corpus = [preprocessing(doc) for doc in corpus]
index = gen_idx(corpus)
index

{'europ': {5: 7},
 'simplifi': {5: 7},
 'paradisemat': {4: 3},
 'boss': {2: 4},
 'sort': {2: 1},
 'prevent': {6: 3},
 'waxth': {3: 1},
 'tv': {3: 9},
 'fault': {6: 3},
 'let': {2: 1},
 'four': {2: 2},
 'flax': {3: 3},
 'too': {1: 3, 2: 1},
 'wouldnt': {2: 1},
 'possess': {1: 4},
 'earthquak': {3: 3},
 'luxuri': {2: 1},
 'fail': {6: 3},
 'began': {2: 1},
 'and': {1: 43, 2: 26, 3: 15, 4: 44, 5: 28, 6: 42},
 'strenuou': {2: 1},
 'plaid': {3: 3},
 'curs': {2: 1},
 'box': {3: 3},
 'some': {2: 1, 6: 6},
 'vermin': {2: 1},
 'thought': {2: 5},
 'given': {2: 1},
 'heart': {1: 4},
 'troubl': {2: 1, 6: 3},
 'take': {2: 1, 4: 3, 6: 3},
 'whole': {1: 4, 2: 1},
 'luck': {3: 3},
 'id': {2: 3},
 'ax': {3: 3},
 'shut': {2: 1},
 'drawer': {2: 1},
 'quartz': {3: 6},
 'fifteen': {2: 1},
 'undertak': {6: 3},
 'alarm': {2: 2},
 'slowli': {2: 1},
 'until': {4: 3},
 'accept': {2: 1, 6: 3},
 'bit': {2: 1},
 'cambridg': {5: 7},
 'wax': {3: 5},
 'fop': {3: 3},
 'how': {2: 1, 3: 6, 6: 6},
 'those': {6: 6},
 'stif

In [18]:
sorted([(k, list(v.keys())) for k, v in index.items()], key=lambda x: len(x[1]))[-10:]

[('that', [1, 2, 4, 5, 6]),
 ('it', [1, 2, 4, 5, 6]),
 ('be', [1, 2, 4, 5, 6]),
 ('and', [1, 2, 3, 4, 5, 6]),
 ('to', [1, 2, 3, 4, 5, 6]),
 ('of', [1, 2, 3, 4, 5, 6]),
 ('the', [1, 2, 3, 4, 5, 6]),
 ('in', [1, 2, 3, 4, 5, 6]),
 ('is', [1, 2, 3, 4, 5, 6]),
 ('a', [1, 2, 3, 4, 5, 6])]

In [19]:
docs = {}
for q in preprocessing("brown fox").split(" "):
    if q in index:
        docs[q] = index[q]
p = list([list(val.keys()) for val in docs.values()])
if len(p) > 1:
    for doc in p[0]:
        if doc in p[1:][0]:
            print(doc)

3


In [20]:
query = "brown AND fox -bear -bubble"
terms = [q.split(" ")[0] for q in query.split(" AND ")]
terms = [preprocessing(q) for q in terms]
negation_terms = [q.split(" ")[0] for q in query.split("-")[1:]]
negation_terms = [preprocessing(q) for q in negation_terms]
query, terms, negation_terms, "         "

('brown AND fox -bear -bubble',
 ['brown', 'fox'],
 ['bear', 'bubbl'],
 '         ')

In [21]:
def retrieve_raw_indexes(terms):
    indexes = []
    for term in terms:
        if term in index:
            indexes.append(index[term])
        else:
            indexes.append({})
    return indexes

def AND_filter(indexes):
    if not len(indexes):
        return indexes
    docs = []
    for key in indexes[0].keys():
        if all([key in idx.keys() for idx in indexes]):
            docs.append(key)
    return docs
indexes = retrieve_raw_indexes(terms)
AND_filter(indexes)

# def AND_filter(indexes, and_terms):
#     docs = []
#     for idx in indexes:
#         print(idx)
#         if all([idx in index[term].keys() for term in and_terms]):
#             docs.append(index)
#     return docs
# AND_filter(indexes, and_terms)

[3]

In [22]:
terms
index['brown']

{2: 1, 3: 7}

In [23]:
# def filter_AND(indexes, and_terms):
indexes, terms
and_term_indexes = retrieve_raw_indexes(terms)
filtered_indexes = []
for idx in indexes:
    pass
    print([key in [i.keys() for i in and_term_indexes] for key in idx.keys()])
and_term_indexes

[False, False]
[False]


[{2: 1, 3: 7}, {3: 22}]

In [24]:
def full_process_query(query):
    query = preprocessing(query).split(" ")
    return retrieve_raw_indexes(query)
index_list = full_process_query(query)
# index_list = filter_AND(index_list, and_terms)

# filter_NOT(index_list, not_terms)


In [25]:
query = "the"
neg_query = query.split(" -")[1:]
neg_query
# def neg_filter(indexes, negations):
docs = []
for key in indexes[0].keys():
    if all([key in idx.keys() for idx in indexes]):
        docs.append(key)


In [26]:
id1 = index['brown']
id2 = index['fox']
id3 = index['sort']
id1.keys()

dict_keys([2, 3])

In [27]:
def AND_filter(indexes):
    if not len(indexes):
        return indexes
    docs = []
    for key in indexes[0].keys(): # for all docs in first index
        if all([key in idx.keys() for idx in indexes]): # the doc is in all other indexes
            docs.append(key)
    return docs

In [28]:
"(term1 AND term2) AND -term3"

'(term1 AND term2) AND -term3'

In [50]:
id1 = [{2: 1, 3: 7}]
query = ["brown", "fox"]
def get_docs_by_intersection(terms):
    # Get the indexes of the AND terms
    indexes = retrieve_raw_indexes(terms)
    indexes = sorted(indexes, key=len) # sort for efficiency
    if not len(indexes):
        return []
    # Accumulate docs if they are in all other terms
    docs = []
    for key in indexes[0].keys(): # for all docs in first index
        if all([key in idx.keys() for idx in indexes]): # the doc is in all other indexes
            docs.append(key)
    return docs

get_docs_by_intersection(["brown", "fox", "bear"])

[]

In [48]:
indexes = retrieve_raw_indexes(["brown", "fox"])
sorted(indexes, key=len)


[{3: 22}, {2: 1, 3: 7}]

In [39]:
def get_docs_by_union(terms):
    # Get the indexes of the NOT terms
    indexes = retrieve_raw_indexes(terms)
    if not len(indexes):
        return []
    # Accumulate docs of the NOT terms
    docs = set()
    for index in indexes:
        docs.update(index.keys())
    return list(docs)

get_docs_by_union(["brown", "fox", "bear"])

[1, 2, 3]

In [31]:
s = set({1,2,3}).difference({2})
s.add(1)
s

{1, 3}