In [1]:
import numpy as np
import codecs, string

with codecs.open("common-english-words.txt", "r", "utf-8") as f:
    common_words = f.read().split(",")


In [2]:
# Create and clean the text
text = "Intelligent behavior in people is a product of the mind. But the mind itself is more like what the human brain does."
def clean(text):
    # Remove stop words
    text = " ".join(list(filter(lambda x: x not in common_words, text.split(" "))))
    # Remove punctutation
    text = "".join(list(filter(lambda x: x not in string.punctuation+"\n\r\t\0", text)))
    # To lower case
    text = text.lower()
    return text
text = clean(text)
text

'intelligent behavior people product mind but mind itself more human brain does'

In [3]:
# Create inverted index
vocab = dict([(key, (1, text.count(key))) for key in set(text.split(" "))])
vocab

{'itself': (1, 1),
 'more': (1, 1),
 'but': (1, 1),
 'does': (1, 1),
 'brain': (1, 1),
 'intelligent': (1, 1),
 'people': (1, 1),
 'human': (1, 1),
 'behavior': (1, 1),
 'product': (1, 1),
 'mind': (1, 2)}

In [4]:
# Construct a (normal) inverted index
# For one document this is just a frequency list
def gen_idx(corpus):
    # Initiate the index as a dict('term', dict('doc', num_occ))
    idx_list = dict([(key, {}) for key in set(" ".join(corpus).split(" "))])
    for doc_idx, doc in enumerate(corpus, 1):
        # Increment number of occurrences for each occurrence
        for term in doc.split(" "):
            if doc_idx not in idx_list[term].keys():
                idx_list[term][doc_idx] = 0
            idx_list[term][doc_idx] += 1
    return idx_list
gen_idx([text])

{'itself': {1: 1},
 'more': {1: 1},
 'but': {1: 1},
 'does': {1: 1},
 'brain': {1: 1},
 'intelligent': {1: 1},
 'people': {1: 1},
 'human': {1: 1},
 'behavior': {1: 1},
 'product': {1: 1},
 'mind': {1: 2}}

In [69]:
def gen_idx_block(corpus, block_size=3):
    # Initiate the index as a dict('term', dict('doc', [block_ids]))
    idx_list = dict([(key, {}) for key in set(" ".join(corpus).split(" "))])
    corpus_blocks = []
    for doc_idx, doc in enumerate(corpus, 1):
        # Generate blocks
        blocks = [doc.split(" ")[block_size*i:block_size*i+block_size] for i in range(len(doc.split(" "))//block_size+1)]
        blocks = list(filter(lambda x: len(x)>0, blocks))
        corpus_blocks.append(blocks)
        # For each distinct term in the document
        for term in set(doc.split(" ")):
            if doc_idx not in idx_list[term].keys():
                idx_list[term][doc_idx] = []
            # Find occurrences and add block to block list:
            for block_idx, block in enumerate(blocks):
                if term in block:
                    idx_list[term][doc_idx].append(block_idx)

    return idx_list, corpus_blocks

In [73]:
gen_idx_block([text])[0]

{'itself': {1: [2]},
 'more': {1: [2]},
 'but': {1: [1]},
 'does': {1: [3]},
 'brain': {1: [3]},
 'intelligent': {1: [0]},
 'people': {1: [0]},
 'human': {1: [3]},
 'behavior': {1: [0]},
 'product': {1: [1]},
 'mind': {1: [1, 2]}}

In [75]:
# 'word': {doc_id: [block_indices]}
# Indexing using block addressing

print(text)
idx, blocks = gen_idx_block([text], block_size=3)
for bidx, block in enumerate(blocks[0]):
    print(bidx,block)
idx

intelligent behavior people product mind but mind itself more human brain does
0 ['intelligent', 'behavior', 'people']
1 ['product', 'mind', 'but']
2 ['mind', 'itself', 'more']
3 ['human', 'brain', 'does']


{'itself': {1: [2]},
 'more': {1: [2]},
 'but': {1: [1]},
 'does': {1: [3]},
 'brain': {1: [3]},
 'intelligent': {1: [0]},
 'people': {1: [0]},
 'human': {1: [3]},
 'behavior': {1: [0]},
 'product': {1: [1]},
 'mind': {1: [1, 2]}}

## Creating a Suffix Tree

In [7]:
# Vocabulary suffix trie
from typing import Any


class Node:
    def __init__(self, data=None, index=-1):
        self.children = {}
        self.data = data
        self.index = index

txt = "missing mississippi"
suffixes = [txt[i:] for i in range(len(txt))]
suffixes

class SuffixTrie:

    def __call__(self, *args: Any, **kwds: Any) -> Any:
        print("test")

    def create_tree(self, txt):
        self.root = self.build_tree(txt)
        self.root = self.reduce(self.root, txt)

    def build_tree(self, txt):
        txt += "$"
        root = Node()
        current = root
        for i in range(len(txt)):
            current = root
            for j in range(i, len(txt)):
                c = txt[j]
                if c not in current.children:
                    newNode = Node(index=j-(j-i)+1, data=c if c != " " else "_")
                    current.children[c] = newNode
                current = current.children[c]
        return root

    def search(self, txt, count=0):
        current = root
        if len(txt) <= 0:
            if txt not in current.children:
                return False
            return current.children[txt].index
        for c in txt:
            if c in current.children:
                current = current.children[c]
            else:
                return self.search(txt[current.index:len(txt)+1], count+1)
        return current.index-1

    def reduce(self, current: Node, txt):
        idx = self.search()

        for child in current.children:
            self.reduce(child)

root = SuffixTrie()


In [8]:
txt = "missing mississippi"

def search(key, count=0):
    print(key)
    current = root
    if len(key) <= 0:
        if key not in current.children:
            return False
        print(current.children)
        return current.children[key].index
    for c in key:
        if c in current.children:
            current = current.children[c]
        else:
            return search(txt[current.index:len(key)+1], count+1)
    return current.index-1

# search("issing")

## Listing on corpus

In [12]:
corpus = [
    "Although we know much more about the human brain than we did even",
    "ten years ago, the thinking it engages in remains pretty much a total",
    "mystery. It is like a big jigsaw puzzle where we can see many of the",
    "pieces, but cannot yet put them together. There is so much about us",
    "that we do not understand at all.",
]
corpus = [clean(text) for text in corpus]
corpus

['although know much more human brain even',
 'ten years ago thinking engages remains pretty much total',
 'mystery it big jigsaw puzzle see many',
 'pieces put together there much',
 'understand all']

In [13]:
# index, blocks = gen_idx_block(corpus, block_size=3)
index = gen_idx(corpus)
index

{'more': {1: 1},
 'pieces': {4: 1},
 'although': {1: 1},
 'know': {1: 1},
 'many': {3: 1},
 'mystery': {3: 1},
 'it': {3: 1},
 'see': {3: 1},
 'together': {4: 1},
 'there': {4: 1},
 'ten': {2: 1},
 'much': {1: 1, 2: 1, 4: 1},
 'total': {2: 1},
 'remains': {2: 1},
 'big': {3: 1},
 'brain': {1: 1},
 'puzzle': {3: 1},
 'thinking': {2: 1},
 'pretty': {2: 1},
 'all': {5: 1},
 'jigsaw': {3: 1},
 'understand': {5: 1},
 'put': {4: 1},
 'engages': {2: 1},
 'even': {1: 1},
 'years': {2: 1},
 'human': {1: 1},
 'ago': {2: 1}}

In [14]:
index1 = index['much']
index1

{1: 1, 2: 1, 4: 1}

In [15]:
tf = np.array(list(index1.values()))
df = len(index1)
idf = np.log2(len(corpus) / df)
tf*idf

array([0.73696559, 0.73696559, 0.73696559])

# Test

In [16]:
from nltk.stem.porter import PorterStemmer
import string

# This is used for preprocessing of both the corpus and queries
def preprocessing(text):
    # Initiate stemmer
    stemmer = PorterStemmer()

    # Define unwanted characters (punctuation)
    bad_chars = string.punctuation+"\n\r\t"

    # Clean, tokenize and stem text
    new_text = text = text.lower() # all lower case
    new_text = "".join(list(filter(lambda x: x not in bad_chars, new_text))) # remove unwanted chars
    new_text = new_text.split(" ") # tokenize (split into words)
    new_text = list(filter(lambda c: len(c) > 0, new_text)) # remove empty strings
    new_text = [stemmer.stem(word) for word in new_text] # perform stemming
    new_text = " ".join(new_text)
    return new_text

In [17]:
corpus = []
for i in range(1,7):
    with open(f"./DataAssignment4/Text{i}.txt") as f:
        corpus.append(f.read())
corpus = [preprocessing(doc) for doc in corpus]
index = gen_idx(corpus)
index

{'whenev': {2: 1},
 'heavi': {2: 1},
 'wonder': {1: 4},
 'nois': {2: 1},
 'belli': {2: 2},
 'sleep': {2: 4},
 'can': {2: 2, 6: 3},
 'samsa': {2: 2},
 'ital': {4: 3},
 'dwell': {1: 3},
 'coalesc': {5: 7},
 'gild': {2: 1},
 'piti': {2: 1},
 'convinc': {4: 3},
 'translat': {5: 7},
 'drew': {2: 1},
 'music': {5: 7},
 'sink': {1: 3},
 'gone': {2: 1},
 'deepli': {2: 1},
 'funni': {2: 1},
 'chest': {2: 1},
 'greater': {1: 3, 6: 2},
 'wall': {2: 1},
 'close': {1: 3, 2: 1},
 'fli': {1: 3, 4: 3},
 'ha': {1: 4, 4: 3, 6: 6},
 'quietli': {2: 2, 3: 3},
 'set': {2: 1},
 'tell': {2: 1},
 'pane': {2: 1},
 'how': {2: 1, 3: 6, 6: 6},
 'viewer': {2: 1},
 'eat': {2: 1},
 'littl': {1: 3, 2: 4, 4: 9},
 'hi': {1: 3, 2: 20},
 'impress': {1: 3},
 'let': {2: 1},
 'river': {4: 4},
 'from': {2: 4, 3: 6, 4: 10, 6: 6},
 'wikigirl': {3: 3},
 'power': {1: 3, 6: 3},
 'fifteen': {2: 1},
 'best': {2: 1, 6: 3},
 'she': {3: 3, 4: 18},
 'they': {4: 10, 6: 3},
 'were': {2: 3, 3: 3, 4: 3},
 'full': {1: 3},
 'put': {2: 1, 4: 3

In [18]:
sorted([(k, list(v.keys())) for k, v in index.items()], key=lambda x: len(x[1]))[-10:]

[('wa', [1, 2, 3, 4, 6]),
 ('for', [1, 2, 3, 4, 5]),
 ('it', [1, 2, 4, 5, 6]),
 ('is', [1, 2, 3, 4, 5, 6]),
 ('to', [1, 2, 3, 4, 5, 6]),
 ('and', [1, 2, 3, 4, 5, 6]),
 ('a', [1, 2, 3, 4, 5, 6]),
 ('the', [1, 2, 3, 4, 5, 6]),
 ('in', [1, 2, 3, 4, 5, 6]),
 ('of', [1, 2, 3, 4, 5, 6])]

In [67]:
def retrieve_raw_indexes(terms):
    indexes = []
    for term in terms:
        if term in index:
            indexes.append(index[term])
        else:
            indexes.append({})
    return indexes

In [107]:
query = "brown AND (fox bear) -(bubble cat)"
and_terms = [q.split(" ")[0] for q in query.split(" AND ")]
and_terms = [preprocessing(q) for q in and_terms]
negation_terms = [q.split(" ")[0] for q in query.split("-")[1:]]
negation_terms = [preprocessing(q) for q in negation_terms]
query, and_terms, negation_terms, "         "

('brown AND (fox bear) -(bubble cat)',
 ['brown', 'fox'],
 ['bubbl'],
 '         ')

In [103]:
import re
def parse_query(query):
    # Tokenize the query
    query = query.replace(" -", " NOT ")
    tokens = re.findall(r'(?:AND|OR|NOT|\(|\)|[a-zA-Z0-9]+)', query)
    return parse_expression(tokens)

def parse_expression(tokens):
    current_expression = []

    while tokens:
        token = tokens.pop(0)

        if token == "(":
            # Start a new nested expression
            nested_expression = parse_expression(tokens)
            current_expression.append(nested_expression)

        elif token == ")":
            # End the current expression
            break
        else:
            current_expression.append(token)

    return current_expression

# Example usage:
parsed_query = parse_query(query)
parsed_query

['brown', 'AND', ['fox', 'bear'], 'NOT', ['bubble', 'cat']]

In [104]:
def full_process_query(query):
    query = preprocessing(query).split(" ")
    return retrieve_raw_indexes(query)
index_list = full_process_query(query)

In [105]:
id1 = [{2: 1, 3: 7}]
query = ["brown", "fox"]
def get_docs_by_intersection(terms):
    # Get the indexes of the AND terms
    indexes = retrieve_raw_indexes(terms)
    indexes = sorted(indexes, key=len) # sort for efficiency
    if not len(indexes):
        return []
    # Accumulate docs if they are in all other terms
    docs = []
    for key in indexes[0].keys(): # for all docs in first index
        if all([key in idx.keys() for idx in indexes]): # the doc is in all other indexes
            docs.append(key)
    return docs

get_docs_by_intersection(["brown", "fox", "bear"])

[]

In [106]:
def get_docs_by_union(terms):
    # Get the indexes of the NOT terms
    indexes = retrieve_raw_indexes(terms)
    if not len(indexes):
        return []
    # Accumulate docs of the NOT terms
    docs = set()
    for index in indexes:
        docs.update(index.keys())
    return list(docs)

get_docs_by_union(["brown", "fox", "bear"])

[1, 2, 3]

In [149]:
def retrieve_docs(term):
    term = preprocessing(term)
    if term in index:
        return set(index[term].keys())
    return set()

In [88]:
parsed_query = parse_query(query)
parsed_query

['brown', 'and', ['fox', 'bear'], '-', ['bubble', 'cat']]

In [153]:
# query = "brown AND (fox) -(bubble cat)"
query = "enjoy AND bear"
def search_recursive(query, doc_set:set=set()):
    current_context = set()
    operator = None

    for term in query:
        if isinstance(term, list):
            # Recursively process sub-expression
            sub_result = search_recursive(term, doc_set)
            if operator == 'AND':
                doc_set.intersection_update(sub_result)
            elif operator == 'NOT':
                doc_set.difference_update(sub_result)
            else: # default OR
                doc_set.update(sub_result)
        elif term in {'AND', 'OR', 'NOT', '-'}:
            # Set the current operator
            operator = term
        else:
            # Process term
            term_docs = retrieve_docs(term)  # Replace with your actual retrieval function
            if operator == 'AND':
                if not current_context:
                    current_context = term_docs
                else:
                    current_context.intersection_update(term_docs)
            elif operator == 'OR':
                current_context.update(term_docs)
            elif operator == 'NOT':
                current_context.difference_update(term_docs)
            else:
                current_context.update(term_docs)
    
    # After processing all terms and operators in the query, update the document set
    doc_set.update(current_context)
    
    return doc_set


def search(query):
    parsed_query = parse_query(query)
    return list(search_recursive(parsed_query))

search(query)


[1]