In [1]:
# %load DocSearch.py
import math


def dotProduct(D1, D2):
    Sum = 0.0
    for key in D1:
        if key in D2:
            Sum += (D1[key] * D2[key])
    return Sum


def vector_angle(D1, D2):
    numerator = dotProduct(D1, D2)
    denominator = math.sqrt(dotProduct(D1, D1))*math.sqrt(dotProduct(D2, D2))
    return round(math.degrees(math.acos(numerator / denominator)), 5)


def get_inverted_index(dictionary, docs):
    inverted = {}
    for x in dictionary:
        inverted[x] = []
        for i in range(1, len(docs)+1):
            if x in docs[i-1]:
                inverted[x].append(i)
    return inverted


def readLines(fileName):
    try:
        file = open(fileName, 'r')
        lines = file.readlines()
        file.close()
        return lines
    except:
        print("File {} Not Found".format(fileName))
        quit()


if __name__ == "__main__":
    # Input Files
    docFileName = 'docs.txt'
    queryFileName = 'queries.txt'

    # Read All Lines From Files
    docs = readLines(docFileName)
    queries = readLines(queryFileName)

    # Create Vector of docs
    docVectors = []
    for doc in docs:
        docWords = doc.strip().split()
        docVectors.append({key: docWords.count(key) for key in docWords})

    # 1. Building the Dictionary
    dictionary = {}
    for docVector in docVectors:
        for key in docVector:
            dictionary[key] = dictionary.get(key, 0) + docVector[key]
    # Printing dictionary Word Count
    print("Words in dictionary: ", len(dictionary))

    # 2. Building Inverted Index
    invert_index = get_inverted_index(dictionary, docs)

    # 3. Document Searching
    for query in queries:
        # Removing Space and \n from the end of query
        query = query.strip()
        
        # Split the query in words
        queryWords = query.split()

        # Printing Query
        print('Query: {}'.format(query))

        # Finding All the Relevant documents
        Relevant_Documents = [invert_index.get(q,[]) for q in queryWords]
        Relevant_Documents = list(set.intersection(*map(set, Relevant_Documents)))
        
        # Printing Relevant documents
        print('Relevant documents:',*Relevant_Documents)
        
#         print('Relevant documents: {}'.format(" ".join(map(str, Relevant_Documents))))
        
        # Finding the angel between query and each Relevant documents
        angles = []
        for i in Relevant_Documents:
            # Create Vector of query
            queryVactor = {
                key: 1 if key in queryWords else 0 for key in docVectors[i-1]}
            # Storing doc id and vector angel
            angles.append((i, vector_angle(docVectors[i-1], queryVactor)))

        # sort vector angle based on angle
        angles.sort(key=lambda x: x[1])

        # Printing vector angle of query
        for angel in angles:
            print(angel[0], angel[1])

Words in dictionary:  16
Query: rainbow
Relevant documents: 1 3
3 46.50848
1 62.68827
Query: double rainbow
Relevant documents: 3
3 35.79576
Query: double size
Relevant documents: 2
2 52.23876
