# Document Distance
For each input file, a word-frequency vector is computed as follows:<br>
The paragraph is read in and is converted into a list of alphanumeric "words". Here a "word" is a sequence of consecutive alphanumeric characters. Non-alphanumeric characters are treated as blanks, and can be igored in the process.<br>
For each word, its frequency of occurrence is determined and the word/frequency lists are sorted into order alphabetically.<br><br>
The "distance" between two vectors is the angle between them.

In [13]:
import math

def get_Words(paragraph) :
    # To split the paragraph into words.
    text = ''
    for c in paragraph :
        if c.isalnum() :
            if c.isalpha() and c.isupper() :
                text += c.lower()
            else :
                text += c
        elif c.isspace() :
            text += c
    words = text.split(' ')
    return words

def count_Frequency(words) :
    # To find the freqency of each word in the paragraph.
    frequency = {}
    for word in words :
        if word in frequency :
            frequency[word] += 1
        else :
            frequency[word] = 1
    return frequency

def process_Text(text) :
    # To process each set of text.
    words = get_Words(text)
    frequencyMap = count_Frequency(words)
    return frequencyMap

def scalar_Product(T1, T2) :
    # Calculates the scalar/dot/inner product of the two frequency maps.
    # inner_product({"and":3,"of":2,"the":5},{"and":4,"in":1,"of":1,"this":2}) = 14.0.
    summ = 0
    for word in T1 :
        if word in T2 :
            summ += (T1[word] * T2[word])
    return summ

def vector_Angle(T1, T2) :
    # Calculates the vector angle between the two vectors.
    numerator = scalar_Product(T1, T2)
    pro = scalar_Product(T1, T1) * scalar_Product(T2, T2)
    denominator = pow(pro, (1/2))
    angle = math.acos(numerator/denominator)
    return angle

def get_Input() :
    # Recieves two paragraphs of text from the user console.
    text1 = input('Enter the first text : ')
    text2 = input('Enter the second text : ')
    return text1, text2

text1, text2 = get_Input()
freq1 = process_Text(text1)
freq2 = process_Text(text2)
distance = vector_Angle(freq1, freq2)
print('The distance in radians is : %0.6f'%distance)

Enter the first text :  hi how are you
Enter the second text :  are you fine!


The distance in radians is : 0.955317
