In [12]:
import string
from collections import OrderedDict

In [13]:
def extractUniqueWords(words):
    unique_words = []
    for word in words:
        if word not in unique_words:
            unique_words.append(word)
    return unique_words

def cleanData(file_text):
    file_text = file_text.lower()
    file_text = file_text.translate(str.maketrans('', '', string.punctuation))
    cleaned_data = file_text.split()
    return cleaned_data

def sortDictionary(dictionary):
    sorted_dict = OrderedDict(sorted(dictionary.items(), key=lambda x: x[1], reverse=True))
    return sorted_dict

In [14]:
with open("TextFiles/assemblyX86.txt", "r+") as file1:
    file_text = file1.read()

words_1 = cleanData(file_text)
unique_words_1 = extractUniqueWords(words_1)

with open("TextFiles/whyProgramming.txt", "r+") as file2:
    file_text = file2.read()

words_2 = cleanData(file_text)
unique_words_2 = extractUniqueWords(words_2)

In [15]:
### Approach-1: Not Optimized
dictionary = {}
for word in unique_words_1 + unique_words_2:
    dictionary[word] = words_1.count(word) + words_2.count(word)

dictionary_1 = {}
for word in unique_words_1 + unique_words_2:
    dictionary_1[word] = words_1.count(word)

dictionary_2 = {}
for word in unique_words_1 + unique_words_2:
    dictionary_2[word] = words_2.count(word)

dictionary_value = list(dictionary.values())
dictionary_1_value = list(dictionary_1.values())
dictionary_2_value = list(dictionary_2.values())

# for test
# print(dictionary_value)
# print(dictionary_1_value)
# print(dictionary_2_value)

# for test
# print(len(dictionary_value))
# print(dictionary_1_value.count(0))
# print(dictionary_2_value.count(0))

dictionary_text1_similarity = 0
dictionary_text2_similarity = 0
text1_text2_similarity = 0
dictionary_dictionary_similarity = 0

for i in range(len(dictionary_value)):
    dictionary_text1_similarity += dictionary_value[i] * dictionary_1_value[i]
    dictionary_text2_similarity += dictionary_value[i] * dictionary_2_value[i]
    text1_text2_similarity += dictionary_1_value[i] * dictionary_2_value[i]
    dictionary_dictionary_similarity += dictionary_value[i] * dictionary_value[i]

dictionary_text1_similarity = dictionary_text1_similarity / len(dictionary_value)
dictionary_text2_similarity = dictionary_text2_similarity / len(dictionary_value)
text1_text2_similarity = text1_text2_similarity / len(dictionary_value)
dictionary_dictionary_similarity = dictionary_dictionary_similarity / len(dictionary_value)

print("Result of Approach-1: ")
print("Similarity of our main dictionary to itself: ", dictionary_dictionary_similarity / dictionary_dictionary_similarity * 100, "/ 100")
print("Similarity of text_1 to main dictionary: ", round(dictionary_text1_similarity / dictionary_dictionary_similarity * 100, 2), "/ 100")
print("Similarity of text_2 to main dictionary: ", round(dictionary_text2_similarity / dictionary_dictionary_similarity * 100, 2), "/ 100")
print("Similarity of text_1 to text_2 based on main dictionary: ", round(text1_text2_similarity / dictionary_dictionary_similarity * 100, 2), "/ 100")

Result of Approach-1: 
Similarity of our main dictionary to itself:  100.0 / 100
Similarity of text_1 to main dictionary:  42.97 / 100
Similarity of text_2 to main dictionary:  57.03 / 100
Similarity of text_1 to text_2 based on main dictionary:  20.97 / 100


In [16]:
### Approach-2: Optimized
## Remove the words with 0 frequency and using 2 pointer idea like merge sort
optimized_dictionary_1 = {}
for key, value in dictionary_1.items():
    if value != 0:
        optimized_dictionary_1[key] = value

optimized_dictionary_1_value = list(optimized_dictionary_1.values())
sorted_optimized_dictionary_1 = sortDictionary(optimized_dictionary_1)

# print(sorted_optimized_dictionary_1)

optimized_dictionary_2 = {}
for key, value in dictionary_2.items():
    if value != 0:
        optimized_dictionary_2[key] = value

optimized_dictionary_2_value = list(optimized_dictionary_2.values())
sorted_optimized_dictionary_2 = sortDictionary(optimized_dictionary_2)

# print(sorted_optimized_dictionary_2)

## i want to implement an idea like the combine part of merge sort for combining the two sorted optimized dictionaries
combined_dictionary = {}
dic_1_pointer = 0
dic_2_pointer = 0

keys_1 = list(sorted_optimized_dictionary_1.keys())
keys_2 = list(sorted_optimized_dictionary_2.keys())

# iterate until one of the dictionaries is fully processed
while dic_1_pointer < len(keys_1) and dic_2_pointer < len(keys_2):
    key_1 = keys_1[dic_1_pointer]
    key_2 = keys_2[dic_2_pointer]
    
    # because keys can be Integer or String
    key_1_str = str(key_1)
    key_2_str = str(key_2)

    if key_1_str < key_2_str:
        combined_dictionary[key_1] = sorted_optimized_dictionary_1[key_1]
        dic_1_pointer += 1
    elif key_1_str > key_2_str:
        combined_dictionary[key_2] = sorted_optimized_dictionary_2[key_2]
        dic_2_pointer += 1
    else:  # if keys are equal
        combined_dictionary[key_1] = sorted_optimized_dictionary_1[key_1] + sorted_optimized_dictionary_2[key_2]
        dic_1_pointer += 1
        dic_2_pointer += 1

# add remaining elements from the first dictionary
while dic_1_pointer < len(keys_1):
    key_1 = keys_1[dic_1_pointer]
    combined_dictionary[key_1] = sorted_optimized_dictionary_1[key_1]
    dic_1_pointer += 1

# add remaining elements from the second dictionary
while dic_2_pointer < len(keys_2):
    key_2 = keys_2[dic_2_pointer]
    combined_dictionary[key_2] = sorted_optimized_dictionary_2[key_2]
    dic_2_pointer += 1

In [17]:
# similarity of text_1 to text_2 based on the combined dictionary
text1_text2_similarity = 0

for key, value in combined_dictionary.items():
    if key in dictionary_1 and key in dictionary_2:
        text1_text2_similarity += dictionary_1[key] * dictionary_2[key]

text1_text2_similarity = text1_text2_similarity / len(combined_dictionary)

print("Result of Approach-2: ")
print("Similarity of text_1 to text_2 based on combined dictionary: ", round(text1_text2_similarity / dictionary_dictionary_similarity * 100, 2), "/ 100")

Result of Approach-2: 
Similarity of text_1 to text_2 based on combined dictionary:  20.97 / 100
