In [None]:
import re
import math
from collections import Counter

def to_lower(sample):
    return sample.lower()

def remove_stop_words(sample, stop_words):
    words = sample.split()
    filtered_words = [word for word in words if word not in stop_words]
    return ' '.join(filtered_words)

def normalize_data(sample, stop_words):
    sample_lower = to_lower(sample)
    sample_without_stopwords = remove_stop_words(sample_lower, stop_words)
    return sample_without_stopwords

def unique_words(sample):
    return set(sample.split())

def bag_of_words(sample_list):
    all_unique_words = set()
    for sample in sample_list:
        all_unique_words.update(unique_words(sample))
    return list(all_unique_words)

def tf(word, sample):
    words = sample.split()
    word_count = Counter(words)
    return word_count[word] / len(words)

def idf(word, sample_list):
    num_samples_with_word = sum(1 for sample in sample_list if word in sample)
    return math.log(len(sample_list) / (1 + num_samples_with_word))

def tf_idf(sample_list, bag_of_words):
    tfidf_matrix = []
    for sample in sample_list:
        tfidf_vector = [tf(word, sample) * idf(word, sample_list) for word in bag_of_words]
        tfidf_matrix.append(tfidf_vector)
    return tfidf_matrix

def cosine_similarity(v1, v2):
    dot_product = sum(x * y for x, y in zip(v1, v2))
    magnitude_v1 = math.sqrt(sum(x ** 2 for x in v1))
    magnitude_v2 = math.sqrt(sum(y ** 2 for y in v2))
    return dot_product / (magnitude_v1 * magnitude_v2)

def rank(W_ij):
    q_vector = W_ij[0]
    similarities = {}
    for i in range(1, len(W_ij)):
        similarity = cosine_similarity(q_vector, W_ij[i])
        similarities[f"sample{i}"] = round(similarity, 2)
    sorted_similarities = {k: v for k, v in sorted(similarities.items(), key=lambda item: item[1], reverse=True)}
    return sorted_similarities

stop_words = set(["i", "am", "to", "your", "for", "and", "it", "is", "a", "with", "or", "on", "can", "almost", "all", "by", "of", "its", "quite", "which"])

sample1 = "Emacs is the best IDE for almost all types of programming languages which is mostly used by idiots of world. probably they do not care about the productivity. Rather their focus could be on quality and its flexibility. Similarly Lisp is the quite geek language which is used by hackers( not tom, dick and harry )."
sample2 = "I am thrilled to see your enthusiasm for data science and analytics. It is a dynamic field with endless possibilities, and I am here to support your journey. Feel free to reach out for guidance or to discuss any projects you are working on. Together, we can explore the exciting world of data-driven insights and solutions."

line1 = "Learning Lisp programming language is a time waste."
line2 = "Data science is subject of future"

line1_normalized = normalize_data(line1, stop_words)
line2_normalized = normalize_data(line2, stop_words)
sample_list = [sample1, sample2, line1_normalized, line2_normalized]
bag_of_words_list = bag_of_words(sample_list)
tfidf_matrix = tf_idf(sample_list, bag_of_words_list)
ranked_documents = rank(tfidf_matrix)
print(ranked_documents)


{'sample1': 0.05, 'sample2': 0.02, 'sample3': 0.0}
