For more examples and intutition see [здесь](https://www.kaggle.com/code/yassinehamdaoui1/creating-tf-idf-model-from-scratch) and [здесь](https://www.capitalone.com/tech/machine-learning/understanding-tf-idf/) and [здесь](https://melaniewalsh.github.io/Intro-Cultural-Analytics/05-Text-Analysis/03-TF-IDF-Scikit-Learn.html)

In [1]:
import math

# разбиваем документы на отдельные слова
document_a = [
    "Hold",
    "fast",
    "to",
    "dreams",
    "for",
    "if",
    "dreams",
    "die",
    "life",
    "is",
    "a",
    "broken-winged",
    "bird",
    "that",
    "cannot",
    "fly",
]
document_b = [
    "No",
    "bird",
    "soars",
    "too",
    "high",
    "if",
    "he",
    "soars",
    "with",
    "his",
    "own",
    "wings",
]

# общий набор слов
total_corpus = set(document_a).union(set(document_b))
total_corpus

{'Hold',
 'No',
 'a',
 'bird',
 'broken-winged',
 'cannot',
 'die',
 'dreams',
 'fast',
 'fly',
 'for',
 'he',
 'high',
 'his',
 'if',
 'is',
 'life',
 'own',
 'soars',
 'that',
 'to',
 'too',
 'wings',
 'with'}

In [2]:
# простые вычисления частот слов в каждом документе
# путем инициализации словаря
import pandas as pd
dict_a = dict.fromkeys(total_corpus, 0)
dict_b = dict.fromkeys(total_corpus, 0)

for word in document_a:
    dict_a[word] += 1

for word in document_b:
    dict_b[word] += 1

frequency = pd.DataFrame([dict_a, dict_b])
frequency.T

Unnamed: 0,0,1
No,0,1
he,0,1
high,0,1
too,0,1
that,1,0
fly,1,0
Hold,1,0
wings,0,1
to,1,0
own,0,1


In [3]:
def tf(doc_dict: dict, doc_elements: list[str]) -> dict:
    """
    Количество появлений терма в документе на общее 
    количество термов в документе
    """
    tf_dict = {}
    corpus_count = len(doc_elements)
    
    for word, count in doc_dict.items():
        tf_dict[word] = count / float(corpus_count)
        
    return tf_dict


def idf(doc_list: list[dict[str, int]]) -> dict:
    """
    Общее количество документов (N) на количество документов, 
    содержащий данный терм
    """
    idf_dict = {}
    N = len(doc_list)

    idf_dict = dict.fromkeys(doc_list[0].keys(), 0)

    for word in idf_dict.keys():
        idf_dict[word] = sum(doc[word] > 0 for doc in doc_list)

    for word, val in idf_dict.items():
        idf_dict[word] = math.log10((N + 1.0) / (val + 1.0))

    return idf_dict

# все обратные частоты документов для всех слов 
idfs = idf([dict_a, dict_b])


def tfidf(doc_elements: dict[str, int], idfs: dict[str, int])-> dict:
    """
    TF * IDF на каждый терм, учитывая один терм в одном документе 
    и количество документов, в которых встречается этот терм
    """
    tfidf_dict = {}
    
    for word, val in doc_elements.items():
        tfidf_dict[word] = val * idfs[word]

    return tfidf_dict

# вычисляем частоту терма для каждого документа по отдельности
tf_a = tf(dict_a, document_a)
tf_b = tf(dict_b, document_b)

# вычисляем обратную частоту документа, 
# учитывая частоту каждого терма
tfidf_a = tfidf(tf_a, idfs)
tfidf_b = tfidf(tf_b, idfs)

# возвращаем вес каждого слова в каждом документе 
# по отношению к общему объему корпуса
document_tfidf = pd.DataFrame([tfidf_a, tfidf_b])
document_tfidf.T

Unnamed: 0,0,1
No,0.0,0.014674
he,0.0,0.014674
high,0.0,0.014674
too,0.0,0.014674
that,0.011006,0.0
fly,0.011006,0.0
Hold,0.011006,0.0
wings,0.0,0.014674
to,0.011006,0.0
own,0.0,0.014674


In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

str1 = ("Hold fast to dreams, for if dreams die, life is "
        "a broken-winged bird that cannot fly.")
str2 = "No bird soars too high if he soars with his own wings."

corpus = [str1, str2]

text_titles = ["quote_langstonhughes", "quote_william_blake"]

vectorizer = TfidfVectorizer()
vector = vectorizer.fit_transform(corpus)
dict(zip(vectorizer.get_feature_names_out(), vector.toarray()[0]))

tfidf_df = pd.DataFrame(vector.toarray(), 
                        index=text_titles, 
                        columns=vectorizer.get_feature_names_out())

In [5]:
tfidf_df.loc['00_Document Frequency'] = (tfidf_df > 0).sum()

In [6]:
tfidf_df.T

Unnamed: 0,quote_langstonhughes,quote_william_blake,00_Document Frequency
bird,0.172503,0.197242,2.0
broken,0.242447,0.0,1.0
cannot,0.242447,0.0,1.0
die,0.242447,0.0,1.0
dreams,0.484893,0.0,1.0
fast,0.242447,0.0,1.0
fly,0.242447,0.0,1.0
for,0.242447,0.0,1.0
he,0.0,0.277217,1.0
high,0.0,0.277217,1.0


In [7]:
v1 = [0, 3, 4, 5, 6]
v2 = [4, 5, 6, 7, 8]


def dot(v1, v2):
    # вычисление скалярного произведения векторов
    dot_product = sum((a * b) for a, b in zip(v1, v2))
    return dot_product


def cosine_similarity(v1, v2):
    """
    (v1 dot v2)/||v1|| *||v2||)
    """
    # вычисление скалярного произведения векторов - числителя
    # в формуле косинусного сходства
    products = dot(v1, v2)
    # вычисление знаменателя в формуле косинусного сходства
    denominator = (dot(v1, v1) ** 0.5) * (dot(v2, v2) ** 0.5)
    # вычисление косинусного сходства
    similarity = products / denominator
    return similarity


print(cosine_similarity(v1, v2))

0.9544074144996451


In [8]:
from sklearn.metrics import pairwise

# переводим данные в формат NumPy
pairwise.cosine_similarity([v1], [v2])
# array([[0.95440741]])

array([[0.95440741]])