# TF-IDF weighting

In [1]:
import ipytest
import math
import pytest

ipytest.autoconfig()

The document-term vector contains the raw term frequencies for each term in the document.

In [2]:
doc_term_matrix = [
    [0, 0, 3, 0, 0, 0],
    [1, 1, 2, 0, 0, 0],
    [0, 0, 2, 1, 1, 0],
    [0, 0, 0, 1, 1, 0],
    [1, 1, 1, 0, 1, 1]
]

## Task 1: TF weighting

Compute the L1-normalized term frequency vector for a given document.

The L1-normalized frequency of a single term in a document is given by:

$$tf_{t,d}=\frac{c_{t,d}}{|d|} ~,$$ 

where $c_{t,d}$ is the count of occurrences of term $t$ in document $d$ and $|d|$ is the document length (total number of terms).

In [8]:
sum([5,5,5,5])

20

In [15]:
def get_tf_vector(doc_term_vector):    
    """Computes the normalized term frequency vector from a raw term-frequency vector."""
    n_cols = len(doc_term_vector)
    n_terms = sum(doc_term_vector)
    tf_vector = [0]*n_cols
    for col_idx in range(n_cols):
        tf_vector[col_idx] = doc_term_vector[col_idx]/n_terms
    return tf_vector

Tests.

In [16]:
%%run_pytest[clean]

def test_tf_doc0():
    assert get_tf_vector(doc_term_matrix[0]) == [0, 0, 1, 0, 0, 0]
    
def test_tf_doc1():
    assert get_tf_vector(doc_term_matrix[1]) == [0.25, 0.25, 0.5, 0, 0, 0]

..                                                                       [100%]
2 passed in 0.02s


## Task 2: IDF weighting

Compute the IDF weight of a term given by

$$idf_{t}=\log \frac{N}{n_t} ~,$$ 

where $N$ is the total number of documents and $n_t$ is the number of documents that contain term $t$.
**Use base-10 logarithm in this exercise.**

In [25]:
def get_term_idf(doc_term_matrix, term_index):
    """Computes the IDF value of a term, given by its index, based on a document-term matrix."""
    n_documents = len(doc_term_matrix)
    n_documents_with_term = 0
    for doc in doc_term_matrix:
        if doc[term_index] > 0:
            n_documents_with_term +=1
    return math.log10(n_documents/n_documents_with_term)

Tests.

In [26]:
%%run_pytest[clean]

def test_idf_term0():
    assert get_term_idf(doc_term_matrix, 0) == pytest.approx(0.3979, rel=1e-3)
    
def test_idf_term2():
    assert get_term_idf(doc_term_matrix, 2) == pytest.approx(0.0969, rel=1e-3)

..                                                                       [100%]
2 passed in 0.02s


## Task 3: TF-IDF weighting

Compute the TF-IDF vector for a given document, where the TF-IDF weight of a term in a document is given by:

$$ tfidf_{t,d} = tf_{t,d} \times idf_{t} ~.$$

In [41]:
def get_tfidf_vector(doc_term_matrix, doc_index):
    """Computes the TFIDF vector from a raw term-frequency vector."""
    doc_term_vector = doc_term_matrix[doc_index]
    tf_idf = [0] * len(doc_term_matrix[doc_index])
    tf = get_tf_vector(doc_term_vector)
    for term_index in range(len(doc_term_vector)):
        idf = get_term_idf(doc_term_matrix,term_index)
        tf_idf[term_index] = tf[term_index] *  idf
    return tf_idf

In [42]:
get_tfidf_vector(doc_term_matrix, 0)

[0.0, 0.0, 0.09691001300805642, 0.0, 0.0, 0.0]

Tests.

In [43]:
%%run_pytest[clean]

def test_tfidf_doc0():
    assert get_tfidf_vector(doc_term_matrix, 0) == pytest.approx([0, 0, 0.0969, 0, 0, 0], rel=1e-3)

.                                                                        [100%]
1 passed in 0.01s
