In [1]:
import math
import pandas as pd
from collections import defaultdict

ModuleNotFoundError: No module named 'pandas'

## Inverted Index
Inverted index is a data structure used in information retrieval to index the content of documents. The inverted index stores a mapping from each term to the documents that contain it. This makes it possible to quickly look up all the documents that contain a given term.

Let's start by defining a simple example corpus.

In [6]:
corpus = {
    'doc1': 'apple banana apple',
    'doc2': 'banana cherry',
    'doc3': 'apple cherry'
}

To create an inverted index from this corpus, we can follow these steps:

Tokenize the documents into terms. In this example, we'll use whitespace as the delimiter. you can use a tokenizer from nltk of write your own tokenizer 
Create a dictionary to store the inverted index.
For each term in each document, add the document to the inverted index.

In [4]:
def create_inverted_index(corpus):
    inverted_index = defaultdict(list)
    for doc_id, doc_content in corpus.items():
        ## add some text processing , Tokenize, remove stop words ... to create a list of cleaned terms
        ## write your code here
        terms = word_tokenize(doc_content)
        for term in terms:
            inverted_index[term].append(doc_id)
    return dict(inverted_index)

inverted_index = create_inverted_index(corpus)
print(inverted_index)

In [9]:
# This should output the followings:
# {
#     'apple': ['doc1', 'doc3'],
#     'banana': ['doc1', 'doc2'],
#     'cherry': ['doc2', 'doc3']
# }

## Term Frequency
Term frequency (TF) is a measure of how often a term appears in a document. It is used in information retrieval to help rank documents that match a query. The basic idea is that documents that contain a term more frequently are more likely to be relevant to a query that contains that term.

To calculate the term frequency of each term in a document, we can follow these steps:

Tokenize the document into terms.
Count the number of occurrences of each term in the document.
Divide each term count by the total number of terms in the document.

In [2]:
def calculate_tf(document):
    tf = {}
    terms = document.split()
    term_count = len(terms)
    for term in terms:
        tf[term] =terms.count(term)/term_count
    return tf

tf_doc1 = calculate_tf(corpus['doc1'])
# print(tf_doc1)

NameError: name 'corpus' is not defined

## TF-IDF
Term frequency-inverse document frequency (TF-IDF) is a measure of the importance of a term in a document. It is used in information retrieval to help rank documents that match a query. The basic idea is that terms that appear more frequently in a document are more important, but terms that appear in many documents are less important.

To calculate the TF-IDF score of each term in a document, we can follow these steps:

Calculate the term frequency (TF) of each term in the document.
Calculate the inverse document frequency (IDF) of each term across all documents.
Multiply the TF and IDF of each term to get the TF-IDF score of each term in the document.

In [11]:
# write 
def calculate_idf(corpus):
    idf = {}
    n_docs = len(corpus)
    inverted_index = creat_inverted_index(corpus)  # create the inverted index for the whole document.

    for term, doc_ids in inverted_index.items():
        idf[term] = math.log10(n_docs / len(doc_ids)) # logarithm to base 10.

    return idf

def calculate_tfidf(document, corpus):
    tfidf = {}
    tf = calculate_tf
    idf = calculate_idf
    for term in tf:
        tfidf[term] = tf(document, term) * idf(corpus) # tf * idf.
    return tfidf

tfidf_doc1 = calculate_tfidf(corpus['doc1'], corpus)
print(tfidf_doc1)

{'apple': 0.0, 'banana': 0.13515503603605478}


# sklearn Library
now user sklearn library to calcuate tf and tfidf

In [12]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

# Define a simple example corpus
corpus = {
    'doc1': 'apple banana apple',
    'doc2': 'banana cherry',
    'doc3': 'apple cherry'
}

# Create a list of documents
documents = list(corpus.values())

# Create a TfidfVectorizer object
vectorizer = ?

# Fit the vectorizer to the documents
tfidf_matrix = ?

# Convert the TF-IDF matrix to a Pandas DataFrame
df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names(), index=corpus.keys())

# Print the resulting TF-IDF scores
print(df)

         apple    banana    cherry
doc1  0.894427  0.447214  0.000000
doc2  0.000000  0.707107  0.707107
doc3  0.707107  0.000000  0.707107




In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

# Define a simple example corpus
corpus = {
    'doc1': 'apple banana apple',
    'doc2': 'banana cherry',
    'doc3': 'apple cherry'
}

# Create a list of documents
documents = list(corpus.values())

# Create a CountVectorizer object
vectorizer = ?

# Fit the vectorizer to the documents
tf_matrix = ?

# Convert the TF matrix to a Pandas DataFrame
df = pd.DataFrame(tf_matrix.toarray(), columns=vectorizer.get_feature_names(), index=corpus.keys())

# Normalize the TF scores by dividing each row by its sum
df = df.div(df.sum(axis=1), axis=0)

# Print the resulting TF scores
print(df)