In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Sample corpus: a list of documents (each document is a string)
corpus = [
    "Data science is the study of data",
    "Data science involves statistics and machine learning",
    "Statistics is important for data science"
]

# Initialize TfidfVectorizer
vectorizer = TfidfVectorizer()

# Compute the TF-IDF matrix
tfidf_matrix = vectorizer.fit_transform(corpus)

# Get feature names (terms) for each column in the TF-IDF matrix
feature_names = vectorizer.get_feature_names_out()

# Convert TF-IDF matrix into a readable format (dense matrix)
tfidf_dense = tfidf_matrix.todense()

# Display the TF-IDF scores
for doc_idx, doc in enumerate(tfidf_dense):
    print(f"Document {doc_idx+1} TF-IDF Scores:")
    for term_idx, score in enumerate(doc.tolist()[0]):
        if score > 0:
            print(f"{feature_names[term_idx]}: {score:.4f}")
    print("\n")


Document 1 TF-IDF Scores:
data: 0.5120
is: 0.3297
of: 0.4335
science: 0.2560
study: 0.4335
the: 0.4335


Document 2 TF-IDF Scores:
and: 0.4354
data: 0.2571
involves: 0.4354
learning: 0.4354
machine: 0.4354
science: 0.2571
statistics: 0.3311


Document 3 TF-IDF Scores:
data: 0.3008
for: 0.5094
important: 0.5094
is: 0.3874
science: 0.3008
statistics: 0.3874




In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Sample corpus: a list of documents (each document is a string)
corpus = [
    "TF-IDF (Term Frequency-Inverse Document Frequency) is a statistical measure used in text mining",
     "information retrieval to evaluate the importance of a word in a document relative to a collection of documents",
     "It helps highlight significant words in a document that are not too common across the entire corpus"
]

# Initialize TfidfVectorizer
vectorizer = TfidfVectorizer()

# Compute the TF-IDF matrix
tfidf_matrix = vectorizer.fit_transform(corpus)

# Get feature names (terms) for each column in the TF-IDF matrix
feature_names = vectorizer.get_feature_names_out()

# Convert TF-IDF matrix into a readable format (dense matrix)
tfidf_dense = tfidf_matrix.todense()

# Display the TF-IDF scores
for doc_idx, doc in enumerate(tfidf_dense):
    print(f"Document {doc_idx+1} TF-IDF Scores:")
    for term_idx, score in enumerate(doc.tolist()[0]):
        if score > 0:
            print(f"{feature_names[term_idx]}: {score:.4f}")
    print("\n")


Document 1 TF-IDF Scores:
document: 0.1541
frequency: 0.5217
idf: 0.2608
in: 0.1541
inverse: 0.2608
is: 0.2608
measure: 0.2608
mining: 0.2608
statistical: 0.2608
term: 0.2608
text: 0.2608
tf: 0.2608
used: 0.2608


Document 2 TF-IDF Scores:
collection: 0.2406
document: 0.1421
documents: 0.2406
evaluate: 0.2406
importance: 0.2406
in: 0.1421
information: 0.2406
of: 0.4812
relative: 0.2406
retrieval: 0.2406
the: 0.1830
to: 0.4812
word: 0.2406


Document 3 TF-IDF Scores:
across: 0.2647
are: 0.2647
common: 0.2647
corpus: 0.2647
document: 0.1563
entire: 0.2647
helps: 0.2647
highlight: 0.2647
in: 0.1563
it: 0.2647
not: 0.2647
significant: 0.2647
that: 0.2647
the: 0.2013
too: 0.2647
words: 0.2647


