In [1]:
import nltk
import re
import math
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.probability import FreqDist

nltk.download('punkt_tab')
nltk.download('stopwords')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [2]:
documents = [
    "Natural language processing is a field of artificial intelligence.",
    "Bag of words is a simple text representation technique.",
    "TF IDF gives importance to rare words in the document."
]

In [3]:
stop_words = set(stopwords.words('english'))

def preprocess(text):
    text = text.lower()
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    words = word_tokenize(text)
    words = [word for word in words if word not in stop_words]
    return words

processed_docs = [preprocess(doc) for doc in documents]

print("Processed Documents:")
print(processed_docs)

Processed Documents:
[['natural', 'language', 'processing', 'field', 'artificial', 'intelligence'], ['bag', 'words', 'simple', 'text', 'representation', 'technique'], ['tf', 'idf', 'gives', 'importance', 'rare', 'words', 'document']]


In [4]:
vocab = set()
for doc in processed_docs:
    vocab.update(doc)

vocab = list(vocab)
print("\nVocabulary:")
print(vocab)


Vocabulary:
['text', 'rare', 'processing', 'language', 'words', 'bag', 'document', 'intelligence', 'artificial', 'gives', 'simple', 'idf', 'technique', 'natural', 'importance', 'tf', 'field', 'representation']


In [5]:
tf = []

for doc in processed_docs:
    freq = FreqDist(doc)
    doc_tf = []
    for word in vocab:
        doc_tf.append(freq[word] / len(doc))  # normalized TF
    tf.append(doc_tf)

print("\nTF Matrix:")
print(tf)


TF Matrix:
[[0.0, 0.0, 0.16666666666666666, 0.16666666666666666, 0.0, 0.0, 0.0, 0.16666666666666666, 0.16666666666666666, 0.0, 0.0, 0.0, 0.0, 0.16666666666666666, 0.0, 0.0, 0.16666666666666666, 0.0], [0.16666666666666666, 0.0, 0.0, 0.0, 0.16666666666666666, 0.16666666666666666, 0.0, 0.0, 0.0, 0.0, 0.16666666666666666, 0.0, 0.16666666666666666, 0.0, 0.0, 0.0, 0.0, 0.16666666666666666], [0.0, 0.14285714285714285, 0.0, 0.0, 0.14285714285714285, 0.0, 0.14285714285714285, 0.0, 0.0, 0.14285714285714285, 0.0, 0.14285714285714285, 0.0, 0.0, 0.14285714285714285, 0.14285714285714285, 0.0, 0.0]]


In [6]:
idf = []

N = len(processed_docs)

for word in vocab:
    count = sum(1 for doc in processed_docs if word in doc)
    idf.append(math.log(N / (count + 1)))  # smoothing

print("\nIDF Values:")
print(idf)


IDF Values:
[0.4054651081081644, 0.4054651081081644, 0.4054651081081644, 0.4054651081081644, 0.0, 0.4054651081081644, 0.4054651081081644, 0.4054651081081644, 0.4054651081081644, 0.4054651081081644, 0.4054651081081644, 0.4054651081081644, 0.4054651081081644, 0.4054651081081644, 0.4054651081081644, 0.4054651081081644, 0.4054651081081644, 0.4054651081081644]


In [7]:
tfidf = []

for doc_tf in tf:
    doc_tfidf = []
    for i in range(len(vocab)):
        doc_tfidf.append(doc_tf[i] * idf[i])
    tfidf.append(doc_tfidf)

print("\nTF-IDF Matrix:")
for row in tfidf:
    print(row)


TF-IDF Matrix:
[0.0, 0.0, 0.06757751801802739, 0.06757751801802739, 0.0, 0.0, 0.0, 0.06757751801802739, 0.06757751801802739, 0.0, 0.0, 0.0, 0.0, 0.06757751801802739, 0.0, 0.0, 0.06757751801802739, 0.0]
[0.06757751801802739, 0.0, 0.0, 0.0, 0.0, 0.06757751801802739, 0.0, 0.0, 0.0, 0.0, 0.06757751801802739, 0.0, 0.06757751801802739, 0.0, 0.0, 0.0, 0.0, 0.06757751801802739]
[0.0, 0.05792358687259491, 0.0, 0.0, 0.0, 0.0, 0.05792358687259491, 0.0, 0.0, 0.05792358687259491, 0.0, 0.05792358687259491, 0.0, 0.0, 0.05792358687259491, 0.05792358687259491, 0.0, 0.0]
