In [1]:
import string
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.util import ngrams
import pandas as pd

# Predefined stop words list
STOP_WORDS = set([
    'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours', 'yourself', 'yourselves',
    'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself', 'they', 'them', 'their',
    'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are',
    'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an',
    'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about',
    'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up',
    'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when',
    'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no',
    'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don',
    'should', 'now'
])

# Sample data
documents = [
    "Natural Language Processing is a fascinating field of AI.",
    "Machine Learning and NLP are closely related",
    "TF-IDF and N-Grams are essential techniques in NLP"
]

# Preprocessing function
def preprocess_text(doc):
    doc = doc.lower()
    doc = doc.translate(str.maketrans('', '', string.punctuation))
    tokens = doc.split()
    tokens = [word for word in tokens if word not in STOP_WORDS]
    return tokens

# Preprocessed documents
preprocessed_docs = [' '.join(preprocess_text(doc)) for doc in documents]

# 1. Tokenization
def tokenize_text(documents):
    return [doc.split() for doc in documents]

print("Tokenized Documents:")
print(tokenize_text(preprocessed_docs))

# 2. Bag of Words (BoW)
vectorizer_bow = CountVectorizer()
X_bow = vectorizer_bow.fit_transform(preprocessed_docs)
bow_df = pd.DataFrame(X_bow.toarray(), columns=vectorizer_bow.get_feature_names_out())

print("\nBag of Words Representation:")
print(bow_df)

# 3. TF-IDF
vectorizer_tfidf = TfidfVectorizer()
X_tfidf = vectorizer_tfidf.fit_transform(preprocessed_docs)
tfidf_df = pd.DataFrame(X_tfidf.toarray(), columns=vectorizer_tfidf.get_feature_names_out())

print("\nTF-IDF Representation:")
print(tfidf_df)

# 4. N-Grams
def generate_ngrams(documents, n):
    ngram_list = []
    for doc in documents:
        tokens = doc.split()
        ngram_list.append(list(ngrams(tokens, n)))
    return ngram_list

bi_grams = generate_ngrams(preprocessed_docs, 2)
tri_grams = generate_ngrams(preprocessed_docs, 3)

print("\nBi-Grams:")
print(bi_grams)

print("\nTri-Grams:")
print(tri_grams)



Tokenized Documents:
[['natural', 'language', 'processing', 'fascinating', 'field', 'ai'], ['machine', 'learning', 'nlp', 'closely', 'related'], ['tfidf', 'ngrams', 'essential', 'techniques', 'nlp']]

Bag of Words Representation:
   ai  closely  essential  fascinating  field  language  learning  machine  \
0   1        0          0            1      1         1         0        0   
1   0        1          0            0      0         0         1        1   
2   0        0          1            0      0         0         0        0   

   natural  ngrams  nlp  processing  related  techniques  tfidf  
0        1       0    0           1        0           0      0  
1        0       0    1           0        1           0      0  
2        0       1    1           0        0           1      1  

TF-IDF Representation:
         ai   closely  essential  fascinating     field  language  learning  \
0  0.408248  0.000000   0.000000     0.408248  0.408248  0.408248  0.000000   
1  0.000000