In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import nltk
import string

# Sample documents
documents = [
    "The quick brown fox jumps over the lazy dog.",
    "This is a sample document for testing the bag of words function.",
    "NLTK provides useful useful tools for text preprocessing.",
    "Bag of words is a simple and effective method for text representation.",
    "Machine learning algorithms often use bag of words as input features."
]

# Label Encoding
labels = ['animal', 'sample doc', 'NLP', 'NLP', 'ML']

# Preprocessing function
def preprocess_text(text):
    # Tokenization
    tokens = word_tokenize(text.lower())
    
    # Remove punctuation
    tokens = [word for word in tokens if word not in string.punctuation]
    
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    # Join tokens back into text
    clean_text = ' '.join(tokens)
    
    return clean_text

# Preprocess documents
clean_documents = [preprocess_text(doc) for doc in documents]

# Label encoding
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)

# Create TF-IDF representations
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(clean_documents)

# Convert TF-IDF matrix to DataFrame
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

# Add labels to DataFrame
tfidf_df['label'] = encoded_labels

# Save outputs
tfidf_df.to_csv('tfidf_representation.csv', index=False)
label_encoder_mapping = pd.DataFrame({'label': labels, 'encoded_label': encoded_labels})
label_encoder_mapping.to_csv('label_encoder_mapping.csv', index=False)

# Output the preprocessed documents
for doc, clean_doc in zip(documents, clean_documents):
    print(f"Original: {doc}")
    print(f"Cleaned: {clean_doc}")
    print()

# Display TF-IDF DataFrame
print("TF-IDF Representation:")
print(tfidf_df)

# Display Label Encoder Mapping
print("Label Encoder Mapping:")
print(label_encoder_mapping)


Original: The quick brown fox jumps over the lazy dog.
Cleaned: quick brown fox jump lazy dog

Original: This is a sample document for testing the bag of words function.
Cleaned: sample document testing bag word function

Original: NLTK provides useful useful tools for text preprocessing.
Cleaned: nltk provides useful useful tool text preprocessing

Original: Bag of words is a simple and effective method for text representation.
Cleaned: bag word simple effective method text representation

Original: Machine learning algorithms often use bag of words as input features.
Cleaned: machine learning algorithm often use bag word input feature

TF-IDF Representation:
   algorithm       bag     brown  document       dog  effective   feature  \
0   0.000000  0.000000  0.408248  0.000000  0.408248   0.000000  0.000000   
1   0.000000  0.302637  0.000000  0.451891  0.000000   0.000000  0.000000   
2   0.000000  0.000000  0.000000  0.000000  0.000000   0.000000  0.000000   
3   0.000000  0.284329 