**Perform text cleaning, perform lemmatization (any method), remove stop words (any method), label encoding. Create representations using TF-IDF. Save outputs**

In [None]:
import nltk
import string
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle

# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [None]:
# Sample text data
corpus = [
    "Natural Language Processing (NLP) is fun and exciting.",
    "Machine learning is a subfield of artificial intelligence.",
    "Deep learning is a part of machine learning, involving neural networks.",
    "NLP techniques include tokenization, stemming, lemmatization, and stopword removal."
]

# Sample labels for text classification (e.g., categories)
labels = ['Technology', 'Technology', 'AI', 'AI']


In [None]:
# Function to clean the text
def clean_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation using string.punctuation
    text = ''.join([char for char in text if char not in string.punctuation])
    return text

# Clean the corpus
cleaned_corpus = [clean_text(text) for text in corpus]
print("\nCleaned Text Corpus:", cleaned_corpus)



Cleaned Text Corpus: ['natural language processing nlp is fun and exciting', 'machine learning is a subfield of artificial intelligence', 'deep learning is a part of machine learning involving neural networks', 'nlp techniques include tokenization stemming lemmatization and stopword removal']


In [None]:
# Initialize lemmatizer
lemmatizer = nltk.WordNetLemmatizer()

# Function to lemmatize the text
def lemmatize_text(text):
    words = word_tokenize(text)
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(lemmatized_words)

# Lemmatize the cleaned corpus
lemmatized_corpus = [lemmatize_text(text) for text in cleaned_corpus]
print("\nLemmatized Text Corpus:", lemmatized_corpus)



Lemmatized Text Corpus: ['natural language processing nlp is fun and exciting', 'machine learning is a subfield of artificial intelligence', 'deep learning is a part of machine learning involving neural network', 'nlp technique include tokenization stemming lemmatization and stopword removal']


In [None]:
# Get stop words from NLTK
stop_words = set(stopwords.words('english'))

# Function to remove stop words
def remove_stopwords(text):
    words = word_tokenize(text)
    filtered_words = [word for word in words if word not in stop_words]
    return ' '.join(filtered_words)

# Remove stop words from the lemmatized corpus
cleaned_no_stopwords = [remove_stopwords(text) for text in lemmatized_corpus]
print("\nText without Stopwords:", cleaned_no_stopwords)



Text without Stopwords: ['natural language processing nlp fun exciting', 'machine learning subfield artificial intelligence', 'deep learning part machine learning involving neural network', 'nlp technique include tokenization stemming lemmatization stopword removal']


In [None]:
# Initialize the LabelEncoder
label_encoder = LabelEncoder()

# Encode the labels
encoded_labels = label_encoder.fit_transform(labels)
print("\nEncoded Labels:", encoded_labels)



Encoded Labels: [1 1 0 0]


In [None]:
# Initialize TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit the vectorizer and transform the cleaned text into a TF-IDF representation
X_tfidf = tfidf_vectorizer.fit_transform(cleaned_no_stopwords)

# Convert to an array for easier visualization
tfidf_matrix = X_tfidf.toarray()
print("\nTF-IDF Matrix:\n", tfidf_matrix)

# Display the feature names (words) from the TF-IDF matrix
print("\nTF-IDF Feature Names (Words):", tfidf_vectorizer.get_feature_names_out())



TF-IDF Matrix:
 [[0.         0.         0.42176478 0.42176478 0.         0.
  0.         0.42176478 0.         0.         0.         0.42176478
  0.         0.         0.3325242  0.         0.42176478 0.
  0.         0.         0.         0.         0.        ]
 [0.48546061 0.         0.         0.         0.         0.48546061
  0.         0.         0.38274272 0.         0.38274272 0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.48546061 0.         0.        ]
 [0.         0.35119159 0.         0.         0.         0.
  0.35119159 0.         0.55376697 0.         0.27688349 0.
  0.35119159 0.35119159 0.         0.35119159 0.         0.
  0.         0.         0.         0.         0.        ]
 [0.         0.         0.         0.         0.36222393 0.
  0.         0.         0.         0.36222393 0.         0.
  0.         0.         0.2855815  0.         0.         0.36222393
  0.36222393 0.36222393 0.         0.36222393 0.36222393]]

TF-ID

In [None]:
# Save cleaned text to a file
with open('cleaned_corpus.txt', 'w') as file:
    for text in cleaned_corpus:
        file.write(text + '\n')

# Save lemmatized text to a file
with open('lemmatized_corpus.txt', 'w') as file:
    for text in lemmatized_corpus:
        file.write(text + '\n')

# Save text without stopwords to a file
with open('no_stopwords_corpus.txt', 'w') as file:
    for text in cleaned_no_stopwords:
        file.write(text + '\n')

# Save TF-IDF matrix to a CSV file using pandas
tfidf_df = pd.DataFrame(tfidf_matrix, columns=tfidf_vectorizer.get_feature_names_out())
tfidf_df.to_csv('tfidf_matrix.csv', index=False)

# Save label encoder using pickle
with open('label_encoder.pkl', 'wb') as file:
    pickle.dump(label_encoder, file)

print("\nOutputs have been saved!")



Outputs have been saved!
