# NLP Assignment 3: Text Cleaning, Lemmatization, Stopwords, Encoding & TF-IDF

This notebook demonstrates text preprocessing and feature extraction using NLTK and Scikit-learn.

In [7]:

import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

## Sample Dataset

In [8]:

texts = [
    "I love Natural Language Processing!",
    "Text cleaning is an important step in NLP.",
    "Machine learning models need clean data."
]
labels = ["positive", "neutral", "neutral"]

print(texts)
print(labels)


['I love Natural Language Processing!', 'Text cleaning is an important step in NLP.', 'Machine learning models need clean data.']
['positive', 'neutral', 'neutral']


## Text Cleaning

In [9]:

import re

def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    return text

cleaned_texts = [clean_text(t) for t in texts]
print(cleaned_texts)


['i love natural language processing', 'text cleaning is an important step in nlp', 'machine learning models need clean data']


## Lemmatization

In [10]:
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

lemmatizer = WordNetLemmatizer()

lemmatized_texts = []
for text in cleaned_texts:
    tokens = word_tokenize(text)
    lemmas = [lemmatizer.lemmatize(token) for token in tokens]
    lemmatized_texts.append(" ".join(lemmas))

print(lemmatized_texts)

['i love natural language processing', 'text cleaning is an important step in nlp', 'machine learning model need clean data']


In [11]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

## Stopword Removal

In [12]:

from nltk.corpus import stopwords

stop_words = set(stopwords.words("english"))

filtered_texts = []
for text in lemmatized_texts:
    words = text.split()
    filtered = [word for word in words if word not in stop_words]
    filtered_texts.append(" ".join(filtered))

print(filtered_texts)


['love natural language processing', 'text cleaning important step nlp', 'machine learning model need clean data']


## Label Encoding

In [13]:

from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
encoded_labels = encoder.fit_transform(labels)

print("Original Labels:", labels)
print("Encoded Labels:", encoded_labels)


Original Labels: ['positive', 'neutral', 'neutral']
Encoded Labels: [1 0 0]


## TF-IDF Representation

In [14]:

from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(filtered_texts)

print("TF-IDF Feature Names:", vectorizer.get_feature_names_out())
print("TF-IDF Matrix:\n", tfidf_matrix.toarray())


TF-IDF Feature Names: ['clean' 'cleaning' 'data' 'important' 'language' 'learning' 'love'
 'machine' 'model' 'natural' 'need' 'nlp' 'processing' 'step' 'text']
TF-IDF Matrix:
 [[0.         0.         0.         0.         0.5        0.
  0.5        0.         0.         0.5        0.         0.
  0.5        0.         0.        ]
 [0.         0.4472136  0.         0.4472136  0.         0.
  0.         0.         0.         0.         0.         0.4472136
  0.         0.4472136  0.4472136 ]
 [0.40824829 0.         0.40824829 0.         0.         0.40824829
  0.         0.40824829 0.40824829 0.         0.40824829 0.
  0.         0.         0.        ]]


## Saving Outputs

In [15]:

import pandas as pd

df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())
df['label'] = encoded_labels

df.to_csv("tfidf_output.csv", index=False)
print("TF-IDF output saved as tfidf_output.csv")


TF-IDF output saved as tfidf_output.csv
