In [1]:
!pip install nltk scikit-learn pandas



In [2]:
import pandas as pd
import numpy as np
import nltk
import re

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer

In [8]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [9]:
data = {
    "text": [
        "I love Natural Language Processing!",
        "NLP is used in chatbots and search engines.",
        "Text cleaning is an important step in NLP.",
        "Machine learning models need clean data.",
        "I hate dealing with noisy and unstructured text.",
        "This model performs very poorly on messy data.",
        "The results are disappointing and inaccurate."
    ],
    "label": [
        "positive",
        "positive",
        "neutral",
        "neutral",
        "negative",
        "negative",
        "negative"
    ]
}

df = pd.DataFrame(data)
df

Unnamed: 0,text,label
0,I love Natural Language Processing!,positive
1,NLP is used in chatbots and search engines.,positive
2,Text cleaning is an important step in NLP.,neutral
3,Machine learning models need clean data.,neutral
4,I hate dealing with noisy and unstructured text.,negative
5,This model performs very poorly on messy data.,negative
6,The results are disappointing and inaccurate.,negative


In [10]:
# Text Cleaning Function
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    return text

In [11]:
# Lemmatization & Stopword Removal
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return " ".join(tokens)

In [12]:
# Apply Text Cleaning + Lemmatization
df["cleaned_text"] = df["text"].apply(clean_text)
df["processed_text"] = df["cleaned_text"].apply(preprocess_text)

df

Unnamed: 0,text,label,cleaned_text,processed_text
0,I love Natural Language Processing!,positive,i love natural language processing,love natural language processing
1,NLP is used in chatbots and search engines.,positive,nlp is used in chatbots and search engines,nlp used chatbots search engine
2,Text cleaning is an important step in NLP.,neutral,text cleaning is an important step in nlp,text cleaning important step nlp
3,Machine learning models need clean data.,neutral,machine learning models need clean data,machine learning model need clean data
4,I hate dealing with noisy and unstructured text.,negative,i hate dealing with noisy and unstructured text,hate dealing noisy unstructured text
5,This model performs very poorly on messy data.,negative,this model performs very poorly on messy data,model performs poorly messy data
6,The results are disappointing and inaccurate.,negative,the results are disappointing and inaccurate,result disappointing inaccurate


In [13]:
# Label Encoding
label_encoder = LabelEncoder()
df["label_encoded"] = label_encoder.fit_transform(df["label"])

df

Unnamed: 0,text,label,cleaned_text,processed_text,label_encoded
0,I love Natural Language Processing!,positive,i love natural language processing,love natural language processing,2
1,NLP is used in chatbots and search engines.,positive,nlp is used in chatbots and search engines,nlp used chatbots search engine,2
2,Text cleaning is an important step in NLP.,neutral,text cleaning is an important step in nlp,text cleaning important step nlp,1
3,Machine learning models need clean data.,neutral,machine learning models need clean data,machine learning model need clean data,1
4,I hate dealing with noisy and unstructured text.,negative,i hate dealing with noisy and unstructured text,hate dealing noisy unstructured text,0
5,This model performs very poorly on messy data.,negative,this model performs very poorly on messy data,model performs poorly messy data,0
6,The results are disappointing and inaccurate.,negative,the results are disappointing and inaccurate,result disappointing inaccurate,0


In [14]:
# TF-IDF Vectorization
tfidf = TfidfVectorizer()

X_tfidf = tfidf.fit_transform(df["processed_text"])

X_tfidf_df = pd.DataFrame(
    X_tfidf.toarray(),
    columns=tfidf.get_feature_names_out()
)

X_tfidf_df

Unnamed: 0,chatbots,clean,cleaning,data,dealing,disappointing,engine,hate,important,inaccurate,...,noisy,performs,poorly,processing,result,search,step,text,unstructured,used
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0
1,0.461804,0.0,0.0,0.0,0.0,0.0,0.461804,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.461804,0.0,0.0,0.0,0.461804
2,0.0,0.0,0.477923,0.0,0.0,0.0,0.0,0.0,0.477923,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.477923,0.396717,0.0,0.0
3,0.0,0.431207,0.0,0.357939,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.461804,0.0,0.0,0.461804,0.0,0.0,...,0.461804,0.0,0.0,0.0,0.0,0.0,0.0,0.383337,0.461804,0.0
5,0.0,0.0,0.0,0.396717,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.477923,0.477923,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.57735,0.0,0.0,0.0,0.57735,...,0.0,0.0,0.0,0.0,0.57735,0.0,0.0,0.0,0.0,0.0


In [15]:
# Save Outputs (CSV Files)
df.to_csv("processed_text_data.csv", index=False)
X_tfidf_df.to_csv("tfidf_features.csv", index=False)

print("Files saved successfully!")

Files saved successfully!
