In [1]:
import pandas as pd

data = {
    "text": [
        "I love Natural Language Processing!",
        "NLP helps computers understand human language.",
        "Text cleaning is an important step in NLP."
    ],
    "label": ["positive", "positive", "neutral"]
}

df = pd.DataFrame(data)
df


Unnamed: 0,text,label
0,I love Natural Language Processing!,positive
1,NLP helps computers understand human language.,positive
2,Text cleaning is an important step in NLP.,neutral


In [2]:
import nltk
import re

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [3]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)  # remove punctuation & numbers
    text = re.sub(r'\s+', ' ', text)      # remove extra spaces
    return text.strip()

df["cleaned_text"] = df["text"].apply(clean_text)
df


Unnamed: 0,text,label,cleaned_text
0,I love Natural Language Processing!,positive,i love natural language processing
1,NLP helps computers understand human language.,positive,nlp helps computers understand human language
2,Text cleaning is an important step in NLP.,neutral,text cleaning is an important step in nlp


In [5]:
import nltk
nltk.download('punkt_tab')

stop_words = set(stopwords.words("english"))

def remove_stopwords(text):
    words = word_tokenize(text)
    words = [word for word in words if word not in stop_words]
    return " ".join(words)

df["no_stopwords"] = df["cleaned_text"].apply(remove_stopwords)
df

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


Unnamed: 0,text,label,cleaned_text,no_stopwords
0,I love Natural Language Processing!,positive,i love natural language processing,love natural language processing
1,NLP helps computers understand human language.,positive,nlp helps computers understand human language,nlp helps computers understand human language
2,Text cleaning is an important step in NLP.,neutral,text cleaning is an important step in nlp,text cleaning important step nlp


In [6]:
lemmatizer = WordNetLemmatizer()

def lemmatize_text(text):
    words = word_tokenize(text)
    words = [lemmatizer.lemmatize(word) for word in words]
    return " ".join(words)

df["lemmatized_text"] = df["no_stopwords"].apply(lemmatize_text)
df


Unnamed: 0,text,label,cleaned_text,no_stopwords,lemmatized_text
0,I love Natural Language Processing!,positive,i love natural language processing,love natural language processing,love natural language processing
1,NLP helps computers understand human language.,positive,nlp helps computers understand human language,nlp helps computers understand human language,nlp help computer understand human language
2,Text cleaning is an important step in NLP.,neutral,text cleaning is an important step in nlp,text cleaning important step nlp,text cleaning important step nlp


In [7]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
df["label_encoded"] = label_encoder.fit_transform(df["label"])
df


Unnamed: 0,text,label,cleaned_text,no_stopwords,lemmatized_text,label_encoded
0,I love Natural Language Processing!,positive,i love natural language processing,love natural language processing,love natural language processing,1
1,NLP helps computers understand human language.,positive,nlp helps computers understand human language,nlp helps computers understand human language,nlp help computer understand human language,1
2,Text cleaning is an important step in NLP.,neutral,text cleaning is an important step in nlp,text cleaning important step nlp,text cleaning important step nlp,0


In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(df["lemmatized_text"])

tfidf_df = pd.DataFrame(
    tfidf_matrix.toarray(),
    columns=tfidf.get_feature_names_out()
)

tfidf_df


Unnamed: 0,cleaning,computer,help,human,important,language,love,natural,nlp,processing,step,text,understand
0,0.0,0.0,0.0,0.0,0.0,0.40204,0.528635,0.528635,0.0,0.528635,0.0,0.0,0.0
1,0.0,0.440362,0.440362,0.440362,0.0,0.334907,0.0,0.0,0.334907,0.0,0.0,0.0,0.440362
2,0.467351,0.0,0.0,0.0,0.467351,0.0,0.0,0.0,0.355432,0.0,0.467351,0.467351,0.0


In [9]:
df.to_csv("cleaned_text_data.csv", index=False)


In [10]:
tfidf_df.to_csv("tfidf_features.csv", index=False)
