In [1]:
pip install pandas scikit-learn nltk



In [2]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
# 1. Setup & Downloads
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [4]:
# Sample Dataset: Text and its Category (Label)
data = {
    'text': [
        "The cats are running around the house!",
        "Dogs love playing in the park with a ball.",
        "I am eating a delicious apple for breakfast.",
        "The software engineers are building great programs.",
        "Apples and oranges are healthy fruits."
    ],
    'category': ['animal', 'animal', 'food', 'tech', 'food']
}

df = pd.DataFrame(data)

In [5]:
# 2. Text Cleaning, Stopword Removal, and Lemmatization
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    # Remove punctuation and special characters, keep only letters
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Convert to lowercase
    text = text.lower()
    # Tokenize
    words = nltk.word_tokenize(text)
    # Remove stop words and lemmatize
    cleaned_words = [lemmatizer.lemmatize(w) for w in words if w not in stop_words]
    return " ".join(cleaned_words)

df['cleaned_text'] = df['text'].apply(clean_text)

In [6]:
# 3. Label Encoding (Categorical to Numerical)
label_encoder = LabelEncoder()
df['label_encoded'] = label_encoder.fit_transform(df['category'])

In [7]:
# 4. TF-IDF Representation
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(df['cleaned_text'])

# Create a DataFrame for TF-IDF output
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

In [8]:
# 5. Display and Save Outputs
print("--- Processed Data ---")
print(df[['text', 'cleaned_text', 'label_encoded']])

print("\n--- TF-IDF Matrix (Snippet) ---")
print(tfidf_df.iloc[:, :5]) # Showing first 5 columns

# Saving outputs
df.to_csv("processed_text_data.csv", index=False)
tfidf_df.to_csv("tfidf_features.csv", index=False)

print("\nSuccess: Outputs saved to 'processed_text_data.csv' and 'tfidf_features.csv'")

--- Processed Data ---
                                                text  \
0             The cats are running around the house!   
1         Dogs love playing in the park with a ball.   
2       I am eating a delicious apple for breakfast.   
3  The software engineers are building great prog...   
4             Apples and oranges are healthy fruits.   

                               cleaned_text  label_encoded  
0                  cat running around house              0  
1                dog love playing park ball              0  
2          eating delicious apple breakfast              1  
3  software engineer building great program              2  
4                apple orange healthy fruit              1  

--- TF-IDF Matrix (Snippet) ---
      apple  around      ball  breakfast  building
0  0.000000     0.5  0.000000   0.000000  0.000000
1  0.000000     0.0  0.447214   0.000000  0.000000
2  0.422242     0.0  0.000000   0.523358  0.000000
3  0.000000     0.0  0.000000   0.000