In [7]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt_tab')

print("Libraries imported and NLTK data downloaded.")

Libraries imported and NLTK data downloaded.


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [8]:
# Create a sample DataFrame
data = {
    'text': [
        "The cat is sitting on the mat.",
        "Dogs are great pets and very loyal.",
        "I love coding in Python everyday.",
        "It is raining cats and dogs outside!",
        "The quick brown fox jumps over the lazy dog."
    ],
    'category': [
        "Animal",
        "Animal",
        "Tech",
        "Weather",
        "Animal"
    ]
}

df = pd.DataFrame(data)

print("--- Original Data ---")
display(df)

--- Original Data ---


Unnamed: 0,text,category
0,The cat is sitting on the mat.,Animal
1,Dogs are great pets and very loyal.,Animal
2,I love coding in Python everyday.,Tech
3,It is raining cats and dogs outside!,Weather
4,The quick brown fox jumps over the lazy dog.,Animal


In [9]:
# Initialize Lemmatizer and Stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    # 1. Text Cleaning: Lowercase
    text = text.lower()

    # 2. Text Cleaning: Remove non-alphabetic characters (keep spaces)
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # 3. Tokenization
    tokens = word_tokenize(text)

    # 4. Remove Stop Words and Perform Lemmatization
    # We filter out words in 'stop_words' and lemmatize the rest
    clean_tokens = [
        lemmatizer.lemmatize(word)
        for word in tokens
        if word not in stop_words
    ]

    # Join back into a string
    return " ".join(clean_tokens)

# Apply the preprocessing to the 'text' column
df['processed_text'] = df['text'].apply(preprocess_text)

print("--- Data after Preprocessing ---")
display(df[['text', 'processed_text']])

--- Data after Preprocessing ---


Unnamed: 0,text,processed_text
0,The cat is sitting on the mat.,cat sitting mat
1,Dogs are great pets and very loyal.,dog great pet loyal
2,I love coding in Python everyday.,love coding python everyday
3,It is raining cats and dogs outside!,raining cat dog outside
4,The quick brown fox jumps over the lazy dog.,quick brown fox jump lazy dog


In [10]:
# Initialize Label Encoder
label_encoder = LabelEncoder()

# Fit and transform the 'category' column
df['category_encoded'] = label_encoder.fit_transform(df['category'])

print("--- Data with Label Encoding ---")
display(df[['category', 'category_encoded']])

# Display the mapping to understand which number equals which label
mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
print("\nLabel Mapping:", mapping)

--- Data with Label Encoding ---


Unnamed: 0,category,category_encoded
0,Animal,0
1,Animal,0
2,Tech,1
3,Weather,2
4,Animal,0



Label Mapping: {'Animal': np.int64(0), 'Tech': np.int64(1), 'Weather': np.int64(2)}


In [11]:
# Initialize TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the processed text
tfidf_matrix = tfidf_vectorizer.fit_transform(df['processed_text'])

# Convert to DataFrame for visualization and saving
# We use .toarray() to convert the sparse matrix to a dense one
df_tfidf = pd.DataFrame(
    tfidf_matrix.toarray(),
    columns=tfidf_vectorizer.get_feature_names_out()
)

print("--- TF-IDF Matrix (first 5 rows) ---")
display(df_tfidf.head())

--- TF-IDF Matrix (first 5 rows) ---


Unnamed: 0,brown,cat,coding,dog,everyday,fox,great,jump,lazy,love,loyal,mat,outside,pet,python,quick,raining,sitting
0,0.0,0.495524,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.614189,0.0,0.0,0.0,0.0,0.0,0.614189
1,0.0,0.0,0.0,0.360638,0.0,0.0,0.538498,0.0,0.0,0.0,0.538498,0.0,0.0,0.538498,0.0,0.0,0.0,0.0
2,0.0,0.0,0.5,0.0,0.5,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0
3,0.0,0.45827,0.0,0.380406,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.568014,0.0,0.0,0.0,0.568014,0.0
4,0.428411,0.0,0.0,0.286912,0.0,0.428411,0.0,0.428411,0.428411,0.0,0.0,0.0,0.0,0.0,0.0,0.428411,0.0,0.0


In [12]:
# 1. Save the processed dataset (Text + Labels + Encoded Labels)
df.to_csv("processed_dataset.csv", index=False)

# 2. Save the TF-IDF features
df_tfidf.to_csv("tfidf_features.csv", index=False)

print("Files saved successfully:")
print("1. processed_dataset.csv")
print("2. tfidf_features.csv")

Files saved successfully:
1. processed_dataset.csv
2. tfidf_features.csv
