In [None]:
!pip install nltk scikit-learn

import re
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle

nltk.download('punkt_tab')
nltk.download('wordnet')
nltk.download('stopwords')



[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
data = {
    'text': [
        "I love programming in Python! It's amazing, isn't it?",
        "Natural Language Processing is a complex field, but so interesting.",
        "Data cleaning, lemmatization, and vectorization are essential steps in NLP."
    ],
    'category': ['positive', 'neutral', 'positive']
}

df = pd.DataFrame(data)
df.head()

Unnamed: 0,text,category
0,"I love programming in Python! It's amazing, is...",positive
1,Natural Language Processing is a complex field...,neutral
2,"Data cleaning, lemmatization, and vectorizatio...",positive


In [None]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def clean_text(text):
    """
    Clean and preprocess text:
    - Convert text to lowercase
    - Remove URLs, mentions, hashtags, punctuation, and digits
    - Tokenize the text and remove stop words
    - Lemmatize tokens and reconstruct the cleaned text
    """
    text = text.lower()

    text = re.sub(r'http\S+|www\S+|@\S+|#\S+', '', text)

    text = re.sub(r'[^a-z\s]', '', text)

    tokens = text.split()

    cleaned_tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]


    cleaned_text = " ".join(cleaned_tokens)

    return cleaned_text

df['cleaned_text'] = df['text'].apply(clean_text)

df[['text', 'cleaned_text']].head()

Unnamed: 0,text,cleaned_text
0,"I love programming in Python! It's amazing, is...",love programming python amazing isnt
1,Natural Language Processing is a complex field...,natural language processing complex field inte...
2,"Data cleaning, lemmatization, and vectorizatio...",data cleaning lemmatization vectorization esse...


In [None]:
label_encoder = LabelEncoder()

df['category_encoded'] = label_encoder.fit_transform(df['category'])

label_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
print("Label Mapping:", label_mapping)

df[['category', 'category_encoded']].head()

Label Mapping: {'neutral': 0, 'positive': 1}


Unnamed: 0,category,category_encoded
0,positive,1
1,neutral,0
2,positive,1


In [None]:
tfidf_vectorizer = TfidfVectorizer()

tfidf_matrix = tfidf_vectorizer.fit_transform(df['cleaned_text'])

print("TF-IDF matrix shape:", tfidf_matrix.shape)

feature_names = tfidf_vectorizer.get_feature_names_out()
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=feature_names)
tfidf_df.head()

TF-IDF matrix shape: (3, 18)


Unnamed: 0,amazing,cleaning,complex,data,essential,field,interesting,isnt,language,lemmatization,love,natural,nlp,processing,programming,python,step,vectorization
0,0.447214,0.0,0.0,0.0,0.0,0.0,0.0,0.447214,0.0,0.0,0.447214,0.0,0.0,0.0,0.447214,0.447214,0.0,0.0
1,0.0,0.0,0.408248,0.0,0.0,0.408248,0.408248,0.0,0.408248,0.0,0.0,0.408248,0.0,0.408248,0.0,0.0,0.0,0.0
2,0.0,0.377964,0.0,0.377964,0.377964,0.0,0.0,0.0,0.0,0.377964,0.0,0.0,0.377964,0.0,0.0,0.0,0.377964,0.377964


In [None]:
df.to_csv('cleaned_data.csv', index=False)

with open('tfidf_vectorizer.pkl', 'wb') as f:
    pickle.dump(tfidf_vectorizer, f)

with open('label_encoder.pkl', 'wb') as f:
    pickle.dump(label_encoder, f)

print("Outputs have been saved successfully!")

Outputs have been saved successfully!
