In [1]:
import nltk
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import pos_tag
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [8]:
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\aakan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\aakan\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\aakan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\aakan\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [9]:
data = {
    'Text': [
        '  Natural Language Processing is fun',
        np.nan,
        '   ',
        'I love AI and machine learning',
        '',
        'Deep Learning is a subset of AI',
        '  NLP applications are amazing    '
    ]
}

df = pd.DataFrame(data)
print(df)


                                   Text
0    Natural Language Processing is fun
1                                   NaN
2                                      
3        I love AI and machine learning
4                                      
5       Deep Learning is a subset of AI
6      NLP applications are amazing    


In [10]:
df['Text'] = df['Text'].replace(r'^\s*$', np.nan, regex=True)
df.dropna(inplace=True)
df['Text'] = df['Text'].str.strip()
df = df[df['Text'] != '']
print(df)

                                 Text
0  Natural Language Processing is fun
3      I love AI and machine learning
5     Deep Learning is a subset of AI
6        NLP applications are amazing


In [11]:
# Tokenization
df['Tokens'] = df['Text'].apply(word_tokenize)
print("\nTokens (after Tokenization):")
print(df[['Text', 'Tokens']])


Tokens (after Tokenization):
                                 Text  \
0  Natural Language Processing is fun   
3      I love AI and machine learning   
5     Deep Learning is a subset of AI   
6        NLP applications are amazing   

                                     Tokens  
0  [Natural, Language, Processing, is, fun]  
3     [I, love, AI, and, machine, learning]  
5   [Deep, Learning, is, a, subset, of, AI]  
6         [NLP, applications, are, amazing]  


In [12]:
# POS Tagging
df['POS_Tags'] = df['Tokens'].apply(pos_tag)

# Display POS tags
print("\nPOS Tags:")
print(df[['Text', 'POS_Tags']])

LookupError: 
**********************************************************************
  Resource [93maveraged_perceptron_tagger_eng[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('averaged_perceptron_tagger_eng')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtaggers/averaged_perceptron_tagger_eng/[0m

  Searched in:
    - 'C:\\Users\\aakan/nltk_data'
    - 'C:\\Program Files\\Python312\\nltk_data'
    - 'C:\\Program Files\\Python312\\share\\nltk_data'
    - 'C:\\Program Files\\Python312\\lib\\nltk_data'
    - 'C:\\Users\\aakan\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
**********************************************************************


In [14]:
# Stop Words Removal
stop_words = set(stopwords.words('english'))
df['Filtered_Tokens'] = df['Tokens'].apply(lambda x: [word for word in x if word.lower() not in stop_words])

# Display filtered tokens (after stopwords removal)
print("\nFiltered Tokens (after Stopwords Removal):")
print(df[['Text', 'Filtered_Tokens']])


Filtered Tokens (after Stopwords Removal):
                                 Text                       Filtered_Tokens
0  Natural Language Processing is fun  [Natural, Language, Processing, fun]
3      I love AI and machine learning         [love, AI, machine, learning]
5     Deep Learning is a subset of AI          [Deep, Learning, subset, AI]
6        NLP applications are amazing          [NLP, applications, amazing]


In [15]:
# Stemming
stemmer = PorterStemmer()
df['Stemmed_Tokens'] = df['Filtered_Tokens'].apply(lambda x: [stemmer.stem(word) for word in x])

# Display stemmed tokens
print("\nStemmed Tokens:")
print(df[['Text', 'Stemmed_Tokens']])



Stemmed Tokens:
                                 Text                  Stemmed_Tokens
0  Natural Language Processing is fun  [natur, languag, process, fun]
3      I love AI and machine learning       [love, ai, machin, learn]
5     Deep Learning is a subset of AI       [deep, learn, subset, ai]
6        NLP applications are amazing             [nlp, applic, amaz]


In [16]:
# Lemmatization
lemmatizer = WordNetLemmatizer()
df['Lemmatized_Tokens'] = df['Filtered_Tokens'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])

# Display lemmatized tokens
print("\nLemmatized Tokens:")
print(df[['Text', 'Lemmatized_Tokens']])


Lemmatized Tokens:
                                 Text                     Lemmatized_Tokens
0  Natural Language Processing is fun  [Natural, Language, Processing, fun]
3      I love AI and machine learning         [love, AI, machine, learning]
5     Deep Learning is a subset of AI          [Deep, Learning, subset, AI]
6        NLP applications are amazing           [NLP, application, amazing]


In [17]:
# TF-IDF Representation
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['Text'])
tfidf_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())

# Display TF-IDF Matrix
print("\nTF-IDF Matrix:")
print(tfidf_df)


TF-IDF Matrix:
         ai  amazing       and  applications  are      deep       fun  \
0  0.000000      0.0  0.000000           0.0  0.0  0.000000  0.465162   
1  0.382743      0.0  0.485461           0.0  0.0  0.000000  0.000000   
2  0.357455      0.0  0.000000           0.0  0.0  0.453386  0.000000   
3  0.000000      0.5  0.000000           0.5  0.5  0.000000  0.000000   

         is  language  learning      love   machine   natural  nlp        of  \
0  0.366739  0.465162  0.000000  0.000000  0.000000  0.465162  0.0  0.000000   
1  0.000000  0.000000  0.382743  0.485461  0.485461  0.000000  0.0  0.000000   
2  0.357455  0.000000  0.357455  0.000000  0.000000  0.000000  0.0  0.453386   
3  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.5  0.000000   

   processing    subset  
0    0.465162  0.000000  
1    0.000000  0.000000  
2    0.000000  0.453386  
3    0.000000  0.000000  
