In [29]:
import nltk
from nltk.stem import PorterStemmer , WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize , sent_tokenize

from sklearn.feature_extraction.text import CountVectorizer , TfidfVectorizer

import re



# Lower Casing

In [30]:
text="I AM Loving the NLP Class, but sometimes it feels confusing!!! 123"
lower_case=text.lower()
print(lower_case)

i am loving the nlp class, but sometimes it feels confusing!!! 123


# Puncutation Removal: 

In [31]:
Text_clean=re.sub(r'[^a-zA-Z\s]','',lower_case)
print(Text_clean)

i am loving the nlp class but sometimes it feels confusing 


# Tokenization 

In [32]:
tokens=word_tokenize(Text_clean)
print(tokens)

['i', 'am', 'loving', 'the', 'nlp', 'class', 'but', 'sometimes', 'it', 'feels', 'confusing']


# Stopword 

In [34]:
stopword=set(stopwords.words('english'))
tokens=[word for word in tokens if word not in stopword]
print(tokens)

['loving', 'nlp', 'class', 'sometimes', 'feels', 'confusing']


# steming 

In [36]:
stemmer = PorterStemmer()
sw=[stemmer.stem(word) for word in tokens]
sw

['love', 'nlp', 'class', 'sometim', 'feel', 'confus']

# LEMMATIZATION

In [38]:
lem=WordNetLemmatizer()
lemmatizer_l=[lem.lemmatize(word) for word in tokens]
lemmatizer_l

['loving', 'nlp', 'class', 'sometimes', 'feel', 'confusing']

# POS TAGGING

In [42]:
ptag=nltk.pos_tag(tokens)
ptag

[('loving', 'VBG'),
 ('nlp', 'JJ'),
 ('class', 'NN'),
 ('sometimes', 'RB'),
 ('feels', 'VBZ'),
 ('confusing', 'VBG')]

# Vocab and Bag Of Words

In [46]:
corpus = [ "I am loving the NLP class, but sometimes it feels confusing!!!",
    "NLP is a fascinating field it deals with text, speech, and language understanding."]

vectorizer=CountVectorizer()
bow=vectorizer.fit_transform(corpus)
bow

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 22 stored elements and shape (2, 20)>

In [51]:
feature_name=vectorizer.get_feature_names_out()
feature_name

array(['am', 'and', 'but', 'class', 'confusing', 'deals', 'fascinating',
       'feels', 'field', 'is', 'it', 'language', 'loving', 'nlp',
       'sometimes', 'speech', 'text', 'the', 'understanding', 'with'],
      dtype=object)

In [52]:
print(vectorizer.vocabulary_)

{'am': 0, 'loving': 12, 'the': 17, 'nlp': 13, 'class': 3, 'but': 2, 'sometimes': 14, 'it': 10, 'feels': 7, 'confusing': 4, 'is': 9, 'fascinating': 6, 'field': 8, 'deals': 5, 'with': 19, 'text': 16, 'speech': 15, 'and': 1, 'language': 11, 'understanding': 18}


In [58]:
bow.toarray()

array([[1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0],
       [0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1]])

# TFIDF

In [61]:
tfidf=TfidfVectorizer()
tf_idf=tfidf.fit_transform(corpus)
tf_idf

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 22 stored elements and shape (2, 20)>

In [66]:

ti=tfidf.get_feature_names_out()
ti

array(['am', 'and', 'but', 'class', 'confusing', 'deals', 'fascinating',
       'feels', 'field', 'is', 'it', 'language', 'loving', 'nlp',
       'sometimes', 'speech', 'text', 'the', 'understanding', 'with'],
      dtype=object)

In [71]:
import pandas as pd
data=pd.DataFrame(tf_idf.toarray(),columns=ti)
data

Unnamed: 0,am,and,but,class,confusing,deals,fascinating,feels,field,is,it,language,loving,nlp,sometimes,speech,text,the,understanding,with
0,0.333102,0.0,0.333102,0.333102,0.333102,0.0,0.0,0.333102,0.0,0.0,0.237005,0.0,0.333102,0.237005,0.333102,0.0,0.0,0.333102,0.0,0.0
1,0.0,0.30134,0.0,0.0,0.0,0.30134,0.30134,0.0,0.30134,0.30134,0.214406,0.30134,0.0,0.214406,0.0,0.30134,0.30134,0.0,0.30134,0.30134
