# Question No 1: Text Pre-processing

## Import libraries

In [3]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer

## Download required resources

In [4]:
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /home/abdullah-
[nltk_data]     azhar/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /home/abdullah-
[nltk_data]     azhar/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/abdullah-azhar/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /home/abdullah-
[nltk_data]     azhar/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

## Raw text

In [16]:
text = "I AM Loving the NLP Class, but sometimes it feels confusing!!! 123"

### 1. Lowercasing

In [17]:
text_lower=text.lower()

In [18]:
print("Lowercased Text:",text_lower)

Lowercased Text: i am loving the nlp class, but sometimes it feels confusing!!! 123


### 2. Remove numbers and punctuation

In [19]:
import re

In [20]:
text_clean=re.sub(r'[^a-zA-Z\s]','',text_lower)

In [21]:
print("Removed Numbers & Punctuation:",text_clean)

Removed Numbers & Punctuation: i am loving the nlp class but sometimes it feels confusing 


### 3. Tokenization

In [22]:
tokens=word_tokenize(text_clean)

In [23]:
print("Tokens:",tokens)

Tokens: ['i', 'am', 'loving', 'the', 'nlp', 'class', 'but', 'sometimes', 'it', 'feels', 'confusing']


### 4. Stopword removal

In [24]:
stop_words=set(stopwords.words('english'))

In [25]:
tokens=[word for word in tokens if word not in stop_words]

In [26]:
print("After Stopword Removal:",tokens)

After Stopword Removal: ['loving', 'nlp', 'class', 'sometimes', 'feels', 'confusing']


### 5. Stemming

In [30]:
stemmer=PorterStemmer()

In [31]:
words = [stemmer.stem(word) for word in tokens]

In [32]:
print("After Stemming:",words)

After Stemming: ['love', 'nlp', 'class', 'sometim', 'feel', 'confus']


### 6. Lemmatization

In [33]:
lemmatizer=WordNetLemmatizer()

In [34]:
words=[lemmatizer.lemmatize(word) for word in tokens]

In [35]:
print("After Lemmatization:",words)

After Lemmatization: ['loving', 'nlp', 'class', 'sometimes', 'feel', 'confusing']


### 7. POS Tagging

In [36]:
tags=nltk.pos_tag(tokens)

In [37]:
print("POS Tags:",tags)

POS Tags: [('loving', 'VBG'), ('nlp', 'JJ'), ('class', 'NN'), ('sometimes', 'RB'), ('feels', 'VBZ'), ('confusing', 'VBG')]


# Question No 2: Generating the vocabulary and BOW 

In [38]:
from sklearn.feature_extraction.text import CountVectorizer

## Input corpus

In [39]:
corpus = [ "I am loving the NLP class, but sometimes it feels confusing!!!",
    "NLP is a fascinating field it deals with text, speech, and language understanding."]

## Create vocabulary and BOW

In [42]:
vectorizer=CountVectorizer()

In [43]:
x=vectorizer.fit_transform(corpus)

## Vocabulary

In [46]:
print("Vocabulary:",vectorizer.vocabulary_)

Vocabulary: {'am': 0, 'loving': 12, 'the': 17, 'nlp': 13, 'class': 3, 'but': 2, 'sometimes': 14, 'it': 10, 'feels': 7, 'confusing': 4, 'is': 9, 'fascinating': 6, 'field': 8, 'deals': 5, 'with': 19, 'text': 16, 'speech': 15, 'and': 1, 'language': 11, 'understanding': 18}


## Bag of Words Representation

In [47]:
print("BOW Matrix:\n",x.toarray())

BOW Matrix:
 [[1 0 1 1 1 0 0 1 0 0 1 0 1 1 1 0 0 1 0 0]
 [0 1 0 0 0 1 1 0 1 1 1 1 0 1 0 1 1 0 1 1]]


# Importing necessary libraries

In [50]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Sample documents 

In [51]:
documents=["I love programming in Python",
    "Python is great for data science",
    "Machine learning is a part of data science"]

# Create a TF-IDF Vectorizer

In [52]:
vectorizer=TfidfVectorizer()

# Fit and transform the documents into TF-IDF matrix

In [53]:
tfidf_matrix=vectorizer.fit_transform(documents)

# Get feature names (words)

In [55]:
feature_names=vectorizer.get_feature_names_out()
feature_names

array(['data', 'for', 'great', 'in', 'is', 'learning', 'love', 'machine',
       'of', 'part', 'programming', 'python', 'science'], dtype=object)

# Convert the TF-IDF result into a readable DataFrame

In [56]:
import pandas as pd
df=pd.DataFrame(tfidf_matrix.toarray(), columns=feature_names)

In [57]:
print("TF-IDF Matrix:")
print(df)

TF-IDF Matrix:
      data       for     great        in       is  learning      love  \
0  0.00000  0.000000  0.000000  0.528635  0.00000  0.000000  0.528635   
1  0.36618  0.481482  0.481482  0.000000  0.36618  0.000000  0.000000   
2  0.31757  0.000000  0.000000  0.000000  0.31757  0.417567  0.000000   

    machine        of      part  programming   python  science  
0  0.000000  0.000000  0.000000     0.528635  0.40204  0.00000  
1  0.000000  0.000000  0.000000     0.000000  0.36618  0.36618  
2  0.417567  0.417567  0.417567     0.000000  0.00000  0.31757  
