### Importing Libraries

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import pos_tag

### Sample Document

In [2]:
sample_doc = "Life is like a camera. Focus on what's important, capture the good times, develop from the negatives, and if things don't work out, take another shot."

### Tokenization

In [3]:
tokenized_words = word_tokenize(sample_doc)
print("Original Doc/text :\n",sample_doc)
print("\n\nTokenized Text :\n",tokenized_words)

Original Doc/text :
 Life is like a camera. Focus on what's important, capture the good times, develop from the negatives, and if things don't work out, take another shot.


Tokenized Text :
 ['Life', 'is', 'like', 'a', 'camera', '.', 'Focus', 'on', 'what', "'s", 'important', ',', 'capture', 'the', 'good', 'times', ',', 'develop', 'from', 'the', 'negatives', ',', 'and', 'if', 'things', 'do', "n't", 'work', 'out', ',', 'take', 'another', 'shot', '.']


### Stop Words Removal

In [4]:
#Printing the stopwords
stop_words = stopwords.words('english')
print(stop_words)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [5]:
cleaned_token = []
for word in tokenized_words:
    if word not in stop_words:
        cleaned_token.append(word)
        
print("Tokenized sentence without stopword removal :\n",tokenized_words)
print("\n\nTokenized sentence with stopword removal :\n",cleaned_token)

Tokenized sentence without stopword removal :
 ['Life', 'is', 'like', 'a', 'camera', '.', 'Focus', 'on', 'what', "'s", 'important', ',', 'capture', 'the', 'good', 'times', ',', 'develop', 'from', 'the', 'negatives', ',', 'and', 'if', 'things', 'do', "n't", 'work', 'out', ',', 'take', 'another', 'shot', '.']


Tokenized sentence with stopword removal :
 ['Life', 'like', 'camera', '.', 'Focus', "'s", 'important', ',', 'capture', 'good', 'times', ',', 'develop', 'negatives', ',', 'things', "n't", 'work', ',', 'take', 'another', 'shot', '.']


In [6]:
#Cleaning
words = [cleaned_token.lower() for cleaned_token in cleaned_token if cleaned_token.isalpha()] 
print(words)

['life', 'like', 'camera', 'focus', 'important', 'capture', 'good', 'times', 'develop', 'negatives', 'things', 'work', 'take', 'another', 'shot']


### Stemming

In [7]:
stemmer = PorterStemmer()
port_stemmer_output = [stemmer.stem(words) for words in words]
print(port_stemmer_output)

['life', 'like', 'camera', 'focu', 'import', 'captur', 'good', 'time', 'develop', 'neg', 'thing', 'work', 'take', 'anoth', 'shot']


### Lemmatization

In [8]:
lemmatizer = WordNetLemmatizer() 
lemmatizer_output = [lemmatizer.lemmatize(words) for words in words] 
print(lemmatizer_output)

['life', 'like', 'camera', 'focus', 'important', 'capture', 'good', 'time', 'develop', 'negative', 'thing', 'work', 'take', 'another', 'shot']


### POS Tagging

In [9]:
tagged = pos_tag(cleaned_token)
print(tagged)

[('Life', 'NNP'), ('like', 'IN'), ('camera', 'NN'), ('.', '.'), ('Focus', 'NNP'), ("'s", 'POS'), ('important', 'JJ'), (',', ','), ('capture', 'NN'), ('good', 'JJ'), ('times', 'NNS'), (',', ','), ('develop', 'VB'), ('negatives', 'NNS'), (',', ','), ('things', 'NNS'), ("n't", 'RB'), ('work', 'VB'), (',', ','), ('take', 'VB'), ('another', 'DT'), ('shot', 'NN'), ('.', '.')]


### Calculation of Term Frequency and Inverse Document Frequency

In [10]:
doc = ["The quick brown fox jumps over the lazy dog",
       "The lazy cat sleeps on the brown rug",
       "Brown bears are common in this area",
       "The quick fox runs faster than the brown dog",
       "The lazy dog lies down under the brown tree"]

In [11]:
vectorizer = TfidfVectorizer(analyzer = "word", norm = None, use_idf = True, smooth_idf = True)
Mat = vectorizer.fit(doc)
vocabulary = Mat.vocabulary_

for word, number in vocabulary.items():
    print(f"{word}: {number}")

the: 21
quick: 16
brown: 3
fox: 9
jumps: 11
over: 15
lazy: 12
dog: 6
cat: 4
sleeps: 19
on: 14
rug: 17
bears: 2
are: 0
common: 5
in: 10
this: 22
area: 1
runs: 18
faster: 8
than: 20
lies: 13
down: 7
under: 24
tree: 23


In [12]:
tfidfMat = vectorizer.fit_transform(doc)
print(tfidfMat)

  (0, 6)	1.4054651081081644
  (0, 12)	1.4054651081081644
  (0, 15)	2.09861228866811
  (0, 11)	2.09861228866811
  (0, 9)	1.6931471805599454
  (0, 3)	1.0
  (0, 16)	1.6931471805599454
  (0, 21)	2.3646431135879094
  (1, 17)	2.09861228866811
  (1, 14)	2.09861228866811
  (1, 19)	2.09861228866811
  (1, 4)	2.09861228866811
  (1, 12)	1.4054651081081644
  (1, 3)	1.0
  (1, 21)	2.3646431135879094
  (2, 1)	2.09861228866811
  (2, 22)	2.09861228866811
  (2, 10)	2.09861228866811
  (2, 5)	2.09861228866811
  (2, 0)	2.09861228866811
  (2, 2)	2.09861228866811
  (2, 3)	1.0
  (3, 20)	2.09861228866811
  (3, 8)	2.09861228866811
  (3, 18)	2.09861228866811
  (3, 6)	1.4054651081081644
  (3, 9)	1.6931471805599454
  (3, 3)	1.0
  (3, 16)	1.6931471805599454
  (3, 21)	2.3646431135879094
  (4, 23)	2.09861228866811
  (4, 24)	2.09861228866811
  (4, 7)	2.09861228866811
  (4, 13)	2.09861228866811
  (4, 6)	1.4054651081081644
  (4, 12)	1.4054651081081644
  (4, 3)	1.0
  (4, 21)	2.3646431135879094


In [13]:
feature_names = vectorizer.get_feature_names_out()
print(feature_names)

['are' 'area' 'bears' 'brown' 'cat' 'common' 'dog' 'down' 'faster' 'fox'
 'in' 'jumps' 'lazy' 'lies' 'on' 'over' 'quick' 'rug' 'runs' 'sleeps'
 'than' 'the' 'this' 'tree' 'under']


In [14]:
dense = tfidfMat.todense()
denselist = dense.tolist()

In [15]:
#Creating Pandas Dataframe of the feature names and there TFIDF values
df = pd.DataFrame(denselist,columns = feature_names)
df

Unnamed: 0,are,area,bears,brown,cat,common,dog,down,faster,fox,...,over,quick,rug,runs,sleeps,than,the,this,tree,under
0,0.0,0.0,0.0,1.0,0.0,0.0,1.405465,0.0,0.0,1.693147,...,2.098612,1.693147,0.0,0.0,0.0,0.0,2.364643,0.0,0.0,0.0
1,0.0,0.0,0.0,1.0,2.098612,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,2.098612,0.0,2.098612,0.0,2.364643,0.0,0.0,0.0
2,2.098612,2.098612,2.098612,1.0,0.0,2.098612,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.098612,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,0.0,1.405465,0.0,2.098612,1.693147,...,0.0,1.693147,0.0,2.098612,0.0,2.098612,2.364643,0.0,0.0,0.0
4,0.0,0.0,0.0,1.0,0.0,0.0,1.405465,2.098612,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,2.364643,0.0,2.098612,2.098612
