## nltk means natural language tool kit which is a library for working with human language data
punkt is used to split text into sentences or words

In [2]:
import nltk 
nltk.download('punkt')

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
 from nltk import word_tokenize, sent_tokenize     ## imports two functions
 sent = "Sachin is considered to be one of the greatest cricket players. Virat is the captain of the Indian cricket team"
 print(word_tokenize(sent))  ## splits sentences into words
 print(sent_tokenize(sent))  ## splits the string into sentences

['Sachin', 'is', 'considered', 'to', 'be', 'one', 'of', 'the', 'greatest', 'cricket', 'players', '.', 'Virat', 'is', 'the', 'captain', 'of', 'the', 'Indian', 'cricket', 'team']
['Sachin is considered to be one of the greatest cricket players.', 'Virat is the captain of the Indian cricket team']


In [4]:
 from nltk.corpus import stopwords  ## import common stop words from nltk
 import nltk
 nltk.download('stopwords')     ## 179 stop words are downloaded
 stop_words = stopwords.words('english')
 print(stop_words)

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', "he'd", "he'll", 'her', 'here', 'hers', 'herself', "he's", 'him', 'himself', 'his', 'how', 'i', "i'd", 'if', "i'll", "i'm", 'in', 'into', 'is', 'isn', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'itself', "i've", 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
 token = word_tokenize(sent) ## splits the sentences into words
 cleaned_token = []         ## creates a list to store the words after stop words removal
 for word in token:
  if word not in stop_words:  ## keeps the word which is not in stop word
    cleaned_token.append(word)
 print("This is the unclean version : ",token)
 print("This is the cleaned version : ",cleaned_token)

This is the unclean version :  ['Sachin', 'is', 'considered', 'to', 'be', 'one', 'of', 'the', 'greatest', 'cricket', 'players', '.', 'Virat', 'is', 'the', 'captain', 'of', 'the', 'Indian', 'cricket', 'team']
This is the cleaned version :  ['Sachin', 'considered', 'one', 'greatest', 'cricket', 'players', '.', 'Virat', 'captain', 'Indian', 'cricket', 'team']


In [6]:
 words = [cleaned_token.lower() for cleaned_token in cleaned_token if  ## lower converts into lower case
cleaned_token.isalpha()] ## alpha keeps the alphanumeric words and ignores the punctuation
 print(words)

['sachin', 'considered', 'one', 'greatest', 'cricket', 'players', 'virat', 'captain', 'indian', 'cricket', 'team']


## stemming reduces the words into base form like running is converted to run

In [8]:
 from nltk.stem import PorterStemmer  ## porter stemmer is a popular stemming algorithm
 stemmer = PorterStemmer()            ## stemmer object is created
 port_stemmer_output = [stemmer.stem(words) for words in words]
 print(port_stemmer_output)

['sachin', 'consid', 'one', 'greatest', 'cricket', 'player', 'virat', 'captain', 'indian', 'cricket', 'team']


## lemmatiztion is same as stemming but more accurate

In [9]:
 from nltk.stem import WordNetLemmatizer
 nltk.download('wordnet')
 lemmatizer = WordNetLemmatizer()
 lemmatizer_output = [lemmatizer.lemmatize(words) for words in words]
 print(lemmatizer_output)

[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


['sachin', 'considered', 'one', 'greatest', 'cricket', 'player', 'virat', 'captain', 'indian', 'cricket', 'team']


## perform pos tagging i.e part of speech tagging i.e whether it is verb, noun,pronoun, etc

In [13]:
import nltk
from nltk import pos_tag, word_tokenize
from nltk.corpus import stopwords

# Download required resources
nltk.download('punkt')  # for word_tokenize
nltk.download('stopwords')  # for stopword removal
nltk.download('averaged_perceptron_tagger_eng')  ## model required to pos of each word


# Set of English stopwords
stop_words = set(stopwords.words('english'))

# Tokenize the sentence
token = word_tokenize(sent)

# Remove stopwords
cleaned_token = [word for word in token if word.lower() not in stop_words]

# POS tagging
tagged = pos_tag(cleaned_token, lang='eng')

# Output the result
print("POS Tagged Tokens:")
print(tagged)


[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /usr/share/nltk_data...


POS Tagged Tokens:
[('Sachin', 'NNP'), ('considered', 'VBD'), ('one', 'CD'), ('greatest', 'JJS'), ('cricket', 'NN'), ('players', 'NNS'), ('.', '.'), ('Virat', 'NNP'), ('captain', 'NN'), ('Indian', 'JJ'), ('cricket', 'NN'), ('team', 'NN')]


[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


## tf means term frequency i.e no. of times the words appear in a document
idf means inverse document fre. which measures how importnat the word is in the entire set of documents

at last tf is multiplied with idf

more unique words have higher tf-idf

In [10]:
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.metrics.pairwise import cosine_similarity
 import pandas as pd
 docs = [ "Sachin is considered to be one of the greatest cricket players",
 "Federer is considered one of the greatest tennis players",
 "Nadal is considered one of the greatest tennis players",
 "Virat is the captain of the Indian cricket team"]
 vectorizer = TfidfVectorizer(analyzer = "word", norm = None , use_idf 
= True , smooth_idf=True)
 Mat = vectorizer.fit(docs)
 print(Mat.vocabulary_)

{'sachin': 12, 'is': 7, 'considered': 2, 'to': 16, 'be': 0, 'one': 10, 'of': 9, 'the': 15, 'greatest': 5, 'cricket': 3, 'players': 11, 'federer': 4, 'tennis': 14, 'nadal': 8, 'virat': 17, 'captain': 1, 'indian': 6, 'team': 13}


## 'sachin' is at index 15 in the vector.

'cricket' is at index 3.

In [11]:
 tfidfMat = vectorizer.fit_transform(docs)
 print(tfidfMat)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 37 stored elements and shape (4, 18)>
  Coords	Values
  (0, 11)	1.2231435513142097
  (0, 3)	1.5108256237659907
  (0, 5)	1.2231435513142097
  (0, 15)	1.0
  (0, 9)	1.0
  (0, 10)	1.2231435513142097
  (0, 0)	1.916290731874155
  (0, 16)	1.916290731874155
  (0, 2)	1.2231435513142097
  (0, 7)	1.0
  (0, 12)	1.916290731874155
  (1, 14)	1.5108256237659907
  (1, 4)	1.916290731874155
  (1, 11)	1.2231435513142097
  (1, 5)	1.2231435513142097
  (1, 15)	1.0
  (1, 9)	1.0
  (1, 10)	1.2231435513142097
  (1, 2)	1.2231435513142097
  (1, 7)	1.0
  (2, 8)	1.916290731874155
  (2, 14)	1.5108256237659907
  (2, 11)	1.2231435513142097
  (2, 5)	1.2231435513142097
  (2, 15)	1.0
  (2, 9)	1.0
  (2, 10)	1.2231435513142097
  (2, 2)	1.2231435513142097
  (2, 7)	1.0
  (3, 13)	1.916290731874155
  (3, 6)	1.916290731874155
  (3, 1)	1.916290731874155
  (3, 17)	1.916290731874155
  (3, 3)	1.5108256237659907
  (3, 15)	2.0
  (3, 9)	1.0
  (3, 7)	1.0


## (3, 12) refers to the word at index 12 having a TF-IDF score of 0.447213 in the fourth document.



## all the important words are printed

In [12]:
features_names = vectorizer.get_feature_names_out()
print(features_names)

['be' 'captain' 'considered' 'cricket' 'federer' 'greatest' 'indian' 'is'
 'nadal' 'of' 'one' 'players' 'sachin' 'team' 'tennis' 'the' 'to' 'virat']


## converted into dense matrix
each row represents document and values are the tf idf values

In [13]:
 dense = tfidfMat.todense()
 denselist = dense.tolist()
 df = pd.DataFrame(denselist , columns = features_names)
 df

Unnamed: 0,be,captain,considered,cricket,federer,greatest,indian,is,nadal,of,one,players,sachin,team,tennis,the,to,virat
0,1.916291,0.0,1.223144,1.510826,0.0,1.223144,0.0,1.0,0.0,1.0,1.223144,1.223144,1.916291,0.0,0.0,1.0,1.916291,0.0
1,0.0,0.0,1.223144,0.0,1.916291,1.223144,0.0,1.0,0.0,1.0,1.223144,1.223144,0.0,0.0,1.510826,1.0,0.0,0.0
2,0.0,0.0,1.223144,0.0,0.0,1.223144,0.0,1.0,1.916291,1.0,1.223144,1.223144,0.0,0.0,1.510826,1.0,0.0,0.0
3,0.0,1.916291,0.0,1.510826,0.0,0.0,1.916291,1.0,0.0,1.0,0.0,0.0,0.0,1.916291,0.0,2.0,0.0,1.916291


In [15]:
features_names = sorted(vectorizer.get_feature_names_out()) ## sorts in alphabetical order


In [22]:
 docList = ['Doc 1','Doc 2','Doc 3','Doc 4']
 skDocsIfIdfdf = pd.DataFrame(tfidfMat.todense(),index = 
sorted(docList), columns=features_names)
 print(skDocsIfIdfdf)

             be   captain  considered   cricket   federer  greatest    indian  \
Doc 1  1.916291  0.000000    1.223144  1.510826  0.000000  1.223144  0.000000   
Doc 2  0.000000  0.000000    1.223144  0.000000  1.916291  1.223144  0.000000   
Doc 3  0.000000  0.000000    1.223144  0.000000  0.000000  1.223144  0.000000   
Doc 4  0.000000  1.916291    0.000000  1.510826  0.000000  0.000000  1.916291   

        is     nadal   of       one   players    sachin      team    tennis  \
Doc 1  1.0  0.000000  1.0  1.223144  1.223144  1.916291  0.000000  0.000000   
Doc 2  1.0  0.000000  1.0  1.223144  1.223144  0.000000  0.000000  1.510826   
Doc 3  1.0  1.916291  1.0  1.223144  1.223144  0.000000  0.000000  1.510826   
Doc 4  1.0  0.000000  1.0  0.000000  0.000000  0.000000  1.916291  0.000000   

       the        to     virat  
Doc 1  1.0  1.916291  0.000000  
Doc 2  1.0  0.000000  0.000000  
Doc 3  1.0  0.000000  0.000000  
Doc 4  2.0  0.000000  1.916291  


## This calculates the cosine similarity between each pair of documents in the TF-IDF matrix.

The cosine similarity is a measure of how similar two documents are, based on their TF-IDF vectors. The values range from 0 to 1, where:

1 means the documents are identical in terms of word usage.

0 means the documents are completely different.

In [26]:
csim = cosine_similarity(tfidfMat,tfidfMat)
csimDf = pd.DataFrame(csim,index=sorted(docList),columns=sorted(docList))
print(csimDf)

          Doc 1     Doc 2     Doc 3     Doc 4
Doc 1  1.000000  0.492416  0.492416  0.277687
Doc 2  0.492416  1.000000  0.754190  0.215926
Doc 3  0.492416  0.754190  1.000000  0.215926
Doc 4  0.277687  0.215926  0.215926  1.000000
