In [1]:
text = '''The lion (Panthera leo) is a large cat of the genus Panthera,
It is a social species, forming groups called prides.
A lion's pride consists of a few adult males, related females, and cubs.
Groups of female lions usually hunt together, preying mostly on large ungulates.
The lion inhabits grasslands, savannahs, and shrublands.
It is usually more diurnal than other wild cats, but when persecuted,
It has been listed as Vulnerable on the IUCN Red List since 1996
Lion populations are untenable outside designated protected areas.
Although the cause of the decline is not fully understood,
One of the most widely recognised animal symbols in human culture,
the lion has been extensively depicted in sculptures and paintings,
on national flags, and in literature and films.
native to Africa and India. It has a muscular, broad-chested body; a short, rounded head; round ears; and a hairy tuft at the end of its tail.
it adapts to being active at night and at twilight.
because populations in African countries have declined by about 43%
since the early 1990s. habitat loss and conflicts with humans are the greatest causes for concern.
'''

In [2]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter
import math

In [3]:
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /usr/share/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:
tokens = word_tokenize(text)
tokens

['The',
 'lion',
 '(',
 'Panthera',
 'leo',
 ')',
 'is',
 'a',
 'large',
 'cat',
 'of',
 'the',
 'genus',
 'Panthera',
 ',',
 'It',
 'is',
 'a',
 'social',
 'species',
 ',',
 'forming',
 'groups',
 'called',
 'prides',
 '.',
 'A',
 'lion',
 "'s",
 'pride',
 'consists',
 'of',
 'a',
 'few',
 'adult',
 'males',
 ',',
 'related',
 'females',
 ',',
 'and',
 'cubs',
 '.',
 'Groups',
 'of',
 'female',
 'lions',
 'usually',
 'hunt',
 'together',
 ',',
 'preying',
 'mostly',
 'on',
 'large',
 'ungulates',
 '.',
 'The',
 'lion',
 'inhabits',
 'grasslands',
 ',',
 'savannahs',
 ',',
 'and',
 'shrublands',
 '.',
 'It',
 'is',
 'usually',
 'more',
 'diurnal',
 'than',
 'other',
 'wild',
 'cats',
 ',',
 'but',
 'when',
 'persecuted',
 ',',
 'It',
 'has',
 'been',
 'listed',
 'as',
 'Vulnerable',
 'on',
 'the',
 'IUCN',
 'Red',
 'List',
 'since',
 '1996',
 'Lion',
 'populations',
 'are',
 'untenable',
 'outside',
 'designated',
 'protected',
 'areas',
 '.',
 'Although',
 'the',
 'cause',
 'of',
 'th

In [5]:
pos_tags = nltk.pos_tag(tokens)
pos_tags

[('The', 'DT'),
 ('lion', 'NN'),
 ('(', '('),
 ('Panthera', 'NNP'),
 ('leo', 'NN'),
 (')', ')'),
 ('is', 'VBZ'),
 ('a', 'DT'),
 ('large', 'JJ'),
 ('cat', 'NN'),
 ('of', 'IN'),
 ('the', 'DT'),
 ('genus', 'NN'),
 ('Panthera', 'NNP'),
 (',', ','),
 ('It', 'PRP'),
 ('is', 'VBZ'),
 ('a', 'DT'),
 ('social', 'JJ'),
 ('species', 'NNS'),
 (',', ','),
 ('forming', 'VBG'),
 ('groups', 'NNS'),
 ('called', 'VBD'),
 ('prides', 'NNS'),
 ('.', '.'),
 ('A', 'DT'),
 ('lion', 'NN'),
 ("'s", 'POS'),
 ('pride', 'NN'),
 ('consists', 'VBZ'),
 ('of', 'IN'),
 ('a', 'DT'),
 ('few', 'JJ'),
 ('adult', 'NN'),
 ('males', 'NNS'),
 (',', ','),
 ('related', 'JJ'),
 ('females', 'NNS'),
 (',', ','),
 ('and', 'CC'),
 ('cubs', 'NNS'),
 ('.', '.'),
 ('Groups', 'NNP'),
 ('of', 'IN'),
 ('female', 'JJ'),
 ('lions', 'NNS'),
 ('usually', 'RB'),
 ('hunt', 'VBP'),
 ('together', 'RB'),
 (',', ','),
 ('preying', 'VBG'),
 ('mostly', 'RB'),
 ('on', 'IN'),
 ('large', 'JJ'),
 ('ungulates', 'NNS'),
 ('.', '.'),
 ('The', 'DT'),
 ('lion',

In [6]:
stop_words = set(stopwords.words('english'))
stop_words

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

In [7]:
filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
filtered_tokens

['lion',
 '(',
 'Panthera',
 'leo',
 ')',
 'large',
 'cat',
 'genus',
 'Panthera',
 ',',
 'social',
 'species',
 ',',
 'forming',
 'groups',
 'called',
 'prides',
 '.',
 'lion',
 "'s",
 'pride',
 'consists',
 'adult',
 'males',
 ',',
 'related',
 'females',
 ',',
 'cubs',
 '.',
 'Groups',
 'female',
 'lions',
 'usually',
 'hunt',
 'together',
 ',',
 'preying',
 'mostly',
 'large',
 'ungulates',
 '.',
 'lion',
 'inhabits',
 'grasslands',
 ',',
 'savannahs',
 ',',
 'shrublands',
 '.',
 'usually',
 'diurnal',
 'wild',
 'cats',
 ',',
 'persecuted',
 ',',
 'listed',
 'Vulnerable',
 'IUCN',
 'Red',
 'List',
 'since',
 '1996',
 'Lion',
 'populations',
 'untenable',
 'outside',
 'designated',
 'protected',
 'areas',
 '.',
 'Although',
 'cause',
 'decline',
 'fully',
 'understood',
 ',',
 'One',
 'widely',
 'recognised',
 'animal',
 'symbols',
 'human',
 'culture',
 ',',
 'lion',
 'extensively',
 'depicted',
 'sculptures',
 'paintings',
 ',',
 'national',
 'flags',
 ',',
 'literature',
 'films'

In [8]:
stemmer = PorterStemmer()
stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]
stemmed_tokens

['lion',
 '(',
 'panthera',
 'leo',
 ')',
 'larg',
 'cat',
 'genu',
 'panthera',
 ',',
 'social',
 'speci',
 ',',
 'form',
 'group',
 'call',
 'pride',
 '.',
 'lion',
 "'s",
 'pride',
 'consist',
 'adult',
 'male',
 ',',
 'relat',
 'femal',
 ',',
 'cub',
 '.',
 'group',
 'femal',
 'lion',
 'usual',
 'hunt',
 'togeth',
 ',',
 'prey',
 'mostli',
 'larg',
 'ungul',
 '.',
 'lion',
 'inhabit',
 'grassland',
 ',',
 'savannah',
 ',',
 'shrubland',
 '.',
 'usual',
 'diurnal',
 'wild',
 'cat',
 ',',
 'persecut',
 ',',
 'list',
 'vulner',
 'iucn',
 'red',
 'list',
 'sinc',
 '1996',
 'lion',
 'popul',
 'unten',
 'outsid',
 'design',
 'protect',
 'area',
 '.',
 'although',
 'caus',
 'declin',
 'fulli',
 'understood',
 ',',
 'one',
 'wide',
 'recognis',
 'anim',
 'symbol',
 'human',
 'cultur',
 ',',
 'lion',
 'extens',
 'depict',
 'sculptur',
 'paint',
 ',',
 'nation',
 'flag',
 ',',
 'literatur',
 'film',
 '.',
 'nativ',
 'africa',
 'india',
 '.',
 'muscular',
 ',',
 'broad-chest',
 'bodi',
 ';',


In [9]:
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
nltk.download('wordnet')
# lemmatizer = WordNetLemmatizer()
# lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]
# lemmatized_tokens

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /usr/share/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [10]:
sentences = nltk.sent_tokenize(text)
sentences

['The lion (Panthera leo) is a large cat of the genus Panthera,\nIt is a social species, forming groups called prides.',
 "A lion's pride consists of a few adult males, related females, and cubs.",
 'Groups of female lions usually hunt together, preying mostly on large ungulates.',
 'The lion inhabits grasslands, savannahs, and shrublands.',
 'It is usually more diurnal than other wild cats, but when persecuted,\nIt has been listed as Vulnerable on the IUCN Red List since 1996\nLion populations are untenable outside designated protected areas.',
 'Although the cause of the decline is not fully understood,\nOne of the most widely recognised animal symbols in human culture,\nthe lion has been extensively depicted in sculptures and paintings,\non national flags, and in literature and films.',
 'native to Africa and India.',
 'It has a muscular, broad-chested body; a short, rounded head; round ears; and a hairy tuft at the end of its tail.',
 'it adapts to being active at night and at twil

In [11]:
tfidf_vectorizer = TfidfVectorizer()
# Fit and transform the sentences
tfidf_matrix = tfidf_vectorizer.fit_transform(sentences)
# Get feature names
feature_names = tfidf_vectorizer.get_feature_names_out()
# Calculate TF-IDF for each word
tfidf_scores = {}
for i, sentence in enumerate(sentences):
    tokens = nltk.word_tokenize(sentence)
    token_counter = Counter(tokens)
    total_tokens = len(tokens)
    for token, count in token_counter.items():
        tf = count / total_tokens
        idf = math.log(len(sentences) / sum([1 for s in sentences if token in s]))
        tfidf_scores[token] = tf * idf
print()
print("TF-IDF Scores:")
for word, score in tfidf_scores.items():
    print(f"{word}: {score}")


TF-IDF Scores:
The: 0.17047480922384253
lion: 0.017919485462824326
(: 0.0922267412614758
Panthera: 0.1844534825229516
leo: 0.0922267412614758
): 0.0922267412614758
is: 0.022990929810874545
a: 0.0
large: 0.12176772087417323
cat: 0.06556723431686251
of: 0.02815919144158108
the: 0.034768086441773635
genus: 0.0922267412614758
,: 0.032284651695932656
It: 0.04640296371893789
social: 0.0922267412614758
species: 0.0922267412614758
forming: 0.0922267412614758
groups: 0.0922267412614758
called: 0.0922267412614758
prides: 0.0922267412614758
.: 0.0
A: 0.05950593598108705
's: 0.1410526631057865
pride: 0.10027929954343678
consists: 0.1410526631057865
few: 0.1410526631057865
adult: 0.1410526631057865
males: 0.1410526631057865
related: 0.1410526631057865
females: 0.1410526631057865
and: 0.034768086441773635
cubs: 0.1410526631057865
Groups: 0.17127823377131218
female: 0.12176772087417323
lions: 0.17127823377131218
usually: 0.04735411367328959
hunt: 0.17127823377131218
together: 0.17127823377131218
pre