#(2) Word Embedding

Teniendo en cuenta que no tengo el archivo de requirements.txt hice ingenieria inversa. Le pregunte a Copilot que según los requerimientos indicados en la [página](https://github.com/PacktPublishing/The-Handbook-of-NLP-with-Gensim/blob/main/Chapter02/02_Word_Embedding.ipynb) cuales libreria debía importar.

Entonces, en la carpeta main cree un archivo con el comando 
`nano requirements.txt` 

y luego, dentro de nano copié:
```
gensim==4.3.1
nltk==3.8.1
numpy==1.23.5
scipy==1.10.1
session_info==1.0.0
sklearn==1.2.2
````
terminé con ``control+O`` para grabar el archivo y ``control+X`` para cerrar el editor

Ahora instalo el el contenido de `requirements.txt` con

In [3]:
%pip install -r ../requirements.txt
#!pip install gensim

Note: you may need to restart the kernel to use updated packages.


In [4]:
#!pip freeze > ../requirements.txt

In [5]:
#!pip install session-info

#import session_info
#session_info.show()

In [6]:
#import session_info
#session_info.show()

### (A.1) Bag of words with Gensim

In [7]:
import gensim
from gensim.utils import simple_preprocess
from gensim.corpora import Dictionary
import pprint

Primero se trabaja con una lista de cadenas (str)

In [8]:
doc_list = [
"Start spreading the news",
"You're leaving today (tell him friend)",
"I want to be a part of it, New York, New York",
"Your vagabond shoes, they are longing to stray",
"And steps around the heart of it, New York, New York"
]

In [9]:
print(doc_list)

['Start spreading the news', "You're leaving today (tell him friend)", 'I want to be a part of it, New York, New York', 'Your vagabond shoes, they are longing to stray', 'And steps around the heart of it, New York, New York']


In [10]:
pp = pprint.PrettyPrinter(indent=4)
pp.pprint(doc_list)

[   'Start spreading the news',
    "You're leaving today (tell him friend)",
    'I want to be a part of it, New York, New York',
    'Your vagabond shoes, they are longing to stray',
    'And steps around the heart of it, New York, New York']


In [11]:
doc_tokenized = [simple_preprocess(doc) for doc in doc_list]
doc_tokenized

[['start', 'spreading', 'the', 'news'],
 ['you', 're', 'leaving', 'today', 'tell', 'him', 'friend'],
 ['want', 'to', 'be', 'part', 'of', 'it', 'new', 'york', 'new', 'york'],
 ['your', 'vagabond', 'shoes', 'they', 'are', 'longing', 'to', 'stray'],
 ['and',
  'steps',
  'around',
  'the',
  'heart',
  'of',
  'it',
  'new',
  'york',
  'new',
  'york']]

In [12]:
doc_tokenized = []
for doc in doc_list:
    doc_tokenized.append(simple_preprocess(doc))
doc_tokenized

[['start', 'spreading', 'the', 'news'],
 ['you', 're', 'leaving', 'today', 'tell', 'him', 'friend'],
 ['want', 'to', 'be', 'part', 'of', 'it', 'new', 'york', 'new', 'york'],
 ['your', 'vagabond', 'shoes', 'they', 'are', 'longing', 'to', 'stray'],
 ['and',
  'steps',
  'around',
  'the',
  'heart',
  'of',
  'it',
  'new',
  'york',
  'new',
  'york']]

In [13]:
dictionary = Dictionary()
dictionary

<gensim.corpora.dictionary.Dictionary at 0x7f94f044db40>

In [14]:
BoW_corpus = [dictionary.doc2bow(doc, allow_update=True) for doc in doc_tokenized]
BoW_corpus

[[(0, 1), (1, 1), (2, 1), (3, 1)],
 [(4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1)],
 [(11, 1), (12, 1), (13, 2), (14, 1), (15, 1), (16, 1), (17, 1), (18, 2)],
 [(16, 1), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1)],
 [(3, 1),
  (12, 1),
  (13, 2),
  (14, 1),
  (18, 2),
  (26, 1),
  (27, 1),
  (28, 1),
  (29, 1)]]

In [15]:
print(BoW_corpus)

[[(0, 1), (1, 1), (2, 1), (3, 1)], [(4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1)], [(11, 1), (12, 1), (13, 2), (14, 1), (15, 1), (16, 1), (17, 1), (18, 2)], [(16, 1), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1)], [(3, 1), (12, 1), (13, 2), (14, 1), (18, 2), (26, 1), (27, 1), (28, 1), (29, 1)]]


In [16]:
id_words = [[(dictionary[id], count) for id, count in line] for line in BoW_corpus]
print(id_words)

[[('news', 1), ('spreading', 1), ('start', 1), ('the', 1)], [('friend', 1), ('him', 1), ('leaving', 1), ('re', 1), ('tell', 1), ('today', 1), ('you', 1)], [('be', 1), ('it', 1), ('new', 2), ('of', 1), ('part', 1), ('to', 1), ('want', 1), ('york', 2)], [('to', 1), ('are', 1), ('longing', 1), ('shoes', 1), ('stray', 1), ('they', 1), ('vagabond', 1), ('your', 1)], [('the', 1), ('it', 1), ('new', 2), ('of', 1), ('york', 2), ('and', 1), ('around', 1), ('heart', 1), ('steps', 1)]]


Ahora se realiza el procedimiento anterior, pero no con una lista de cadenas por cada frase o sentencia, sino una sola cadena que es todo el documento.
De este modo dará un resultado diferente, con estos 2 procesos se diferencias los valores por cada frase y del documentos completo.

In [17]:
doc_list_long = "Start spreading the news, You're leaving today (tell him friend), I want to be a part of it, New York, New York, Your vagabond shoes, they are longing to stray, And steps around the heart of it, New York, New York"
doc_list_tokenized_long = simple_preprocess(doc_list_long)
doc_list_tokenized_long

['start',
 'spreading',
 'the',
 'news',
 'you',
 're',
 'leaving',
 'today',
 'tell',
 'him',
 'friend',
 'want',
 'to',
 'be',
 'part',
 'of',
 'it',
 'new',
 'york',
 'new',
 'york',
 'your',
 'vagabond',
 'shoes',
 'they',
 'are',
 'longing',
 'to',
 'stray',
 'and',
 'steps',
 'around',
 'the',
 'heart',
 'of',
 'it',
 'new',
 'york',
 'new',
 'york']

In [18]:
BoW_corpus_long = dictionary.doc2bow(doc_list_tokenized_long)
BoW_corpus_long

[(0, 1),
 (1, 1),
 (2, 1),
 (3, 2),
 (4, 1),
 (5, 1),
 (6, 1),
 (7, 1),
 (8, 1),
 (9, 1),
 (10, 1),
 (11, 1),
 (12, 2),
 (13, 4),
 (14, 2),
 (15, 1),
 (16, 2),
 (17, 1),
 (18, 4),
 (19, 1),
 (20, 1),
 (21, 1),
 (22, 1),
 (23, 1),
 (24, 1),
 (25, 1),
 (26, 1),
 (27, 1),
 (28, 1),
 (29, 1)]

In [19]:
id_words_long = [(dictionary[id], count) for id, count in BoW_corpus_long]
pp.pprint(id_words_long)

[   ('news', 1),
    ('spreading', 1),
    ('start', 1),
    ('the', 2),
    ('friend', 1),
    ('him', 1),
    ('leaving', 1),
    ('re', 1),
    ('tell', 1),
    ('today', 1),
    ('you', 1),
    ('be', 1),
    ('it', 2),
    ('new', 4),
    ('of', 2),
    ('part', 1),
    ('to', 2),
    ('want', 1),
    ('york', 4),
    ('are', 1),
    ('longing', 1),
    ('shoes', 1),
    ('stray', 1),
    ('they', 1),
    ('vagabond', 1),
    ('your', 1),
    ('and', 1),
    ('around', 1),
    ('heart', 1),
    ('steps', 1)]


### (A.2) Word of Bags with scikit-learn

In [20]:
doc_list = [
"Start spreading the news",
"You're leaving today (tell him friend)",
"I want to be a part of it, New York, New York",
"Your vagabond shoes, they are longing to stray",
"And steps around the heart of it, New York, New York"
]

In [21]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
cv_fit = cv.fit_transform(doc_list)
word_list = cv.get_feature_names_out()
count_list = cv_fit.toarray().sum(axis=0)
pp.pprint( dict(zip(word_list,count_list)) )

{   'and': 1,
    'are': 1,
    'around': 1,
    'be': 1,
    'friend': 1,
    'heart': 1,
    'him': 1,
    'it': 2,
    'leaving': 1,
    'longing': 1,
    'new': 4,
    'news': 1,
    'of': 2,
    'part': 1,
    're': 1,
    'shoes': 1,
    'spreading': 1,
    'start': 1,
    'steps': 1,
    'stray': 1,
    'tell': 1,
    'the': 2,
    'they': 1,
    'to': 2,
    'today': 1,
    'vagabond': 1,
    'want': 1,
    'york': 4,
    'you': 1,
    'your': 1}


In [22]:
word_list

array(['and', 'are', 'around', 'be', 'friend', 'heart', 'him', 'it',
       'leaving', 'longing', 'new', 'news', 'of', 'part', 're', 'shoes',
       'spreading', 'start', 'steps', 'stray', 'tell', 'the', 'they',
       'to', 'today', 'vagabond', 'want', 'york', 'you', 'your'],
      dtype=object)

In [23]:
cv_fit.toarray()

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1,
        0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0,
        0, 0, 1, 0, 0, 0, 1, 0],
       [0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 2, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 1, 0, 0, 1, 2, 0, 0],
       [0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0,
        1, 1, 0, 1, 0, 0, 0, 1],
       [1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 2, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1,
        0, 0, 0, 0, 0, 2, 0, 0]])

In [24]:
count_list = cv_fit.toarray().sum(axis=0)

In [25]:
pp.pprint( dict(zip(word_list,count_list)) )

{   'and': 1,
    'are': 1,
    'around': 1,
    'be': 1,
    'friend': 1,
    'heart': 1,
    'him': 1,
    'it': 2,
    'leaving': 1,
    'longing': 1,
    'new': 4,
    'news': 1,
    'of': 2,
    'part': 1,
    're': 1,
    'shoes': 1,
    'spreading': 1,
    'start': 1,
    'steps': 1,
    'stray': 1,
    'tell': 1,
    'the': 2,
    'they': 1,
    'to': 2,
    'today': 1,
    'vagabond': 1,
    'want': 1,
    'york': 4,
    'you': 1,
    'your': 1}


### (B.1)  Bag of N-grams with Gensim

In [26]:
doc_list_Ngrams = [
"Start spreading the news",
"You're leaving today",
"I want to be a part of it, New York, New York",
"Your vagabond shoes, they are longing to stray",
"And steps around the heart of it, New York, New York",
"Come and visit us",
"Come and visit the city",
]
doc_tokenized_Ngrams = [simple_preprocess(doc) for doc in doc_list_Ngrams]
doc_tokenized_Ngrams

[['start', 'spreading', 'the', 'news'],
 ['you', 're', 'leaving', 'today'],
 ['want', 'to', 'be', 'part', 'of', 'it', 'new', 'york', 'new', 'york'],
 ['your', 'vagabond', 'shoes', 'they', 'are', 'longing', 'to', 'stray'],
 ['and',
  'steps',
  'around',
  'the',
  'heart',
  'of',
  'it',
  'new',
  'york',
  'new',
  'york'],
 ['come', 'and', 'visit', 'us'],
 ['come', 'and', 'visit', 'the', 'city']]

In [27]:
#type(doc_tokenized_Ngrams[0][0])

In [28]:
from gensim.models.phrases import Phrases
mydict = Dictionary()
mycorpus_Ngrams = [mydict.doc2bow(doc, allow_update=True) for doc in doc_tokenized_Ngrams]
mycorpus_Ngrams

[[(0, 1), (1, 1), (2, 1), (3, 1)],
 [(4, 1), (5, 1), (6, 1), (7, 1)],
 [(8, 1), (9, 1), (10, 2), (11, 1), (12, 1), (13, 1), (14, 1), (15, 2)],
 [(13, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 1), (21, 1), (22, 1)],
 [(3, 1),
  (9, 1),
  (10, 2),
  (11, 1),
  (15, 2),
  (23, 1),
  (24, 1),
  (25, 1),
  (26, 1)],
 [(23, 1), (27, 1), (28, 1), (29, 1)],
 [(3, 1), (23, 1), (27, 1), (29, 1), (30, 1)]]

In [29]:
from gensim.models import Phrases
from gensim.models.phrases import Phraser

# Build the bigram models
bigram = gensim.models.phrases.Phrases(doc_tokenized_Ngrams, min_count=3, threshold=10)
bigram = Phrases(doc_tokenized_Ngrams, min_count=1, threshold=2, delimiter=' ')
bigram_phraser = Phraser(bigram)
print(bigram_phraser)

for sent in doc_tokenized_Ngrams:
    tokens_ = bigram_phraser[sent]
    print(tokens_)

FrozenPhrases<6 phrases, min_count=1, threshold=2>
['start', 'spreading', 'the', 'news']
['you', 're', 'leaving', 'today']
['want', 'to', 'be', 'part', 'of it', 'new york', 'new york']
['your', 'vagabond', 'shoes', 'they', 'are', 'longing', 'to', 'stray']
['and', 'steps', 'around', 'the', 'heart', 'of it', 'new york', 'new york']
['come and', 'visit', 'us']
['come and', 'visit', 'the', 'city']


In [30]:
from gensim.models import Phrases

bigram = Phrases(doc_tokenized_Ngrams, min_count=1, delimiter=' ')
trigram = Phrases(bigram[doc_tokenized_Ngrams], min_count=1, delimiter=' ')

for sent in doc_tokenized_Ngrams:
    bigrams_ = [b for b in bigram[sent] if b.count(' ') == 1]
    trigrams_ = [t for t in trigram[bigram[sent]] if t.count(' ') == 2]

    print(trigrams_)

[]
[]
[]
[]
[]
['come and visit']
['come and visit']


### (B.2) Bag of N-grams with scikit-learn

In [31]:
doc_list = [
"Start spreading the news",
"You're leaving today (tell him friend)",
"I want to be a part of it, New York, New York",
"Your vagabond shoes, they are longing to stray",
"And steps around the heart of it, New York, New York"
]

In [32]:
# look at sequences of tokens of minimum length 2 and maximum length 2
bigram_vectorizer = CountVectorizer(ngram_range=(2, 2))
bigram_vectorizer.fit(doc_list)

In [33]:
from sklearn.feature_extraction.text import CountVectorizer
ngram_vectorizer = CountVectorizer(ngram_range=(2, 2))
ngram_fit = ngram_vectorizer.fit_transform(doc_list)
word_list = ngram_vectorizer.get_feature_names_out()
count_list = ngram_fit.toarray().sum(axis=0)
pp.pprint( dict(zip(word_list,count_list)) )

{   'and steps': 1,
    'are longing': 1,
    'around the': 1,
    'be part': 1,
    'heart of': 1,
    'him friend': 1,
    'it new': 2,
    'leaving today': 1,
    'longing to': 1,
    'new york': 4,
    'of it': 2,
    'part of': 1,
    're leaving': 1,
    'shoes they': 1,
    'spreading the': 1,
    'start spreading': 1,
    'steps around': 1,
    'tell him': 1,
    'the heart': 1,
    'the news': 1,
    'they are': 1,
    'to be': 1,
    'to stray': 1,
    'today tell': 1,
    'vagabond shoes': 1,
    'want to': 1,
    'york new': 2,
    'you re': 1,
    'your vagabond': 1}


In [34]:
from sklearn.feature_extraction.text import CountVectorizer
ngram_vectorizer = CountVectorizer(ngram_range=(3, 3))
ngram_fit = ngram_vectorizer.fit_transform(doc_list)
word_list = ngram_vectorizer.get_feature_names_out()
count_list = ngram_fit.toarray().sum(axis=0)
pp.pprint( dict(zip(word_list,count_list)) )

{   'and steps around': 1,
    'are longing to': 1,
    'around the heart': 1,
    'be part of': 1,
    'heart of it': 1,
    'it new york': 2,
    'leaving today tell': 1,
    'longing to stray': 1,
    'new york new': 2,
    'of it new': 2,
    'part of it': 1,
    're leaving today': 1,
    'shoes they are': 1,
    'spreading the news': 1,
    'start spreading the': 1,
    'steps around the': 1,
    'tell him friend': 1,
    'the heart of': 1,
    'they are longing': 1,
    'to be part': 1,
    'today tell him': 1,
    'vagabond shoes they': 1,
    'want to be': 1,
    'york new york': 2,
    'you re leaving': 1,
    'your vagabond shoes': 1}


### (B.4) Bag of N-grams with NLTK

In [35]:
!pip install nltk



In [36]:
from nltk.util import bigrams, trigrams
flat_list = []
for sublist in doc_tokenized:
    for item in sublist:
        flat_list.append(item)
flat_list

['start',
 'spreading',
 'the',
 'news',
 'you',
 're',
 'leaving',
 'today',
 'tell',
 'him',
 'friend',
 'want',
 'to',
 'be',
 'part',
 'of',
 'it',
 'new',
 'york',
 'new',
 'york',
 'your',
 'vagabond',
 'shoes',
 'they',
 'are',
 'longing',
 'to',
 'stray',
 'and',
 'steps',
 'around',
 'the',
 'heart',
 'of',
 'it',
 'new',
 'york',
 'new',
 'york']

In [37]:
from nltk.util import bigrams, trigrams
flat_list = []
for sublist in doc_tokenized:
    for item in sublist:
        flat_list.append(item)

nltk_bigrams = list(bigrams(flat_list))
nltk_bigrams

[('start', 'spreading'),
 ('spreading', 'the'),
 ('the', 'news'),
 ('news', 'you'),
 ('you', 're'),
 ('re', 'leaving'),
 ('leaving', 'today'),
 ('today', 'tell'),
 ('tell', 'him'),
 ('him', 'friend'),
 ('friend', 'want'),
 ('want', 'to'),
 ('to', 'be'),
 ('be', 'part'),
 ('part', 'of'),
 ('of', 'it'),
 ('it', 'new'),
 ('new', 'york'),
 ('york', 'new'),
 ('new', 'york'),
 ('york', 'your'),
 ('your', 'vagabond'),
 ('vagabond', 'shoes'),
 ('shoes', 'they'),
 ('they', 'are'),
 ('are', 'longing'),
 ('longing', 'to'),
 ('to', 'stray'),
 ('stray', 'and'),
 ('and', 'steps'),
 ('steps', 'around'),
 ('around', 'the'),
 ('the', 'heart'),
 ('heart', 'of'),
 ('of', 'it'),
 ('it', 'new'),
 ('new', 'york'),
 ('york', 'new'),
 ('new', 'york')]

In [38]:
nltk_trigrams = list(trigrams(flat_list))
nltk_trigrams

[('start', 'spreading', 'the'),
 ('spreading', 'the', 'news'),
 ('the', 'news', 'you'),
 ('news', 'you', 're'),
 ('you', 're', 'leaving'),
 ('re', 'leaving', 'today'),
 ('leaving', 'today', 'tell'),
 ('today', 'tell', 'him'),
 ('tell', 'him', 'friend'),
 ('him', 'friend', 'want'),
 ('friend', 'want', 'to'),
 ('want', 'to', 'be'),
 ('to', 'be', 'part'),
 ('be', 'part', 'of'),
 ('part', 'of', 'it'),
 ('of', 'it', 'new'),
 ('it', 'new', 'york'),
 ('new', 'york', 'new'),
 ('york', 'new', 'york'),
 ('new', 'york', 'your'),
 ('york', 'your', 'vagabond'),
 ('your', 'vagabond', 'shoes'),
 ('vagabond', 'shoes', 'they'),
 ('shoes', 'they', 'are'),
 ('they', 'are', 'longing'),
 ('are', 'longing', 'to'),
 ('longing', 'to', 'stray'),
 ('to', 'stray', 'and'),
 ('stray', 'and', 'steps'),
 ('and', 'steps', 'around'),
 ('steps', 'around', 'the'),
 ('around', 'the', 'heart'),
 ('the', 'heart', 'of'),
 ('heart', 'of', 'it'),
 ('of', 'it', 'new'),
 ('it', 'new', 'york'),
 ('new', 'york', 'new'),
 ('york',

### (C.1) TF-IDF with Gensim

In [39]:
import gensim
import pprint
from gensim.models import TfidfModel
from gensim import corpora
from gensim.utils import simple_preprocess

In [40]:
BoW_corpus

[[(0, 1), (1, 1), (2, 1), (3, 1)],
 [(4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1)],
 [(11, 1), (12, 1), (13, 2), (14, 1), (15, 1), (16, 1), (17, 1), (18, 2)],
 [(16, 1), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1)],
 [(3, 1),
  (12, 1),
  (13, 2),
  (14, 1),
  (18, 2),
  (26, 1),
  (27, 1),
  (28, 1),
  (29, 1)]]

In [41]:
doc_tokenized = [simple_preprocess(doc) for doc in doc_list]
dictionary = corpora.Dictionary()
BoW_corpus = [dictionary.doc2bow(doc, allow_update=True) for doc in doc_tokenized]
for doc in BoW_corpus:
   print([[dictionary[id], freq] for id, freq in doc])
import numpy as np
tfidf = TfidfModel(BoW_corpus, smartirs='ntc')

[['news', 1], ['spreading', 1], ['start', 1], ['the', 1]]
[['friend', 1], ['him', 1], ['leaving', 1], ['re', 1], ['tell', 1], ['today', 1], ['you', 1]]
[['be', 1], ['it', 1], ['new', 2], ['of', 1], ['part', 1], ['to', 1], ['want', 1], ['york', 2]]
[['to', 1], ['are', 1], ['longing', 1], ['shoes', 1], ['stray', 1], ['they', 1], ['vagabond', 1], ['your', 1]]
[['the', 1], ['it', 1], ['new', 2], ['of', 1], ['york', 2], ['and', 1], ['around', 1], ['heart', 1], ['steps', 1]]


In [42]:
dictionary?

[0;31mType:[0m           Dictionary
[0;31mString form:[0m    Dictionary<30 unique tokens: ['news', 'spreading', 'start', 'the', 'friend']...>
[0;31mLength:[0m         30
[0;31mFile:[0m           ~/.python/current/lib/python3.10/site-packages/gensim/corpora/dictionary.py
[0;31mDocstring:[0m     
Dictionary encapsulates the mapping between normalized words and their integer ids.

Notable instance attributes:

Attributes
----------
token2id : dict of (str, int)
    token -> token_id. I.e. the reverse mapping to `self[token_id]`.
cfs : dict of (int, int)
    Collection frequencies: token_id -> how many instances of this token are contained in the documents.
dfs : dict of (int, int)
    Document frequencies: token_id -> how many documents contain this token.
num_docs : int
    Number of documents processed.
num_pos : int
    Total number of corpus positions (number of processed words).
num_nnz : int
    Total number of non-zeroes in the BOW matrix (sum of the number of unique
    

In [43]:
dictionary.itervalues()

ValuesView(<gensim.corpora.dictionary.Dictionary object at 0x7f94bcdcbb80>)

In [44]:
[valor for valor in dictionary.itervalues()]

['news',
 'spreading',
 'start',
 'the',
 'friend',
 'him',
 'leaving',
 're',
 'tell',
 'today',
 'you',
 'be',
 'it',
 'new',
 'of',
 'part',
 'to',
 'want',
 'york',
 'are',
 'longing',
 'shoes',
 'stray',
 'they',
 'vagabond',
 'your',
 'and',
 'around',
 'heart',
 'steps']

In [45]:
BoW_corpus

[[(0, 1), (1, 1), (2, 1), (3, 1)],
 [(4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1)],
 [(11, 1), (12, 1), (13, 2), (14, 1), (15, 1), (16, 1), (17, 1), (18, 2)],
 [(16, 1), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1)],
 [(3, 1),
  (12, 1),
  (13, 2),
  (14, 1),
  (18, 2),
  (26, 1),
  (27, 1),
  (28, 1),
  (29, 1)]]

In [46]:
# Get the tfidf transformed corpus,
# then the vector of the third sentence.
tfidf[BoW_corpus][4]

[(3, 0.21496814396163463),
 (12, 0.21496814396163463),
 (13, 0.42993628792326927),
 (14, 0.21496814396163463),
 (18, 0.42993628792326927),
 (26, 0.35059794205706235),
 (27, 0.35059794205706235),
 (28, 0.35059794205706235),
 (29, 0.35059794205706235)]

### (C.2) TD-IDF with Scikit-learn

In [47]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()
tfidf_vectorizer.fit(doc_list)

In [48]:
import numpy as np
np.set_printoptions(precision=2)

print(tfidf_vectorizer.transform(doc_list).toarray())

[[0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.52 0.   0.
  0.   0.   0.52 0.52 0.   0.   0.   0.42 0.   0.   0.   0.   0.   0.
  0.   0.  ]
 [0.   0.   0.   0.   0.38 0.   0.38 0.   0.38 0.   0.   0.   0.   0.
  0.38 0.   0.   0.   0.   0.   0.38 0.   0.   0.   0.38 0.   0.   0.
  0.38 0.  ]
 [0.   0.   0.   0.31 0.   0.   0.   0.25 0.   0.   0.51 0.   0.25 0.31
  0.   0.   0.   0.   0.   0.   0.   0.   0.   0.25 0.   0.   0.31 0.51
  0.   0.  ]
 [0.   0.36 0.   0.   0.   0.   0.   0.   0.   0.36 0.   0.   0.   0.
  0.   0.36 0.   0.   0.   0.36 0.   0.   0.36 0.29 0.   0.36 0.   0.
  0.   0.36]
 [0.3  0.   0.3  0.   0.   0.3  0.   0.24 0.   0.   0.48 0.   0.24 0.
  0.   0.   0.   0.   0.3  0.   0.   0.24 0.   0.   0.   0.   0.   0.48
  0.   0.  ]]


In [49]:
tfidf_vectorizer.get_feature_names_out()

array(['and', 'are', 'around', 'be', 'friend', 'heart', 'him', 'it',
       'leaving', 'longing', 'new', 'news', 'of', 'part', 're', 'shoes',
       'spreading', 'start', 'steps', 'stray', 'tell', 'the', 'they',
       'to', 'today', 'vagabond', 'want', 'york', 'you', 'your'],
      dtype=object)

In [51]:
tfidf_vectorizer.transform(doc_list).toarray()[4]

array([0.3 , 0.  , 0.3 , 0.  , 0.  , 0.3 , 0.  , 0.24, 0.  , 0.  , 0.48,
       0.  , 0.24, 0.  , 0.  , 0.  , 0.  , 0.  , 0.3 , 0.  , 0.  , 0.24,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.48, 0.  , 0.  ])