In [1]:
import spacy

from scipy import spatial
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
corpus = [
  "Red Bull drops hint on F1 engine.",
  "Honda exits F1, leaving F1 partner Red Bull.",
  "Hamilton eyes record eighth F1 title.",
  "Aston Martin announces sponsor."
]

In [3]:
vectorizer = CountVectorizer()

In [4]:
bow = vectorizer.fit_transform(corpus)

In [5]:
print(vectorizer.get_feature_names_out())
print(vectorizer.vocabulary_)

['announces' 'aston' 'bull' 'drops' 'eighth' 'engine' 'exits' 'eyes' 'f1'
 'hamilton' 'hint' 'honda' 'leaving' 'martin' 'on' 'partner' 'record'
 'red' 'sponsor' 'title']
{'red': 17, 'bull': 2, 'drops': 3, 'hint': 10, 'on': 14, 'f1': 8, 'engine': 5, 'honda': 11, 'exits': 6, 'leaving': 12, 'partner': 15, 'hamilton': 9, 'eyes': 7, 'record': 16, 'eighth': 4, 'title': 19, 'aston': 1, 'martin': 13, 'announces': 0, 'sponsor': 18}


In [6]:
print(type(bow))

<class 'scipy.sparse.csr.csr_matrix'>


In [7]:
print(bow) #(doc, word_index)

  (0, 17)	1
  (0, 2)	1
  (0, 3)	1
  (0, 10)	1
  (0, 14)	1
  (0, 8)	1
  (0, 5)	1
  (1, 17)	1
  (1, 2)	1
  (1, 8)	2
  (1, 11)	1
  (1, 6)	1
  (1, 12)	1
  (1, 15)	1
  (2, 8)	1
  (2, 9)	1
  (2, 7)	1
  (2, 16)	1
  (2, 4)	1
  (2, 19)	1
  (3, 1)	1
  (3, 13)	1
  (3, 0)	1
  (3, 18)	1


In [8]:
nlp = spacy.load('en_core_web_sm')

#Tokenizer callback using spacy
def spacy_tokenizer(doc):
    return [t.text for t in nlp(doc) if not t.is_punct]

In [9]:
vectorizer = CountVectorizer(tokenizer=spacy_tokenizer, lowercase=False, binary=True)
bow = vectorizer.fit_transform(corpus)

In [10]:
print(vectorizer.get_feature_names_out())
print(vectorizer.vocabulary_)

['Aston' 'Bull' 'F1' 'Hamilton' 'Honda' 'Martin' 'Red' 'announces' 'drops'
 'eighth' 'engine' 'exits' 'eyes' 'hint' 'leaving' 'on' 'partner' 'record'
 'sponsor' 'title']
{'Red': 6, 'Bull': 1, 'drops': 8, 'hint': 13, 'on': 15, 'F1': 2, 'engine': 10, 'Honda': 4, 'exits': 11, 'leaving': 14, 'partner': 16, 'Hamilton': 3, 'eyes': 12, 'record': 17, 'eighth': 9, 'title': 19, 'Aston': 0, 'Martin': 5, 'announces': 7, 'sponsor': 18}


In [11]:
print("A dense representation")
print(bow.toarray())
print()
print("Indexing and slicing")
print(bow[0])
print()
print(bow[0:2])

A dense representation
[[0 1 1 0 0 0 1 0 1 0 1 0 0 1 0 1 0 0 0 0]
 [0 1 1 0 1 0 1 0 0 0 0 1 0 0 1 0 1 0 0 0]
 [0 0 1 1 0 0 0 0 0 1 0 0 1 0 0 0 0 1 0 1]
 [1 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 1 0]]

Indexing and slicing
  (0, 6)	1
  (0, 1)	1
  (0, 8)	1
  (0, 13)	1
  (0, 15)	1
  (0, 2)	1
  (0, 10)	1

  (0, 6)	1
  (0, 1)	1
  (0, 8)	1
  (0, 13)	1
  (0, 15)	1
  (0, 2)	1
  (0, 10)	1
  (1, 6)	1
  (1, 1)	1
  (1, 2)	1
  (1, 4)	1
  (1, 11)	1
  (1, 14)	1
  (1, 16)	1


## Cosine similarity

In [13]:
doc1_vs_doc2 = 1 - spatial.distance.cosine(bow[0].toarray(), bow[1].toarray())
doc1_vs_doc3 = 1 - spatial.distance.cosine(bow[0].toarray(), bow[2].toarray())
doc1_vs_doc4 = 1 - spatial.distance.cosine(bow[0].toarray(), bow[3].toarray())
print(corpus)
print(f"Doc 1 vs Doc 2: {doc1_vs_doc2}")
print(f"Doc 1 vs Doc 3: {doc1_vs_doc3}")
print(f"Doc 1 vs Doc 4: {doc1_vs_doc4}")

['Red Bull drops hint on F1 engine.', 'Honda exits F1, leaving F1 partner Red Bull.', 'Hamilton eyes record eighth F1 title.', 'Aston Martin announces sponsor.']
Doc 1 vs Doc 2: 0.4285714285714286
Doc 1 vs Doc 3: 0.15430334996209194
Doc 1 vs Doc 4: 0.0


In [14]:
print(cosine_similarity(bow))

[[1.         0.42857143 0.15430335 0.        ]
 [0.42857143 1.         0.15430335 0.        ]
 [0.15430335 0.15430335 1.         0.        ]
 [0.         0.         0.         1.        ]]


## N-grams

In [16]:
#uni-gram and bi-gram
vectorizer = CountVectorizer(tokenizer=spacy_tokenizer, lowercase=False, binary=True, ngram_range=(1,2))
bigrams = vectorizer.fit_transform(corpus)
print(vectorizer.get_feature_names_out())
print("Number of features: {}".format(len(vectorizer.get_feature_names_out())))
print(vectorizer.vocabulary_)

['Aston' 'Aston Martin' 'Bull' 'Bull drops' 'F1' 'F1 engine' 'F1 leaving'
 'F1 partner' 'F1 title' 'Hamilton' 'Hamilton eyes' 'Honda' 'Honda exits'
 'Martin' 'Martin announces' 'Red' 'Red Bull' 'announces'
 'announces sponsor' 'drops' 'drops hint' 'eighth' 'eighth F1' 'engine'
 'exits' 'exits F1' 'eyes' 'eyes record' 'hint' 'hint on' 'leaving'
 'leaving F1' 'on' 'on F1' 'partner' 'partner Red' 'record'
 'record eighth' 'sponsor' 'title']
Number of features: 40
{'Red': 15, 'Bull': 2, 'drops': 19, 'hint': 28, 'on': 32, 'F1': 4, 'engine': 23, 'Red Bull': 16, 'Bull drops': 3, 'drops hint': 20, 'hint on': 29, 'on F1': 33, 'F1 engine': 5, 'Honda': 11, 'exits': 24, 'leaving': 30, 'partner': 34, 'Honda exits': 12, 'exits F1': 25, 'F1 leaving': 6, 'leaving F1': 31, 'F1 partner': 7, 'partner Red': 35, 'Hamilton': 9, 'eyes': 26, 'record': 36, 'eighth': 21, 'title': 39, 'Hamilton eyes': 10, 'eyes record': 27, 'record eighth': 37, 'eighth F1': 22, 'F1 title': 8, 'Aston': 0, 'Martin': 13, 'announces

In [17]:
#bi-gram only
vectorizer = CountVectorizer(tokenizer=spacy_tokenizer, lowercase=False, binary=True, ngram_range=(2,2))
bigrams = vectorizer.fit_transform(corpus)
print(vectorizer.get_feature_names_out())
print(vectorizer.vocabulary_)

['Aston Martin' 'Bull drops' 'F1 engine' 'F1 leaving' 'F1 partner'
 'F1 title' 'Hamilton eyes' 'Honda exits' 'Martin announces' 'Red Bull'
 'announces sponsor' 'drops hint' 'eighth F1' 'exits F1' 'eyes record'
 'hint on' 'leaving F1' 'on F1' 'partner Red' 'record eighth']
{'Red Bull': 9, 'Bull drops': 1, 'drops hint': 11, 'hint on': 15, 'on F1': 17, 'F1 engine': 2, 'Honda exits': 7, 'exits F1': 13, 'F1 leaving': 3, 'leaving F1': 16, 'F1 partner': 4, 'partner Red': 18, 'Hamilton eyes': 6, 'eyes record': 14, 'record eighth': 19, 'eighth F1': 12, 'F1 title': 5, 'Aston Martin': 0, 'Martin announces': 8, 'announces sponsor': 10}


## Exercise

In [29]:
corpus = [
  "Students use their GPS-enabled cellphones to take birdview photographs of a land in order to find specific danger points such as rubbish heaps.",
  "Teenagers are enthusiastic about taking aerial photograph in order to study their neighbourhood.",
  "Aerial photography is a great way to identify terrestrial features that aren’t visible from the ground level, such as lake contours or river paths.",
  "During the early days of digital SLRs, Canon was pretty much the undisputed leader in CMOS image sensor technology.",
  "Syrian President Bashar al-Assad tells the US it will 'pay the price' if it strikes against Syria."
]

nlp = spacy.load('en_core_web_sm')

def spacy_tokenizer(doc):
  return [t.text for t in nlp(doc) if not t.is_punct]

In [30]:
doc = nlp(corpus[4])
print(spacy_tokenizer(doc))

['Syrian', 'President', 'Bashar', 'al', 'Assad', 'tells', 'the', 'US', 'it', 'will', 'pay', 'the', 'price', 'if', 'it', 'strikes', 'against', 'Syria']


In [31]:
vectorizer = CountVectorizer(tokenizer=spacy_tokenizer, lowercase=False, binary=True)

In [32]:
bow = vectorizer.fit_transform(corpus)

In [34]:
s = ["Teenagers take aerial shots of their neighbourhood using digital cameras sitting in old bottles which are launched via kites - a common toy for children living in the favelas. They then use GPS-enabled smartphones to take pictures of specific danger points - such as rubbish heaps, which can become a breeding ground for mosquitoes carrying dengue fever."]
new_bow = vectorizer.transform(s)

In [43]:
doc1_vs_sentence = 1 - spatial.distance.cosine(bow[0].toarray(), new_bow[0].toarray())
doc2_vs_sentence = 1 - spatial.distance.cosine(bow[1].toarray(), new_bow[0].toarray())
doc3_vs_sentence = 1 - spatial.distance.cosine(bow[2].toarray(), new_bow[0].toarray())
doc4_vs_sentence = 1 - spatial.distance.cosine(bow[3].toarray(), new_bow[0].toarray())
doc5_vs_sentence = 1 - spatial.distance.cosine(bow[4].toarray(), new_bow[0].toarray())
print(corpus[0])
print(s[0])
print(doc1_vs_sentence)
print()
print(corpus[1])
print(s[0])
print(doc2_vs_sentence)
print()
print(corpus[2])
print(s[0])
print(doc3_vs_sentence)
print()
print(corpus[3])
print(s[0])
print(doc4_vs_sentence)
print()
print(corpus[4])
print(s[0])
print(doc5_vs_sentence)


Students use their GPS-enabled cellphones to take birdview photographs of a land in order to find specific danger points such as rubbish heaps.
Teenagers take aerial shots of their neighbourhood using digital cameras sitting in old bottles which are launched via kites - a common toy for children living in the favelas. They then use GPS-enabled smartphones to take pictures of specific danger points - such as rubbish heaps, which can become a breeding ground for mosquitoes carrying dengue fever.
0.6956521739130435

Teenagers are enthusiastic about taking aerial photograph in order to study their neighbourhood.
Teenagers take aerial shots of their neighbourhood using digital cameras sitting in old bottles which are launched via kites - a common toy for children living in the favelas. They then use GPS-enabled smartphones to take pictures of specific danger points - such as rubbish heaps, which can become a breeding ground for mosquitoes carrying dengue fever.
0.4048204523763681

Aerial ph

In [45]:
def spacy_tokenizer(doc):
  return [t.lemma_ for t in nlp(doc) if not (t.is_punct and t.is_stop)]

vectorizer = CountVectorizer(tokenizer=spacy_tokenizer, lowercase=False, binary=True)
bow = vectorizer.fit_transform(corpus)
s = ["Teenagers take aerial shots of their neighbourhood using digital cameras sitting in old bottles which are launched via kites - a common toy for children living in the favelas. They then use GPS-enabled smartphones to take pictures of specific danger points - such as rubbish heaps, which can become a breeding ground for mosquitoes carrying dengue fever."]
new_bow = vectorizer.transform(s)
doc1_vs_sentence = 1 - spatial.distance.cosine(bow[0].toarray(), new_bow[0].toarray())
doc2_vs_sentence = 1 - spatial.distance.cosine(bow[1].toarray(), new_bow[0].toarray())
doc3_vs_sentence = 1 - spatial.distance.cosine(bow[2].toarray(), new_bow[0].toarray())
doc4_vs_sentence = 1 - spatial.distance.cosine(bow[3].toarray(), new_bow[0].toarray())
doc5_vs_sentence = 1 - spatial.distance.cosine(bow[4].toarray(), new_bow[0].toarray())
print(corpus[0])
print(s[0])
print(doc1_vs_sentence)
print()
print(corpus[1])
print(s[0])
print(doc2_vs_sentence)
print()
print(corpus[2])
print(s[0])
print(doc3_vs_sentence)
print()
print(corpus[3])
print(s[0])
print(doc4_vs_sentence)
print()
print(corpus[4])
print(s[0])
print(doc5_vs_sentence)


Students use their GPS-enabled cellphones to take birdview photographs of a land in order to find specific danger points such as rubbish heaps.
Teenagers take aerial shots of their neighbourhood using digital cameras sitting in old bottles which are launched via kites - a common toy for children living in the favelas. They then use GPS-enabled smartphones to take pictures of specific danger points - such as rubbish heaps, which can become a breeding ground for mosquitoes carrying dengue fever.
0.7060180864974626

Teenagers are enthusiastic about taking aerial photograph in order to study their neighbourhood.
Teenagers take aerial shots of their neighbourhood using digital cameras sitting in old bottles which are launched via kites - a common toy for children living in the favelas. They then use GPS-enabled smartphones to take pictures of specific danger points - such as rubbish heaps, which can become a breeding ground for mosquitoes carrying dengue fever.
0.4717281765248632

Aerial ph

## TF-IDF

In [1]:
import spacy

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
corpus = fetch_20newsgroups(categories=['sci.space'],
                            remove=('header','footers','quotes'))

In [4]:
print(type(corpus))

<class 'sklearn.utils.Bunch'>


In [5]:
len(corpus.data)

593

In [6]:
corpus.data[:2]

["From: henry@zoo.toronto.edu (Henry Spencer)\nSubject: Re: japanese moon landing?\nOrganization: U of Toronto Zoology\nLines: 21\n\n\nAny lunar satellite needs fuel to do regular orbit corrections, and when\nits fuel runs out it will crash within months.  The orbits of the Apollo\nmotherships changed noticeably during lunar missions lasting only a few\ndays.  It is *possible* that there are stable orbits here and there --\nthe Moon's gravitational field is poorly mapped -- but we know of none.\n\nPerturbations from Sun and Earth are relatively minor issues at low\naltitudes.  The big problem is that the Moon's own gravitational field\nis quite lumpy due to the irregular distribution of mass within the Moon.",
 'From: henry@zoo.toronto.edu (Henry Spencer)\nSubject: Re: Space Station Redesign, JSC Alternative #4\nOrganization: U of Toronto Zoology\nLines: 10\n\n\nGlad to see Griffin is spending his time on engineering rather than on\nritual purification of the language.  Pity he got stu

In [7]:
nlp = spacy.load('en_core_web_sm')
unwanted_pipes = ['ner', 'parser'] # we don't need ner and parser for this example

def spacy_tokenizer(doc):
    with nlp.disable_pipes(*unwanted_pipes):
        return [t.lemma_ for t in nlp(doc) if \
                not t.is_punct and \
                not t.is_space and \
                t.is_alpha]

In [8]:
%%time
vectorizer = TfidfVectorizer(tokenizer=spacy_tokenizer)
features = vectorizer.fit_transform(corpus.data)

CPU times: total: 17.1 s
Wall time: 17.1 s


In [10]:
print(len(vectorizer.get_feature_names_out()))

9776


In [11]:
print(features.shape)

(593, 9776)


In [12]:
print(features[0]) #this will show what the encoding looks like

  (0, 5245)	0.10083250649270274
  (0, 2423)	0.05760094669728994
  (0, 4472)	0.1478972728003204
  (0, 2536)	0.10478449464464447
  (0, 5090)	0.1649813701620663
  (0, 6950)	0.09588654725767391
  (0, 6209)	0.09823567084500094
  (0, 6759)	0.08156582092411671
  (0, 917)	0.08420911679771867
  (0, 323)	0.10702668902936702
  (0, 5067)	0.0795608833533368
  (0, 642)	0.047502285137254206
  (0, 4502)	0.09907124722154151
  (0, 5467)	0.12791016016973922
  (0, 7159)	0.12081961912328393
  (0, 2573)	0.07115801716796445
  (0, 8395)	0.0887960062112186
  (0, 6414)	0.1341043427064035
  (0, 5868)	0.10478449464464447
  (0, 4739)	0.061347711827251336
  (0, 9481)	0.05873248964938554
  (0, 1179)	0.04729340979781867
  (0, 5203)	0.11884679509781822
  (0, 6593)	0.1478972728003204
  (0, 3160)	0.19177309451534783
  :	:
  (0, 379)	0.10189617730449169
  (0, 1904)	0.13081317543857449
  (0, 6108)	0.19282118869189258
  (0, 7135)	0.13081317543857449
  (0, 2444)	0.041897775284087835
  (0, 8792)	0.06000438474467328
  (0, 340

In [13]:
#Query the data
query = ['lunar orbit']
query_tfidf = vectorizer.transform(query)

In [18]:
cosine_similarities = cosine_similarity(features, query_tfidf).flatten()

In [24]:
import numpy as np

def top_k(arr, k):
    kth_largest = (k+1) * -1
    return np.argsort(arr)[:kth_largest:-1]

In [26]:
top_related_indices = top_k(cosine_similarities, 5)
print(top_related_indices)
print(cosine_similarities[top_related_indices])

[249 108   0 312 509]
[0.44656382 0.38482292 0.24800997 0.24041302 0.20820115]


In [27]:
print(corpus.data[top_related_indices[0]])

From: henry@zoo.toronto.edu (Henry Spencer)
Subject: Re: japanese moon landing?
Organization: U of Toronto Zoology
Lines: 22


Actually, Hiten wasn't originally intended to go into lunar orbit at all,
so it indeed didn't have much fuel on hand.  The lunar-orbit mission was
an afterthought, after Hagoromo (a tiny subsatellite deployed by Hiten
during a lunar flyby) had a transmitter failure and its proper insertion
into lunar orbit couldn't be positively confirmed.

It should be noted that the technique does have disadvantages.  It takes
a long time, and you end up with a relatively inconvenient lunar orbit.
If you want something useful like a low circular polar orbit, you do have
to plan to expend a certain amount of fuel, although it is reduced from
what you'd need for the brute-force approach.


In [28]:
print(corpus.data[top_related_indices[1]])

From: henry@zoo.toronto.edu (Henry Spencer)
Subject: Re: japanese moon landing?
Organization: U of Toronto Zoology
Lines: 14


Their Hiten engineering-test mission spent a while in a highly eccentric
Earth orbit doing lunar flybys, and then was inserted into lunar orbit
using some very tricky gravity-assist-like maneuvering.  This meant that
it would crash on the Moon eventually, since there is no such thing as
a stable lunar orbit (as far as anyone knows), and I believe I recall
hearing recently that it was about to happen.


In [30]:
query = ['satellite']
query_tfidf = vectorizer.transform(query)
cosine_similarities = cosine_similarity(features, query_tfidf).flatten()
top_related_indices = top_k(cosine_similarities, 5)
print(top_related_indices)
print(cosine_similarities[top_related_indices])

[378 248 138 539  61]
[0.36921296 0.3255952  0.29483236 0.2614343  0.25347882]


In [31]:
print(corpus.data[top_related_indices[0]])

From: jim@inqmind.bison.mb.ca (jim jaworski)
Subject: Re: How many read sci.space?
Organization: The Inquiring Mind BBS  1 204 488-1607
Lines: 36



As an Amateur Radio operator (VHF 2metres) I like to keep up with what is 
going up (and for that matter what is coming down too).
 
In about 30 days I have learned ALOT about satellites current, future and 
past all the way back to Vanguard series and up to Astro D observatory 
(space).  I borrowed a book from the library called Weater Satellites (I 
think, it has a photo of the earth with a TIROS type satellite on it.)
 
I would like to build a model or have a large color poster of one of the 
TIROS satellites I think there are places in the USA that sell them.
ITOS is my favorite looking satellite, followed by AmSat-OSCAR 13 
(AO-13).
 
TTYL
73
Jim
