In [1]:
import gensim
from gensim import corpora
from pprint import pprint




In [2]:
text = ["""In computer science, artificial intelligence (AI),
             sometimes called machine intelligence, is intelligence
             demonstrated by machines, in contrast to the natural intelligence
             displayed by humans and animals. Computer science defines
             AI research as the study of intelligent agents: any device that
             perceives its environment and takes actions that maximize its chance
             of successfully achieving its goals."""]

tokens = [[token for token in sentence.split()] for sentence in text]


In [3]:
tokens

[['In',
  'computer',
  'science,',
  'artificial',
  'intelligence',
  '(AI),',
  'sometimes',
  'called',
  'machine',
  'intelligence,',
  'is',
  'intelligence',
  'demonstrated',
  'by',
  'machines,',
  'in',
  'contrast',
  'to',
  'the',
  'natural',
  'intelligence',
  'displayed',
  'by',
  'humans',
  'and',
  'animals.',
  'Computer',
  'science',
  'defines',
  'AI',
  'research',
  'as',
  'the',
  'study',
  'of',
  'intelligent',
  'agents:',
  'any',
  'device',
  'that',
  'perceives',
  'its',
  'environment',
  'and',
  'takes',
  'actions',
  'that',
  'maximize',
  'its',
  'chance',
  'of',
  'successfully',
  'achieving',
  'its',
  'goals.']]

In [4]:
gensim_dictionary = corpora.Dictionary(tokens)
gensim_dictionary

<gensim.corpora.dictionary.Dictionary at 0x7f8388948190>

In [5]:
print(len(gensim_dictionary))

46


In [8]:
for k, v in gensim_dictionary.token2id.items():
    print(f'{k:{15}} {v:{10}}')

(AI),                    0
AI                       1
Computer                 2
In                       3
achieving                4
actions                  5
agents:                  6
and                      7
animals.                 8
any                      9
artificial              10
as                      11
by                      12
called                  13
chance                  14
computer                15
contrast                16
defines                 17
demonstrated            18
device                  19
displayed               20
environment             21
goals.                  22
humans                  23
in                      24
intelligence            25
intelligence,           26
intelligent             27
is                      28
its                     29
machine                 30
machines,               31
maximize                32
natural                 33
of                      34
perceives               35
research                36
s

In [9]:
print(list(gensim_dictionary.token2id.keys())[list(gensim_dictionary.token2id.values()).index(40)])

study


In [10]:
text = ["""Colloquially, the term "artificial intelligence" is used to
           describe machines that mimic "cognitive" functions that humans
           associate with other human minds, such as "learning" and "problem solving"""]

tokens = [[token for token in sentence.split()] for sentence in text]
gensim_dictionary.add_documents(tokens)

print("The dictionary has: " + str(len(gensim_dictionary)) + " tokens")
print(gensim_dictionary.token2id)

The dictionary has: 65 tokens
{'(AI),': 0, 'AI': 1, 'Computer': 2, 'In': 3, 'achieving': 4, 'actions': 5, 'agents:': 6, 'and': 7, 'animals.': 8, 'any': 9, 'artificial': 10, 'as': 11, 'by': 12, 'called': 13, 'chance': 14, 'computer': 15, 'contrast': 16, 'defines': 17, 'demonstrated': 18, 'device': 19, 'displayed': 20, 'environment': 21, 'goals.': 22, 'humans': 23, 'in': 24, 'intelligence': 25, 'intelligence,': 26, 'intelligent': 27, 'is': 28, 'its': 29, 'machine': 30, 'machines,': 31, 'maximize': 32, 'natural': 33, 'of': 34, 'perceives': 35, 'research': 36, 'science': 37, 'science,': 38, 'sometimes': 39, 'study': 40, 'successfully': 41, 'takes': 42, 'that': 43, 'the': 44, 'to': 45, '"artificial': 46, '"cognitive"': 47, '"learning"': 48, '"problem': 49, 'Colloquially,': 50, 'associate': 51, 'describe': 52, 'functions': 53, 'human': 54, 'intelligence"': 55, 'machines': 56, 'mimic': 57, 'minds,': 58, 'other': 59, 'solving': 60, 'such': 61, 'term': 62, 'used': 63, 'with': 64}


### Creating Bag of Words Corpus from in memory objects


In [11]:
gensim_dictionary = corpora.Dictionary()
gensim_corpus = [gensim_dictionary.doc2bow(token, allow_update=True) for token in tokens]

In [12]:
gensim_corpus

[[(0, 1),
  (1, 1),
  (2, 1),
  (3, 1),
  (4, 1),
  (5, 1),
  (6, 1),
  (7, 1),
  (8, 1),
  (9, 1),
  (10, 1),
  (11, 1),
  (12, 1),
  (13, 1),
  (14, 1),
  (15, 1),
  (16, 1),
  (17, 1),
  (18, 1),
  (19, 1),
  (20, 1),
  (21, 2),
  (22, 1),
  (23, 1),
  (24, 1),
  (25, 1)]]

First tuple (0,1) means that word with ID 0 occurred 1 time in text. Likewise, (25,3) means word with ID 25 occurred 3 times in document.




In [13]:
word_freq = [[(gensim_dictionary[id], freq) for id, freq in couple] for couple in gensim_corpus]
print(word_freq)

[[('"artificial', 1), ('"cognitive"', 1), ('"learning"', 1), ('"problem', 1), ('Colloquially,', 1), ('and', 1), ('as', 1), ('associate', 1), ('describe', 1), ('functions', 1), ('human', 1), ('humans', 1), ('intelligence"', 1), ('is', 1), ('machines', 1), ('mimic', 1), ('minds,', 1), ('other', 1), ('solving', 1), ('such', 1), ('term', 1), ('that', 2), ('the', 1), ('to', 1), ('used', 1), ('with', 1)]]


### Creating TF-IDF Corpus
Term Frequency - Inverse Document Frequency

In [14]:
text = ["I like to play Football",
        "Football is the best game",
        "Which game do you like to play ?"]

tokens = [[token for token in sentence.split()] for sentence in text]

gensim_dictionary = corpora.Dictionary()
gensim_corpus = [gensim_dictionary.doc2bow(token, allow_update=True) for token in tokens]

In [15]:
from gensim import models
import numpy as np

In [16]:
tfidf = models.TfidfModel(gensim_corpus, smartirs='ntc')
for sent in tfidf[gensim_corpus]:
    print([[gensim_dictionary[id], np.around(frequency, decimals=2)] for id, frequency in sent])

[['Football', 0.35], ['I', 0.71], ['like', 0.35], ['play', 0.35], ['to', 0.35]]
[['Football', 0.27], ['best', 0.53], ['game', 0.27], ['is', 0.53], ['the', 0.53]]
[['like', 0.22], ['play', 0.22], ['to', 0.22], ['game', 0.22], ['?', 0.45], ['Which', 0.45], ['do', 0.45], ['you', 0.45]]


## Downloading Builtin gensim models and datasets


In [17]:
import gensim.downloader as api
w2v_embedding = api.load('glove-wiki-gigaword-100')



In [18]:
w2v_embedding.most_similar('toyota')

[('honda', 0.8739858865737915),
 ('nissan', 0.8108116388320923),
 ('automaker', 0.7918164134025574),
 ('mazda', 0.7687168717384338),
 ('bmw', 0.7616022825241089),
 ('ford', 0.7547588348388672),
 ('motors', 0.7539199590682983),
 ('volkswagen', 0.7176680564880371),
 ('prius', 0.7156581878662109),
 ('chrysler', 0.7085398435592651)]