In [1]:
pip install --upgrade gensim

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import gensim
from gensim import corpora

text1 = ["""Gensim is a free open-source Python library for representing documents as semantic vectors,
           as efficiently and painlessly as possible. Gensim is designed 
           to process raw, unstructured digital texts using unsupervised machine learning algorithms."""]

tokens1 = [[item for item in line.split()] for line in text1]
g_dict1 = corpora.Dictionary(tokens1)

print("The dictionary has: " +str(len(g_dict1)) + " tokens\n")
print(g_dict1.token2id)

The dictionary has: 29 tokens

{'Gensim': 0, 'Python': 1, 'a': 2, 'algorithms.': 3, 'and': 4, 'as': 5, 'designed': 6, 'digital': 7, 'documents': 8, 'efficiently': 9, 'for': 10, 'free': 11, 'is': 12, 'learning': 13, 'library': 14, 'machine': 15, 'open-source': 16, 'painlessly': 17, 'possible.': 18, 'process': 19, 'raw,': 20, 'representing': 21, 'semantic': 22, 'texts': 23, 'to': 24, 'unstructured': 25, 'unsupervised': 26, 'using': 27, 'vectors,': 28}


In [3]:
g_bow =[g_dict1.doc2bow(token, allow_update = True) for token in tokens1]
print("Bag of Words : ", g_bow)

Bag of Words :  [[(0, 2), (1, 1), (2, 1), (3, 1), (4, 1), (5, 3), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 2), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1), (27, 1), (28, 1)]]


In [8]:
from gensim import corpora, models
from gensim.utils import simple_preprocess
import numpy as np

# Sample text
text = ["The food is excellent but the service can be better",
        "The food is always delicious and loved the service",
        "The food was mediocre and the service was terrible"]

# Create a dictionary and a bag-of-words (BoW) corpus
g_dict = corpora.Dictionary([simple_preprocess(line) for line in text])
g_bow = [g_dict.doc2bow(simple_preprocess(line)) for line in text]

# Print the BoW representation
print("Dictionary : ")
for item in g_bow:
    print([[g_dict[id], freq] for id, freq in item])

# Create a TF-IDF model
g_tfidf = models.TfidfModel(g_bow, smartirs='ntc')

# Print the TF-IDF representation
print("TF-IDF Vector:")
for item in g_tfidf[g_bow]:
    print([[g_dict[id], np.around(freq, decimals=2)] for id, freq in item])


Dictionary : 
[['be', 1], ['better', 1], ['but', 1], ['can', 1], ['excellent', 1], ['food', 1], ['is', 1], ['service', 1], ['the', 2]]
[['food', 1], ['is', 1], ['service', 1], ['the', 2], ['always', 1], ['and', 1], ['delicious', 1], ['loved', 1]]
[['food', 1], ['service', 1], ['the', 2], ['and', 1], ['mediocre', 1], ['terrible', 1], ['was', 2]]
TF-IDF Vector:
[['be', 0.43], ['better', 0.43], ['but', 0.43], ['can', 0.43], ['excellent', 0.43], ['food', 0.09], ['is', 0.21], ['service', 0.09], ['the', 0.18]]
[['food', 0.11], ['is', 0.26], ['service', 0.11], ['the', 0.21], ['always', 0.52], ['and', 0.26], ['delicious', 0.52], ['loved', 0.52]]
[['food', 0.08], ['service', 0.08], ['the', 0.16], ['and', 0.2], ['mediocre', 0.39], ['terrible', 0.39], ['was', 0.78]]


In [6]:
from gensim.models.word2vec import Word2Vec
from multiprocessing import cpu_count
import gensim.downloader as api

dataset = api.load("text8")
words = [d for d in dataset]

data1 = words[:1000]
w2v_model = Word2Vec(data1, min_count = 0, workers=cpu_count())



In [18]:
word = 'service'
if word in w2v_model.wv:
    print(f"\nWord vector for '{word}':\n", w2v_model.wv[word])


Word vector for 'service':
 [ 2.6918826  -0.426363    3.4098058  -2.225435    0.01576584 -1.657481
 -0.25096825 -1.2883489   1.7208221   0.4164066   1.8191155  -1.9158213
  1.5740644   0.8696585   0.45638767 -0.60900885  0.6489568   1.072156
  0.4573284  -1.9745895   0.7708862  -0.05540381 -0.9771591   0.23771837
 -0.14038394  1.6806232   1.5502543   0.45755035 -0.4022828   1.7237673
 -0.6433264   0.84633946 -2.8816407  -0.3603856  -1.5032923  -2.4225917
  1.4155889  -2.4237328  -0.3548736   0.5125505  -3.4372883   0.28157535
 -2.0293      1.643747   -2.551914   -1.3040798  -0.21132892  1.2344356
  1.4983875   2.723901   -1.5391455  -1.5445918   1.0350138   0.5357209
 -0.39968297 -1.0089308   1.9309181   1.7833318  -2.2138474   0.43255952
  0.27062684 -0.45434377  0.53598696  0.7331825  -0.43786472  0.23419012
 -1.487662    0.5992024  -2.1934562   0.49401972  2.0896049   2.1591113
  0.19851218  1.8651133   0.6176218   1.2261645   0.9219802  -0.71965295
  0.4876182   0.7274301   0.0514