In [None]:
# !pip install gensim

In [None]:
# Gensim

#   What it is: A Python library for topic modeling, similarity retrieval, and vector space modeling.

#Popular models inside it:

#   Word2Vec

#   Doc2Vec

#   FastText (yes, gensim also provides a wrapper for FastText)

#   LDA (Latent Dirichlet Allocation) for topic modeling

#   Main Uses:

#   Build word embeddings (Word2Vec, FastText)

#   Document embeddings (Doc2Vec)

#   Topic modeling (LDA, LSI, HDP)

#   Similarity queries (e.g., "most similar words")

#   Key Features:

#   Lightweight, memory-efficient

#   Easy to integrate with NLP pipelines

#   Handles large text corpora using streaming & efficient implementations

#   ðŸ”¹ FastText

#   What it is: A specific model created by Facebook AI Research (FAIR) for word embeddings and text classification.

#   Unique Approach:

#   Unlike Word2Vec, it represents a word as a bag of character n-grams.

#   Example: "playing" â†’ "play", "lay", "ying", etc.

#   This helps it understand morphology (word structure).

#  Main Uses:

#   Word embeddings (handles out-of-vocabulary (OOV) words better than Word2Vec)

#   Text classification (very fast, works with large datasets)

#   Key Features:

#   Great at handling rare words / OOV words (like "playyyy" or "plaiing")

#   Faster training than Word2Vec

#   Pretrained embeddings available in many languages

In [None]:
# Step 1: Import library
from gensim.models import Word2Vec

# Step 2: Create a small dataset (tokenized sentences)
sentences = [
    ["i", "love", "natural", "language", "processing"],
    ["word2vec", "creates", "word", "embeddings"],
    ["machine", "learning", "is", "fun"],
    ["i", "love", "machine", "learning"]
]

# Step 3: Train Word2Vec model
model = Word2Vec(sentences, vector_size=50, window=3, min_count=1, sg=1)

# Step 4: Get vector for a word
print("Vector for 'love':")
print(model.wv['love'])

# Step 5: Find most similar words
print("\nMost similar to 'machine':")
print(model.wv.most_similar('machine'))


Vector for 'love':
[-0.01723938  0.00733148  0.01037977  0.01148388  0.01493384 -0.01233535
  0.00221123  0.01209456 -0.0056801  -0.01234705 -0.00082045 -0.0167379
 -0.01120002  0.01420908  0.00670508  0.01445134  0.01360049  0.01506148
 -0.00757831 -0.00112361  0.00469675 -0.00903806  0.01677746 -0.01971633
  0.01352928  0.00582883 -0.00986566  0.00879638 -0.00347915  0.01342277
  0.0199297  -0.00872489 -0.00119868 -0.01139127  0.00770164  0.00557325
  0.01378215  0.01220219  0.01907699  0.01854683  0.01579614 -0.01397901
 -0.01831173 -0.00071151 -0.00619968  0.01578863  0.01187715 -0.00309133
  0.00302193  0.00358008]

Most similar to 'machine':
[('fun', 0.12486250698566437), ('natural', 0.08055731654167175), ('embeddings', 0.07399576157331467), ('learning', 0.04237300902605057), ('is', 0.018277151510119438), ('love', 0.011071980930864811), ('word2vec', 0.0013571369927376509), ('language', -0.1094222441315651), ('creates', -0.11910455673933029), ('i', -0.17424818873405457)]


In [None]:
# Step 1: Import FastText
from gensim.models import FastText

# Step 2: Create dataset
sentences = [
    ["i", "love", "natural", "language", "processing"],
    ["fasttext", "handles", "rare", "words"],
    ["machine", "learning", "is", "fun"],
    ["i", "love", "machine", "learning"]
]

# Step 3: Train FastText model
ft_model = FastText(sentences, vector_size=50, window=3, min_count=1)

# Step 4: Get vector for a word
print("Vector for 'love':")
print(ft_model.wv['love'])

# Step 5: Handle unseen word (OOV)
print("\nVector for unseen word 'lovely':")
print(ft_model.wv['lovely'])

# Step 6: Find most similar words
print("\nMost similar to 'machine':")
print(ft_model.wv.most_similar('machine'))


Vector for 'love':
[ 0.0001016   0.0022412  -0.00336715 -0.00033085 -0.00387935 -0.00419755
 -0.00151996 -0.00087366  0.00622636 -0.00660398 -0.00371136 -0.0065105
 -0.00377819  0.0010506  -0.00381672 -0.00034082  0.00088526  0.00218162
 -0.0022804   0.00038835  0.00569094  0.00153093  0.00135464 -0.00223462
 -0.00252768  0.00366243 -0.00188376 -0.0065851   0.00077982  0.00158725
 -0.00045243 -0.00435207 -0.00471068  0.00685046 -0.00293258 -0.00189113
 -0.0010784  -0.00111529  0.00011267 -0.00160898 -0.00335323  0.00489591
  0.00278897  0.00365879 -0.00198696 -0.00063323 -0.00035158 -0.00079444
 -0.00180542  0.00361628]

Vector for unseen word 'lovely':
[ 1.0996094e-03 -1.4329599e-03  6.4024061e-04 -4.6484624e-03
 -3.2252874e-03 -3.2272032e-03 -5.2106446e-03  7.3700161e-05
 -2.5065436e-03 -4.3827746e-04 -8.6720131e-04 -2.0282581e-03
 -5.2141231e-03 -1.7155211e-03 -2.9054082e-03 -4.3657396e-04
  1.4546837e-03 -5.4090080e-04  3.5679985e-03  1.3004243e-03
  5.2754241e-03  2.4784237e-04 -1

In [None]:


# Small text corpus
sentences = [
    ["dog", "barks", "at", "the", "cat"],
    ["cat", "meows", "at", "the", "dog"],
    ["birds", "are", "flying", "in", "the", "sky"],
    ["fish", "swims", "in", "water"]
]

# Train Word2Vec
w2v_model = Word2Vec(sentences, vector_size=50, window=3, min_count=1, sg=1)

# Word similarity
print("Similarity between 'dog' and 'cat':", w2v_model.wv.similarity("dog", "cat"))
print("Similarity between 'dog' and 'fish':", w2v_model.wv.similarity("dog", "fish"))

# Most similar words
print("\nMost similar to 'cat':")
print(w2v_model.wv.most_similar("cat"))


Similarity between 'dog' and 'cat': 0.16562882
Similarity between 'dog' and 'fish': 0.13665016

Most similar to 'cat':
[('dog', 0.1656288504600525), ('birds', 0.1551763415336609), ('sky', 0.13940520584583282), ('the', 0.12668493390083313), ('are', 0.12119622528553009), ('swims', 0.08871668577194214), ('barks', 0.02048538811504841), ('in', 0.011042577214539051), ('water', -0.027841337025165558), ('meows', -0.03341934457421303)]


In [None]:


# Same corpus
sentences = [
    ["dog", "barks", "at", "the", "cat"],
    ["cat", "meows", "at", "the", "dog"],
    ["birds", "are", "flying", "in", "the", "sky"],
    ["fish", "swims", "in", "water"]
]

# Train FastText
ft_model = FastText(sentences, vector_size=50, window=3, min_count=1)

# Vector for seen word
print("Vector for 'dog':")
print(ft_model.wv['dog'][:10])  # show only first 10 dims

# Vector for unseen word (OOV)
print("\nVector for unseen word 'dogggg':")
print(ft_model.wv['dogggg'][:10])

# Most similar words
print("\nMost similar to 'dog':")
print(ft_model.wv.most_similar("dog"))


Vector for 'dog':
[-0.00512784 -0.00174533  0.00035479  0.0010277  -0.0009202  -0.00480705
  0.00024738 -0.00019025 -0.00260254  0.00054966]

Vector for unseen word 'dogggg':
[-0.00745251  0.00190354  0.00191593  0.00187805 -0.00046492 -0.00053973
  0.00265135 -0.00101074  0.00162682  0.00532739]

Most similar to 'dog':
[('are', 0.19231237471103668), ('the', 0.13459865748882294), ('water', 0.11051099747419357), ('at', 0.09520871192216873), ('swims', 0.07707151025533676), ('fish', 0.04277683421969414), ('cat', 0.03962136059999466), ('in', 0.017470519989728928), ('sky', 0.01520441472530365), ('meows', -0.07853716611862183)]


In [None]:


# Custom corpus for analogy
sentences = [
    ["king", "queen", "man", "woman"],
    ["paris", "france", "rome", "italy"],
    ["delhi", "india", "tokyo", "japan"],
    ["dog", "puppy", "cat", "kitten"]
]

# Train model
model = Word2Vec(sentences, vector_size=50, window=3, min_count=1, sg=1)

# Analogy: king - man + woman â‰ˆ queen
result = model.wv.most_similar(positive=["king", "woman"], negative=["man"])
print("Analogy: king - man + woman =", result)


Analogy: king - man + woman = [('italy', 0.14145417511463165), ('dog', 0.06958038359880447), ('puppy', 0.04945472627878189), ('cat', 0.03198548033833504), ('queen', 0.012669281102716923), ('delhi', 0.0028484025970101357), ('paris', -0.030324876308441162), ('kitten', -0.0685611367225647), ('france', -0.07234222441911697), ('japan', -0.0752946138381958)]


In [None]:


sentences = [
    ["play", "playing", "played", "player"],
    ["run", "running", "runner", "ran"],
    ["study", "studying", "studied", "student"]
]

# Train FastText
ft_model = FastText(sentences, vector_size=50, window=3, min_count=1)

# Similar words
print("Most similar to 'playing':", ft_model.wv.most_similar("playing"))
print("Most similar to 'runner':", ft_model.wv.most_similar("runner"))

# Test unseen but related word
print("\nVector for unseen word 'playyyyyy':")
print(ft_model.wv['playyyyyy'][:10])


Most similar to 'playing': [('play', 0.43791663646698), ('runner', 0.28430938720703125), ('played', 0.28360700607299805), ('run', 0.18218275904655457), ('running', 0.1550898253917694), ('player', 0.14196591079235077), ('studied', 0.01624378375709057), ('ran', -0.06368699669837952), ('student', -0.11292298883199692), ('studying', -0.137682244181633)]
Most similar to 'runner': [('run', 0.5079447031021118), ('running', 0.47139981389045715), ('playing', 0.284309446811676), ('played', 0.27585023641586304), ('player', 0.20748484134674072), ('play', 0.17613056302070618), ('ran', 0.12243297696113586), ('studied', 0.03735663741827011), ('study', -0.0896022692322731), ('studying', -0.1674293577671051)]

Vector for unseen word 'playyyyyy':
[-2.3022487e-03 -3.2724833e-04 -1.9040132e-03  5.2393098e-05
  2.7455350e-03 -4.2905905e-03  3.1240331e-03 -2.0504657e-03
 -2.5947457e-03 -2.2385521e-03]
