## Lab 3- Embeddings
By Aryan Iyappan (2023115021)

### One-hot encoding

In [4]:
words = ["cat", "dog", "cat", "fish"]

vocab = list(set(words))

one_hot = []

for word in words:
    vector = [1 if v == word else 0 for v in vocab]
    one_hot.append(vector)

print("Vocabulary: ", vocab)
print("One-hot encoded words: ")
for w, v in zip(words, one_hot):
    print(w, "->", v)

Vocabulary:  ['cat', 'fish', 'dog']
One-hot encoded words: 
cat -> [1, 0, 0]
dog -> [0, 0, 1]
cat -> [1, 0, 0]
fish -> [0, 1, 0]


### One-hot encoding using pandas

In [5]:
import pandas as pd

data = pd.Series(["cat", "dog", "fish", "cat"])

print(pd.get_dummies(data))

     cat    dog   fish
0   True  False  False
1  False   True  False
2  False  False   True
3   True  False  False


### One-hot encoding using scikit-learn

In [6]:
from sklearn.preprocessing import OneHotEncoder
import numpy as np

data = np.array([["cat"], ["dog"], ["fish"], ["cat"]])

encoder = OneHotEncoder(sparse_output=False)

encoded = encoder.fit_transform(data)

print("Categories: ", encoder.categories_)

print("One hot encoded output: ")
print(encoded)

Categories:  [array(['cat', 'dog', 'fish'], dtype='<U4')]
One hot encoded output: 
[[1. 0. 0.]
 [0. 1. 0.]
 [0. 0. 1.]
 [1. 0. 0.]]


### Word2Vec implementation using gensim

In [7]:
from gensim.models import Word2Vec

sentences = [
    "artificial intelligence is powerful".split(" "),
    "machine learning is a part of artificial intelligence".split(" "),
    "deep learning uses neural networks".split(" ")
]

model = Word2Vec(
    sentences,
    vector_size=50,
    window=3,
    min_count=1,
    sg=1,
)

vector = model.wv["intelligence"]

print("Vector for intelligence: ")
print(vector)

similar_words = model.wv.most_similar("learning")
print("similar words to learning: ", similar_words)

Vector for intelligence: 
[-0.01723938  0.00733148  0.01037977  0.01148388  0.01493384 -0.01233535
  0.00221123  0.01209456 -0.0056801  -0.01234705 -0.00082045 -0.0167379
 -0.01120002  0.01420908  0.00670508  0.01445134  0.01360049  0.01506148
 -0.00757831 -0.00112361  0.00469675 -0.00903806  0.01677746 -0.01971633
  0.01352928  0.00582883 -0.00986566  0.00879638 -0.00347915  0.01342277
  0.0199297  -0.00872489 -0.00119868 -0.01139127  0.00770164  0.00557325
  0.01378215  0.01220219  0.01907699  0.01854683  0.01579614 -0.01397901
 -0.01831173 -0.00071151 -0.00619968  0.01578863  0.01187715 -0.00309133
  0.00302193  0.00358008]
similar words to learning:  [('machine', 0.16704076528549194), ('deep', 0.13204392790794373), ('intelligence', 0.1267007291316986), ('part', 0.0998455286026001), ('is', 0.042373016476631165), ('powerful', 0.04067763686180115), ('neural', 0.012442179024219513), ('a', -0.01259106956422329), ('artificial', -0.01447527389973402), ('of', -0.0560765340924263)]


### GloVe implementation using gensim

In [8]:
import gensim.downloader as api

glove_model = api.load("glove-wiki-gigaword-100")

vector = glove_model["intelligence"]

print("Vector for intelligence: ")
print(vector)

similar_words = glove_model.most_similar("learning")
for word, score in similar_words:
    print(f"{word}: {score}")

Vector for intelligence: 
[-0.31101   -0.43291    0.77734   -0.31115    0.052934  -0.8502
 -0.35372   -0.70531    0.084464   0.88768    0.83527   -0.41641
  0.36703    0.60834    0.0085214  0.94293    0.5314    -0.75322
 -0.86764    0.34833   -0.29865   -0.43442    0.3514    -1.1228
 -1.2564    -0.094171   0.29402    0.31994    0.086692   0.31915
  0.56067    0.032952  -0.94379   -0.58112    0.11274    0.006062
 -0.79353    0.70368    0.59687    0.60501   -0.22855   -0.26469
  0.045172   0.58118    0.26756   -0.47237    0.29358   -0.28342
 -0.22823   -0.59532    1.0845     0.21541    0.5789     1.5825
  0.15322   -1.3246     0.42594   -0.24834    1.3285     0.48737
  0.17115    0.73042    0.51749   -0.50172    0.23246   -0.33179
 -0.31772    0.34714    0.95887    1.5972     0.76459   -0.1559
 -0.13554   -0.97654   -0.29545    0.097254  -0.17109    0.17695
 -1.1941     0.41086    1.0578     0.55551    0.034317  -0.18596
 -1.7366     0.22696    1.0213     0.80212   -0.017432  -0.45574
 -

### FastText embedding implmentation (using gensim)

In [9]:
from gensim.models import FastText

sentences = [
    "natural language processing is a fascinating field".split(" "),
    "word embeddings capture semantic meaning".split(" "),
    "fasttext is an extension of word2vec".split(" ")
]

model = FastText(
    sentences,
    vector_size=50,
    window=3,
    min_count=1,
    sg=1,
)

vector = model.wv["intelligence"]
print("Vector for intelligence: ")
print(vector)

similar_words = model.wv.most_similar("learning")
for word, score in similar_words:
    print(f"{word}: {score}")

Vector for intelligence: 
[ 1.2442525e-03  6.1205524e-04 -7.8747980e-04  1.6670383e-04
 -2.2123381e-03 -1.2123872e-03 -1.2194589e-03 -3.8544473e-03
  2.5588896e-03  1.5041081e-03  5.0475444e-03  1.2285195e-03
  8.7324705e-04 -1.1611626e-04 -7.7292963e-04 -2.4581761e-03
 -1.2628294e-03 -1.8496130e-03  4.2046177e-05  2.7340525e-03
  1.2192813e-03  1.2515471e-03  2.6138644e-03 -5.6116347e-04
  1.2510451e-03  8.1061118e-04  2.9067504e-03  3.9396860e-04
 -5.7816796e-04 -2.8444370e-03  2.1801500e-03 -2.8956099e-04
 -2.9883529e-03 -3.0058046e-04 -7.7108195e-04  6.8529876e-04
 -1.3268661e-03  1.2676803e-03 -8.3737250e-04  9.6935441e-04
  1.9220284e-03 -2.5679730e-03 -1.1573596e-03  1.0904556e-03
  1.4968399e-03  1.2666686e-03  1.1835489e-03  7.3763513e-04
 -2.3163224e-04 -1.6384197e-03]
processing: 0.2987723648548126
extension: 0.23967039585113525
embeddings: 0.2296343594789505
an: 0.2120790332555771
capture: 0.2046079933643341
language: 0.18014957010746002
fascinating: 0.1578020453453064
mean

## Try it yourself problems

Problem 1: Word2Vec Semantic Similarity 

Create a small text corpus related to artificial intelligence and train a Word2Vec model using the Skip-Gram approach. Extract the vector representation of a given word and compute the top five most similar words. Observe whether semantically related words appear closer in the embedding space. 

In [14]:
from gensim.models import Word2Vec

sentences = [
    "artificial intelligence is the electricity of the 21st century".split(" "),
    "machine learning is a subset of artificial intelligence".split(" "),
    "artificial intelligence has evolved from simple algorithms to complex neural networks".split(" ")
]

model = Word2Vec(sentences, vector_size=50, window=3, min_count=1, sg=1)

vector = model.wv["artificial"]
print("Vector for artificial: ")
print(vector)

similar_words = model.wv.most_similar("artificial")
for word, score in similar_words[5:]:
    print(f"{word}: {score}")

Vector for artificial: 
[-0.01631689  0.00898098 -0.00828151  0.00164911  0.01698853 -0.00893017
  0.00903413 -0.01358104 -0.00710486  0.01878957 -0.00315465  0.00063616
 -0.00827822 -0.01536349 -0.00301752  0.00494668 -0.00178007  0.01106599
 -0.00549287  0.00451755  0.01090922  0.01669611 -0.0029087  -0.01840873
  0.00874684  0.00113891  0.01488712 -0.00161985 -0.00528167 -0.01750022
 -0.00170869  0.00564818  0.01079729  0.01410526 -0.01140808  0.00371395
  0.01218884 -0.00960119 -0.00622252  0.01359342  0.00326335  0.00037754
  0.00694279  0.00043278  0.01923393  0.01012419 -0.0178245  -0.01409015
  0.00179851  0.01278906]
neural: 0.07401131093502045
intelligence: 0.04244506359100342
networks: 0.018417924642562866
subset: 0.011702703312039375
of: 0.011397400870919228


Problem 2: Using Pretrained GloVe Embeddings 

Load pretrained GloVe embeddings and retrieve vectors for selected words such as “learning,” “education,” and “knowledge.” Compute cosine similarity between pairs of words and analyze which pairs show stronger semantic similarity based on the similarity score. 

In [11]:
import gensim.downloader as api

glove_model = api.load("glove-wiki-gigaword-100")

selected_words = ["learning", "education", "knowledge"]

for word in selected_words:
    vector = glove_model[word]
    print(f"Vector for {word}: ")
    print(vector)

print("Computing cosine similarities:")
for i, word1 in enumerate(selected_words):
    for word2 in selected_words[i + 1:]:
        similarity = glove_model.similarity(word1, word2)
        print(f"Cosine similarity between {word1} and {word2}: {similarity}")

Vector for learning: 
[ 0.64812    0.69878   -0.39947    0.77634   -0.13132    0.2024
 -0.33399   -0.0066588  0.061684   0.1885    -0.10559   -0.31316
 -0.082495  -0.080517   0.3858    -0.10302    0.049431   0.17216
 -0.59079    0.77068   -1.2768    -0.25187    0.2195    -0.20176
 -0.30581   -0.18518    0.010889  -0.07529   -0.34732    0.61998
 -0.99703    1.0516    -0.42071   -0.39635    0.32607   -0.40061
 -0.46462    0.69904    0.29567   -0.35309   -0.59074    0.28999
 -0.25732   -0.1317    -0.69798    0.49818    0.41503    0.1487
  0.083347  -0.43543   -0.093969  -0.3543     0.014998   0.63593
  0.54564   -1.8439     0.78842   -0.19836    1.5707     0.25988
  0.20875    0.7521    -0.085488  -0.70717    0.094104   0.44485
  0.087818  -0.34779    0.57148    0.18662   -0.29435    0.42928
  0.28392   -0.61614   -0.34108    0.58192   -0.16388   -0.0081997
 -0.27162   -0.27112   -0.21471    0.37376   -0.5352    -0.060945
 -1.6317     0.85144    0.056035  -0.53861   -0.58383   -0.19612
 -

Problem 3: FastText and Out-of-Vocabulary Words 

Train a FastText model on a small corpus and generate embeddings for both seen and unseen words. Compare how FastText produces meaningful vectors for unseen words due to its character n-gram representation, unlike Word2Vec. 

In [13]:
from gensim.models import FastText

sentences = [
    "artificial intelligence is powerful".split(" "),
    "machine learning is a part of artificial intelligence".split(" "),
    "deep learning uses neural networks".split(" ")
]

model = FastText(sentences, vector_size=50, window=3, min_count=1, sg=1)

# seen words
seen_words = ["artificial", "intelligence", "learning"]
for word in seen_words:
    vector = model.wv[word]
    print(f"Vector for seen word {word}: ")
    print(vector)

# unseen words
unseen_words = ["automation", "robotics"]
for word in unseen_words:
    vector = model.wv[word]
    print(f"Vector for unseen word {word}: ")
    print(vector)

# unseen words by word2vec model
word2vec_model = Word2Vec(sentences, vector_size=50, window=3, min_count=1, sg=1)

for word in unseen_words:
    try:
        vector = word2vec_model.wv[word]
        print(f"Vector for unseen word {word} in Word2Vec: ")
        print(vector)
    except KeyError:
        print(f"Word '{word}' not found in Word2Vec vocabulary.")

Vector for seen word artificial: 
[-2.1826662e-03  6.0296047e-04 -1.4207925e-03 -8.9026283e-04
 -2.4187756e-03  2.9780034e-03  1.7206641e-03  2.7639512e-03
 -1.9971149e-03  4.0762232e-05  8.7178743e-04 -2.3102832e-03
  3.2494627e-03 -5.3905376e-04  2.5887699e-03  2.1144887e-03
  6.7115214e-04  3.0500221e-03  3.8534440e-03  1.6691759e-03
  5.5586832e-04 -1.1562546e-03  6.8903202e-04  1.9583234e-03
 -1.5651557e-03 -7.0948462e-04  1.9376080e-04  2.8539493e-04
 -3.3578160e-04 -2.6936035e-03  1.3029205e-03 -3.2696521e-03
  4.5152735e-03 -1.5075026e-04 -2.1355576e-03 -1.2526751e-03
  1.1347395e-03  1.1043815e-03 -1.3895440e-03 -1.0036826e-04
  3.4054377e-04 -4.3776690e-04  1.8515515e-03 -3.6265829e-03
  3.4345223e-03 -2.0397338e-04 -1.0703121e-03  1.1549455e-03
 -3.8344195e-04  9.8625629e-04]
Vector for seen word intelligence: 
[ 0.0008144   0.00076832 -0.00052778  0.00042989 -0.00181359 -0.00147106
 -0.00113968 -0.00348354  0.00236729  0.00118199  0.00491108  0.0008107
  0.00059247  0.00021