## Lab 3- Embeddings
By Aryan Iyappan (2023115021)

### One-hot encoding

In [4]:
words = ["cat", "dog", "cat", "fish"]

vocab = list(set(words))

one_hot = []

for word in words:
    vector = [1 if v == word else 0 for v in vocab]
    one_hot.append(vector)

print("Vocabulary: ", vocab)
print("One-hot encoded words: ")
for w, v in zip(words, one_hot):
    print(w, "->", v)

Vocabulary:  ['cat', 'fish', 'dog']
One-hot encoded words: 
cat -> [1, 0, 0]
dog -> [0, 0, 1]
cat -> [1, 0, 0]
fish -> [0, 1, 0]


### One-hot encoding using pandas

In [5]:
import pandas as pd

data = pd.Series(["cat", "dog", "fish", "cat"])

print(pd.get_dummies(data))

     cat    dog   fish
0   True  False  False
1  False   True  False
2  False  False   True
3   True  False  False


### One-hot encoding using scikit-learn

In [6]:
from sklearn.preprocessing import OneHotEncoder
import numpy as np

data = np.array([["cat"], ["dog"], ["fish"], ["cat"]])

encoder = OneHotEncoder(sparse_output=False)

encoded = encoder.fit_transform(data)

print("Categories: ", encoder.categories_)

print("One hot encoded output: ")
print(encoded)

Categories:  [array(['cat', 'dog', 'fish'], dtype='<U4')]
One hot encoded output: 
[[1. 0. 0.]
 [0. 1. 0.]
 [0. 0. 1.]
 [1. 0. 0.]]


### Word2Vec implementation using gensim

In [7]:
from gensim.models import Word2Vec

sentences = [
    "artificial intelligence is powerful".split(" "),
    "machine learning is a part of artificial intelligence".split(" "),
    "deep learning uses neural networks".split(" ")
]

model = Word2Vec(
    sentences,
    vector_size=50,
    window=3,
    min_count=1,
    sg=1,
)

vector = model.wv["intelligence"]

print("Vector for intelligence: ")
print(vector)

similar_words = model.wv.most_similar("learning")
print("similar words to learning: ", similar_words)

Vector for intelligence: 
[-0.01723938  0.00733148  0.01037977  0.01148388  0.01493384 -0.01233535
  0.00221123  0.01209456 -0.0056801  -0.01234705 -0.00082045 -0.0167379
 -0.01120002  0.01420908  0.00670508  0.01445134  0.01360049  0.01506148
 -0.00757831 -0.00112361  0.00469675 -0.00903806  0.01677746 -0.01971633
  0.01352928  0.00582883 -0.00986566  0.00879638 -0.00347915  0.01342277
  0.0199297  -0.00872489 -0.00119868 -0.01139127  0.00770164  0.00557325
  0.01378215  0.01220219  0.01907699  0.01854683  0.01579614 -0.01397901
 -0.01831173 -0.00071151 -0.00619968  0.01578863  0.01187715 -0.00309133
  0.00302193  0.00358008]
similar words to learning:  [('machine', 0.16704076528549194), ('deep', 0.13204392790794373), ('intelligence', 0.1267007291316986), ('part', 0.0998455286026001), ('is', 0.042373016476631165), ('powerful', 0.04067763686180115), ('neural', 0.012442179024219513), ('a', -0.01259106956422329), ('artificial', -0.01447527389973402), ('of', -0.0560765340924263)]


### GloVe implementation using gensim

In [8]:
import gensim.downloader as api

glove_model = api.load("glove-wiki-gigaword-100")

vector = glove_model["intelligence"]

print("Vector for intelligence: ")
print(vector)

similar_words = glove_model.most_similar("learning")
for word, score in similar_words:
    print(f"{word}: {score}")

Vector for intelligence: 
[-0.31101   -0.43291    0.77734   -0.31115    0.052934  -0.8502
 -0.35372   -0.70531    0.084464   0.88768    0.83527   -0.41641
  0.36703    0.60834    0.0085214  0.94293    0.5314    -0.75322
 -0.86764    0.34833   -0.29865   -0.43442    0.3514    -1.1228
 -1.2564    -0.094171   0.29402    0.31994    0.086692   0.31915
  0.56067    0.032952  -0.94379   -0.58112    0.11274    0.006062
 -0.79353    0.70368    0.59687    0.60501   -0.22855   -0.26469
  0.045172   0.58118    0.26756   -0.47237    0.29358   -0.28342
 -0.22823   -0.59532    1.0845     0.21541    0.5789     1.5825
  0.15322   -1.3246     0.42594   -0.24834    1.3285     0.48737
  0.17115    0.73042    0.51749   -0.50172    0.23246   -0.33179
 -0.31772    0.34714    0.95887    1.5972     0.76459   -0.1559
 -0.13554   -0.97654   -0.29545    0.097254  -0.17109    0.17695
 -1.1941     0.41086    1.0578     0.55551    0.034317  -0.18596
 -1.7366     0.22696    1.0213     0.80212   -0.017432  -0.45574
 -

### FastText embedding implmentation (using gensim)

In [9]:
from gensim.models import FastText

sentences = [
    "natural language processing is a fascinating field".split(" "),
    "word embeddings capture semantic meaning".split(" "),
    "fasttext is an extension of word2vec".split(" ")
]

model = FastText(
    sentences,
    vector_size=50,
    window=3,
    min_count=1,
    sg=1,
)

vector = model.wv["intelligence"]
print("Vector for intelligence: ")
print(vector)

similar_words = model.wv.most_similar("learning")
for word, score in similar_words:
    print(f"{word}: {score}")

Vector for intelligence: 
[ 1.2442525e-03  6.1205524e-04 -7.8747980e-04  1.6670383e-04
 -2.2123381e-03 -1.2123872e-03 -1.2194589e-03 -3.8544473e-03
  2.5588896e-03  1.5041081e-03  5.0475444e-03  1.2285195e-03
  8.7324705e-04 -1.1611626e-04 -7.7292963e-04 -2.4581761e-03
 -1.2628294e-03 -1.8496130e-03  4.2046177e-05  2.7340525e-03
  1.2192813e-03  1.2515471e-03  2.6138644e-03 -5.6116347e-04
  1.2510451e-03  8.1061118e-04  2.9067504e-03  3.9396860e-04
 -5.7816796e-04 -2.8444370e-03  2.1801500e-03 -2.8956099e-04
 -2.9883529e-03 -3.0058046e-04 -7.7108195e-04  6.8529876e-04
 -1.3268661e-03  1.2676803e-03 -8.3737250e-04  9.6935441e-04
  1.9220284e-03 -2.5679730e-03 -1.1573596e-03  1.0904556e-03
  1.4968399e-03  1.2666686e-03  1.1835489e-03  7.3763513e-04
 -2.3163224e-04 -1.6384197e-03]
processing: 0.2987723648548126
extension: 0.23967039585113525
embeddings: 0.2296343594789505
an: 0.2120790332555771
capture: 0.2046079933643341
language: 0.18014957010746002
fascinating: 0.1578020453453064
mean