In [1]:
!pip install -q scikit-learn

In [2]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Corpus and Query
corpus = ["I deposited money in the bank", "I went to the river bank"]
query = ["I deposited quarkcoins in the bank"]  # 'quarkcoins' is OOV

# ------------------------------
# 1. Bag-of-Words (BoW)
# ------------------------------
# Include single-character words like 'I'
bow_vectorizer = CountVectorizer(token_pattern=r"(?u)\b\w+\b")
X_corpus_bow = bow_vectorizer.fit_transform(corpus)
X_query_bow = bow_vectorizer.transform(query)  # transform, not fit

print("BoW Vocabulary:", bow_vectorizer.get_feature_names_out())
print("BoW Corpus vectors:\n", X_corpus_bow.toarray())
print("BoW Query vector:\n", X_query_bow.toarray())

# Cosine similarity between query and corpus
cos_sim_bow = cosine_similarity(X_query_bow, X_corpus_bow)
print("BoW Cosine similarity with query:\n", cos_sim_bow)

BoW Vocabulary: ['bank' 'deposited' 'i' 'in' 'money' 'river' 'the' 'to' 'went']
BoW Corpus vectors:
 [[1 1 1 1 1 0 1 0 0]
 [1 0 1 0 0 1 1 1 1]]
BoW Query vector:
 [[1 1 1 1 0 0 1 0 0]]
BoW Cosine similarity with query:
 [[0.91287093 0.54772256]]


In [3]:
tfidf_vectorizer = TfidfVectorizer(token_pattern=r"(?u)\b\w+\b")
X_corpus_tfidf = tfidf_vectorizer.fit_transform(corpus)
X_query_tfidf = tfidf_vectorizer.transform(query)

print("\nTF-IDF Vocabulary:", tfidf_vectorizer.get_feature_names_out())
print("TF-IDF Corpus vectors:\n", X_corpus_tfidf.toarray())
print("TF-IDF Query vector:\n", X_query_tfidf.toarray())

# Cosine similarity between query and corpus
cos_sim_tfidf = cosine_similarity(X_query_tfidf, X_corpus_tfidf)
print("TF-IDF Cosine similarity with query:\n", cos_sim_tfidf)


TF-IDF Vocabulary: ['bank' 'deposited' 'i' 'in' 'money' 'river' 'the' 'to' 'went']
TF-IDF Corpus vectors:
 [[0.33471228 0.47042643 0.33471228 0.47042643 0.47042643 0.
  0.33471228 0.         0.        ]
 [0.33471228 0.         0.33471228 0.         0.         0.47042643
  0.33471228 0.47042643 0.47042643]]
TF-IDF Query vector:
 [[0.37930349 0.53309782 0.37930349 0.53309782 0.         0.
  0.37930349 0.         0.        ]]
TF-IDF Cosine similarity with query:
 [[0.88243922 0.38087261]]


In [4]:
# Gensim for Word2Vec and FastText
!pip install -q gensim

  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mPreparing metadata [0m[1;32m([0m[32mpyproject.toml[0m[1;32m)[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m [31m[54 lines of output][0m
  [31m   [0m [36m[1m+ meson setup /private/var/folders/y3/l1qzr2852s51qbvnph6b1qf80000gp/T/pip-install-ofjtj18z/scipy_f0af16d766894d60bed687581b8a18ee /private/var/folders/y3/l1qzr2852s51qbvnph6b1qf80000gp/T/pip-install-ofjtj18z/scipy_f0af16d766894d60bed687581b8a18ee/.mesonpy-dru4dvfa -Dbuildtype=release -Db_ndebug=if-release -Db_vscrt=md --native-file=/private/var/folders/y3/l1qzr2852s51qbvnph6b1qf80000gp/T/pip-install-ofjtj18z/scipy_f0af16d766894d60bed687581b8a18ee/.mesonpy-dru4dvfa/meson-python-native-file.ini[0m
  [31m   [0m The Meson build system
  [31m   [0m Version: 1.9.1
  [31m   [0m Source dir: /private/var/folders/y3/l1qzr2852s51qbvnph6b1qf80000gp/T/pip-install-ofjtj18z/scipy_f0af16d766894d60bed687581b8a18ee
  [

In [5]:
from gensim.models import Word2Vec

tokenized_corpus = [s.lower().split() for s in corpus]
w2v_model = Word2Vec(sentences=tokenized_corpus, vector_size=50, min_count=1)

# Normal word
vec_money = w2v_model.wv['money']
# OOV word
vec_quarkcoins = w2v_model.wv.get_vector('quarkcoins', norm=True) if 'quarkcoins' in w2v_model.wv else None

print("Word2Vec embedding for 'money':", vec_money[:50])
print("Word2Vec embedding for 'quarkcoins':", vec_quarkcoins)


ModuleNotFoundError: No module named 'gensim'

In [None]:
import numpy as np

In [None]:
# ------------------------------
# 1. Download GloVe embeddings
# ------------------------------
!wget -q http://nlp.stanford.edu/data/glove.6B.zip
!unzip -q glove.6B.zip

# Check the files
!ls glove.6B*


glove.6B.100d.txt  glove.6B.300d.txt  glove.6B.zip
glove.6B.200d.txt  glove.6B.50d.txt


In [None]:
glove_model = {}
with open('glove.6B.50d.txt', 'r', encoding='utf8') as f:
    for line in f:
        values = line.split()
        glove_model[values[0]] = np.array(values[1:], dtype='float32')

vec_money = glove_model['money']
vec_quarkcoins = glove_model.get('quarkcoins', None)

print("GloVe 'money':", vec_money[:50])
print("GloVe 'quarkcoins':", vec_quarkcoins)


GloVe 'money': [ 0.59784  -0.057026  0.97746  -0.58504   0.37386   0.036373 -0.67548
 -0.090134  0.33473   0.4612   -0.70586   0.88032  -0.1532   -0.59041
  1.0221   -0.20335   0.80479   0.23907   0.51985  -0.34106   1.1747
 -0.44956   0.19799  -0.25137  -0.59436  -2.2372   -0.15901  -0.39896
  0.4188   -0.67741   3.3433    0.98779  -0.023405  0.14755  -0.46205
  0.34545  -0.77937   0.32595   0.6553   -1.0528   -0.19255   0.22296
  0.2518    0.71563  -0.47951  -0.95866  -0.79283   0.20869   0.16084
  0.24745 ]
GloVe 'quarkcoins': None


In [None]:
#'quarkcoins' → breaks into character n-grams → can still produce vector

In [None]:
from gensim.models import FastText

ft_model = FastText(sentences=tokenized_corpus, vector_size=50, window=5, min_count=1)
vec_quarkcoins = ft_model.wv['quarkcoins']  # generates vector even if not seen

print("FastText 'quarkcoins' embedding (first 50 values):", vec_quarkcoins[:50])


FastText 'quarkcoins' embedding (first 50 values): [-5.8522855e-04  9.7089051e-04 -2.8751694e-04 -5.0666467e-03
 -2.4688602e-03  2.0737904e-03  2.4688407e-04 -6.5799445e-06
 -1.2381976e-03 -3.0854793e-04  2.8960016e-03  1.1711261e-03
 -1.0972954e-03 -3.5702419e-03  3.5900727e-03  3.1766349e-03
 -1.7390212e-03  7.8540581e-04 -1.9342663e-03  2.7224340e-03
 -8.0930954e-04 -1.4312990e-03 -1.5794317e-03 -4.7960444e-04
  1.8669844e-03  4.4668582e-03 -3.0146188e-03  1.2886367e-03
  3.2853460e-04  1.0617605e-03 -2.3801127e-03 -1.7158007e-03
 -2.1681935e-03  1.0601060e-03  2.6434993e-03 -1.5054455e-04
 -2.4826103e-04  3.7209634e-04 -3.4450609e-04 -9.9421560e-04
 -3.3948894e-04  9.5636561e-04  8.2225079e-04  3.1005465e-03
 -2.6694741e-03 -1.3809204e-03  2.6692760e-03 -7.6122349e-04
  5.4074463e-04  7.1721303e-04]


In [3]:
# Transformers and Torch for backend support (needed by Flair and SBERT)
! pip install -U sentence-transformers


Collecting sentence-transformers
  Downloading sentence_transformers-5.1.1-py3-none-any.whl.metadata (16 kB)
Collecting Pillow (from sentence-transformers)
  Using cached pillow-11.3.0-cp313-cp313-macosx_11_0_arm64.whl.metadata (9.0 kB)
Downloading sentence_transformers-5.1.1-py3-none-any.whl (486 kB)
Using cached pillow-11.3.0-cp313-cp313-macosx_11_0_arm64.whl (4.7 MB)
Installing collected packages: Pillow, sentence-transformers
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2/2[0m [sentence-transformers]ence-transformers]
[1A[2KSuccessfully installed Pillow-11.3.0 sentence-transformers-5.1.1


In [None]:
# 'quarkcoins' → tokenized into ['qu', '##ark', '##coin', '##s']

# Each subword has embedding → combined via Transformer + pooling

# So even new words produce valid embeddings in context

In [4]:
from sentence_transformers import SentenceTransformer



In [5]:
model = SentenceTransformer('all-MiniLM-L6-v2')

# Normal
vec_normal = model.encode("I deposited money in the bank")
# OOV
vec_oov = model.encode("I deposited sfsdmlsadkjl in the bank")

print("BERT sentence embedding normal (first 50 values):", vec_normal[:10])
print("BERT sentence embedding with OOV (first 50 values):", vec_oov[:10])


README.md: 0.00B [00:00, ?B/s]

BERT sentence embedding normal (first 50 values): [ 0.05013232  0.04963814 -0.01344959  0.0345262   0.03906994 -0.07721663
  0.13875812  0.02018296  0.03256781  0.00444347]
BERT sentence embedding with OOV (first 50 values): [ 0.02826977  0.03689677 -0.03470108  0.0193714   0.05298067 -0.0254625
  0.06700128  0.01436979  0.04815311  0.01550889]
