# Embeddings


## fastText


In [1]:
!wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz && gzip -dv cc.en.300.bin.gz

--2024-02-28 16:34:50--  https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz

Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 3.163.189.14, 3.163.189.96, 3.163.189.51, ...

Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|3.163.189.14|:443... connected.

HTTP request sent, awaiting response... 200 OK

Length: 4503593528 (4.2G) [application/octet-stream]

Saving to: 'cc.en.300.bin.gz'






2024-02-28 16:35:10 (213 MB/s) - 'cc.en.300.bin.gz' saved [4503593528/4503593528]



cc.en.300.bin.gz:	 37.8% -- replaced with cc.en.300.bin


In [2]:
import fasttext

model = fasttext.load_model("cc.en.300.bin")



In [3]:
vector = model.get_word_vector("hello")
print(vector.shape)

(300,)


In [4]:
nearest_neighbors = model.get_nearest_neighbors("hello")
print(nearest_neighbors)

[(0.7143728733062744, 'hellow'), (0.7095366716384888, 'hello.'), (0.703833818435669, 'hi'), (0.6944323182106018, 'hullo'), (0.6912142038345337, 'hello-'), (0.663975715637207, 'Hello'), (0.6563249230384827, 'hello.I'), (0.6529381275177002, 'howdy'), (0.6283847689628601, 'hellooo'), (0.6235803365707397, 'hellooooo')]


In [5]:
analogy_result = model.get_analogies("king", "man", "woman", k=1)
print("Analogy relationship: king - man + woman =", analogy_result)

Analogy relationship: king - man + woman = [(0.7554811835289001, 'queen')]


## GloVe


In [6]:
import gensim.downloader as api

# Load the pre-trained GloVe model
glove_model = api.load("glove-wiki-gigaword-100")


# Find similarity between two words
similarity = glove_model.similarity("cat", "dog")
print("Similarity between 'cat' and 'dog':", similarity)

# Find analogy relationship
analogy = glove_model.most_similar(positive=["king", "woman"], negative=["man"])
print("Analogy relationship: king - man + woman =", analogy)


Similarity between 'cat' and 'dog': 0.8798075

Analogy relationship: king - man + woman = [('queen', 0.7698540687561035), ('monarch', 0.6843381524085999), ('throne', 0.6755736470222473), ('daughter', 0.6594556570053101), ('princess', 0.6520534157752991), ('prince', 0.6517034769058228), ('elizabeth', 0.6464517712593079), ('mother', 0.631171703338623), ('emperor', 0.6106470823287964), ('wife', 0.6098655462265015)]


## Vector databases


In [7]:
%%capture
!pip install chromadb

In [8]:
import chromadb

chroma_client = chromadb.Client()

In [9]:
collection = chroma_client.create_collection(name="test_collection")

In [10]:
documents = [
    "The latest iPhone model comes with impressive features and a powerful camera.",
    "Exploring the beautiful beaches and vibrant culture of Bali is a dream for many travelers.",
    "Einstein's theory of relativity revolutionized our understanding of space and time.",
    "Traditional Italian pizza is famous for its thin crust, fresh ingredients, and wood-fired ovens.",
    "The American Revolution had a profound impact on the birth of the United States as a nation.",
    "Regular exercise and a balanced diet are essential for maintaining good physical health.",
    "Leonardo da Vinci's Mona Lisa is considered one of the most iconic paintings in art history.",
    "Climate change poses a significant threat to the planet's ecosystems and biodiversity.",
    "Startup companies often face challenges in securing funding and scaling their operations.",
    "Beethoven's Symphony No. 9 is celebrated for its powerful choral finale, 'Ode to Joy.'",
]

genres = [
    "technology",
    "travel",
    "science",
    "food",
    "history",
    "fitness",
    "art",
    "climate change",
    "business",
    "music",
]

collection.add(
    documents=documents,
    ids=[f"id{i}" for i in range(len(documents))],
    metadatas=[{"genre": g} for g in genres],
)

/root/.cache/chroma/onnx_models/all-MiniLM-L6-v2/onnx.tar.gz: 100%|██████████| 79.3M/79.3M [00:02<00:00, 32.6MiB/s]


In [11]:
results = collection.query(query_texts=["I'm hungry"], n_results=1)
results

{'ids': [['id3']],
 'distances': [[1.7825171947479248]],
 'metadatas': [[{'genre': 'food'}]],
 'embeddings': None,
 'documents': [['Traditional Italian pizza is famous for its thin crust, fresh ingredients, and wood-fired ovens.']],
 'uris': None,
 'data': None}

In [12]:
results = collection.query(query_texts=["sport"], n_results=1)
results

{'ids': [['id5']],
 'distances': [[1.5580840110778809]],
 'metadatas': [[{'genre': 'fitness'}]],
 'embeddings': None,
 'documents': [['Regular exercise and a balanced diet are essential for maintaining good physical health.']],
 'uris': None,
 'data': None}

## Task

Task is to classify descriptions into categories with use of `fasttext`


In [13]:
import pandas as pd

train_df = pd.read_csv("train.csv")
train_df.head()

Unnamed: 0,class,text
0,12,Rules Changed Up is the debut studio album by...
1,14,Back is a novel written by British writer Hen...
2,14,Love and Glory (ISBN 0-385-29261-9) is a 1983...
3,13,Max Manus: Man of War is a 2008 Norwegian bio...
4,7,The former Ahavas Sholem Synagogue building w...


In [14]:
import re


def clean_text(text: str):
    replacements = {
        ",": " ",
        '"': "",
        "'": " ' ",
        ".": " . ",
        "(": " ( ",
        ")": " ) ",
        "!": " ! ",
        "?": " ? ",
        ":": " ",
        ";": " ",
    }

    # Convert the text to lowercase
    text = text.lower()

    # Use a regular expression to replace multiple characters in a single pass
    for char, replacement in replacements.items():
        text = re.sub(f"\\{char}", replacement, text)

    return text

In [15]:
def preprocess_df(data, label_prefix="__label__", train: bool = True):
    # Defining the new data
    df = data[["text"]].copy(deep=True)

    if train:
        df["class"] = label_prefix + data["class"].astype(str) + " "

    df["text"] = df["text"].apply(lambda x: clean_text(x))

    df.sample(frac=1).reset_index(drop=True)

    return df

In [16]:
df_train_preprocessed = preprocess_df(train_df)
df_train_preprocessed.head()

Unnamed: 0,text,class
0,rules changed up is the debut studio album by...,__label__12
1,back is a novel written by british writer hen...,__label__14
2,love and glory ( isbn 0-385-29261-9 ) is a ...,__label__14
3,max manus man of war is a 2008 norwegian bio...,__label__13
4,the former ahavas sholem synagogue building w...,__label__7


In [17]:
df_train_preprocessed.to_csv(
    "processed_train.csv", header=None, index=False, columns=["class", "text"]
)

In [18]:
from fasttext import train_supervised

model = train_supervised("processed_train.csv", epoch=1, dim=20, thread=2, verbose=100)

Read 5M words

Number of words:  250712

Number of labels: 14

Progress: 100.0% words/sec/thread: 3040075 lr:  0.000000 avg.loss:  0.710001 ETA:   0h 0m 0s


In [19]:
cool_model = train_supervised(
    "processed_train.csv",
    lr=1.0,
    epoch=75,
    loss="ova",
    wordNgrams=2,
    dim=200,
    thread=2,
    verbose=100,
)

Read 5M words

Number of words:  250712

Number of labels: 14

Progress: 100.0% words/sec/thread:  639845 lr:  0.000000 avg.loss:  0.009417 ETA:   0h 0m 0s


In [20]:
test_df = pd.read_csv("test.csv")
test_df.head()

Unnamed: 0,id,text
0,0,The Valea Cândii River is a tributary of the ...
1,1,Etiene Pires de Medeiros (born May 24 1991 in...
2,2,Bromelia balansae is a species of the genus B...
3,3,Pilsbryspira loxospira is a species of sea sn...
4,4,Wellington Management Company is one of the l...


In [21]:
df_test_preprocessed = preprocess_df(test_df, train=False)
cool_df_test_preprocessed = preprocess_df(test_df, train=False)

df_test_preprocessed.head()

Unnamed: 0,text
0,the valea cândii river is a tributary of the ...
1,etiene pires de medeiros ( born may 24 1991 ...
2,bromelia balansae is a species of the genus b...
3,pilsbryspira loxospira is a species of sea sn...
4,wellington management company is one of the l...


In [22]:
df_test_preprocessed["class_id"] = df_test_preprocessed["text"].apply(
    lambda x: model.predict(x, k=1)[0][0][9:]
)
cool_df_test_preprocessed["class_id"] = cool_df_test_preprocessed["text"].apply(
    lambda x: cool_model.predict(x, k=1)[0][0][9:]
)