# Fasttext Word embeddings

We plan to use fasttext word embedding as input to the final classifier because it can give embeddings for unknown words as well (using its n grams). It also helps us to handle missplet words as well

In [5]:
# Install the library
!pip install fasttext

# Download and extract the model
#!wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz
#!gunzip cc.en.300.bin.gz

# Load the FastText model and verify
import fasttext
import fasttext.util

model = fasttext.load_model('cc.en.300.bin')
print("Vector Dimension:", model.get_dimension())

Collecting fasttext
  Using cached fasttext-0.9.3.tar.gz (73 kB)
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting pybind11>=2.2 (from fasttext)
  Using cached pybind11-2.13.6-py3-none-any.whl.metadata (9.5 kB)
Using cached pybind11-2.13.6-py3-none-any.whl (243 kB)
Building wheels for collected packages: fasttext
  Building wheel for fasttext (pyproject.toml) ... [?25l[?25hdone
  Created wheel for fasttext: filename=fasttext-0.9.3-cp310-cp310-linux_x86_64.whl size=4296185 sha256=749f89d22040e04dffbdfbaf570d081f80ae7bc32f833ea1fabdd039c7aaf45e
  Stored in directory: /root/.cache/pip/wheels/0d/a2/00/81db54d3e6a8199b829d58e02cec2ddb20ce3e59fad8d3c92a
Successfully built fasttext
Installing collected packages: pybind11, fasttext
Successfully installed fasttext-0.9.3 pybind11-2.13.6
Vector Dimension: 300


In [6]:
model.get_nearest_neighbors("Cell", k=5)

[(0.6876197457313538, 'cell'),
 (0.6766916513442993, 'Cells'),
 (0.6637295484542847, '-Cell'),
 (0.6585685610771179, 'Cell-'),
 (0.6313833594322205, 'Cellular')]

In [7]:
import numpy as np
import string

def get_words(text):
    for p in string.punctuation:
        text = text.replace(p,' ')
    tokens = text.split()

    words = []
    for tok in tokens:
        if not tok.isnumeric():
            words.append(tok)
    return words

def text_to_vector(text, model):

    words = get_words(text)

    word_vectors = []
    for word in words:
        word_vectors.append(model.get_word_vector(word))

    if not word_vectors:  # Handle cases where no words are in the vocabulary
        print(text)
        return np.zeros(model.get_dimension())

    return np.sum(word_vectors, axis=0)


### Test Data

In [9]:
import pandas as pd

data = pd.read_csv("bms_data.csv")
data = data.fillna("")
data.head()

Unnamed: 0,Analysis,Attribute
0,D_250475,IL2 INHIBITION ASSAY
1,D_95007196,PH
2,D_M00003744,ABATACEPT MAJOR BAND (REDUCED)
3,Y_SM_95011468_R,BIOASSAY
4,250684_CE_SDS_REDUC,SUM HEAVY AND LIGHT CHAIN


In [11]:
data['words']= data.apply(lambda row : get_words(row['Analysis']+' '+row['Attribute']), axis=1)
data['Embeddings']=data.apply(lambda row: text_to_vector(row['Analysis']+' '+row['Attribute'],model),axis=1)
data.head()

Unnamed: 0,Analysis,Attribute,words,Embeddings
0,D_250475,IL2 INHIBITION ASSAY,"[D, IL2, INHIBITION, ASSAY]","[0.7127228, -0.7752601, -0.24844645, -0.213777..."
1,D_95007196,PH,"[D, PH]","[0.7130991, -0.6000244, -0.5627817, -0.4912354..."
2,D_M00003744,ABATACEPT MAJOR BAND (REDUCED),"[D, M00003744, ABATACEPT, MAJOR, BAND, REDUCED]","[0.06129151, -0.85993564, -0.29559693, -0.1129..."
3,Y_SM_95011468_R,BIOASSAY,"[Y, SM, R, BIOASSAY]","[0.54968286, -0.5839823, 0.56449264, -0.605069..."
4,250684_CE_SDS_REDUC,SUM HEAVY AND LIGHT CHAIN,"[CE, SDS, REDUC, SUM, HEAVY, AND, LIGHT, CHAIN]","[0.60163784, -0.55069363, 0.16281101, -0.67703..."


In [13]:
data.to_pickle("bms_data_word_embedding.pkl")

### Training data

In [14]:
data = pd.read_csv("augmented_labelled_data.csv")
data = data.fillna("")
data.head()

Unnamed: 0,Analysis,Attribute,Standard names
0,HPLC,AEX-HPLC TOTAL ACIDIC PEAKS,AEX ACIDIC PEAKS
1,BIO_ICIEF,AEX HPLC-ACIDIC PEAKS,AEX ACIDIC PEAKS
2,HPLC,AEX TOTAL ACIDIC PEAKS,AEX ACIDIC PEAKS
3,HPLC,AEX-HPLC ACIDIC PEAKS,AEX ACIDIC PEAKS
4,BIO_ICIEF,AEX-HPLC TOTAL ACIDIC PEAKS,AEX ACIDIC PEAKS


In [15]:
data['words']= data.apply(lambda row : get_words(row['Analysis']+' '+row['Attribute']), axis=1)
data['Embeddings']=data.apply(lambda row: text_to_vector(row['Analysis']+' '+row['Attribute'],model),axis=1)
data.head()

80 80
80 80
80 80
80 80
80 80
80 80
80 80
80 80
80 80
80 80
80 80
80 80
80 80
80 80
80 80
80 80
80 80
80 80
80 80


Unnamed: 0,Analysis,Attribute,Standard names,words,Embeddings
0,HPLC,AEX-HPLC TOTAL ACIDIC PEAKS,AEX ACIDIC PEAKS,"[HPLC, AEX, HPLC, TOTAL, ACIDIC, PEAKS]","[0.20632643, 0.010347083, -0.101710916, -0.326..."
1,BIO_ICIEF,AEX HPLC-ACIDIC PEAKS,AEX ACIDIC PEAKS,"[BIO, ICIEF, AEX, HPLC, ACIDIC, PEAKS]","[0.09764487, -0.10927658, -0.45782584, -0.3632..."
2,HPLC,AEX TOTAL ACIDIC PEAKS,AEX ACIDIC PEAKS,"[HPLC, AEX, TOTAL, ACIDIC, PEAKS]","[0.07056949, -0.14527115, -0.17276011, -0.3101..."
3,HPLC,AEX-HPLC ACIDIC PEAKS,AEX ACIDIC PEAKS,"[HPLC, AEX, HPLC, ACIDIC, PEAKS]","[0.22171792, 0.09704448, -0.108197734, -0.2888..."
4,BIO_ICIEF,AEX-HPLC TOTAL ACIDIC PEAKS,AEX ACIDIC PEAKS,"[BIO, ICIEF, AEX, HPLC, TOTAL, ACIDIC, PEAKS]","[0.082253374, -0.19597399, -0.45133898, -0.401..."


In [16]:
data.to_pickle("augmented_labelled_data_word_embedding.pkl")