# Installing Packages

In [None]:
!pip install xmltodict pandas fasttext hazm nltk
!pip install -U sentence-transformers

Collecting xmltodict
  Downloading xmltodict-0.13.0-py2.py3-none-any.whl (10.0 kB)
Collecting fasttext
  Downloading fasttext-0.9.2.tar.gz (68 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m68.8/68.8 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting hazm
  Downloading hazm-0.9.4-py3-none-any.whl (371 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m371.7/371.7 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
Collecting pybind11>=2.2 (from fasttext)
  Using cached pybind11-2.11.1-py3-none-any.whl (227 kB)
Collecting fasttext-wheel<0.10.0,>=0.9.2 (from hazm)
  Downloading fasttext_wheel-0.9.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.4/4.4 MB[0m [31m30.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting flashtext<3.0,>=2.7 (from hazm)
  Downloading flashtext-2.7.tar.gz (14 kB)
  Preparing metadata

Collecting sentence-transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m996.2 kB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting sentencepiece (from sentence-transformers)
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: sentence-transformers
  Building wheel for sentence-transformers (setup.py) ... [?25l[?25hdone
  Created wheel for sentence-transformers: filename=sentence_transformers-2.2.2-py3-none-any.whl size=125923 sha256=39eacbcee22ee08599f6cc86e4d33877d26eb2322a24d27a48ba583db0bff2da
  Stored in directory: /root/.cache/pip/wheels/62/f2/10/1e606fd5f02395388f74e7462910fe851042f97238cbbd902f
Successfully built sentence-t

# Importing Packages

In [None]:
import fasttext.util
from google.colab import drive
import pandas as pd
from hazm import word_tokenize
import string
from hazm import Normalizer as HazmNormalizer
import fasttext
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
from gdown import download
from sklearn.decomposition import PCA
import plotly.express as px
import plotly.graph_objects as go

In [None]:
drive.mount('/content/drive')
!cp drive/MyDrive/cc.fa.300.bin ./

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


###loading farsi drug data

In [None]:
download("https://drive.google.com/uc?id=1NZxxgmBTtMpJbSDe6-h4bHzg28fFZUcD", "farsi_drug_data.csv")
df = pd.read_csv('farsi_drug_data.csv')

Downloading...
From: https://drive.google.com/uc?id=1NZxxgmBTtMpJbSDe6-h4bHzg28fFZUcD
To: /content/farsi_drug_data.csv
100%|██████████| 471k/471k [00:00<00:00, 75.2MB/s]


##Normalization

In [None]:
!wget https://github.com/language-ml/course-nlp-ir-1-text-exploring/blob/main/notebooks/Farsi/stopwords.txt

--2024-01-08 15:58:19--  https://github.com/language-ml/course-nlp-ir-1-text-exploring/blob/main/notebooks/Farsi/stopwords.txt
Resolving github.com (github.com)... 140.82.113.4
Connecting to github.com (github.com)|140.82.113.4|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 11106 (11K) [text/plain]
Saving to: ‘stopwords.txt’


2024-01-08 15:58:19 (446 KB/s) - ‘stopwords.txt’ saved [11106/11106]



In [None]:
stopword_path = "./stopwords.txt"
with open('stopwords.txt') as f:
    stop_words = set(line.strip() for line in f)

In [None]:
hazm_normalizer = HazmNormalizer()

In [None]:
def filter_stops(tokens):
    return [t for t in tokens if t not in stop_words]

def delete_punc(tokens):
  punctuations = string.punctuation
  return [word for word in tokens if word not in punctuations]

def farsi_normalization(input):

  normalized = hazm_normalizer.normalize(input)
  tokenize = word_tokenize(normalized)
  non_stop = filter_stops(tokenize)
  filtered = delete_punc(non_stop)
  return filtered

In [None]:
df['normal_data'] = df['mavared_masraf'].apply(farsi_normalization)

# Fasttext Model

## downloading farsi Fasttext

In [None]:
fasttext.util.download_model('fa', if_exists='ignore')

In [None]:
ft = fasttext.load_model('cc.fa.300.bin')



## applying fasttext on normalized description of drugs

In [None]:
# Generate a sentence Embeddings based on average embedding of all words in it
def get_fasttext_embedding(data, model):
  embeddings = []
  for token in data:
    embeddings.append(model.get_word_vector(token))
  embeddings = np.array(embeddings)
  return np.mean(embeddings, axis=0)

In [None]:
df['fasttext_embedding'] = df['normal_data'].apply(get_fasttext_embedding, model=ft) #adding fasttext embedding to dataframe

# Paraphrase Model

Here we use Paraphrase, a model for sentence similarity. We're considering each drug description as a single sentence and expecting permutations in descriptions still preserve overall similarity.

In [None]:
def transformers_embeddings(description, model):
    text = ' '.join(description)
    return model.encode(text)

model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')

In [None]:
df['paraphrase_embedding'] = df['normal_data'].apply(transformers_embeddings, model=model) #adding Paraphrase embedding to dataframe

# Evaluation

In [None]:
# function to get the 3 most similar embeddings to input
def search_over_embeds(df, input, embed_key: str, compute_embedding, model):
  normal_input = farsi_normalization(input)
  similarity_scores = []
  input_embed = compute_embedding(normal_input, model)
  for index, drug in df.iterrows():
    similarity = cosine_similarity(drug[embed_key].reshape(1, -1), input_embed.reshape(1, -1))[0][0]
    similarity_scores.append(similarity)

  top_3_indices = sorted(range(len(similarity_scores)), key=lambda i: similarity_scores[i], reverse=True)[:3]
  top_3_embeddings = [df['name_tejary'][i] for i in top_3_indices]
  return top_3_embeddings, similarity_scores, input_embed

In [114]:
# Top 3 based on Fasttext
top3_fasttext, scores_fasttext, embed_fasttext = search_over_embeds(df, "نارسایی حاد قلبی و شوک قلبی", "fasttext_embedding", get_fasttext_embedding, model=ft)
print('Fasttext Drug Suggestion:', top3_fasttext)

# Top 3 based on Paraphrase
top3_paraphrase, scores_paraphrase, embed_paraphrase = search_over_embeds(df, "نارسایی حاد قلبی و شوک قلبی", "paraphrase_embedding", transformers_embeddings, model=model)
print('Paraphrase Drug Suggestion:', top3_paraphrase)

Fasttext Drug Suggestion: ['Dobutrex', 'Isordil', 'Nipride']
Paraphrase Drug Suggestion: ['Apresoline', 'Lanoxine', 'Nipride']


## Plotting
For better understating, we're going to plot given description by user embeddings and compare it to 2000 random drugs embeddings. Coloring in plots are based on cosine similarity of them and the given description.

### Fasttext Ploting

In [None]:
pca = PCA(n_components=2)
embedding_2d = pca.fit_transform(np.stack(df.fasttext_embedding))
drug_embedding = pca.transform(np.stack(embed_fasttext.reshape(1, -1)))
embedding_2d = pd.DataFrame(embedding_2d, columns=['X', 'Y'])
embedding_2d['label'] = df['name_tejary']
embedding_2d['score'] = scores_fasttext
random_drugs = np.random.randint(0, embedding_2d.shape[0], 2000)
embedding_2d = embedding_2d.iloc[random_drugs]

In [None]:
fig = px.scatter(embedding_2d, x="X", y="Y", color='score')
fig.update_layout(
    height=1000,
    title_text='Drugs Fasttext Embedding Chart'
)
fig.add_traces(
    px.scatter(x=[drug_embedding[0, 0]], y=[drug_embedding[0, 1]]).update_traces(marker_size=10, marker_color="black", marker_symbol='cross').data
)
fig.show()

### MPNET Plotting

In [None]:
pca = PCA(n_components=2)
embedding_2d = pca.fit_transform(np.stack(df.paraphrase_embedding))
drug_embedding = pca.transform(np.stack(embed_paraphrase.reshape(1, -1)))
embedding_2d = pd.DataFrame(embedding_2d, columns=['X', 'Y'])
embedding_2d['label'] = df['name_tejary']
embedding_2d['score'] = scores_paraphrase
random_drugs = np.random.randint(0, embedding_2d.shape[0], 2000)
embedding_2d = embedding_2d.iloc[random_drugs]

In [None]:
fig = px.scatter(embedding_2d, x="X", y="Y", color='score')
fig.update_layout(
    height=1000,
    title_text='Drugs Paraphrase Embedding Chart'
)
fig.add_traces(
    px.scatter(x=[drug_embedding[0, 0]], y=[drug_embedding[0, 1]]).update_traces(marker_size=10, marker_color="black", marker_symbol='cross').data
)
fig.show()

## Compare between suggested drugs generated by each model

In [None]:
# Top 3 based on Fasttext
top3_fasttext, scores_fasttext, embed_fasttext = search_over_embeds(df, "نارسایی حاد قلبی و شوک قلبی", "fasttext_embedding", get_fasttext_embedding, model=ft)
print('Fasttext Drug Suggestion:', top3_fasttext)

# Top 3 based on Paraphrase
top3_paraphrase, scores_paraphrase, embed_paraphrase = search_over_embeds(df, "نارسایی حاد قلبی و شوک قلبی", "paraphrase_embedding", transformers_embeddings, model=model)
print('Paraphrase Drug Suggestion:', top3_paraphrase)

Fasttext Drug Suggestion: ['Dobutrex', 'Isordil', 'Nipride']
Paraphrase Drug Suggestion: ['Apresoline', 'Lanoxine', 'Nipride']


In [112]:
df[(df['name_tejary'] == top3_fasttext[0]) | (df['name_tejary'] == top3_fasttext[1]) | (df['name_tejary'] == top3_fasttext[2])][['name_tejary', 'mavared_masraf']]

Unnamed: 0,name_tejary,mavared_masraf
2,Dobutrex,درمان نارسایی حاد قلبی و در شوک قلبی یا عفونی ...
286,Nipride,"حملات حاد فشار خون , نارسایی قلبی , سکته قلبی ..."
401,Isordil,"در درمان آنژِین قلبی , پیشگیری از درد های قلبی..."


In [113]:
df[(df['name_tejary'] == top3_paraphrase[0]) | (df['name_tejary'] == top3_paraphrase[1]) | (df['name_tejary'] == top3_paraphrase[2])][['name_tejary', 'mavared_masraf']]

Unnamed: 0,name_tejary,mavared_masraf
286,Nipride,"حملات حاد فشار خون , نارسایی قلبی , سکته قلبی ..."
319,Apresoline,"فشار خون بالا , نارسایی احتقانی قلب , نارسایی ..."
325,Lanoxine,"نارسایی قلبی , بی نظمی های ریتم قلبی ( نوع سوپ..."
