In [6]:
import pandas as pd
import numpy as np

In [7]:
data = pd.read_csv("words_preprocessed.csv")

READ DATA

In [8]:
data

Unnamed: 0,Words
0,ab
1,aba
2,abacı
3,abacılık
4,abadi
...,...
49796,zürriyet
49797,zürriyetli
49798,zürriyetsiz
49799,zürriyetsizli


SETTING EMBEDDINGS (EMRECAN SEMANTIC EMBEDDING)

In [9]:
from sentence_transformers import SentenceTransformer
sentences = data["Words"].tolist()
sentences

['ab',
 'aba',
 'abacı',
 'abacılık',
 'abadi',
 'abajur',
 'abajurcu',
 'abajurculuk',
 'abajurlu',
 'abajursuz',
 'abaküs',
 'abalı',
 'abana',
 'abandırma',
 'abandırmak',
 'abandone',
 'abanık',
 'abani',
 'abanma',
 'abanmak',
 'abanoz',
 'abanozgiller',
 'abanozlaşma',
 'abanozlaşmak',
 'abartı',
 'abartıcı',
 'abartıcılık',
 'abartılı',
 'abartılma',
 'abartılmak',
 'abartısız',
 'abartısızlık',
 'abartış',
 'abartma',
 'abartmacı',
 'abartmacılık',
 'abartmak',
 'abartmalı',
 'abartmasız',
 'abasız',
 'abaşo',
 'abat',
 'abaza',
 'abazaca',
 'abazan',
 'abazanlık',
 'abbas',
 'abd',
 'abdal',
 'abdal',
 'abdallık',
 'abdest',
 'abdestbozan',
 'abdesthane',
 'abdestli',
 'abdestlik',
 'abdestlilik',
 'abdestsiz',
 'abdestsizlik',
 'abdiâciz',
 'abdülleziz',
 'abece',
 'abecesel',
 'aberasyon',
 'abes',
 'abeslik',
 'abıhayat',
 'abıkevser',
 'abide',
 'abideleşme',
 'abideleşmek',
 'abideleştirme',
 'abideleştirmek',
 'abidevi',
 'abis',
 'abiye',
 'abla',
 'ablacı',
 'ablacılık

In [10]:
model = SentenceTransformer('emrecan/bert-base-turkish-cased-mean-nli-stsb-tr')
embeddings = model.encode(sentences)

Saving Embeddings for static storage

In [11]:
np.save("embeddings.npy",embeddings)

SETTING UP VECTORDB

In [12]:
embeddings

array([[ 0.760441  , -0.20330493,  0.39946017, ..., -0.3512381 ,
         0.70096797,  0.10735158],
       [ 1.0934386 , -0.79947215,  0.40679616, ..., -0.4637628 ,
        -0.06327389, -0.3521843 ],
       [ 1.1624897 , -0.45807654,  0.8009405 , ..., -0.22780861,
        -0.02661   ,  0.21961263],
       ...,
       [-1.1682118 , -0.10456763, -0.4421331 , ..., -0.7938882 ,
         1.7286984 , -0.7052603 ],
       [-1.2919304 , -0.47446468, -0.6819107 , ..., -0.5533506 ,
         1.3615257 , -0.9933367 ],
       [-0.28806633,  0.52145904, -0.91896474, ..., -0.40328017,
         0.50946033, -0.28500202]], dtype=float32)

In [13]:
embeddings.shape

(49801, 768)

EMRECAN EMBEDDING / SENTENCE SIMILARITY R&D

In [49]:
from sentence_transformers import SentenceTransformer
sentences_try = ["Selam", "İnsan","Merhaba"]

model = SentenceTransformer('emrecan/bert-base-turkish-cased-mean-nli-stsb-tr')
embeddings_try = model.encode(sentences_try)
print(embeddings_try)


[[-1.0581504   0.26353064  0.29067758 ... -0.21723221  0.4108744
   1.1782107 ]
 [ 0.47393417 -0.44023237 -0.7126212  ...  0.40980363  1.0300173
   0.9495943 ]
 [ 0.11937086  0.10059706  0.06262342 ... -0.21120255 -0.36441565
   1.1737489 ]]


In [52]:
embeddings.shape

(63840, 768)

SETTING UP VECTORDB (FAISS)

In [14]:
vector_dimension = embeddings.shape[1]


import faiss
index = faiss.IndexFlatL2(vector_dimension)
faiss.normalize_L2(embeddings)
index.add(embeddings)

Saving VectorDB for static storage

In [10]:
faiss.write_index(index,'faiss_index.bin')

SEARCHING R&D (FAISS)

In [99]:
search_text = "çorap"

encoded_search = model.encode(search_text)
search_vector = np.array([encoded_search])
faiss.normalize_L2(search_vector)

In [100]:
k = index.ntotal
distances, ann = index.search(search_vector, k=k)

In [101]:
results = pd.DataFrame({'distances': distances[0], 'ann': ann[0]})

In [102]:
results

Unnamed: 0,distances,ann
0,6.484667e-13,12550
1,2.298705e-01,12551
2,5.015517e-01,12552
3,5.897416e-01,12543
4,8.519388e-01,4282
...,...,...
63835,2.420739e+00,5417
63836,2.422859e+00,53431
63837,2.424670e+00,53430
63838,2.435082e+00,53433


In [103]:
merged_df = pd.merge(results,data,left_on='ann',right_index=True)

In [104]:
merged_df[0:50]

Unnamed: 0,distances,ann,Words
0,6.484667e-13,12550,çorap
1,0.2298705,12551,çorapçı
2,0.5015517,12552,çorapçılık
3,0.5897416,12543,çor
4,0.8519388,4282,ayakkabı
5,0.8727239,36090,külotlu çorap
6,0.8796129,35320,kumaş
7,0.8942307,27449,iplik
8,0.8982747,12559,çorlu
9,0.9232085,4275,ayak


SETTING UP VECTORDB (CHROMA)

WITH INFERENCE API

In [20]:
import requests

API_URL = "https://api-inference.huggingface.co/models/emrecan/bert-base-turkish-cased-mean-nli-stsb-tr"
headers = {"Authorization": "Bearer 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"}
def query(payload):
	response = requests.post(API_URL, headers=headers, json=payload)
	return response.json()
	
output = query({
	"inputs": {
	"source_sentence": "Selam",
	"sentences": [
		"Merhaba",
		"Salam",
		"Burun"
	]
},
})

In [21]:
output

[0.6838985085487366, 0.23454970121383667, 0.2540648579597473]