In [207]:
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F
import numpy as np
import seaborn as sns

In [208]:
import mysql.connector

In [209]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"

In [210]:
mydb = mysql.connector.connect(
  host="xxxxx",
  user="xxxxx",
  password="xxxxx",
  database="xxxxx"
)

In [211]:
cur = mydb.cursor()

In [212]:
cur.execute("SELECT description FROM twitter_profiles ORDER BY twitter_profiles.person_id ASC limit 20000;")

In [213]:
myresult = cur.fetchall()

In [214]:
for i in range(len(myresult)):
    myresult[i] = list(myresult[i])

In [215]:
# Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

In [216]:
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2').to(device)

In [217]:
def emb(text,model,tokenizer):
    encoded_input = tokenizer(text =text, padding=True, truncation=True, return_tensors='pt').to(device)
    with torch.no_grad():
        model_output = model(**encoded_input)
    sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
    sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
    return np.array(sentence_embeddings.to('cpu'))

In [218]:
embeddings_dataset = []

In [219]:
i = 0
for text in myresult:
    embeddings_dataset.append( (f'id-{i}',emb(str(text),model,tokenizer)[0].tolist()) )
    i+=1

In [220]:
embeddings_dataset[2][0]

'id-2'

In [221]:
import pinecone

In [222]:
pinecone.init(api_key="xxxxx", environment="xxxxx")

In [223]:
index = pinecone.Index("xxxxx")

In [224]:
import random
import itertools

def chunks(iterable, batch_size=100):
    """A helper function to break an iterable into chunks of size batch_size."""
    it = iter(iterable)
    chunk = tuple(itertools.islice(it, batch_size))
    while chunk:
        yield chunk
        chunk = tuple(itertools.islice(it, batch_size))

vector_dim = 384
vector_count = 20000


# Upsert data with 100 vectors per upsert request
for ids_vectors_chunk in chunks(embeddings_dataset, batch_size=100):
    index.upsert(vectors=ids_vectors_chunk)  # Assuming `index` defined elsewhere

In [228]:
top = index.query(
    vector=[embeddings_dataset[0][1]],
    top_k = 50,
    include_values=True
)

In [238]:
top['matches'][3][['id','score']]

{'id': 'id-9540',
 'score': 0.728316665,
 'sparseValues': {},
 'values': [-0.00291832234,
            -0.0933647081,
            -0.0108658019,
            0.0136476522,
            0.021051934,
            -0.03038208,
            0.0872836187,
            -0.00678862305,
            0.0144521184,
            0.0159772653,
            -0.0409194827,
            -0.0421466269,
            -0.003516739,
            -0.0336416401,
            -0.0202765,
            0.0146671478,
            -0.0494322404,
            0.0153112449,
            -0.0283294488,
            -0.142628148,
            -0.141943321,
            -0.0662141219,
            0.0138939144,
            -0.0298838858,
            -0.0196002498,
            0.0428650901,
            0.033993613,
            -0.0196875408,
            0.0237415675,
            -0.0648139268,
            0.00558584603,
            0.0199467782,
            0.0285029393,
            0.0409231149,
            0.0268077012,
            0.09