In [69]:
import torch
import pandas as pd
from transformers import AutoModel, AutoTokenizer
import json
import numpy as np
torch.cuda.is_available()

True

In [18]:
data = pd.read_csv("reviews/data.csv")


BERT_MODEL = "bert-base-multilingual-cased"
tokenizer = AutoTokenizer.from_pretrained(BERT_MODEL)
model = AutoModel.from_pretrained(BERT_MODEL)

device = "cuda:0" if torch.cuda.is_available() else "cpu"

model = model.to(device)

def get_embeddings(text: str):

    tokens = tokenizer.encode(text, return_tensors="pt", padding=True, truncation=True)
    input_args = tokens.to(device)

    with torch.no_grad():
        embedding = torch.mean(model(input_args).last_hidden_state, 1).detach().cpu().numpy()

    return embedding

vectors = np.vstack(data.review.apply(get_embeddings))


Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [19]:
vectors

array([[-0.22426929, -0.38108316, -0.16628402, ...,  0.4690356 ,
        -0.19725499,  0.05362346],
       [-0.3083737 , -0.05212977, -0.16806833, ...,  0.67877436,
        -0.18201084, -0.04950444],
       [-0.34247276, -0.13616918, -0.55700266, ...,  0.6820839 ,
        -0.2980253 ,  0.08833668],
       ...,
       [-0.35085803, -0.19258827, -0.2842389 , ...,  0.70045114,
        -0.11468146, -0.0144497 ],
       [-0.21089825, -0.19063358, -0.39455807, ...,  1.0517277 ,
         0.08442838,  0.02160566],
       [-0.34841445, -0.3802213 , -0.42617595, ...,  0.5886858 ,
        -0.31345776,  0.72619945]], dtype=float32)

In [31]:
import requests

headers = {
    'Content-Type': 'application/json',
}

json_data = {
    'settings': {
        'knn': True,
        'knn.algo_param.ef_search': 100,
    },
    'mappings': {
        'properties': {
            'vector_data': {
                'type': 'knn_vector',
                'dimension': 768,
                'method': {
                    'name': 'hnsw',
                    'space_type': 'cosinesimil',
                    'engine': 'nmslib',
                },
            },
        },
    },
}

response = requests.put('https://search-elastica-xxx.eu-west-1.es.amazonaws.com/elastica/',
                        headers=headers, json=json_data, auth=('alvarocp', 'xxx'))

In [32]:
response.text


'{"acknowledged":true,"shards_acknowledged":true,"index":"elastica"}'

In [81]:
bulk = []

headers = {
    'Content-Type': 'application/json',
}

for i, vector in enumerate(vectors):
    index = json.dumps({ "index": { "_index": "elastica", "_id": f"{i}" } })
    index += "\n"
    bulk_file = index + json.dumps({ "vector_data": list(vector.astype(float)), "category": int(np.random.randint(0, 10, 1)[0]) })
    bulk_file += "\n"
    response = requests.post('https://search-elastica-xxx.eu-west-1.es.amazonaws.com/_bulk', headers=headers, data=bulk_file, auth=('alvarocp', 'xxx'))

    if response.status_code != 200:
        break




response.text


KeyboardInterrupt: 

In [103]:
from opensearchpy import OpenSearch, RequestsHttpConnection, AWSV4SignerAuth
import boto3

session = boto3.Session(
    aws_access_key_id="xxx",
    aws_secret_access_key="xxxx",
)

host = "search-elastica-xxx.eu-west-1.es.amazonaws.com"
service = 'es'
credentials = session.get_credentials()
awsauth = AWSV4SignerAuth(credentials, "eu-west-1", service)

client = OpenSearch(
    hosts = [{'host': host, 'port': 443}],
    http_auth = awsauth,
    use_ssl = True,
    verify_certs = True,
    connection_class = RequestsHttpConnection,
    pool_maxsize = 20
)


In [104]:
client.get(index="elastica", id="0")

{'_index': 'elastica',
 '_id': '0',
 '_version': 2,
 '_seq_no': 1,
 '_primary_term': 1,
 'found': True,
 '_source': {'vector_data': [-0.22426928579807281,
   -0.3810831606388092,
   -0.16628402471542358,
   0.1347707211971283,
   0.3719465732574463,
   -0.021697569638490677,
   -0.20046158134937286,
   0.3307285010814667,
   -0.27769920229911804,
   -0.2192084938287735,
   0.16504989564418793,
   0.5258060097694397,
   0.24465250968933105,
   0.3334207534790039,
   0.27607619762420654,
   -0.2972034513950348,
   0.7605559825897217,
   -0.5224223136901855,
   -0.19517506659030914,
   -0.07840140908956528,
   -0.24117442965507507,
   -0.033128637820482254,
   0.2409949153661728,
   0.03166266158223152,
   0.16067564487457275,
   0.4150790870189667,
   -0.49226993322372437,
   0.010445085354149342,
   -0.2685537338256836,
   -0.3034372627735138,
   -0.07982682436704636,
   0.33790507912635803,
   -0.15606217086315155,
   0.4880124628543854,
   -0.14308702945709229,
   0.21544113755226135,

In [238]:
query = {
    "size": 2,
    "query": {"knn": {"vector_data": {"vector": list(vector.astype(float)), "k": 2}}},
    "_source": False,
    "fields": ["id", "category"],
}
response = client.search(body=query, index="elastica")
response

{'took': 19,
 'timed_out': False,
 '_shards': {'total': 5, 'successful': 5, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 43, 'relation': 'eq'},
  'max_score': 0.99999976,
  'hits': [{'_index': 'elastica',
    '_id': '22351',
    '_score': 0.99999976,
    'fields': {'category': [5]}},
   {'_index': 'elastica',
    '_id': '3477',
    '_score': 0.979587,
    'fields': {'category': [2]}}]}}