In [None]:
!pip3 install transformers
!pip3 install nltk
!pip3 install torch
!pip3 install weaviate-client==3.2.2

## Import the BERT transformer model and pytorch

We are using the `bert-base-uncased` model in this example, but any model will work. Feel free to adjust accordingly.

## Initialize Weaviate Client
This assumes you have Weaviate running locally on `:8080`. Adjust URL accordingly. You could also enter the WCS URL here, for example, if you are running a WCS cloud instance instead of running Weaviate locally.

In [2]:
import torch
from transformers import AutoModel, AutoTokenizer
from nltk.tokenize import sent_tokenize
import weaviate

torch.set_grad_enabled(False)

# udpated to use different model if desired
MODEL_NAME = "sentence-transformers/all-mpnet-base-v2"
model = AutoModel.from_pretrained(MODEL_NAME)
# model.to('cuda') # remove if working without GPUs
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# initialize nltk (for tokenizing sentences)
import nltk
nltk.download('punkt')

# initialize weaviate client for importing and searching
client = weaviate.Client("http://64.71.146.93:8080")

  from .autonotebook import tqdm as notebook_tqdm
Downloading: 100%|██████████| 571/571 [00:00<00:00, 141kB/s]
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Downloading: 100%|██████████| 438M/438M [02:18<00:00, 3.17MB/s] 
Downloading: 100%|██████████| 363/363 [00:00<00:00, 179kB/s]
Downloading: 100%|██████████| 232k/232k [00:00<00:00, 499kB/s] 
Downloading: 100%|██████████| 466k/466k [00:00<00:00, 482kB/s]  
Downloading: 100%|██████████| 239/239 [00:00<00:00, 118kB/s]
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\awx626582\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


## Load dataset from disk
Create some helper functions to create the dataset (20-newsgroup text posts) from disk. These methods are specific to the structure of your dataset, adjust accordingly.

In [3]:
import os
import random

questions = []
sqls = []

def read_dataset():
    file_data = open("human_questions_with_sql_queries.txt", 'r', encoding="utf8")
    for line in file_data:
        questions.append(line.split(" : ")[0])
        sqls.append(line.split(" : ")[1])

read_dataset()
           


FileNotFoundError: [Errno 2] No such file or directory: 'human_questions_with_sql_queries.txt'

## Vectorize Dataset using BERT

The following is a helper function to vectorize all posts (using our BERT transformer) which are entered as an array. The return array contains all the vectors in the same order. BERT is optimized to run on GPUs, if you're using CPUs this might take a while. 

In [None]:
import time

def text2vec(text):
    tokens_pt = tokenizer(text, padding=True, truncation=True, max_length=500, add_special_tokens = True, return_tensors="pt")
    outputs = model(**tokens_pt)
    # tokens_pt.to('cuda') # remove if working without GPUs
    return outputs[0].mean(0).mean(0).detach()

def vectorize_questions(posts=[]):
    post_vectors=[]
    before=time.time()
    for i, post in enumerate(posts):
        vec=text2vec(sent_tokenize(post))
        post_vectors += [vec]
        if i % 500 == 0 and i != 0:
            print("So far {} objects vectorized in {}s".format(i, time.time()-before))
    after=time.time()
    
    print("Vectorized {} items in {}s".format(len(posts), after-before))
    
    return post_vectors

### Run everything we have so far

It is now time to run the functions we defined before. Let's load 50 random posts from disk, then vectorize them using BERT.

## Initialize Weaviate

Now that we have vectors we can import both the posts and the vectors into Weaviate, so we can then search through them.

### Init a simple schema
Our schema is very simple, we just have one object class, the "Post". A post class has just a single property, which we call "content" and is of type "text".

Each class in schema creates one index, so by running the below we tell weaviate to create one brand new vector index waiting for us to import data.

In [None]:
import json
schema = client.schema.get()
print(json.dumps(schema, indent=4))
class_obj = {
"class": "SQL_Questions", # <= Change to your class name - it will be your collection
"description": "SQL to Text mapping",
"vectorizer": "none",
"properties": [
    {
        "dataType": [
            "string"
        ],
        "description": "question",
        "name": "question"
    },
    {
        "dataType": [
            "string"
        ],
        "description": "sql",
        "name": "sql"
    }
]
}
client.schema.delete_class("SQL_Questions")
client.schema.create_class(class_obj)

{
    "classes": [
        {
            "class": "Weak_Sim_Intent",
            "description": "Add weak intent similarity",
            "invertedIndexConfig": {
                "bm25": {
                    "b": 0.75,
                    "k1": 1.2
                },
                "cleanupIntervalSeconds": 60,
                "stopwords": {
                    "additions": null,
                    "preset": "en",
                    "removals": null
                }
            },
            "properties": [
                {
                    "dataType": [
                        "int"
                    ],
                    "description": "Frame_nmb",
                    "name": "frame_nmbr"
                },
                {
                    "dataType": [
                        "string"
                    ],
                    "description": "movie_id",
                    "name": "movie_id",
                    "tokenization": "word"
                },
           

In [None]:

def import_questions_with_vectors(questions, sqls, vectors, client):
    if len(questions) != len(vectors):
        raise Exception("len of posts ({}) and vectors ({}) does not match".format(len(questions), len(vectors)))
        
    for i, question in enumerate(questions):
        try:
           client.data_object.create(
                data_object={"question": questions[i], "sql": sqls[i]},
                class_name='SQLQuestions',
                vector=vectors[i]
            )
        except Exception as e:
            print(e)

In [None]:
def search(query="", limit=15):
    before = time.time()
    vec = text2vec(query)
    vec_took = time.time() - before

    before = time.time()
    near_vec = {"vector": vec}
    res = client \
        .query.get("SQLQuestions", ["sql", "_additional {certainty}"]) \
        .with_near_vector(near_vec) \
        .with_limit(limit) \
        .do()
    search_took = time.time() - before

    # print("\nQuery \"{}\" with {} results took {:.3f}s ({:.3f}s to vectorize and {:.3f}s to search)" \
          # .format(query, limit, vec_took+search_took, vec_took, search_took))
    # print(res)
    for post in res["data"]["Get"]["SQLQuestions"]:
        # print("{:.4f}: {}".format(post["_additional"]["certainty"], post["question"]))
        print(post["sql"])
        # print('---')

In [None]:
vectors = vectorize_questions(questions)

So far 500 objects vectorized in 63.61125946044922s
So far 1000 objects vectorized in 122.78761100769043s
So far 1500 objects vectorized in 181.30901956558228s
So far 2000 objects vectorized in 239.4066243171692s
So far 2500 objects vectorized in 297.61777806282043s
So far 3000 objects vectorized in 355.21032333374023s
Vectorized 3044 items in 359.9342269897461s


In [None]:
import_questions_with_vectors(questions, sqls, vectors, client)

In [None]:
search("all incompleted tasks by Aharon", 15)


SELECT COUNT(*), Users.Name FROM Tasks AS t join Users AS u on t.AssigneeId = u.Id WHERE u.Name LIKE '%Aharon%' and t.Completed = 0 group by u.Name;

SELECT COUNT(*), Users.Name FROM Tasks AS t join Users AS u on t.AssigneeId = u.Id WHERE u.Name LIKE '%Aharon%' and t.Completed = 0 group by u.Name;

SELECT COUNT(*), Users.Name FROM Tasks AS tasks inner join Users AS users on tasks.AssigneeId = users.Id WHERE Users.Name LIKE '%aharon%' and tasks.Completed = 0 group by Users.Name;

SELECT COUNT(*), Users.Name FROM Tasks AS tasks inner join Users AS users on tasks.AssigneeId = users.Id WHERE Users.Name LIKE '%aharon%' and tasks.Completed = 0 group by Users.Name;

SELECT COUNT(*), Users.Name FROM Tasks AS t join Users AS u on t.AssigneeId = u.Id WHERE u.Name LIKE '%Aharon%' and t.Completed = 0 group by u.Name;

SELECT COUNT(*), Users.Name FROM Tasks AS t join Users AS u on t.AssigneeId = u.Id WHERE u.Name LIKE '%Aharon%' and t.Completed = 0 group by u.Name;

SELECT t.Name AS Name_of_task, u

In [None]:
#
