## Download mpt-7B model

We install the necessary packages to be able to perform inference with MPT-7B

In [0]:
!wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/libcusparse-dev-11-7_11.7.3.50-1_amd64.deb && \
  dpkg -i libcusparse-dev-11-7_11.7.3.50-1_amd64.deb && \
  wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/libcublas-dev-11-7_11.10.1.25-1_amd64.deb && \
  dpkg -i libcublas-dev-11-7_11.10.1.25-1_amd64.deb && \
  wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/libcusolver-dev-11-7_11.4.0.1-1_amd64.deb && \
  dpkg -i libcusolver-dev-11-7_11.4.0.1-1_amd64.deb
%pip install ninja
%pip install langchain chromadb einops flash-attn==v1.0.3.post0 triton==2.0.0.dev20221202

Import the necessary modules

In [0]:
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, pipeline
import torch
import pandas as pd

Load the MPT-7B Instruct model from the Hugging Face Hub

In [0]:
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neox-20b")

config = AutoConfig.from_pretrained(
  'mosaicml/mpt-7b-instruct',
  trust_remote_code=True
)
config.attn_config['attn_impl'] = 'triton'
#config.update({"max_seq_len": 4096})

model = AutoModelForCausalLM.from_pretrained(
  'mosaicml/mpt-7b-instruct',
  config=config,
  torch_dtype=torch.bfloat16,
  trust_remote_code=True
).to(device="cuda:0")



In [0]:
tokenizer.padding_side = "left"

In [0]:
#Tests to see the text input is valid 
import pandas as pd

payload_pd = pd.DataFrame([["Generate description of following schema: tbl[cars] cols [price, brand, model, year, title_status, mileage, color, vin, lot, state, country, condition]"]],columns=['text'])
payload_pd

In [0]:
instruction = payload_pd.iloc[:,0].to_list() # get the first column
instruction

Create the different compoenents necessary to perform inference with the model

In [0]:
INSTRUCTION_KEY = "### Instruction:"
RESPONSE_KEY = "### Response:"
END_KEY = "### End"
INTRO_BLURB = "Below is an instruction that describes a task. Write a response that appropriately completes the request."
PROMPT_FOR_GENERATION_FORMAT = """{intro}
{instruction_key}
{instruction}
{response_key}
""".format(
    intro=INTRO_BLURB,
    instruction_key=INSTRUCTION_KEY,
    instruction="{instruction}",
    response_key=RESPONSE_KEY,
)

In [0]:
s = PROMPT_FOR_GENERATION_FORMAT.format(instruction=instruction[0])
s

In [0]:
generate_kwargs = {
            "temperature": 0.5,
            "top_p": 92,
            "top_k": 0,
            "max_new_tokens": 512,
            "use_cache": True,
            "do_sample": True,
            "eos_token_id": tokenizer.eos_token_id,
            "pad_token_id": tokenizer.pad_token_id,
            "repetition_penalty": 1.1,  # 1.0 means no penalty, > 1.0 means penalty, 1.2 from CTRL paper
        }

In [0]:
input_ids = tokenizer(s, return_tensors="pt").input_ids
input_ids = input_ids.to(model.device)
gkw = {**generate_kwargs}
with torch.no_grad():
    output_ids = model.generate(input_ids, **gkw)
# Slice the output_ids tensor to get only new tokens
new_tokens = output_ids[0, len(input_ids[0]) :]
output_text = tokenizer.decode(new_tokens, skip_special_tokens=True)
output_text

In [0]:
#Wrap the above logic into a function 
#variable 'instruction' is what is asked
def generate_response(tokenizer, model, instruction):
  INSTRUCTION_KEY = "### Instruction:"
  RESPONSE_KEY = "### Response:"
  END_KEY = "### End"
  INTRO_BLURB = "Below is an instruction that describes a task. Write a response that appropriately completes the request."
  PROMPT_FOR_GENERATION_FORMAT = """{intro}
  {instruction_key}
  {instruction}
  {response_key}
  """.format(
      intro=INTRO_BLURB,
      instruction_key=INSTRUCTION_KEY,
      instruction="{instruction}",
      response_key=RESPONSE_KEY,
  )
  s = PROMPT_FOR_GENERATION_FORMAT.format(instruction=instruction)
  generate_kwargs = {
            "temperature": 0.5,
            "top_p": 92,
            "top_k": 0,
            "max_new_tokens": 512,
            "use_cache": True,
            "do_sample": True,
            "eos_token_id": tokenizer.eos_token_id,
            "pad_token_id": tokenizer.pad_token_id,
            "repetition_penalty": 1.1,  # 1.0 means no penalty, > 1.0 means penalty, 1.2 from CTRL paper
        }
  input_ids = tokenizer(s, return_tensors="pt").input_ids
  input_ids = input_ids.to(model.device)
  gkw = {**generate_kwargs}
  with torch.no_grad():
      output_ids = model.generate(input_ids, **gkw)
  # Slice the output_ids tensor to get only new tokens
  new_tokens = output_ids[0, len(input_ids[0]) :]
  output_text = tokenizer.decode(new_tokens, skip_special_tokens=True)
  return output_text

Testing the inference function

In [0]:
generate_response(tokenizer, model, "What is SQL. Describe briefly")

## Creating Catalog, Database and example 'gold' tables
For the purpose of this demo, I will create a completely new gold layer with denormalized tables that would be representative of a toy dataset.

In [0]:
%sql
CREATE CATALOG IF NOT EXISTS METSQL;
USE CATALOG metsql;
CREATE DATABASE IF NOT EXISTS GOLDDB;
USE GOLDDB;

Create dataframe with metadata, table names and table description generated by MPT-7B based on schema. This description will be used to match a given natural language question by indexing it using a vector database

In [0]:
# List tables in the current database
tables = spark.sql("SHOW TABLES")

# Initialize a list to store the table metadata
table_metadata = []
table_names = []
table_description = []

# Iterate over the tables and retrieve the column names
for row in tables.collect():
    table_name = row.tableName
    columns = spark.sql(f"DESCRIBE {table_name}")
    column_names = [column.col_name for column in columns.collect()]
    metadata_str = f"schema: tbl[{table_name}] cols [{', '.join(column_names)}]"
    table_metadata.append(metadata_str)
    table_names.append(table_name)
    description = generate_response(tokenizer, model, "Generate a brief description of the following "+metadata_str)
    table_description.append(description)

# Print the table metadata
for i in range(len(table_metadata)):
    print(table_metadata[i]), print(table_names[i]), print(table_description[i])

In [0]:
# Create the DataFrame
df = pd.DataFrame({'table_metadata': table_metadata, 'table_names': table_names, 'table_description': table_description})

# Print the DataFrame
display(df)

In [0]:
spark.createDataFrame(df).write.saveAsTable('table_metadata')

In [0]:
name_schema = [[table_names[i], table_metadata[i]] for i in range(len(table_names))]
# Create the DataFrame
df = pd.DataFrame({'table_metadata': name_schema, 'table_names': table_names, 'table_description': table_description})

# Print the DataFrame
display(df)

## Now save this in a table in new database Metabase

In [0]:
%sql
CREATE DATABASE IF NOT EXISTS METABASE;
USE METABASE;

In [0]:
spark.createDataFrame(df).write.saveAsTable('metadata')