In [None]:
# Install the library that contains our SQL dataset
try:
  from datasets import load_dataset
except:
  !pip install datasets
  from datasets import load_dataset

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (

In [None]:
# Load the SQL dataset
ds = load_dataset("gretelai/synthetic_text_to_sql")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/8.18k [00:00<?, ?B/s]

(…)nthetic_text_to_sql_train.snappy.parquet:   0%|          | 0.00/32.4M [00:00<?, ?B/s]

(…)ynthetic_text_to_sql_test.snappy.parquet:   0%|          | 0.00/1.90M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/100000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5851 [00:00<?, ? examples/s]

In [None]:
# Import our dataset that contains the non-SQL dataset
import json
import os

# Clone the repository
!git clone https://github.com/brmson/dataset-factoid-webquestions.git

# Path to the train split
data_path = 'dataset-factoid-webquestions/main/trainmodel.json'

# Load the WebQuestions data
with open(data_path, 'r') as file:
    data = json.load(file)

# Extract the questions
non_sql_prompts = [item['qText'] for item in data]
print("Number of non-SQL examples loaded:", len(non_sql_prompts))


Cloning into 'dataset-factoid-webquestions'...
remote: Enumerating objects: 708, done.[K
remote: Total 708 (delta 0), reused 0 (delta 0), pack-reused 708 (from 1)[K
Receiving objects: 100% (708/708), 20.75 MiB | 12.10 MiB/s, done.
Resolving deltas: 100% (473/473), done.
Number of non-SQL examples loaded: 2834


### Create our Binary Classifier
##### This will first determine if the Natural Language Prompt is SQL or Not

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# SQL dataset (label 1)
sql_prompts = ds['train']['sql_prompt']
sql_labels = [1] * len(sql_prompts)

# Non-SQL dataset (label 0)
non_sql_labels = [0] * len(non_sql_prompts)

# Combine data
all_prompts = sql_prompts + non_sql_prompts
all_labels = sql_labels + non_sql_labels

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(all_prompts, all_labels, test_size=0.2, random_state=42)

# Vector the text data
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Train the classifier
classifier = LogisticRegression()
classifier.fit(X_train_tfidf, y_train)

# Eval the model
y_pred = classifier.predict(X_test_tfidf)
print(classification_report(y_test, y_pred, target_names=["Not SQL", "SQL"]))


              precision    recall  f1-score   support

     Not SQL       1.00      0.85      0.92       547
         SQL       1.00      1.00      1.00     20020

    accuracy                           1.00     20567
   macro avg       1.00      0.92      0.96     20567
weighted avg       1.00      1.00      1.00     20567



In [None]:
#Lets test the Binary Classifier

# Prompts to test
example_prompts = [
    "Can you give me the total sales for last year?",  # SQL
    "What is the capital of France?",  # Non-SQL
    "Show me all employees who joined in 2023.",  # SQL
    "Who won the 2022 World Cup?",  # Non-SQL
    "What is the temperature in New York?", # SQL
    "Show me the operating revenue from FY2023?", # SQL
    "Where is Ann Arbor, MI?" # Non-SQL
]

# Transform the example prompts using the trained vectorizer
example_tfidf = vectorizer.transform(example_prompts)

# Predict with the classifier
predictions = classifier.predict(example_tfidf)

# Loop and print the classifier results
for prompt, prediction in zip(example_prompts, predictions):
    print(f"Prompt: {prompt}\nPrediction: {'SQL' if prediction == 1 else 'Not SQL'}\n")

Prompt: Can you give me the total sales for last year?
Prediction: SQL

Prompt: What is the capital of France?
Prediction: Not SQL

Prompt: Show me all employees who joined in 2023.
Prediction: SQL

Prompt: Who won the 2022 World Cup?
Prediction: Not SQL

Prompt: What is the temperature in New York?
Prediction: SQL

Prompt: Show me the operating revenue from FY2023?
Prediction: SQL

Prompt: Where is Ann Arbor, MI?
Prediction: Not SQL



In [None]:
# Saving the Binary Classifier to a pickle file
import pickle

# Save binary classifier and vectorizer
binary_objects_to_save = {
    "binary_classifier": classifier,  # The binary classifier for SQL detection
    "vectorizer": vectorizer  # The same vectorizer used for text transformation
}

# Save to a new pickle file
with open("binary_sql_classifier.pkl", "wb") as file:
    pickle.dump(binary_objects_to_save, file)

print("Binary classifier and vectorizer saved to 'binary_sql_classifier.pkl'.")

Binary classifier and vectorizer saved to 'binary_sql_classifier.pkl'.


# Multi-class Intent Classifier

In [None]:
# Import necessary libraries
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Extract the features and labels
sql_prompts = ds['train']['sql_prompt']  # Input feature
domain_descriptions = ds['train']['domain_description']  # Label 1
sql_complexities = ds['train']['sql_complexity']  # Label 2
sql_task_types = ds['train']['sql_task_type']  # Label 3

# Encode the target labels for each model
label_encoder_domain = LabelEncoder()
y_domain = label_encoder_domain.fit_transform(domain_descriptions)

label_encoder_complexity = LabelEncoder()
y_complexity = label_encoder_complexity.fit_transform(sql_complexities)

label_encoder_task_type = LabelEncoder()
y_task_type = label_encoder_task_type.fit_transform(sql_task_types)

# Split for each label
X_train, X_test, y_train_domain, y_test_domain = train_test_split(sql_prompts, y_domain, test_size=0.3, random_state=42)
_, _, y_train_complexity, y_test_complexity = train_test_split(sql_prompts, y_complexity, test_size=0.3, random_state=42)
_, _, y_train_task_type, y_test_task_type = train_test_split(sql_prompts, y_task_type, test_size=0.3, random_state=42)

# Vectorize the input features
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Train classifiers for Domain and Task Type
classifier_domain = LogisticRegression(multi_class='multinomial', solver='lbfgs')
classifier_domain.fit(X_train_tfidf, y_train_domain)

classifier_task_type = LogisticRegression(multi_class='multinomial', solver='lbfgs')
classifier_task_type.fit(X_train_tfidf, y_train_task_type)

# Train Random Forest for SQL Complexity
classifier_complexity_rf = RandomForestClassifier(n_estimators=100, random_state=42)
classifier_complexity_rf.fit(X_train_tfidf, y_train_complexity)

# Evaluate the models
y_pred_domain = classifier_domain.predict(X_test_tfidf)
y_pred_complexity_rf = classifier_complexity_rf.predict(X_test_tfidf)
y_pred_task_type = classifier_task_type.predict(X_test_tfidf)

print("Domain Description Classification Report:")
print(classification_report(y_test_domain, y_pred_domain, target_names=label_encoder_domain.classes_))

print("\nSQL Complexity Classification Report (Random Forest):")
print(classification_report(y_test_complexity, y_pred_complexity_rf, target_names=label_encoder_complexity.classes_))

print("\nSQL Task Type Classification Report:")
print(classification_report(y_test_task_type, y_pred_task_type, target_names=label_encoder_task_type.classes_))




Domain Description Classification Report:
                                                                                                                                                                     precision    recall  f1-score   support

                                                                          AI data on algorithmic fairness, AI safety, explainable AI, and creative AI applications.       0.95      0.93      0.94       356
                           Agricultural innovation metrics, rural infrastructure projects, community development initiatives, and economic diversification efforts.       0.92      0.91      0.91       411
                                                 Aircraft manufacturing data, satellite deployment projects, flight safety records, and space exploration research.       0.83      0.71      0.77       283
                                                                            Animal population data, habitat preservation efforts, and com

In [None]:
# Test the Multi-class
example_prompts = [
    "Find the top 5 highest-selling products.",
    "List all employees who joined in the last 2 years.",
    "Show the average test scores for students in science subjects.",
    "What is the total revenue for the previous fiscal year?",
    "Find the top 5 departments with the highest revenue generation in the last 5 years.",
    "Show all invoices that were made during Christmas Eve, Chrsitmas Day, Boxing Day, New Year's Even, New Years' day",
    "Find the top 3 artists by total albums",
    "List all the countries our customers are from"
]

# Vectorize the example prompts
example_tfidf = vectorizer.transform(example_prompts)

# Make predictions using the appropriate classifiers
predictions_domain = classifier_domain.predict(example_tfidf)
predictions_complexity = classifier_complexity_rf.predict(example_tfidf)  # Use the Random Forest model
predictions_task_type = classifier_task_type.predict(example_tfidf)

# Print test prompts and predictions
for i, prompt in enumerate(example_prompts):
    print(f"Prompt: {prompt}")
    print(f"Predicted Domain Description: {label_encoder_domain.inverse_transform([predictions_domain[i]])[0]}")
    print(f"Predicted SQL Complexity: {label_encoder_complexity.inverse_transform([predictions_complexity[i]])[0]}")
    print(f"Predicted SQL Task Type: {label_encoder_task_type.inverse_transform([predictions_task_type[i]])[0]}\n")


Prompt: Find the top 5 highest-selling products.
Predicted Domain Description: Retail data on circular supply chains, ethical labor practices, product transparency, and consumer education.
Predicted SQL Complexity: basic SQL
Predicted SQL Task Type: analytics and reporting

Prompt: List all employees who joined in the last 2 years.
Predicted Domain Description: Employee demographics, talent acquisition data, diversity and inclusion metrics, and training program statistics.
Predicted SQL Complexity: basic SQL
Predicted SQL Task Type: analytics and reporting

Prompt: Show the average test scores for students in science subjects.
Predicted Domain Description: Education data on student mental health, teacher professional development, open pedagogy, and lifelong learning.
Predicted SQL Complexity: basic SQL
Predicted SQL Task Type: analytics and reporting

Prompt: What is the total revenue for the previous fiscal year?
Predicted Domain Description: Restaurant revenue data, menu engineering 

In [None]:
# Save the pickle file for the Multi-class
# Objects to save
objects_to_save = {
    "classifier_domain": classifier_domain,
    "classifier_complexity": classifier_complexity_rf,
    "classifier_task_type": classifier_task_type,
    "vectorizer": vectorizer,
    "label_encoder_domain": label_encoder_domain,
    "label_encoder_complexity": label_encoder_complexity,
    "label_encoder_task_type": label_encoder_task_type
}

# Export as a pickle file
with open("multiclass_intent_classifier.pkl", "wb") as file:
    pickle.dump(objects_to_save, file)

print("Multiclass intent classifier and related objects saved to 'multiclass_intent_classifier.pkl'.")

Multiclass intent classifier and related objects saved to 'multiclass_intent_classifier.pkl'.


# Our new combined Intent Classifier

In [None]:
non_sql_labels = [0] * len(non_sql_prompts)  # Label 0 for non-SQL
print("Number of non-SQL examples loaded:", len(non_sql_prompts))

# Load SQL Prompts
sql_prompts = ds['train']['sql_prompt']  # SQL dataset loaded in context
sql_labels = [1] * len(sql_prompts)  # Label 1 for SQL
print("Number of SQL examples loaded:", len(sql_prompts))

# Combine SQL and Non-SQL Data
all_prompts = sql_prompts + non_sql_prompts
all_labels = sql_labels + non_sql_labels

# Train-Test Split for Binary Classification
X_train_binary, X_test_binary, y_train_binary, y_test_binary = train_test_split(
    all_prompts, all_labels, test_size=0.2, random_state=42
)

# Vectorizer
vectorizer = TfidfVectorizer()
X_train_binary_tfidf = vectorizer.fit_transform(X_train_binary)
X_test_binary_tfidf = vectorizer.transform(X_test_binary)

# Train Binary Classifier
binary_classifier = LogisticRegression()
binary_classifier.fit(X_train_binary_tfidf, y_train_binary)

# Multiclass Classifiers - if it passes binary classifier
domain_descriptions = ds['train']['domain_description']
sql_complexities = ds['train']['sql_complexity']
sql_task_types = ds['train']['sql_task_type']

label_encoder_domain = LabelEncoder()
y_domain = label_encoder_domain.fit_transform(domain_descriptions)

label_encoder_complexity = LabelEncoder()
y_complexity = label_encoder_complexity.fit_transform(sql_complexities)

label_encoder_task_type = LabelEncoder()
y_task_type = label_encoder_task_type.fit_transform(sql_task_types)

# Split for Multiclass Classification
X_train_multi, X_test_multi, y_train_domain, y_test_domain = train_test_split(
    sql_prompts, y_domain, test_size=0.2, random_state=42
)
_, _, y_train_complexity, y_test_complexity = train_test_split(
    sql_prompts, y_complexity, test_size=0.2, random_state=42
)
_, _, y_train_task_type, y_test_task_type = train_test_split(
    sql_prompts, y_task_type, test_size=0.2, random_state=42
)

# Transform for Multiclass
X_train_multi_tfidf = vectorizer.transform(X_train_multi)
X_test_multi_tfidf = vectorizer.transform(X_test_multi)

# Train Multiclass Classifiers
classifier_domain = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=500)
classifier_domain.fit(X_train_multi_tfidf, y_train_domain)

classifier_complexity = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=500)
classifier_complexity.fit(X_train_multi_tfidf, y_train_complexity)

classifier_task_type = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=500)
classifier_task_type.fit(X_train_multi_tfidf, y_train_task_type)

#Combined Classification Pipeline
def classify_intent(prompt):
    # Vectorize the input
    prompt_vectorized = vectorizer.transform([prompt])

    # Binary Classification
    is_sql = binary_classifier.predict(prompt_vectorized)[0]

    # If not SQL, return None
    if is_sql == 0:  # Not SQL
        return {"Intent": "Not SQL", "Details": None}

    # Multiclass Classification (only if SQL)
    domain = label_encoder_domain.inverse_transform(
        classifier_domain.predict(prompt_vectorized)
    )[0]
    complexity = label_encoder_complexity.inverse_transform(
        classifier_complexity.predict(prompt_vectorized)
    )[0]
    task_type = label_encoder_task_type.inverse_transform(
        classifier_task_type.predict(prompt_vectorized)
    )[0]

    return {
        "Intent": "SQL",
        "Domain": domain,
        "Complexity": complexity,
        "Task Type": task_type
    }


Number of non-SQL examples loaded: 2834
Number of SQL examples loaded: 100000




In [None]:
# Test the Pipeline
example_prompts = [
    "Can you give me the total sales for last year?",
    "What is the capital of France?",
    "Show me all employees who joined in 2023.",
    "Who won the last Super Bowl?",
    "What is the weather in New York today?",
    "List all employees in the finance department.",
    "What size shoes does Shaq wear?",
    "Do you have emotions?",
    "Calculate the total revenue of the HR department in FY2023.",
    "When is it suppose to rain next?"
]

# Run classification for each prompt and display results
for prompt in example_prompts:
    result = classify_intent(prompt)
    print(f"Prompt: {prompt}\nResult: {result}\n")

Prompt: Can you give me the total sales for last year?
Result: {'Intent': 'SQL', 'Domain': 'Clinical trial outcomes, drug approval data, sales figures, R&D expenditures, and market access strategies.', 'Complexity': 'basic SQL', 'Task Type': 'analytics and reporting'}

Prompt: What is the capital of France?
Result: {'Intent': 'Not SQL', 'Details': None}

Prompt: Show me all employees who joined in 2023.
Result: {'Intent': 'SQL', 'Domain': 'Employee demographics, talent acquisition data, diversity and inclusion metrics, and training program statistics.', 'Complexity': 'basic SQL', 'Task Type': 'data retrieval'}

Prompt: Who won the last Super Bowl?
Result: {'Intent': 'Not SQL', 'Details': None}

Prompt: What is the weather in New York today?
Result: {'Intent': 'SQL', 'Domain': 'Community health statistics, infectious disease tracking data, healthcare access metrics, and public health policy analysis.', 'Complexity': 'basic SQL', 'Task Type': 'analytics and reporting'}

Prompt: List all 

In [None]:
import pickle

# Combine all components into one dictionary
combined_classifier = {
    "binary_classifier": binary_classifier,  # Binary classifier for SQL vs Non-SQL
    "vectorizer": vectorizer,  # Vectorizer used for feature extraction
    "classifier_domain": classifier_domain,  # Multiclass classifier for domain
    "classifier_complexity": classifier_complexity,  # Multiclass classifier for SQL complexity
    "classifier_task_type": classifier_task_type,  # Multiclass classifier for task type
    "label_encoder_domain": label_encoder_domain,  # Label encoder for domain
    "label_encoder_complexity": label_encoder_complexity,  # Label encoder for complexity
    "label_encoder_task_type": label_encoder_task_type  # Label encoder for task type
}

# Save everything to a single pickle file
with open("combined_sql_classifier.pkl", "wb") as file:
    pickle.dump(combined_classifier, file)

print("All classifiers, vectorizer, and encoders saved to 'combined_sql_classifier.pkl'.")


All classifiers, vectorizer, and encoders saved to 'combined_sql_classifier.pkl'.
