# Experiments Creating Search Intent Sample Utterances

Some example scripts and experiments to generate sample utterances sources including stackoverflow top questions, top google keyword searches, Wolfram|Alpha example queries, popular pages on Wikipedia.

# Google Search Intents

In [None]:
# import the wolfram intents
import os, json, sys, re
path_to_json = "intents-exports/"
with open(os.path.join(path_to_json, "wolfram_simple_utterances.json")) as wa_simple_file:
    wa_simple = json.load(wa_simple_file)
with open(os.path.join(path_to_json, "wolfram_complex_utterances.json")) as wa_complex_file:
    wa_complex = json.load(wa_complex_file)
      

# DDG Instant Answer Intents

Determine queries that are entities/categories suitable for wiki/ddg

In [None]:
import requests
response = ''
def search_ddg(query):
    ddg_url = "https://duckduckgo-duckduckgo-zero-click-info.p.rapidapi.com/"

    ddg_querystring = {"q":query,"no_redirect":"1","no_html":"1","skip_disambig":"1","format":"json"}

    ddg_headers = {
        'x-rapidapi-key': "** key here ***",
        'x-rapidapi-host': "duckduckgo-duckduckgo-zero-click-info.p.rapidapi.com"
        }

    try:
        ddg_response = json.loads(requests.request("GET", ddg_url, headers=ddg_headers, params=ddg_querystring).text)
    except: 
        return ''
    if 'AbstractText' in ddg_response:
        response = ddg_response["AbstractText"]
    else:
        response = ''
    return response


In [None]:
wa_simple_ddg = []
for query in wa_simple["simple"]:
    ddg_result = search_ddg(query)
    print(query + ":\n" + ddg_result)
    if ddg_result:
        wa_simple_ddg.append(query)
for query in wa_complex["complex"]:
    ddg_result = search_ddg(query)
    print(query + ":\n" + ddg_result)
    if ddg_result:
        wa_simple_ddg.append(query) 
wa_simple_ddg

In [None]:
len(wa_simple_ddg)

In [None]:
# Check if is entity using spacy
import spacy
import en_core_web_sm
nlp = en_core_web_sm.load()
def get_entities(text):
    doc = nlp(text)
    ent_list = []
    for ent in doc.ents:
        ent_list.append(ent.text)
    return ent_list

for query in wa_complex["complex"]:
    entities = get_entities(query)
    #print(entities)
    if len(entities) == 1 and query == entities:
        print(query + ":\n" + str(entities))

In [None]:
get_entities("horse")

In [None]:
# Create filtered list of Wolfram simple and complex intents without entities samples
wa_simple_filtered = []
wa_complex_filtered = []
for i in wa_simple["simple"]:
    if i not in wa_simple_ddg:
        wa_simple_filtered.append(i)
for i in wa_complex["complex"]:
    if i not in wa_simple_ddg:
        wa_complex_filtered.append(i)
        

In [None]:
# Save utterances for wolfram simple and complex intents

# Save the simple utterances with answers to a file
lex_utterances_file = open("wolfram_simple_utterances_filtered.txt", "w")
lex_utterances_file.write(str(wa_simple_filtered).replace("', '", "\",\n    \"").replace("', \"", "\",\n    \"").replace("\", '","\",\n    \"").replace("['", "[\"").replace("']", "\"]"))
lex_utterances_file.close()

# Save the complex utterances with answers to a file
lex_utterances_file = open("wolfram_complex_utterances_filtered.txt", "w")
lex_utterances_file.write(str(wa_complex_filtered).replace("', '", "\",\n    \"").replace("', \"", "\",\n    \"").replace("\", '","\",\n    \"").replace("['", "[\"").replace("']", "\"]"))
lex_utterances_file.close()

# Save the DDG intents for entities with answers to a file
# Save the simple utterances with answers to a file
ddg_utterances_file = open("ddg_utterances.txt", "w")
ddg_utterances_file.write(str(wa_simple_ddg).replace("', '", "\",\n    \"").replace("', \"", "\",\n    \"").replace("\", '","\",\n    \"").replace("['", "[\"").replace("']", "\"]"))
ddg_utterances_file.close()


## Create Google Questions Intent

In [None]:
path_to_json = "intents-exports/"
with open(os.path.join(path_to_json, "google_questions_combined_deduped.json")) as google_questions_file:
    google_questions = json.load(google_questions_file)

In [None]:
questions_list = google_questions["questions"]

In [None]:
punctuations = '''‐_+=|!()[]{};:’"\,<>/?@#$%^&*~–0123456789-.'''
deduped_google = []
for i in questions_list:
    if i not in deduped_google:
        for x in i.lower():
            if x in punctuations:
                # utterance = utterance.replace(x," ").strip()
                i = ""
        if 'what time is it' in i.lower() or 'what is the weather' in i.lower():
            i = ""
        deduped_google.append(i)

In [None]:
len(google_questions["questions"])

In [None]:
len(deduped_google)
google_questions_final = deduped_google

In [None]:
# Save the simple utterances with answers to a file
goog_utterances_file = open("google_questions.json", "w")
goog_utterances_file.write(str(google_questions_final).replace("', '", "\",\n    \"").replace("', \"", "\",\n    \"").replace("\", '","\",\n    \"").replace("['", "[\"").replace("']", "\"]"))
goog_utterances_file.close()

## Create Google top queries intent

In [None]:
path_to_json = "intents-exports/"
with open(os.path.join(path_to_json, "google_top_queries.json")) as google_questions_file:
    google_questions = json.load(google_questions_file)
queries_list = google_questions["queries"]
punctuations = '''�‐_+=|!()[]{};:’"\,<>/?@#$%^&*~–0123456789-.'''
deduped_google = []
for i in queries_list:
    if i not in deduped_google and i not in wa_simple_filtered and i not in wa_complex_filtered and i not in wa_simple_ddg:
        for x in i.lower():
            if x in punctuations:
                # utterance = utterance.replace(x," ").strip()
                i = ""
        if 'what time is it' in i.lower() or 'what is the weather' in i.lower():
            i = ""
        deduped_google.append(i)


In [None]:
# Remove any items that resolve with DDG Instant Answers (entities)
google_not_ddg = []
for query in deduped_google:
    ddg_result = search_ddg(query)
    print(query + ":\n" + ddg_result)
    if not ddg_result:
        google_not_ddg.append(query)
google_queries_final = google_not_ddg

In [None]:
len(google_queries_final)
for idx, i in enumerate(google_queries_final):
    #print(i)
    if i == "":
        del google_queries_final[idx]
len(google_queries_final)

In [None]:
'''for query in deduped_google:
    entities = get_entities(query)
    #print(entities)
    if len(entities) == 1 and query == entities:
        print(query + ":\n" + str(entities))'''


In [None]:
# Save the simple utterances with answers to a file
goog_queries_utterances_file = open("google_queries_utterances.json", "w")
goog_queries_utterances_file.write(str(google_queries_final).replace("', '", "\",\n    \"").replace("', \"", "\",\n    \"").replace("\", '","\",\n    \"").replace("['", "[\"").replace("']", "\"]"))
goog_queries_utterances_file.close()

## Create Wikipedia Intents

In [None]:
path_to_json = "intents-exports/"
with open(os.path.join(path_to_json, "wikipedia_top1500.json")) as wiki_file:
    wiki_pages = json.load(wiki_file)
pages_list = wiki_pages["pages"]
punctuations = '''�‐_+=|!()[]{};:’"\,<>/?@#$%^&*~–0123456789-.'''
deduped_wiki = []
for i in pages_list:
    if i not in deduped_wiki and i not in deduped_google and i not in wa_simple_filtered and i not in wa_complex_filtered and i not in wa_simple_ddg and i not in google_queries_final and i != "":
        for x in i.lower():
            if x in punctuations:
                # utterance = utterance.replace(x," ").strip()
                i = ""
        if 'what time is it' in i.lower() or 'what is the weather' in i.lower():
            i = ""
        if i != "":
            deduped_wiki.append(i)

In [None]:
len(deduped_wiki)

In [None]:
# Save the simple utterances with answers to a file
wiki_queries_utterances_file = open("wiki_utterances.json", "w")
wiki_queries_utterances_file.write(str(deduped_wiki).replace("', '", "\",\n    \"").replace("', \"", "\",\n    \"").replace("\", '","\",\n    \"").replace("['", "[\"").replace("']", "\"]"))
wiki_queries_utterances_file.close()

# Generate Website Name and Domain Slots for searching

In [None]:
from bang_data import NAV_DATA
# website names
searchable_names = []
searchable_domains = []
punctuations = '''�‐_+=|!()[]{};:’"\,<>/?@#$%^&*~–0123456789-.'''
domain_punctuations = '''�‐_+=|!()[]{};:’"\,<>/?@#$%^&*~–'''
for site in NAV_DATA:
    if '{{{s}}}' in site['u'] and site['d'] != '' and not 'duckduckgo' in site['u']:
        # searchable website
        site_name = site['s'].lower()
        #site_domain_clean = site['d'].lower().replace('www.', '').replace('.', ' ')
        site_domain = site['d'].lower().replace('www.', '')
        for x in site_name:
            if x in punctuations:
                # utterance = utterance.replace(x," ").strip()
                site_name = ''
        for x in site_domain:
            if x in domain_punctuations:
                site_domain = ''
        if site_name != '' and site_name not in searchable_names:
            searchable_names.append(site_name)
        if site_domain != '' and site_domain not in searchable_domains:
            searchable_domains.append(site_domain)
# Generate objects
names_slots = []
domains_slots = []
for name in searchable_names:
    item = {'value': name}
    names_slots.append(item)
for domain in searchable_domains:
    item = {'value': domain}
    domains_slots.append(item)


In [None]:
import json
with open("search_names_slots.json", "w") as fp:
    json.dump(names_slots, fp)
with open("search_domains_slots.json", "w") as fp:
    json.dump(domains_slots, fp)

# Generate knowledge slots

Generates a list of slot objects with the top 1500 wikipedia pages to use for detecting Knowledge queries


In [None]:
import os
import json
path_to_json = "intents-exports/"
with open(os.path.join(path_to_json, "wikipedia_top1500.json")) as wiki_file:
    wiki_pages = json.load(wiki_file)
pages_list = wiki_pages["pages"]
punctuations = '''�‐_+=|!()[]{};:’"\,<>/?@#$%^&*~–0123456789-.'''
deduped_wiki = []
for i in pages_list:
    if i not in deduped_wiki and i != "":
        for x in i.lower():
            if x in punctuations:
                # utterance = utterance.replace(x," ").strip()
                i = ""
        if 'what time is it' in i.lower() or 'what is the weather' in i.lower():
            i = ""
        if i != "":
            deduped_wiki.append(i)
wiki_phrases = []
for wiki_phrase in deduped_wiki:
    item = {'value': wiki_phrase}
    wiki_phrases.append(item)

In [None]:
import json
with open("wiki_phrase_slots.json", "w") as fp:
    json.dump(wiki_phrases, fp)

# Programming Slot Types and Intent

In [None]:
import json
import requests
import tldextract
import urllib
import socket
import os
from bs4 import BeautifulSoup
path_to_json = "intents-exports/"

In [None]:
# Stackoverflow Top Questions
url_base = 'https://stackoverflow.com/questions?tab=votes&page='
pages = 10
question_list = []
platforms = ['']
for page in range(pages):
    url = f"{url_base}{page}"
    r = requests.get(url)
    html = BeautifulSoup(r.content, 'html.parser')
    questions_containers = html.findAll('div', class_="question-summary")
    
    for question_link in questions_containers:
        question = question_link.find('a').get_text(' ').lower()
        
        question_list.append(question)
question_list

In [None]:
platforms = ['python', 'javascript', 'java','swift ', ' c ', 'ruby', 'vb net', 'perl', 'htaccess', 'assembly', 'jquery', 'smalltalk', 'react ', 'react native', 'basic ', ' c# ', ' c++ ', ' f# ', 'haskell', ' r ', ' rust ', 'angular', ' vue ', 'vue js', 'json', ' git ', 'git ', ' git','html', 'actionscript', 'php', 'css', 'sql server', 'sqllite', ' sql ', 'android', 'ios ', 'mysql', 'powershell', 'maven', 'pandas', 'windows', 'github', 'vagrant', 'docker ', 'junit', 'microsoft', 'mac ', 'angularjs', 'npm ', 'bower ', 'browserify ', 'gulp ', 'grunt ', 'webpack', 'postgresql', 'excel ', 'eclipse ', 'macos', 'dockerfile', 'linux']
remove = ['youtube']

In [86]:

punctuations = '''�‐_+=|!()[]{};:’”"\,<>/?@#$%^&*~–0123456789-—.“””…'''
deduped_questions = []
for i in question_list:
    
    if i not in deduped_questions and i != "":
        i = i.lower()
        for platform in platforms:
            if 'Platform' in i:
                #print("Platform already in:", i)
                break
            else:
                i = i.replace(platform, ' Platform ', 1)
        for x in i.lower():
            if x in punctuations:
                i = i.replace(x," ").strip()
                #i = ""
            if x in ["'"]:
                i = ""
        if 'what time is it' in i.lower() or 'what is the weather' in i.lower():
            i = ""
        for remover in remove:
            if remover in i:
                i = ""
        i = i.replace("Platform", "{Platform}")
        i = " ".join(i.split())
        if i != "" and not i in deduped_questions:
            deduped_questions.append(i)

In [84]:
deduped_questions

['why is processing a sorted array faster than processing an unsorted array',
 'how do i undo the most recent local commits in {Platform}',
 'how do i delete a {Platform} branch locally and remotely',
 'what does the yield keyword do',
 'what is the correct {Platform} content type',
 'what is the operator in c',
 'how do i rename a local {Platform} branch',
 'how can i remove a specific item from an array',
 'what and where are the stack and heap',
 'can comments be used in {Platform}',
 'how do i check if an element is hidden in {Platform}',
 'why does {Platform} think chucknorris is a color',
 'what does use strict do in {Platform} and what is the reasoning behind it',
 'how do i redirect to another webpage',
 'how do i force {Platform} pull to overwrite local files',
 'how to modify existing unpushed commit messages',
 'how do {Platform} closures work',
 'how do i revert a {Platform} repository to a previous commit',
 'how to check whether a string contains a substring in {Platform}

In [85]:
# Save the simple utterances with answers to a file
programming_utterances_file = open("programming_utterances.json", "w")
programming_utterances_file.write(str(deduped_questions).replace("', '", "\",\n    \"").replace("', \"", "\",\n    \"").replace("\", '","\",\n    \"").replace("['", "[\"").replace("']", "\"]"))
programming_utterances_file.close()