requirements:

* llm access

outputs:

* topics.json
* term_candidates.json
* hypothetical_term_candidates.csv

In [2]:
# import external libraries
import sys
import os
import json
import pandas as pd
import numpy as np
from io import StringIO

In [3]:
# import local modules
current_dir = os.path.dirname(os.path.abspath('__file__'))
parent_dir = os.path.dirname(current_dir)
data_dir = os.path.join(parent_dir, 'data')

sys.path.append(os.path.join(parent_dir))
sys.path.append(os.path.join(parent_dir, 'src'))
sys.path.append(os.path.join(parent_dir, 'prompts'))

from searchUnexistent import Searcher
from utilities import get_strtime
import templates
import settings

### Topic Generation Prompt

In [4]:
topic_generator = "What are the most popular 20 topics on the internet?"

### Term Generation Prompt

In [5]:
def generate_term_generation_prompt(topics:list):
    for topic in topics:
        yield templates.term_generator.format(topic=topic)

In [6]:
topics = json.load(open(os.path.join(data_dir, 'intermediate', 'topics.json')))
prompt_gen = generate_term_generation_prompt(topics)

In [10]:
print(next(prompt_gen))

Make a list of 50 nonexistent made up terms about the following topic by using multiple common words.
Do not combine words, just use at least 4 - 5 words together as a phenomenon.
Do not use the words in the following list: ["conventional", "traditional", "holistic", " phenomenon ", "comprehensive ", "technique", "-"]
Topic: Entertainment (movies, TV shows, music, celebrities): This covers the world of entertainment, including movies, television series, music albums, celebrity news, award shows, and celebrity gossip.

Use multiple common words.
Do not combine words. Use space between words. Do not use "-" character.


### Transforming Term Candidates to Dataframe

In [11]:
term_candidates = json.load(open(os.path.join(data_dir, 'intermediate', 'term_candidates.json')))

In [12]:
data = []
for index, item in enumerate(term_candidates):
    terms = item.split("/n/n/n")
    for term in terms:
        term_title, definition = term.split(":")
        data.append({"term": term_title, "explanation": definition, "topic": topics[index]})

term_candidates_df = pd.DataFrame(data)
term_candidates_df["exists"]= np.nan

In [13]:
term_candidates_df

Unnamed: 0,term,explanation,topic,exists
0,Nano-Sync Fusion Technology,A revolutionary advancement in technology tha...,Technology and gadgets: This topic covers the ...,
1,Quantum-Pulse Holography Interface,A cutting-edge interface that harnesses the p...,Technology and gadgets: This topic covers the ...,
2,Neuro-Cognitive Acceleration System,A sophisticated system designed to enhance hu...,Technology and gadgets: This topic covers the ...,
3,Hyper-Reality Immersion Module,An innovative module that enables users to im...,Technology and gadgets: This topic covers the ...,
4,Aero-Adaptive Haptic Feedback,A tactile feedback technology that dynamicall...,Technology and gadgets: This topic covers the ...,
...,...,...,...,...
1115,Virtual reality field trips,Immersive virtual reality experiences that si...,Education and online learning: This topic revo...,
1116,Gamified mastery learning frameworks,Instructional frameworks that combine mastery...,Education and online learning: This topic revo...,
1117,Multilingual e-learning platforms,Online learning platforms that offer educatio...,Education and online learning: This topic revo...,
1118,Data-driven curriculum personalization,Utilizing learner data and analytics to dynam...,Education and online learning: This topic revo...,


### Verifying Hypothetical Terms

In [14]:
searcher = Searcher(settings)

Getting keys


In [15]:
def search_nonexistent(df: pd.DataFrame):
    for index, row in df.iterrows():
        if not np.isnan(row["exists"]):
            continue
        does_exists = searcher.exists()
        df.loc[index, 'exists'] = does_exists

In [16]:
# testing searcher class
searcher.search("Nano-Sync Fusion Technology")

{'kind': 'customsearch#search',
 'url': {'type': 'application/json',
  'template': 'https://www.googleapis.com/customsearch/v1?q={searchTerms}&num={count?}&start={startIndex?}&lr={language?}&safe={safe?}&cx={cx?}&sort={sort?}&filter={filter?}&gl={gl?}&cr={cr?}&googlehost={googleHost?}&c2coff={disableCnTwTranslation?}&hq={hq?}&hl={hl?}&siteSearch={siteSearch?}&siteSearchFilter={siteSearchFilter?}&exactTerms={exactTerms?}&excludeTerms={excludeTerms?}&linkSite={linkSite?}&orTerms={orTerms?}&dateRestrict={dateRestrict?}&lowRange={lowRange?}&highRange={highRange?}&searchType={searchType}&fileType={fileType?}&rights={rights?}&imgSize={imgSize?}&imgType={imgType?}&imgColorType={imgColorType?}&imgDominantColor={imgDominantColor?}&alt=json'},
 'queries': {'request': [{'title': 'Google Custom Search - "Nano-Sync Fusion Technology"',
    'searchTerms': '"Nano-Sync Fusion Technology"',
    'count': 10,
    'startIndex': 1,
    'inputEncoding': 'utf8',
    'outputEncoding': 'utf8',
    'safe': 'off

In [17]:
term_candidates_path = os.path.join(data_dir, "intermediate", "hypothetical_term_candidates.csv")
term_candidates_df = pd.read_csv(term_candidates_path)

In [18]:
search_nonexistent(term_candidates_df)

In [19]:
term_candidates_path = os.path.join(data_dir, 'intermediate', f"term_candidates_{get_strtime()}.csv")
term_candidates_df.to_csv(term_candidates_path, index=False)