# Q&A Pipeline

## Imports

In [None]:

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
from bs4 import BeautifulSoup
from random import randint
import time
import requests
import docx
from docx import Document
import tiktoken
from tenacity import retry, stop_after_attempt,wait_random_exponential
from haystack.document_stores import FAISSDocumentStore
from haystack.utils import convert_files_to_docs, fetch_archive_from_http, clean_wiki_text, print_documents, print_answers
from haystack.nodes import DensePassageRetriever, Seq2SeqGenerator 
import os
import openai
import time

In [None]:
from haystack.document_stores import FAISSDocumentStore
from haystack.utils import convert_files_to_docs, fetch_archive_from_http, clean_wiki_text, print_documents, print_answers
from haystack.nodes import DensePassageRetriever, Seq2SeqGenerator 

In [None]:
# %env OPENAI_API_KEY = {Enter your key here}
openai.api_key = os.getenv("OPENAI_API_KEY")

In [6]:
@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6))
def completion_with_backoff(**kwargs):
    return openai.ChatCompletion.create(**kwargs)



def getText(filename):
    doc = docx.Document(filename)
    fullText = []
    
    for para in doc.paragraphs[1:]:
        fullText.append(para.text)
    
    return '\n'.join(fullText)


def num_tokens_from_messages(messages, model):
    encoding = tiktoken.encoding_for_model(model)
    num_tokens = 0
    for message in messages:
        num_tokens += 4  # every message follows <im_start>{role/name}\n{content}<im_end>\n
        for key, value in message.items():
            num_tokens += len(encoding.encode(value))
            if key == "name":  # if there's a name, the role is omitted
                num_tokens += -1  # role is always required and always 1 token
    num_tokens += 2  # every reply is primed with <im_start>assistant
    return num_tokens


def cleanFilename(sourcestring,  removestring ="\:*?\"<>|"):
    return ''.join([c for c in sourcestring if c not in removestring])


## Data Extraction

In [4]:
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
driver = webdriver.Chrome('chromedriver',options=chrome_options)
driver.get("https://astera.com/blog/")


while(1):
  try:
    button = driver.find_element(By.CLASS_NAME, "load-more")
    driver.execute_script("arguments[0].click();", button)
  
  except NoSuchElementException: break

  s = randint(5,8)
  time.sleep(s)

# page_source is a variable created by Selenium - it holds all the HTML
page = driver.page_source


In [5]:
soup = BeautifulSoup(page, "html.parser")

all_topics = soup.find("div", class_ = "blog-tabinfo-box-main topic active")
job_elements = all_topics.find_all("div", class_="blog-tabinfo-box")
links = []

for element in job_elements:
    link = element.find('a', href = True)['href']
    links.append(link)


In [6]:
path = r"NumberOflinks.txt" 
with open(path, "r") as tfile:
	num_old_links = int(tfile.read())

links = links[0:len(links) - 110]

if( len(links) > num_old_links):

	with open(path,'w') as tfile:
		tfile.write(str(len(links)))

	num_new_links = len(links) - num_old_links
	links = links[0 : num_new_links]

else:
	links = []


In [None]:
new_doc_paths = []

for link in links:
      
    document = Document()
    page = requests.get(link)
    time.sleep(3)
    soup = BeautifulSoup(page.content, "html.parser")

    content = soup.find('div', class_="blog-content")

    if( (content is not None) and (len(content.get_text()) != 0)):
        document.add_paragraph(link)
        title = soup.find('h1')
        document.add_heading(title, level = 1)

        for tag in content:
            if not (isinstance(tag, str)):
                tag_text = tag.get_text().strip()
                tag_name = tag.name

                if(tag_name == 'p'):
                    paragraph = document.add_paragraph(tag_text)
                elif (tag_name in ['h2', 'h3', 'h4', 'h5', 'h6']):
                    paragraph = document.add_heading(tag_text, level=int(tag_name[1]))

        # save the Word document

        doc_name = cleanFilename(title.get_text().strip()) + '.docx'
        doc_name = doc_name.replace('/', ' or ')
        path = fr"Blog Content\{doc_name}"
        new_doc_paths.append(path) 
        document.save(path)


## Question Generation

In [8]:
path = r"all_questions.txt"
with open(path,'r') as tfile:
	all_questions = tfile.readlines()


questions_gpt = []

for path in new_doc_paths:
    
    content = getText(path)
    gpt_prompt = 'Generate 1-2 key, relevant questions that the following article focuses on: \n\n' + 'Title: ' + content
    message = [
            {"role": "system", "content": "You are a helpful assistant that is a Search Engine Optimization Expert."},
            {"role": "user", "content": gpt_prompt},
        ]

    while (num_tokens_from_messages(message, "gpt-3.5-turbo") +150) >4096:
        message[1]['content'] = message[1]['content'].rsplit('. ', 1)[0]


    response = completion_with_backoff(model = "gpt-3.5-turbo", messages = message, max_tokens = 150)

    questions = response['choices'][0]['message']['content'].splitlines()

    for q in questions:
      if ((q not in questions_gpt) and (q not in all_questions)):
        questions_gpt.append(q)



for i in range(len(questions_gpt)):
  questions_gpt[i] = questions_gpt[i].lstrip('0123456789.- ')
  

In [9]:
path = r"all_questions.txt" 
if(questions_gpt is not None):
	with open(path,'a') as tfile:
		tfile.write('\n'.join(questions_gpt))


## Answer Generation

In [7]:
documents = []

with os.scandir('Blog Content/') as entries:
    
    for entry in entries:
        text = getText(entry.path)
        new_doc = {
            'content' : text, 
            'meta' : {'name' : entry.name, 'path': entry.path}
        }
        documents.append(new_doc)



In [None]:
if(os.path.exists('document_store')):
    document_store = FAISSDocumentStore.load(index_path = "document_store")


else:
    document_store = FAISSDocumentStore(embedding_dim=128, faiss_index_factory_str="Flat")


retriever = DensePassageRetriever(
    document_store=document_store,
    query_embedding_model="vblagoje/dpr-question_encoder-single-lfqa-wiki",
    passage_embedding_model="vblagoje/dpr-ctx_encoder-single-lfqa-wiki",
)

document_store.write_documents(documents)
document_store.update_embeddings(retriever)
document_store.save('document_store')


In [9]:
path = r"all_questions.txt" 
with open(path,'r') as tfile:
	all_questions = tfile.readlines()


path = r"answered_questions.txt" 
with open(path,'r') as tfile:
	answered_questions = tfile.readlines()

#This is to ensure that the code only answers maximum 10 questions in a day
if( (len(answered_questions)) + 10 < len(all_questions)):
	questions = all_questions[len(answered_questions) : len(answered_questions)+10]

else:
	questions = all_questions[len(answered_questions) : ]

In [10]:
qa = []

for q in questions:

    res = retriever.retrieve(query=q , top_k= 2)
    context = 'Title: ' + res[0].content + '\n\n' + 'Title: ' + res[1].content


    gpt_prompt = f"""Generate a search engine optimized response to the question: \'{q}\'. You need to follow the following guidelines: 
                    1. Follow all Search Engine Optimization guidelines and ensure usage of keywords
                    2. Make sure the text is written in a formal tone and can be understood by both technical as well as non-technical people
                    3. Mkae sure there is less plagiarism
                    4. You can use the following content pieces to assist your answer: \n {context}"""
     
    message = [
            {"role": "system", "content": "You are a helpful assistant that specializes in writing search engine optimized answers."},
            {"role": "user", "content": gpt_prompt}
        ] 

    while (num_tokens_from_messages(message, "gpt-4") +1000) > 8000:
        message[1]['content'] = message[1]['content'].rsplit('. ', 1)[0]

    response = completion_with_backoff(model = "gpt-4", messages = message, max_tokens = 1000, temperature = 0.4)


    answer = response['choices'][0]['message']['content']
    answer_link = docx.Document(res[0].meta['path']).paragraphs[0].text
    end = f"""\n\nIf you want to learn more about this, you can visit out blog post: {answer_link}"""
    answer = answer + end

    final = q + '\n' +  answer

    print(final)
    qa.append(final)
    answered_questions.append(q)

What are some current trends in AI-powered data warehousing and how are they being used by companies to improve data processing and analysis? What are some predictions for future applications of AI in data warehousing and how will they impact businesses?

Current Trends in AI-Powered Data Warehousing and Their Impact on Businesses

Artificial Intelligence (AI) is revolutionizing data warehousing by streamlining processes and improving data processing and analysis. Approximately 44% of companies plan to invest in AI to enhance their data warehousing capabilities. The current trends in AI-powered data warehousing include AI-assisted ETL processes, smart data modeling, automated data cleansing, and continuous data quality monitoring.

1. AI-Assisted ETL Processes: AI-powered ETL tools automate repetitive tasks, optimize performance, and reduce human error. This allows data engineers to focus on higher-level tasks, such as designing data models and creating data visualizations. For example

In [12]:
path = r"qa.txt" 
with open(path,'a') as tfile:
	tfile.write('\n\n\n'.join(qa))

path = r"answered_questions.txt" 
with open(path,'a') as tfile:
	tfile.write(''.join(answered_questions))