# Data Acquisition

In this lab, we will learn how to acquire data from Hugging Face datasets and how to bring it into the right format for our models.Fill out the missing pieces in the source source to get everything working (indicated by `#FIXME`).

In [None]:
from datasets import load_dataset
import pandas as pd

We are going to work with the wikiqa dataset. It is a question answering dataset, where the task is to predict whether a given question can be answered by a given sentence from a Wikipedia article. The dataset is available in the [Hugging Face datasets library](https://huggingface.co/datasets/wiki_qa).

In [None]:
wikiqa = load_dataset("wiki_qa")

In [None]:
# Define a function to convert a dataset to a pandas dataframe with the document title and text columns
def to_dataframe(dataset):
  return dataset.to_pandas()

# Concatenate the train, test, and validation datasets using the function
wikiqa_df = pd.concat([to_dataframe(wikiqa["train"]), to_dataframe(wikiqa["test"]), to_dataframe(wikiqa["validation"])])

In [None]:
# combine document_title, question, and answer into a single column and add a \n as separator
wikiqa_df["text"] = wikiqa_df["document_title"] + "\n" + wikiqa_df["question"] + "\n" + wikiqa_df["answer"]

In [None]:
list(wikiqa_df['document_title'].unique())

In [None]:
wikiqa_df['text']

In [None]:
from azure.identity import DefaultAzureCredential
import openai
import os
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

openai.api_type = "azure"
openai.api_base = os.environ.get("AOAI_ENDPOINT")
openai.api_key = os.environ.get("AOAI_KEY")
openai.api_version = "2022-12-01"

In [None]:
response = openai.Embedding.create(engine="text-embedding-ada-002",
                                    prompt=wikiqa_df['text'].to_list())

In [None]:
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
import json

def getdata(url):
    r = requests.get(url)
    return r.content

# create empty dict
dict_href_links = {}

def get_links(website_link):
    html_data = getdata(website_link)
    soup = BeautifulSoup(html_data, "html.parser")
    list_links = []
    for link in soup.find_all("a", href=True):
        
        # Append to list if new link contains original link
        if str(link["href"]).startswith((str(website_link))):
            list_links.append(link["href"])
            
        # Include all href that do not start with website link but with "/"
        if str(link["href"]).startswith("/"):
            if link["href"] not in dict_href_links:
                print(link["href"])
                dict_href_links[link["href"]] = None
                link_with_www = website_link + link["href"][1:]
                print("adjusted link =", link_with_www)
                list_links.append(link_with_www)

        # Include all href that do not start with website link but without "/"
        if str(link["href"]).startswith(""):
           if link["href"] not in dict_href_links:
              print(link["href"])
              dict_href_links[link["href"]] = None
              link_with_www = website_link + link["href"]
              print("adjusted link =", link_with_www)
              list_links.append(link_with_www)
          
    # Convert list of links to dictionary and define keys as the links and the values as "Not-checked"
    dict_links = dict.fromkeys(list_links, "Not-checked")
    return dict_links

def get_subpage_links(l):
    for link in tqdm(l):
        # If not crawled through this page start crawling and get links
        if l[link] == "Not-checked":
            dict_links_subpages = get_links(link) 
            # Change the dictionary value of the link to "Checked"
            l[link] = "Checked"
        else:
            # Create an empty dictionary in case every link is checked
            dict_links_subpages = {}
        # Add new dictionary to old dictionary
        l = {**dict_links_subpages, **l}
    return l

In [None]:
# add websuite WITH slash on end
website = "https://learn.microsoft.com/en-us/azure/machine-learning/"
# create dictionary of website
dict_links = {website:"Not-checked"}

counter, counter2 = None, 0
while counter != 0:
    counter2 += 1
    dict_links2 = get_subpage_links(dict_links)
    # Count number of non-values and set counter to 0 if there are no values within the dictionary equal to the string "Not-checked"
    # https://stackoverflow.com/questions/48371856/count-the-number-of-occurrences-of-a-certain-value-in-a-dictionary-in-python
    counter = sum(value == "Not-checked" for value in dict_links2.values())
    # Print some statements
    print("")
    print("THIS IS LOOP ITERATION NUMBER", counter2)
    print("LENGTH OF DICTIONARY WITH LINKS =", len(dict_links2))
    print("NUMBER OF 'Not-checked' LINKS = ", counter)
    print("")
    dict_links = dict_links2
    # Save list in json file
    a_file = open("data.json", "w")
    json.dump(dict_links, a_file)
    a_file.close()

In [None]:
# get all keys from dictionary
list_links = list(dict_links.keys())
list_links

In [None]:
URL = "https://learn.microsoft.com/en-us/azure/machine-learning/"
page = requests.get(URL)
soup = BeautifulSoup(page.content, "html.parser")
for a_href in soup.find_all("a", href=True):
    print(a_href["href"])

In [None]:
soup

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Define a function to scrape the content of a page
def scrape_page(url):
    # Get the page
    page = requests.get(url)
    # Create a soup object
    soup = BeautifulSoup(page.content, 'html.parser')
    # extract all urls from the page that begin with the base url
    urls = [url['href'] for url in soup.find_all('a', href=True) if url['href'].startswith(URL)]
    return urls

# Define a function to scrape the main text of a list of pages
def scrape_pages(urls):
    # Define a list to store the text
    text = []
    # Loop through the urls
    for url in urls:
        # Get the text for the url
        page_text = scrape_page(url)
        # Add the text to the list
        text.append(page_text)
    # Return the list
    return text 

# Define a function to scrape the main text of a list of pages and save it to a file
def scrape_pages_to_json(urls, file_name):
    # Get the text
    text = scrape_pages(urls)
    # Save the text to a file
    df = pd.DataFrame(text)
    df.to_json(file_name)

scrape_pages_to_json(list_links, 'data.json')