# Chapter 2 - Survey of existing data sources

In this chapter I'll attempt to:
- Find websites containing data based on example subheadings
- Search websites for downloadable data
- Iterate through a list of links checking if they contain a csv etc
- Download data

In [1]:
from langchain.agents import load_tools
from langchain.agents import initialize_agent
from langchain.llms import OpenAI
import openai
import json
import requests
import re
from bs4 import BeautifulSoup

In [2]:
import os
os.environ['OPENAI_API_KEY'] = YOUR_API_KEY
os.environ['SERPAPI_API_KEY'] = YOUR_API_KEY

## General function to prompt openai api:

In [3]:
def query_openai(prompt):
    openai.api_key = os.environ['OPENAI_API_KEY']

    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=[
            {
                "role": "system",
                "content": "You are an expert data analyst who excels at finding public data sources that are helpful for analysis."
            },
            {
                "role": "user",
                "content": prompt
            }
        ]
    )

    return response.choices[0].message['content']

## Create a langchain agent to search the web using gpt and serpapi:

In [4]:
llm = OpenAI(temperature=0)

In [5]:
tools = load_tools(["serpapi", "llm-math"], llm=llm)

In [6]:
agent = initialize_agent(tools, llm, agent="zero-shot-react-description", verbose=True)

## Use the langchain agent to search for suitable links:

In [7]:
industry = "musical instrument manufacturing"

prompt = f"""You are an expert data analyst
            who excels at finding public data sources
            that are helpful for analysis.
            You are helping a company
            in the {industry} industry.

            I want data about other companies in the industry,
            containing information such as their size (number of employees, revenue, etc.), 
            location, product range, pricing, and market share.

            List some sources where I can download CSV files 
            with this data about other companies in the {industry} industry.

            Give the result in the following json format:
            [
                {{
                    "source": 1,
                    "name": "...",
                    "url": "..."
                }},
                {{
                    "source": 2,
                    "name": "...",
                    "url": "..."
                }},
                {{
                    "source": 3,
                    "name": "...",
                    "url": "..."
                }}
            ]
            """ 

In [8]:
response = agent.run(prompt)


,
,[1m> Entering new  chain...[0m
,[32;1m[1;3m I need to find sources of data about other companies in the musical instrument manufacturing industry.
,Action: Search
,Action Input: "musical instrument manufacturing industry data sources"[0m
,Observation: [36;1m[1;3mMusical Instrument Mfg. - Free online guide to industry research and analysis including industry statistics, trends, and forecasts, financial ratios, ...[0m
,Thought:[32;1m[1;3m I should look at the results of this search to find sources of data.
,Action: Search
,Action Input: "musical instrument manufacturing industry data sources csv"[0m
,Observation: [36;1m[1;3mThe table below shows the industry code and title relationships applicable to NAICS coded QCEW data, and is available as an Excel workbook (.[0m
,Thought:[32;1m[1;3m I should look at the results of this search to find sources of data in CSV format.
,Final Answer: [
,    {
,        "source": 1,
,        "name": "U.S. Bureau of Labor Statistics",
,  

In [9]:
data_sources = json.loads(response)

## Using Selenium to automate downloads:

In [10]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

In [11]:
driver = webdriver.Chrome()

In [12]:
download_path = "H:\Documents\Software Development\QUB Software Development\Data_analysis_module\Chapter2DownloadedCSVs"

In [17]:
def download_csv_files_from_source(selenium_data_sources, driver, download_path):
    for source in selenium_data_sources:
        driver.get(source["url"])
        try:
            # Try to find CSV files by their visible text
            csv_links = WebDriverWait(driver, 10).until(
                EC.presence_of_all_elements_located((By.PARTIAL_LINK_TEXT, ".csv"))
            )
        except:
            # If none found, try to find CSV files by their href attribute
            try:
                csv_links = WebDriverWait(driver, 10).until(
                    EC.presence_of_all_elements_located((By.XPATH, '//a[contains(@href, ".csv")]'))
                )
            except:
                csv_links = []

        # If we found any CSV links, print out url then download file
        if csv_links:
            for link in csv_links:

                url = link.get_attribute('href')
                print(f"CSV file found at: {url}")

                response = requests.get(url)
                # sanitize filename to ensure it's valid
                filename = re.sub(r'[\\/*?:"<>|]', "", url.split("/")[-1])
                filename = os.path.join(download_path, filename)

                with open(filename, 'wb') as f:
                    f.write(response.content)

                print(f"Downloaded {filename}")

        else:
            print(f"No CSV file found at: {source['url']}")

In [18]:
download_csv_files_from_source(data_sources, driver, download_path)

No CSV file found at: https://www.bls.gov/cew/cewedr/data_tables.htm
,No CSV file found at: https://www.statista.com/statistics/industry/
,No CSV file found at: https://www.ibisworld.com/industry-trends/market-research-reports/manufacturing/musical-instruments-manufacturing.html


In [19]:
driver.quit()

Example webpage with a CSV in the href of anchor tag:

In [20]:
example_source = [
    {'source': 1, 'name': 'UK Trade Info - Willy Banjos', 'url': 'https://www.uktradeinfo.com/traders/willy-banjo-limited-249317?senderQueryString=commodities%3D34%26p%3D175'},
]

In [21]:
driver = webdriver.Chrome()

In [22]:
download_csv_files_from_source(example_source, driver, download_path)

CSV file found at: https://www.uktradeinfo.com/umbraco/api/searchdownload/trade?traderid=249317&filename=WILLY+BANJO+LIMITED%20trade.csv
,Downloaded H:\Documents\Software Development\QUB Software Development\Data_analysis_module\Chapter2DownloadedCSVs\tradetraderid=249317&filename=WILLY+BANJO+LIMITED%20trade.csv
