# Generating Company Summaries using Generative Artificial Intelligence
---
This notebook shows how you can use Web Search-Augmented Generation to quickly, cheaply, and accurately generate a short summary of a company, given it's web site.

## Parameters

In [None]:
# These are used by papermill; see generate-multiple-summaries.py
company_URL = "https://www.agios.com/"
enable_test_harness = False # disable test harness when generating summaries

To access the Bedrock API from this notebook the notebook's execution role must have a policy to allow Bedrock access. To find this notebook's execution role run the following code in this notebook:
```
print(get_execution_role())
```
and then go to the IAM console and add the policy AmazonBedrockFullAccess. You can similarly provide access to S3 and to Textract.

After signing up for an account at `serpapi.com`, enter your API key here:

In [None]:
%env SERPAPI_API_KEY=<...>

In [None]:
BUCKET_NAME="sgh-misc"  # change this to an s3 bucket in your account

In [None]:
# Uncomment these the first time you run this notebook:
# %pip install -q requests==2.31.0
# %pip install -q langchain==0.1.9
# %pip install langchain-community==0.0.24
# %pip install -q boto3==1.34.51
# %pip install -q botocore==1.34.51
# %pip install -q python-dateutil==2.8.2
# %pip install -q dateparser==1.2.0
# %pip install -q amazon-textract-caller==0.2.2
# %pip install -q amazon-textract-textractor==1.7.4
# %pip install -q papermill==2.5.0
# %pip install -q jinja2==3.1.3
# %pip install -q beautifulsoup4==4.12.3

In [None]:
import json, os
from typing import Callable, List, Tuple, Iterable, Optional, Dict, Union
import time
from pathlib import Path
from urllib.parse import urlparse
import importlib
import datetime
from functools import lru_cache, partial

import utilities as u

import boto3, botocore
from IPython.display import HTML, display
import jinja2
import dateparser
from langchain.llms import Bedrock
from urllib3.exceptions import MaxRetryError 
import sagemaker

In [None]:
importlib.reload(u) # You can use this during development to avoid restarting the kernel

In [None]:
start_time = time.time()

In [None]:
num_threads = 10
jenv = jinja2.Environment(trim_blocks=True, lstrip_blocks=True)
bedrock_runtime = boto3.client("bedrock-runtime")
bedrock = boto3.client("bedrock")
s3 = boto3.client("s3")
textract = boto3.client('textract')
# model_id = "anthropic.claude-3-haiku-20240307-v1:0"
model_id="anthropic.claude-3-sonnet-20240229-v1:0"
temperature=0.1
run_model = u.create_bedrock_runner(bedrock_runtime, model_id, temperature)

In [None]:
curl_headers = {
    "User-Agent": "curl/8.1.2",
    "Accept": "*/*"
}
company_URLs = [
        "https://www.agios.com",
        "https://www.morphosys.com/en",
        "https://incyte.com",
        "https://www.protagonist-inc.com",
        "https://kerostx.com",
        "https://vaderis.com",
        "https://hemavant.com",
        "https://www.intelliatx.com",
        "https://beamtx.com",
        "https://www.capstantx.com",
        "https://renagadetx.com",
        "https://arbor.bio"
]

In [None]:
def company_name_from_URL_as_html(_company_name: str, company_URL: str ) -> str:
    """ company name is ignored, this lets us use the test harness """
    return company_name_from_URL(company_URL)

In [None]:
@lru_cache
def company_name_from_URL(company_URL: str) -> str:
    """
    Use the LLM to infer the name of the company from its URL.
    """
    try:
        text_contents = u.download_web_page_as_text(company_URL, s3, textract,
                                                    BUCKET_NAME, headers=curl_headers)
        if not text_contents:
            return "unknown company"
    except (ConnectionError, MaxRetryError) as ex:
        print(f"Failed to download {URL}: {ex}")
        return "unknown company"

    prompt = u.strip_multiline_whitespace(f"""\
        You are a researcher at a biopharmaceutical company who has been tasked
        with determining what the name of a company is. The following is the contents
        of this company's main web page:
         
        <HTML_contents>
        {text_contents}
        </HTML_contents>
        
        Based on this web page you must determine the name of this company and 
        return it in <company_name></company_name> tags. Always return the full 
        name of the company, for example if the name is "Foo Therapeutics" then
        you should return that, not "Foo".
        """)
    try:
        result = run_model(prompt)
        tags = u.extract_multiple_tags(result)
        return tags["company_name"].strip()
    except Exception as ex:
        print(ex)
        return "unknown company"

In [None]:
company_name = company_name_from_URL(company_URL)
company_name

In [None]:
WSAG = partial(u.WSAG,
               model_runner=run_model,
               company_name=company_name,
               company_url=company_URL,
               num_threads=num_threads)

search_result_downloader = partial(
    u.search_result_downloader,
    bucket_name=BUCKET_NAME,
    headers=curl_headers,
    s3_client=s3,
    textract_client=textract)

## Generate the Company Summary

## Is it a publicly-traded company?

In [None]:
def publicly_traded_mapper(search_result: dict,
                           text_contents: str,
                           company_name: str) -> str:
    if text_contents:
        prompt = u.strip_multiline_whitespace(f"""\
            You are a researcher who has been tasked
            with determining whether {company_name} is publicly-traded.
            
            You have access to the following web page:
             
            <HTML_contents>
            {text_contents}
            </HTML_contents>
            
            Based on this web page, return either "True" or "False" if you think this company
            is or isn't publicly traded. If you don't know return "unknown". You must wrap your
            result in <result></result> tags.
            
            Also, if this company IS publicly-traded then return its stock ticker in
            <ticker></ticker> tags.
            """)
        result = run_model(prompt)
        tags = u.extract_multiple_tags(result)
        if "result" not in tags or tags["result"] == "unknown":
            result = "unknown"
        else:
            result = tags["result"].strip()
        if "ticker" not in tags or tags["ticker"] == "unknown":
            ticker = "unknown"
        else:
            ticker = tags["ticker"].strip()
        return result, ticker
    else:
        return "unknown", "unknown"


def is_publicly_traded(company_name: str,
                       company_url: str) -> Tuple[bool, str]:
    
    def reducer(map_results: Iterable[Union[str, dict]]) -> str:
        map_results = list(map_results)
        if len(map_results) > 0:
            is_publics, tickers = zip(*map_results)
            is_public = u.most_common_elem(is_publics)
            ticker = u.most_common_elem(tickers)
            return False if is_public == "unknown" else is_public,\
                   ticker
        else:
            return False, None

    return WSAG(
        web_search_query=f"Is {company_name} publicly-traded?",
        num_web_search_results=5, web_search_timeout=20,
        search_result_downloader=search_result_downloader,
        search_result_mapper=partial(publicly_traded_mapper,
                                     company_name=company_name),
        reducer=reducer)

In [None]:
# is_publicly_traded("Amgen", "https://amgen.com")
# is_publicly_traded("protagonist therapeutics", "https://www.protagonist-inc.com")
# is_publicly_traded("relay therapeutics", "https://www.relaytx.com")

## What is the Market Opportunity?

In [None]:
def market_opportunity_mapper(search_result: dict,
                              text_contents: str,
                              company_name: str) -> str:
    if text_contents:
        prompt = u.strip_multiline_whitespace(f"""\
            You are a researcher who has been tasked
            with determining the size and growth rate of the market that
            {company_name} operates in.
            
            You have access to the following web page:
             
            <HTML_contents>
            {text_contents}
            </HTML_contents>
            
            Based on this web page, summarize the market opportunity for {company_name}
            and return your result in <result></result> tags. If, based on this web page, 
            you don't know what the market opportunity is then you must return
            <result>unknown</result>.
            """)
        result = run_model(prompt)
        tags = u.extract_multiple_tags(result)
        if "result" not in tags or tags["result"] == "unknown":
            result = "unknown"
        else:
            result = tags["result"].strip()
        return result
    else:
        return "unknown"


def market_opportunity(company_name: str, company_url: str) -> bool:
    return WSAG(
        web_search_query=f"What is the size and growth rate of the market that {company_name} operates in?",
        num_web_search_results=10, web_search_timeout=20,
        search_result_downloader=search_result_downloader,
        search_result_mapper=partial(market_opportunity_mapper,
                                     company_name=company_name))

In [None]:
# market_opportunity("relay therapeutics", "https://www.relaytx.com")

## What does the company do, what are its top products?

In [None]:
def top_products_mapper(search_result: dict,
                        text_contents: str,
                        company_name: str) -> str:
    prompt = u.strip_multiline_whitespace(f"""\
        You are a researcher at who has been tasked with determining what the
        top products of {company_name} are.
        
        You have access to the following web page:
         
        <HTML_contents>
        {text_contents}
        </HTML_contents>
        
        If this web page contains information about what the top products of
        {company_name} then extract and summarize that information in 5-6
        sentences and put it in n<result></result> tags.
        
        If this web page doesn't contain useful information about what the top
        products of {company_name} then you MUST say <no_result></no_result>.
        """)
    try:
        result = run_model(prompt)
        tags = u.extract_multiple_tags(result)
        if "no_result" in tags:
            result = None
        elif "result" not in tags:
            result = None
        elif "i don't know" in tags["result"].strip().lower():
            result = None
        else:
            result = tags["result"].strip()
    except ValueError as ex:
        print(ex)
        result = None
    return result

In [None]:
@u.test_wrapper(globals())
def generate_top_products_html(company_name: str, company_url: str) -> str:
    return WSAG(
        web_search_query=f"What are the top products of {company_name}? "
                     f"site:{u.extract_site_from_URL(company_url)}",
        num_web_search_results=5, web_search_timeout=20,
        search_result_downloader=search_result_downloader,
        search_result_mapper=partial(top_products_mapper,
                                     company_name=company_name))

In [None]:
test_generate_top_products_html("Incyte", "https://incyte.com")

In [None]:
def what_they_do_mapper(search_result: dict,
                        text_contents: str,
                        company_name: str) -> str:
    """
    Return an HTML string that describes what the company does based on
    the contents of a web page.
    """
    prompt = u.strip_multiline_whitespace(f"""\
        You are a researcher who has been tasked with determining what
        {company_name} does.
        
        You have access to the following web page from {company_name}'s web site:
         
        <HTML_contents>
        {text_contents}
        </HTML_contents>
        
        If this web page contains information about what {company_name} does
        then extract and summarize that information in 5-6 sentences and put it in
        <result></result> tags.
        
        If this web page doesn't contain useful information about what {company_name}
        does then you MUST say <result>I don't know</result>.
        """)
    try:
        result = run_model(prompt)
        tags = u.extract_multiple_tags(result)
        if "result" not in tags:
            result = None
        elif "i don't know" in tags["result"].strip().lower():
            result = None
        else:
            result = tags["result"].strip()
    except ValueError as ex:
        print(ex)
        result = None
    return result

In [None]:
@u.test_wrapper(globals())
def generate_what_they_do_html(company_name: str, company_url: str) -> str:
    return WSAG(
        web_search_query=f"What does {company_name} do? site:{u.extract_site_from_URL(company_url)}",
        num_web_search_results=5, web_search_timeout=20,
        search_result_mapper=partial(what_they_do_mapper,
                                     company_name=company_name),
        search_result_downloader=search_result_downloader)

In [None]:
test_generate_what_they_do_html("Incyte", "https://incyte.com")

In [None]:
importlib.reload(u)

@u.test_wrapper(globals())
def generate_company_summary_html(company_name: str, company_url: str) -> str:
    """
    Return an HTML string that contains two paragraphs: the first describes
    what this company does, the second describes what products they sell.
    """
    what_they_do_html = generate_what_they_do_html(company_name, company_url)
    top_products_html = generate_top_products_html(company_name, company_url)
    return f"{what_they_do_html}\n<p>\n{top_products_html}"    

In [None]:
test_generate_company_summary_html("Incyte", "https://incyte.com")

## Leadership

In [None]:
def leadership_mapper(search_result: dict,
                      job_role: str,
                      company_name: str,
                      text_contents: str) -> str:
    prompt = u.strip_multiline_whitespace(f"""\
        You are a researcher at a company who has been tasked
        with finding out who the {job_role} is at {company_name}. You have
        access to the following web page from {company_name}'s web site:
         
        <HTML_contents>
        {text_contents}
        </HTML_contents>
        
        Extract all information about the current {job_role} that you can find on this web page.
        
        If you do find information about the {job_role} then return their name in <name></name>
        tags, their title in <title></title> tags, and return a 3-5 sentence bio in <bio></bio>
        tags. Return only information about THIS role/title, do not return anything about other roles.
        
        You must not make anything up, if you don't know something then leave it out.
        """)
    try:
        result = run_model(prompt)
        return u.extract_multiple_tags(result)
    except ValueError as ex:
        print(ex)
        return None

In [None]:
leadership_roles = [
    "Chief Executive Officer",
    "Chief Financial Officer",
    "Chief Science Officer",
    "Chief Medical Officer",
    "Chief Operating Officer"
]

@u.test_wrapper(globals())
def generate_leadership_team_html(company_name: str, company_url: str) -> str:
    """
    Return an HTML string that describes the key leaders at this company.
    """
    print(f"++++++ generate_leadership_team_html {company_name} {company_url}")

    def search_result_filter(idx: int, search_result: dict,
                             search_result_downloader: Callable[[dict, dict], str] = None
                       ) -> Tuple[bool, Optional[str]]:
        this_URL = search_result["link"]
        # is the top hit from the company's website?
        parsed_URL = urlparse(this_URL)
        if u.same_TLD(parsed_URL.netloc, u.extract_site_from_URL(company_url)):
            content = search_result_downloader(search_result=search_result,
                                               cached_contents=None)
            if content is None:
                print(f"Ignoring {this_URL} due to empty content")
                return False, None
            else:
                return True, content
        else:
            print(f"Ignoring hit #{idx} {this_URL}: wrong TLD")
            return False, None

    def evaluator(search_result: dict, mapper_result: str):
        """ Is this a valid result? If so, we stop evaluating search results. """
        return mapper_result and \
               mapper_result.get("name", None) and \
               mapper_result.get("title", None)
        
    leaders = []
    for role in leadership_roles:
        print(f"\nRole {role}")
        result = WSAG(
            web_search_query=f"{role} {company_name}",
            num_web_search_results=3, web_search_timeout=20,
            mode = u.Mode.FIRST_HIT,
            search_result_downloader=search_result_downloader,
            search_result_mapper=partial(leadership_mapper,
                                         company_name=company_name,
                                         job_role=role),
            search_result_evaluator=evaluator,
            search_result_filter=partial(search_result_filter,
                                         search_result_downloader=search_result_downloader))
        leaders.append(result)

    leaders = [x for x in leaders if x and x.get("name", None) and x["name"].strip() != ""]
    for leader in leaders:
        for k, v in leader.items():
            leader[k] = u.escape_HTML_text(v)
    template = u.strip_multiline_whitespace("""\
        <ul>
          {% for leader in leaders %}
              <li> {{leader.name}}, {{leader.title}}. {{leader.bio}}
          {% endfor %}
        </ul>
        """)
    html = jenv.from_string(template).render(leaders=leaders)
    return html

In [None]:
test_generate_leadership_team_html("Beam Therapeutics", "https://beamtx.com")

## Press Releases

The approach here is two-fold:

1. Use a web search to find individual pages on the company's web site where each page contains one press release. Then we use an LLM to verify that each page actually contains a press release and to extract metadata. This approach fails if the press releases are stored on a 3rd-party web site.

2. Use a web search to find press releases about this company on prnewswire.com. Verify and extract metadata as above. This only works if the press releases are hosted by prnewswire.

We combine these by first trying approach #1. If we get zero results then we fallback to approach #2.

In [None]:
def external_PR_mapper(search_result: dict,
                       text_contents: str,
                       company_name: str) -> str:
    prompt = u.strip_multiline_whitespace(f"""\
        You are a researcher at a biopharmaceutical company who has been tasked
        with finding out what press releases have recently published by {company_name}.
        
        You have access to the following web page:
         
        <HTML_contents>
        {text_contents}
        </HTML_contents>
        
        Given this web page I want you to analyze it and compute four values, each in XML tags:
        
         + Does this web page contain a single press release, not a listing of press releases? If so
           return <contains_PR>True</contains_PR> otherwise return <contains_PR>False</contains_PR>.
         + Is the press release on this page authored by {company_name}? Return 
           <authored_by>True/False</authored_by> accordingly.
         + If this page does contain a single press release then when was it published? Return your
           result in <publication_date></publication_date> tags.
         + If this page does contain a single press release then does it describe financial information
           like quarterly results? If so return <is_financial>True</is_financial>, otherwise
           return <is_financial>True</is_financial>.
         + If this page does contain a single press release then return the title (in <title></title> tags)
           and a 3-5 sentence abstract (in <abstract></abstract> tags).
        """)
    try:
        result = run_model(prompt)
        tags = u.extract_multiple_tags(result)
        return tags
    except ValueError as ex:
        print(ex)
        return {}

In [None]:
def press_release_reducer(map_results: Iterable[Union[str, dict]],
                          is_in_situ: bool = False) -> str:
    press_releases = list(map_results)
    press_releases = list(filter(None, press_releases)) # remove Nones
    if not is_in_situ:
        press_releases = [pr for pr in press_releases
                             if pr.get("authored_by", "False").strip().lower() == "true"]
    return press_releases


@u.test_wrapper(globals())
def generate_list_of_externally_hosted_PRs(company_name: str, company_url: str) -> List[dict]:
    return WSAG(
        web_search_query=f"{company_name} site:prnewswire.com",
        num_web_search_results=20, web_search_timeout=20,
        search_result_mapper=partial(external_PR_mapper,
                                     company_name=company_name),
        search_result_downloader=search_result_downloader,
        reducer=press_release_reducer)

In [None]:
test_generate_list_of_externally_hosted_PRs("Incyte", "https://incyte.com")

In [None]:
today = dateparser.parse("now")
one_year = datetime.timedelta(days=365)
two_years = datetime.timedelta(days=2*365)

def parse_date(s: str) -> datetime.datetime:
    """ Remove any timezone info, if there, so we can do easy comparison with `now` """
    try:
        dt = dateparser.parse(s)
        dt = dt.replace(tzinfo=None)
        return dt
    except Exception as ex:
        return today # this is a hack not sure what else to do if date is malformed


def published_recently(pr: dict, cutoff: datetime.timedelta) -> bool:
    if pr["publication_date"]:
        return (today - parse_date(pr["publication_date"])) < cutoff
    else:
        return True # if no publication date, let's assume it's recent


def remove_extraneous_PRs(press_releases: List[dict]) -> List[dict]:
    press_releases = [pr for pr in press_releases
                       if pr["contains_PR"].strip().lower() == "true" and\
                          pr["is_financial"].strip().lower() != "true"]
    # Ideally, limit to those published in the last year, if that produces
    # zero results then relax that to two years
    result = [pr for pr in press_releases if published_recently(pr, cutoff=one_year)]
    return result or [pr for pr in press_releases if published_recently(pr, cutoff=two_years)]

In [None]:
def in_situ_PR_mapper(search_result: dict,
                      text_contents: str,
                      company_name: str) -> str:
    prompt = u.strip_multiline_whitespace(f"""\
        You are a researcher at a biopharmaceutical company who has been tasked
        with finding out what press releases have recently published by {company_name}.
        
        You have access to the following web page from {company_name}'s web site:
         
        <HTML_contents>
        {text_contents}
        </HTML_contents>
        
        Given this web page I want you to analyze it and compute four values, each in XML tags:
        
         + Does this web page contain a single press release, not a listing of press releases? If so
           return <contains_PR>True</contains_PR> otherwise return <contains_PR>False</contains_PR>.
         + If this page does contain a single press release then when was it published? Return your
           result in <publication_date></publication_date> tags.
         + If this page does contain a single press release then does it describe financial information
           like quarterly results? If so return <is_financial>True</is_financial>, otherwise
           return <is_financial>True</is_financial>.
         + If this page does contain a single press release then return the title (in <title></title> tags)
           and a 3-5 sentence abstract (in <abstract></abstract> tags).
        """)
    try:
        result = run_model(prompt)
        tags = u.extract_multiple_tags(result)
        return tags
    except ValueError as ex:
        print(ex)
        return {}

In [None]:
@u.test_wrapper(globals())
def generate_list_of_in_situ_PRs(company_name: str, company_url: str) -> List[dict]:
    return WSAG(
        web_search_query=f"press releases site:{u.extract_TLD_from_URL(company_url)}",
        num_web_search_results=20, web_search_timeout=20,
        search_result_mapper=partial(in_situ_PR_mapper,
                                     company_name=company_name),
        search_result_downloader=search_result_downloader,
        reducer=partial(press_release_reducer,
                        is_in_situ=True))


In [None]:
test_generate_list_of_in_situ_PRs("Beam Therapeutics", "https://beamtx.com")

In [None]:
@u.test_wrapper(globals())
def generate_press_releases_html(company_name: str, company_url: str) -> str:
    """
    Use a web search to find individual pages for press releases and then use the LLM
    to determine if the downloaded page is actually a PR (and not, e.g. a list of PRs).
    """
    print(f"++++++ generate_press_releases_html {company_name} {company_url}")
 
    press_releases = remove_extraneous_PRs(
                            generate_list_of_in_situ_PRs(company_name, company_url))
    if len(press_releases) == 0:
        press_releases = remove_extraneous_PRs(
                            generate_list_of_externally_hosted_PRs(company_name, company_url))

    for pr in press_releases:
        for key in ["title", "abstract"]:
            if key in pr:
                pr[key] = u.escape_HTML_text(pr[key])

    if press_releases:
        template = u.strip_multiline_whitespace("""\
            <ul>
              {% for pr in press_releases %}
                  <li> {{pr.publication_date or ""}} <a href="{{ pr.URL }}">{{ pr.title }}</a><p/>
                       {{ pr.abstract or "" }}
              {% endfor %}
            </ul>
            """)
        html = jenv.from_string(template).render(press_releases=press_releases)
    else:
        html = "No recent press releases were found."
    return html

In [None]:
test_generate_press_releases_html("Beam Therapeutics", "https://beamtx.com")

## Scientific posters

In [None]:
def scientific_poster_mapper(search_result: dict,
                             text_contents: str,
                             company_name: str) -> Optional[dict]:
    if text_contents is not None:
        prompt = u.strip_multiline_whitespace(f"""\
            You are given the following contents of a PDF file:

            <PDF_contents>
            {text_contents}
            </PDF_contents>

            This PDF file may contain a scientific poster presentation.
            
            Your first task is to determine if it _does_ contain a
            scientific poster presentation. If it does not, you should
            return <not_a_poster></not_a_poster>.
            
            If it does contain a scientific poster then your second task
            is to return some metadata about the poster: return the title of
            the poster in <title></title> tags and the entire citation in
            <citation></citation> tags. Also, return a 3-5 sentence summary
            of the key points of the poster in <abstract></abstract> tags.
            """)
        resp = run_model(prompt)
        tags = u.extract_multiple_tags(resp)
        if not tags:
            result = None
        elif "not_a_poster" in tags:
            result = None
        else:
            result = {"URL": search_result["link"], **tags}
    else:
        result = None
    return result

In [None]:
@u.test_wrapper(globals())
def generate_posters_html(company_name: str, company_url: str) -> str:
    """
    Get single-page PDFs from the company's web site and then use the LLM
    to parse the title etc out of the PDF.
    """

    def search_result_filter(idx: int, search_result: dict) -> Tuple[bool, str]:
        contents = search_result_downloader(search_result=search_result)
        selected = contents is not None
        return selected, contents

    def reducer(map_results: Iterable[Union[str, dict]]) -> str:
        posters = filter(None, map_results) # remove Nones
        posters = [p for p in posters if {"URL", "title"} <= p.keys() and p["title"]]
        for p in posters:
            p["citationOrTitle"] = p.get("citation", "") or p.get("title", "")
        if posters:
            template = """\
             <ul>
              {% for poster in posters %}
                  <li> <a href="{{ poster.URL }}">{{ poster.citationOrTitle }}</a>
                     <p/>{{poster.abstract}}
              {% endfor %}
              </ul>
            """
            html = jenv.from_string(template).render(posters=posters)
        else:
            html = "No recent posters were found."
        return html        

    return WSAG(
        web_search_query=f"poster presentation site:{u.extract_site_from_URL(company_url)}",
        num_web_search_results=20, web_search_timeout=20,
        search_result_mapper=partial(scientific_poster_mapper,
                                     company_name=company_name),
        search_result_downloader=partial(search_result_downloader,
                                         allowable_mime_types = ["application/pdf"],
                                         allow_only_single_page_PDFs=True),
        reducer=reducer)

In [None]:
test_generate_posters_html("Incyte", "https://incyte.com")

## 10K report

In [None]:
def tenk_report_html(company_name: str) -> str:
    """
    Return an HTML string that describes this company's 10K report.
    """
    print(f"+++ 10k_report_html {company_name}")
    response_json = u.google_search(f"10K report {company_name} site:sec.gov")
    try:
        tenk_URL = response_json["organic_results"][0]["link"]
    except KeyError as ex:
        tenk_URL = None
        print(ex)

    if tenk_URL:
        return f"""\
        Click <a href="{tenk_URL}">here</a> for the PDF.
        <hr/>
        """
    else:
        return f"{company_name} does not have a 10K report"

## Generate the summary

In [None]:
html_template = u.strip_multiline_whitespace("""\
    <html>
      <head>
          <title>Summary of {{ company_name }}</title>
      </head>
      <body>
          <h1>Summary of {{ company_name }} <small>[generated in {{ elapsed_time }} minutes]</small></h1>
          <h3>Company Summary</h3>
          <p>{{ executive_summary }}<p/>
          {% if is_public %}
          {{ company_name }} is a public company, with stock ticker {{ ticker }}.
          {% else %}
          {{ company_name }} is a private company.
          {% endif %}
    
          <h3>Management Team</h3>
          {{ leadership_team }}
          
          <h3>Press Releases (in approx. last year)</h3>
          {{ press_releases }}
          
          <h3>Posters</h3>
          {{ posters }}
          
          <h3>Most recent 10K Report</h3>
          {{ten_k_report}}
      </body>
    </html>
    """)
is_public, ticker = is_publicly_traded(company_name, company_URL)
kwargs = dict(
    is_public=is_public,
    ticker=ticker,
    company_name=company_name.strip(),
    executive_summary=generate_company_summary_html(company_name, company_URL).strip(),
    leadership_team=generate_leadership_team_html(company_name, company_URL).strip(),
    press_releases=generate_press_releases_html(company_name, company_URL).strip(),
    posters=generate_posters_html(company_name, company_URL).strip(),
    ten_k_report=tenk_report_html(company_name))
kwargs["elapsed_time"] = f"{(time.time() - start_time)/60:.1f}"
html = jenv.from_string(html_template).render(**kwargs)
summaries_dir = Path.cwd() / "summaries"
summaries_dir.mkdir(parents=False, exist_ok=True)
this_summary = summaries_dir / f"{company_name.replace(' ', '-')}-summary.html"
this_summary.write_text(html)
print(f"Summary written to {this_summary}")

In [None]:
display(HTML(html))

## Test Harness

In [None]:
def show_results(results: List[dict]):
    template = """\
    <table>
      <tbody>
        <tr><th>URL</th><th>name</th><th></th></tr>
        {% for result in results %}
          <tr>
            <td>{{ result.URL }}</td>
            <td>{{ result.name }}</td>
            <td>{{ result.html }}</td>
          </tr>
        {% endfor %}
      </tbody>
    </table>
    """
    html_str = jenv.from_string(template).render(results=results)
    (Path().cwd() / "test-harness-results.html").write_text(html_str)
    return html_str

def test_harness(generator_fn: Callable[Tuple[str, str], str],
                 company_URLs: Iterable[str]):
    """
    This is useful when doing prompt engineering to see what effect a given
    change has across a set of companies, rather than just testing on
    one company.
    """
    if enable_test_harness:
        results = []
        for company_URL in company_URLs:
            company_name = company_name_from_URL(company_URL)
            html = generator_fn(company_name, company_URL)
            print(f"{company_URL} {company_name} -> <<{html}>>")
            results.append(dict(URL=company_URL, name=company_name, html=html))
            # Dump provisional results to disc:
            html_str = show_results(results)
        return HTML(html_str)
    else:
        return None

In [None]:
# test_harness(company_name_from_URL_as_html, company_URLs)
test_harness(generate_company_summary_html, company_URLs)
# test_harness(generate_leadership_team_html, company_URLs)
# test_harness(generate_press_releases_html, company_URLs)
# test_harness(generate_posters_html, company_URLs)