# Parameters

In [2]:
# These are used by papermill; see generate-multiple-summaries.py
company_URL = "https://www.agios.com/"
enable_test_harness = False # disable test harness when generating summaries

In [3]:
%env SERPAPI_API_KEY=...
%env AWS_ACCESS_KEY_ID=...
%env AWS_SECRET_ACCESS_KEY=...
%env AWS_SESSION_TOKEN=...

env: SERPAPI_API_KEY=...
env: AWS_ACCESS_KEY_ID=...
env: AWS_SECRET_ACCESS_KEY=...
env: AWS_SESSION_TOKEN=...


In [73]:
BUCKET_NAME="sgh-misc"  # change this to a bucket in your account

In [74]:
import json
from typing import Callable, List, Tuple, Iterable
import time
from pathlib import Path
from urllib.parse import urlparse
import importlib
import datetime
from functools import lru_cache

from urllib3.exceptions import MaxRetryError 
import boto3, botocore
from IPython.display import HTML, display
import jinja2
import dateparser
from langchain.llms import Bedrock

import utilities as u

In [75]:
importlib.reload(u)

<module 'utilities' from '/Users/simongh/projects/pfizer-opportunity-identification/utilities.py'>

In [76]:
start_time = time.time()

In [77]:
jenv = jinja2.Environment(trim_blocks=True, lstrip_blocks=True)
bedrock_runtime = boto3.client("bedrock-runtime")
bedrock = boto3.client("bedrock")
s3 = boto3.client("s3")
textract = boto3.client('textract')
# model_id="anthropic.claude-v2:1"
model_id="anthropic.claude-3-sonnet-20240229-v1:0"

In [78]:
[x['modelId'] for x in bedrock.list_foundation_models()['modelSummaries']]

ClientError: An error occurred (ExpiredTokenException) when calling the ListFoundationModels operation: The security token included in the request is expired

In [58]:
curl_headers = {
    "User-Agent": "curl/8.1.2",
    "Accept": "*/*"
}
company_URLs = [
        "https://www.agios.com",
        "https://www.morphosys.com/en",
        "https://incyte.com",
        "https://www.protagonist-inc.com",
        "https://kerostx.com",
        "https://vaderis.com",
        "https://hemavant.com",
        "https://www.intelliatx.com",
        "https://beamtx.com",
        "https://www.capstantx.com",
        "https://renagadetx.com",
        "https://arbor.bio"
]

In [59]:
wLLM = u.WrappedBedrock(
    model_id=model_id,
    client=bedrock_runtime,
    model_kwargs={
        "max_tokens_to_sample": 4096,
        "temperature": 0.5,
        "top_k": 250,
        "top_p": 0.9,
        "stop_sequences": ["\n\nHuman"],
    })

In [60]:
def company_name_from_URL_as_html(_company_name: str, company_URL: str ) -> str:
    """ company name is ignored, this lets us use the test harness """
    return company_name_from_URL(company_URL)

In [61]:
@lru_cache
def company_name_from_URL(company_URL: str) -> str:
    """
    Use the LLM to infer the name of the company from its URL.
    """
    try:
        text_contents = u.download_web_page_as_text(company_URL, s3, textract,
                                                    BUCKET_NAME, headers=curl_headers)
        if not text_contents:
            return "unknown company"
    except (ConnectionError, MaxRetryError) as ex:
        print(f"Failed to download {URL}: {ex}")
        return "unknown company"

    prompt = f"""\
You are a researcher at a biopharmaceutical company who has been tasked
with determining what the name of a company is. The following is the contents
of this company's main web page:
 
<HTML_contents>
{text_contents}
</HTML_contents>

Based on this web page you must determine the name of this company and 
return it in <company_name></company_name> tags. Always return the full 
name of the company, for example if there name is "Foo Therapeutics" then
you should return that, not "Foo".
"""
    try:
        result = wLLM.invoke(prompt)
        # print(f"-------result:\n{result}\n-------")
        tags = u.extract_multiple_tags(result)
        # print(f"tags {tags}")
        return tags["company_name"].strip()
    except Exception as ex:
        print(ex)
        return "unknown company"

# Generate the Company Summary

In [33]:
company_name = company_name_from_URL(company_URL)
company_name

download_web_page https://www.agios.com/


'Agios'

## Is it a publicly-traded company?

In [62]:
def summarize_publicly_traded_result(result: dict, company_name: str) -> str:
    URL = result["link"]
    print(f"Summarizing search result {URL}:")
    try:
        contents = u.download_web_page_as_text(URL, s3, textract,
                                               BUCKET_NAME, headers=curl_headers)
        if not contents:
            print(f"Ignoring non-PDF, non-HTML content found at {URL}")
            return "unknown", None
        prompt = u.strip_multiline_whitespace(f"""\
            You are a researcher who has been tasked
            with determining whether {company_name} is publicly-traded.
            
            You have access to the following web page:
             
            <HTML_contents>
            {contents}
            </HTML_contents>
            
            Based on this web page, return either "True" or "False" if you think this company
            is or isn't publicly traded. If you don't know return "unknown". You must wrap your
            result in <result></result> tags.
            
            Also, if this company IS publicly-traded then return its stock ticker in <ticker></ticker>
            tags.
            """)
        result = wLLM.invoke(prompt)
        # print(f"-------result:\n{result}\n-------")
        tags = u.extract_multiple_tags(result)
        # print(f"tags {tags}")
        if "result" not in tags or tags["result"] == "unknown":
            result = "unknown"
        else:
            result = tags["result"].strip()
        if "ticker" not in tags or tags["ticker"] == "unknown":
            ticker = "unknown"
        else:
            ticker = tags["ticker"].strip()
    except ValueError as ex:
        print(ex)
        return "unknown", None
    return result, ticker


def is_publicly_traded(company_name: str, company_url: str) -> bool:
    query = f"Is {company_name} publicly-traded?"# site:{u.extract_site_from_URL(company_url)}"
    response_json = u.google_search(query, num_results=5)
    try:
        organic_results = response_json["organic_results"]
    except KeyError:
        print("Warning: Google returned 0 results for this query")
        organic_results = []

    for result in organic_results:
        URL = result["link"]
        print(f"{URL}")

    summaries = [summarize_publicly_traded_result(result, company_name)
                 for result in organic_results]
    print(summaries)
    is_publics, tickers = zip(*summaries)
    is_public = u.most_common_elem(is_publics)
    ticker = u.most_common_elem(tickers)
    return None if is_public == "unknown" else is_public, ticker

In [63]:
# is_publicly_traded("Amgen", "https://amgen.com")
# is_publicly_traded("protagonist therapeutics", "https://www.protagonist-inc.com")
is_publicly_traded("relay therapeutics", "https://www.relaytx.com")

google_search 'Is relay therapeutics publicly-traded?' num_results=5 timeout=20
https://ir.relaytx.com/news-releases/news-release-details/relay-therapeutics-announces-pricing-public-offering-common-0
https://ir.relaytx.com/news-releases/news-release-details/relay-therapeutics-announces-closing-initial-public-offering-and
https://finance.yahoo.com/quote/RLAY/
https://ir.relaytx.com/news-releases/news-release-details/relay-therapeutics-announces-pricing-initial-public-offering
https://www.cnbc.com/quotes/RLAY
Summarizing search result https://ir.relaytx.com/news-releases/news-release-details/relay-therapeutics-announces-pricing-public-offering-common-0:
download_web_page https://ir.relaytx.com/news-releases/news-release-details/relay-therapeutics-announces-pricing-public-offering-common-0
Error raised by bedrock service: An error occurred (ValidationException) when calling the InvokeModel operation: "claude-3-sonnet-20240229" is not supported on this API. Please use the Messages API inst

(None, None)

In [54]:
def generate_stock_price_graph(company_name: str):
    prompt = u.strip_multiline_whitespace(f"""\
        Show me a table of the stock price of {company_name} over the last year.
        """)
    result = wLLM.invoke(prompt)
    print(f"-------result:\n{result}\n-------")
    # tags = u.extract_multiple_tags(result)


In [55]:
generate_stock_price_graph("Amgen")

-------result:
 Unfortunately, I do not have access to historical stock price data for specific companies. As an AI assistant without direct access to financial data sources, I cannot generate a table of Amgen's stock prices over the past year. However, this data should be publicly available on financial websites and in financial publications if you would like to look it up yourself. Let me know if you have any other questions I can assist with!
-------


## What does the company do, what are its top products?

In [14]:
def summarize_top_products_web_page(text_contents: str, company_name: str) -> str:
    prompt = u.strip_multiline_whitespace(f"""\
        You are a researcher at a biopharmaceutical company who has been tasked
        with determining what the top products of {company_name} are.
        
        You have access to the following web page:
         
        <HTML_contents>
        {text_contents}
        </HTML_contents>
        
        If this web page contains information about what the top products of {company_name}
        then extract and summarize that information in 5-6 sentences and put it in
        <result></result> tags.
        
        If this web page doesn't contain useful information about what the top
        products of {company_name} then you MUST say <no_result></no_result>.
        """)
    try:
        result = wLLM.invoke(prompt)
        # print(f"-------result:\n{result}\n-------")
        tags = u.extract_multiple_tags(result)
        # print(f"tags {tags}")
        if "no_result" in tags:
            result = None
        elif "result" not in tags:
            result = None
        elif "i don't know" in tags["result"].strip().lower():
            result = None
        else:
            result = tags["result"].strip()
    except ValueError as ex:
        print(ex)
        result = None
    # print(f"summarize_top_products_web_page -> {result}")
    return result

In [15]:
def summarize_top_products_search_result(result: dict, company_name: str) -> str:
    URL = result["link"]
    print(f"Summarizing search result {URL}:")
    try:
        content = u.download_web_page_as_text(URL, s3, textract,
                                              BUCKET_NAME, headers=curl_headers)
        if content:
            result = summarize_top_products_web_page(content, company_name)
        else:
            print(f"Ignoring non-PDF, non-HTML content found at {URL}")
            result = None
    except (ConnectionError, MaxRetryError) as ex:
        print(f"Failed to download {URL}: {ex}")
        result = None
    return result

In [16]:
def generate_top_products_html(company_name: str, company_url: str) -> str:
    """
    Return an HTML string that summarizes the top products sold by this company.
    """
    print(f"+++++ generate_top_products_html {company_name} {company_url}")
    query = f"What are the top products of {company_name}? site:{u.extract_site_from_URL(company_url)}"
    response_json = u.google_search(query, num_results=5)
    try:
        organic_results = response_json["organic_results"]
    except KeyError:
        print("Warning: Google returned 0 results for this query")
        organic_results = []

    for result in organic_results:
        URL = result["link"]
        print(f"Result: {URL}")

    summaries = [summarize_top_products_search_result(result, company_name)
                 for result in organic_results]
    summaries = [summary for summary in summaries if summary]

    if summaries:
        prompt_template = u.strip_multiline_whitespace("""\
            Please consider the following excerpts:
            
            {% for summary in summaries %}
            <excerpt>
            {{ summary }}
            </excerpt>
            {% endfor %}
            
            Summarize the above excerpts in 5-6 sentences. Don't include any preamble. Don't use
            the word "excerpt". Include your summary in <summary></summary> tags.
            """)
        prompt = jenv.from_string(prompt_template).render(summaries=summaries)
        # print(f"prompt: -------------\n{prompt}\n---------------")
        top_products = wLLM.invoke(prompt)
        # print(f"SoS: <<{top_products}>>")
        tags = u.extract_multiple_tags(top_products)
        # print(f"tags {tags}")
        try:
            top_products = tags["summary"]
        except KeyError:
            top_products = ""
    else:
        top_products = "I couldn't find any information on their web site about their top products."
    # print(f"generate_top_products_html -> {top_products}")
    return top_products

In [17]:
def summarize_web_page_what_they_do(text_contents: str, company_name: str) -> str:
    """
    Return an HTML string that describes what the company does.
    """
    prompt = u.strip_multiline_whitespace(f"""\
        You are a researcher at a biopharmaceutical company who has been tasked
        with determining what {company_name} does.
        
        You have access to the following web page from {company_name}'s web site:
         
        <HTML_contents>
        {text_contents}
        </HTML_contents>
        
        If this web page contains information about what {company_name} does
        then extract and summarize that information in 5-6 sentences and put it in
        <result></result> tags.
        
        If this web page doesn't contain useful information about what {company_name}
        does then you MUST say <result>I don't know</result>.
        """)
    try:
        result = wLLM.invoke(prompt)
        # print(f"-------result:\n{result}\n-------")
        tags = u.extract_multiple_tags(result)
        # print(f"tags {tags}")
        if "result" not in tags:
            result = None
        elif "i don't know" in tags["result"].strip().lower():
            result = None
        else:
            result = tags["result"].strip()
    except ValueError as ex:
        print(ex)
        result = None
    # print(f"summarize_web_page_what_they_do -> {result}")
    return result

In [18]:
def summarize_what_they_do_search_result(result: dict, company_name: str) -> str:
    URL = result["link"]
    print(f"Summarizing search result {URL}:")
    try:
        content = u.download_web_page_as_text(URL, s3, textract, BUCKET_NAME,
                                              headers=curl_headers)
        if content:
            result = summarize_web_page_what_they_do(content, company_name)
        else:
            print(f"Ignoring non-PDF, non-HTML content found at {URL}")
            result = None
    except (ConnectionError, MaxRetryError) as ex:
        print(f"Failed to download {URL}: {ex}")
        result = None
    return result

In [41]:
def generate_what_they_do_html(company_name: str, company_url: str) -> str:
    """
    Return an HTML string that describes what this company does.
    """
    print(f"+++++ generate_what_they_do_html {company_name} {company_url}")
    query = f"What does {company_name} do? site:{u.extract_site_from_URL(company_url)}"
    response_json = u.google_search(query, num_results=5)
    try:
        organic_results = response_json["organic_results"]
    except KeyError:
        print("Warning: Google returned 0 results for this query")
        organic_results = []

    for result in organic_results:
        URL = result["link"]
        print(f"Result: {URL}")

    summaries = [summarize_what_they_do_search_result(result, company_name)
                 for result in organic_results]
    summaries = [summary for summary in summaries if summary]

    prompt_template = u.strip_multiline_whitespace("""\
        Please consider the following snippets:
    
        {% for summary in summaries %}
        <snippet>
        {{ summary }}
        </snippet>
        {% endfor %}
    
        Summarize the above snippets in 5-6 sentences. Don't include any preamble. Include your
        summary in <summary></summary> tags.
        """)
    prompt = jenv.from_string(prompt_template).render(summaries=summaries)
    # print(f"prompt: -------------\n{prompt}\n---------------")
    what_they_do = wLLM.invoke(prompt)
    # print(f"SoS: <<{what_they_do}>>")
    tags = u.extract_multiple_tags(what_they_do)
    # print(f"tags {tags}")
    try:
        what_they_do = tags["summary"]
    except KeyError:
        what_they_do = ""
    return what_they_do

SyntaxError: unterminated string literal (detected at line 30) (1608731350.py, line 30)

In [42]:
def generate_company_summary_html(company_name: str, company_url: str) -> str:
    """
    Return an HTML string that contains two paragraphs: the first describes
    what this company does, the second describes what products they sell.
    """
    what_they_do_html = generate_what_they_do_html(company_name, company_url)
    top_products_html = generate_top_products_html(company_name, company_url)
    return f"{what_they_do_html}\n<p>\n{top_products_html}"    

In [43]:
def generate_and_show_company_summary(company_name: str, company_url: str):
    if enable_test_harness:
        return HTML(generate_company_summary_html(company_name, company_url))
    else:
        return None

In [22]:
# generate_and_show_company_summary("agios", "https://www.agios.com")
# generate_and_show_company_summary("Arbor Biotechnologies", "https://arbor.bio")
generate_and_show_company_summary("Incyte", "https://incyte.com")

## Leadership

In [23]:
def extract_exec_info_from_page(job_role: str, company_name: str, html_contents: str) -> str:
    html_contents = u.remove_noise_from_web_page(html_contents)
    prompt = u.strip_multiline_whitespace(f"""\
        You are a researcher at a biopharmaceutical company who has been tasked
        with finding out who the {job_role} is at {company_name}. You have
        access to the following web page from {company_name}'s web site:
         
        <HTML_contents>
        {html_contents}
        </HTML_contents>
        
        Extract all information about the current {job_role} that you can find on this web page.
        
        If you do find information about the {job_role} then return their name in <name></name>
        tags, their title in <title></title> tags, and return a 3-5 sentence bio in <bio></bio>
        tags. Return only information about THIS role/title, do not return anything about other roles.
        
        You must not make anything up, if you don't know something then leave it out.
        """)
    try:
        result = wLLM.invoke(prompt)
        # print(f"-------result:\n{result}\n-------")
        info = u.extract_multiple_tags(result)
        # print(json.dumps(info, indent=2))
        return info
    except ValueError as ex:
        print(ex)
        return None

In [24]:
leadership_roles = [
    "Chief Executive Officer",
    "Chief Financial Officer",
    "Chief Science Officer",
    "Chief Medical Officer",
    "Chief Operating Officer"
]

def generate_leadership_team_html(company_name: str, company_url: str) -> str:
    """
    Return an HTML string that describes the key leaders at this company.
    """
    print(f"++++++ generate_leadership_team_html {company_name} {company_url}")
    leaders = []
    for role in leadership_roles:
        print(f"\nRole {role}")
        response_json = u.google_search(f"{role} {company_name}", num_results=3)
        organic_results = response_json["organic_results"]
        for i, result in enumerate(organic_results):
            this_URL = result["link"]
            # is the top hit from the company's website?
            parsed_URL = urlparse(this_URL)
            if u.same_TLD(parsed_URL.netloc, u.extract_site_from_URL(company_url)):
                content = u.download_web_page(this_URL)
                if content is None:
                    print(f"Ignoring {this_URL} due to empty content")
                else:
                    info = extract_exec_info_from_page(role, company_name, content)
                    leaders.append(info)
                    break
            else:
                print(f"Ignoring hit #{i} {this_URL}: wrong TLD")
    leaders = [x for x in leaders if x and x.get("name", None) and x["name"].strip() != ""]
    for leader in leaders:
        for k, v in leader.items():
            leader[k] = u.escape_HTML_text(v)
    template = """\
    <ul>
      {% for leader in leaders %}
          <li> {{leader.name}}, {{leader.title}}. {{leader.bio}}
      {% endfor %}
    </ul>
    """
    html = jenv.from_string(template).render(leaders=leaders)
    # print(f"generate_leadership_team_html -> {html}")
    return html

In [25]:
# HTML(generate_leadership_team_html("Beam Therapeutics", "https://beamtx.com"))

## Press Releases

The approach here is two-fold:

1. Use google to find individual pages on the company's web site where each page contains one press release. Then we use an LLM to verify that each page actually contains a press release and to extract metadata. This approach fails if the press releases are stored on a 3rd-party web site.

2. Use google to find press releases about this company on prnewswire.com. Verify and extract metadata as above. This only works if the press releases are hosted by prnewswire.

We combine these by first trying approach #1. If we get zero results then we fallback to approach #2.

In [26]:
def analyze_in_situ_web_page_for_single_PR(html_contents: str, company_name: str) -> str:
    prompt = u.strip_multiline_whitespace(f"""\
        You are a researcher at a biopharmaceutical company who has been tasked
        with finding out what press releases have recently published by {company_name}.
        
        You have access to the following web page from {company_name}'s web site:
         
        <HTML_contents>
        {html_contents}
        </HTML_contents>
        
        Given this web page I want you to analyze it and compute four values, each in XML tags:
        
         + Does this web page contain a single press release, not a listing of press releases? If so
           return <contains_PR>True</contains_PR> otherwise return <contains_PR>False</contains_PR>.
         + If this page does contain a single press release then when was it published? Return your
           result in <publication_date></publication_date> tags.
         + If this page does contain a single press release then does it describe financial information
           like quarterly results? If so return <is_financial>True</is_financial>, otherwise
           return <is_financial>True</is_financial>.
         + If this page does contain a single press release then return the title (in <title></title> tags)
           and a 3-5 sentence abstract (in <abstract></abstract> tags).
        """)
    try:
        result = wLLM.invoke(prompt)
        # print(f"-------result:\n{result}\n-------")
        tags = u.extract_multiple_tags(result)
        # print(json.dumps(tags, indent=2))
        return tags
    except ValueError as ex:
        print(ex)
        return {}

In [27]:
def analyze_external_web_page_for_single_PR(html_contents: str, company_name: str) -> str:
    prompt = u.strip_multiline_whitespace(f"""\
        You are a researcher at a biopharmaceutical company who has been tasked
        with finding out what press releases have recently published by {company_name}.
        
        You have access to the following web page:
         
        <HTML_contents>
        {html_contents}
        </HTML_contents>
        
        Given this web page I want you to analyze it and compute four values, each in XML tags:
        
         + Does this web page contain a single press release, not a listing of press releases? If so
           return <contains_PR>True</contains_PR> otherwise return <contains_PR>False</contains_PR>.
         + Is the press release on this page authored by {company_name}? Return 
           <authored_by>True/False</authored_by> accordingly.
         + If this page does contain a single press release then when was it published? Return your
           result in <publication_date></publication_date> tags.
         + If this page does contain a single press release then does it describe financial information
           like quarterly results? If so return <is_financial>True</is_financial>, otherwise
           return <is_financial>True</is_financial>.
         + If this page does contain a single press release then return the title (in <title></title> tags)
           and a 3-5 sentence abstract (in <abstract></abstract> tags).
        """)
    try:
        result = wLLM.invoke(prompt)
        # print(f"-------result:\n{result}\n-------")
        tags = u.extract_multiple_tags(result)
        # print(json.dumps(tags, indent=2))
        return tags
    except ValueError as ex:
        print(ex)
        return {}

In [29]:
def generate_list_of_externally_hosted_PRs(company_name: str, company_url: str) -> List[dict]:
    query = f"{company_name} site:prnewswire.com"
    response_json = u.google_search(query, num_results=20)
    organic_results = response_json["organic_results"]
    for result in organic_results:
        URL = result["link"]
        print(URL)

    def process_organic_result(result):
        URL = result["link"]
        print(f"URL {URL}:")
        content_str = u.download_web_page_as_text(URL, s3, textract,
                                                  BUCKET_NAME, headers=curl_headers)
        if content_str:
            pr = analyze_external_web_page_for_single_PR(content_str, company_name)
            pr = {**pr, "URL": URL}
            return pr
        else:
            print(f"Ignoring non-HTML content found at {URL}")
            return None

    raw_press_releases = [process_organic_result(result) for result in organic_results]
    raw_press_releases = [pr for pr in raw_press_releases if pr]
    raw_press_releases = [pr for pr in raw_press_releases
                          if pr.get("authored_by", "False").strip().lower() == "true"]
    # print(f"generate_list_of_externally_hosted_PRs -> {raw_press_releases}")
    return raw_press_releases

In [30]:
today = dateparser.parse("now")
one_year = datetime.timedelta(days=365)
two_years = datetime.timedelta(days=2*365)

def parse_date(s: str) -> datetime.datetime:
    """ Remove any timezone info, if there, so we can do easy comparison with now """
    try:
        dt = dateparser.parse(s)
        dt = dt.replace(tzinfo=None)
        return dt
    except Exception as ex:
        return today # this is a hack not sure what else to do if date is malformed

def published_recently(pr: dict, cutoff: datetime.timedelta) -> bool:
    if pr["publication_date"]:
        return (today - parse_date(pr["publication_date"])) < cutoff
    else:
        return True # if no publication date, let's assume it's recent

def remove_extraneous_PRs(press_releases: List[dict]) -> List[dict]:
    press_releases = [pr for pr in press_releases
                       if pr["contains_PR"].strip().lower() == "true" and\
                          pr["is_financial"].strip().lower() != "true"]
    # Ideally, limit to those published in the last year, if that produces
    # zero results then relax that to two years
    result = [pr for pr in press_releases if published_recently(pr, cutoff=one_year)]
    return result or [pr for pr in press_releases if published_recently(pr, cutoff=two_years)]

In [28]:
def generate_list_of_in_situ_PRs(company_name: str, company_url: str) -> List[dict]:
    query = f"press releases site:{u.extract_TLD_from_URL(company_url)}"
    response_json = u.google_search(query, num_results=20)
    try:
        organic_results = response_json["organic_results"]
    except KeyError:
        organic_results = []
        
    for result in organic_results:
        URL = result["link"]
        print(URL)

    def process_organic_result(result: dict) -> Optional[dict]:
        URL = result["link"]
        print(f"URL {URL}:")
        content_str = u.download_web_page_as_text(URL, s3, textract,
                                                  BUCKET_NAME, headers=curl_headers)
        if content_str:
            pr = analyze_in_situ_web_page_for_single_PR(content_str, company_name)
            pr = {**pr, "URL": URL}
            return pr
        else:
            print(f"Ignoring non-HTML content found at {URL}")
            return None

    raw_press_releases = [process_organic_result(result) for result in organic_results]
    raw_press_releases = [pr for pr in raw_press_releases if pr]
    # print(f"generate_list_of_in_situ_PRs -> {raw_press_releases}")
    return raw_press_releases

In [31]:
def generate_press_releases_html(company_name: str, company_url: str) -> str:
    """
    Use Google to find individual pages for press releases and then use Claude
    to determine if the downloaded page is actually a PR (and not, e.g. a list of PRs).
    """
    print(f"++++++ generate_press_releases_html {company_name} {company_url}")
 
    press_releases = remove_extraneous_PRs(
                            generate_list_of_in_situ_PRs(company_name, company_url))
    if len(press_releases) == 0:
        press_releases = remove_extraneous_PRs(
                            generate_list_of_externally_hosted_PRs(company_name, company_url))

    # print(f"press_releases {json.dumps(press_releases, indent=2)}")
    
    for pr in press_releases:
        for key in ["title", "abstract"]:
            if key in pr:
                pr[key] = u.escape_HTML_text(pr[key])

    # print(f"press_releases {json.dumps(press_releases, indent=2)}")
    if press_releases:
        template = u.strip_multiline_whitespace("""\
            <ul>
              {% for pr in press_releases %}
                  <li> {{pr.publication_date or ""}} <a href="{{ pr.URL }}">{{ pr.title }}</a><p/>
                       {{ pr.abstract or "" }}
              {% endfor %}
            </ul>
            """)
        html = jenv.from_string(template).render(press_releases=press_releases)
    else:
        html = "No recent press releases were found."
    # print(f"generate_press_releases_html -> {html}")
    return html

def generate_and_show_PRs(company_name: str, company_url: str):
    if enable_test_harness:
        return HTML(generate_press_releases_html(company_name, company_url))
    else:
        return None

In [32]:
# generate_and_show_PRs("Beam Therapeutics", "https://beamtx.com")
# generate_and_show_PRs("Capstan Therapeutics", "https://www.capstantx.com")
# generate_and_show_PRs("Protagonist Therapeutics Inc", "https://www.protagonist-inc.com")
# generate_and_show_PRs("Capstan Therapeutics", "https://www.capstantx.com")
# generate_and_show_PRs("Vaderis Therapeutics", "https://vaderis.com")
generate_and_show_PRs("Hemavant Sciences", "https://hemavant.com")
# generate_and_show_PRs("Intellia Therapeutics", "https://www.intelliatx.com")

## Scientific posters

In [33]:
def analyze_web_page_as_poster(search_result: dict) -> Optional[dict]:
    URL = search_result["link"]
    print(f"analyze_web_page_as_poster URL: {URL}")
    try:
        text = u.download_web_page_as_text(URL, s3, textract,
                                           BUCKET_NAME, headers=curl_headers,
                                           allow_only_single_page_PDFs=True)
        if text is not None:
            prompt = u.strip_multiline_whitespace(f"""\
                You are given the following contents of a PDF file:
    
                <PDF_contents>
                {text}
                </PDF_contents>
    
                This PDF file may contain a scientific poster presentation.
                
                Your first task is to determine if it _does_ contain a scientific poster
                presentation. If it does not, you should return <not_a_poster></not_a_poster>.
                
                If it does contain a scientific poster then your second tasks to return some
                metadata about the poster: return the title of
                the poster in <title></title> tags and the entire citation in <citation>
                </citation> tags. Also, return a 3-5 sentence summary of the key points of
                the poster in <abstract></abstract> tags.
                """)
            resp = wLLM.invoke(prompt)
            # print(f"resp: {resp}")
            tags = u.extract_multiple_tags(resp)
            # print(f"tags: {tags}")
            if not tags:
                result = None
            elif "not_a_poster" in tags:
                result = None
            else:
                result = {"URL": URL, **tags}
        else:
            result = None
    except Exception as ex:
        print(f"Exception: {ex}")
        result = None
    # print(f"analyze_web_page_as_poster -> {result}")
    return result

In [34]:
def generate_posters_html(company_name: str, company_url: str) -> str:
    """
    Get single-page PDFs from the company's web site and then use the LLM
    to parse the title etc out of the PDF.
    """
    query = f"poster presentation site:{u.extract_site_from_URL(company_url)}"
    response_json = u.google_search(query, num_results=20)
    try:
        organic_results = response_json["organic_results"]
    except KeyError as ex:
        print("Got back zero results from Google")
        organic_results = []
        
    for result in organic_results:
        URL = result["link"]
        print(URL)

    posters = [analyze_web_page_as_poster(result) for result in organic_results]
    # print(f"posters: {json.dumps(posters, indent=2)}")
    posters = [p for p in posters if p]
    # print(f"posters: {json.dumps(posters, indent=2)}")
    posters = [p for p in posters if {"URL", "title"} <= p.keys() and p["title"]]
    # print(f"posters: {json.dumps(posters, indent=2)}")
    for p in posters:
        p["citationOrTitle"] = p.get("citation", "") or p.get("title", "")
    if posters:
        template = """\
         <ul>
          {% for poster in posters %}
              <li> <a href="{{ poster.URL }}">{{ poster.citationOrTitle }}</a>
                 <p/>{{poster.abstract}}
          {% endfor %}
          </ul>
        """
        html = jenv.from_string(template).render(posters=posters)
    else:
        html = "No recent posters were found."
    return html

In [35]:
def generate_and_show_posters(company_name: str, company_url: str):
    if enable_test_harness:
        return HTML(generate_posters_html(company_name, company_url))
    else:
        return None

In [36]:
# generate_and_show_posters("morphosys", "https://www.morphosys.com/en")
generate_and_show_posters("Incyte", "https://incyte.com")

## 10K report

In [44]:
def tenk_report_html(company_name: str) -> str:
    """
    Return an HTML string that describes this company's 10K report.
    """
    print(f"+++ 10k_report_html {company_name}")
    response_json = u.google_search(f"10K report {company_name} site:sec.gov")
    try:
        tenk_URL = response_json["organic_results"][0]["link"]
    except KeyError as ex:
        tenk_URL = None
        print(ex)

    if tenK_URL:
        return f"""\
        Click <a href="{tenk_URL}">here</a> for the PDF.
        <hr/>
        """
    else:
        return f"{company_name} does not have a 10K report"

In [38]:
html_template = u.strip_multiline_whitespace("""\
    <html>
      <head>
          <title>Summary of {{ company_name }}</title>
      </head>
      <body>
          <h1>Summary of {{ company_name }} <small>[generated in {{ elapsed_time }} minutes]</small></h1>
          <h3>Company Summary</h3>
          <p>{{ executive_summary }}<p/>
    
          <h3>Management Team</h3>
          {{ leadership_team }}
          
          <h3>Press Releases (in approx. last year)</h3>
          {{ press_releases }}
          
          <h3>Posters</h3>
          {{ posters }}
          
          <h3>Most recent 10K Report</h3>
          {{ten_k_report}}
      </body>
    </html>
    """)
kwargs = dict(
    company_name=company_name.strip(),
    executive_summary=generate_company_summary_html(company_name, company_URL).strip(),
    leadership_team=generate_leadership_team_html(company_name, company_URL).strip(),
    press_releases=generate_press_releases_html(company_name, company_URL).strip(),
    posters=generate_posters_html(company_name, company_URL).strip(),
    ten_k_report=tenk_report_html(company_name))
kwargs["elapsed_time"] = f"{(time.time() - start_time)/60:.1f}"
html = jenv.from_string(html_template).render(**kwargs)
(Path.cwd() / "summaries" / f"{company_name.replace(' ', '-')}-summary.html").write_text(html)
display(HTML(html))

+++++ generate_what_they_do_html Agios https://www.agios.com/
google_search 'What does Agios do? site:www.agios.com' num_results=5 timeout=20
Result: https://www.agios.com/
Result: https://www.agios.com/pipeline/
Result: https://www.agios.com/about-us/
Result: https://www.agios.com/our-medicine/
Result: https://www.agios.com/about-us/history/
Summarizing search result https://www.agios.com/:
download_web_page https://www.agios.com/
Summarizing search result https://www.agios.com/pipeline/:
download_web_page https://www.agios.com/pipeline/
Summarizing search result https://www.agios.com/about-us/:
download_web_page https://www.agios.com/about-us/
Summarizing search result https://www.agios.com/our-medicine/:
download_web_page https://www.agios.com/our-medicine/
Summarizing search result https://www.agios.com/about-us/history/:
download_web_page https://www.agios.com/about-us/history/
+++++ generate_top_products_html Agios https://www.agios.com/
google_search 'What are the top products o

In [39]:
def show_results(results: List[dict]):
    template = """\
    <table>
      <tbody>
        <tr><th>URL</th><th>name</th><th></th></tr>
        {% for result in results %}
          <tr>
            <td>{{ result.URL }}</td>
            <td>{{ result.name }}</td>
            <td>{{ result.html }}</td>
          </tr>
        {% endfor %}
      </tbody>
    </table>
    """
    html_str = jenv.from_string(template).render(results=results)
    (Path().cwd() / "test-harness-results.html").write_text(html_str)
    return html_str

def test_harness(generator_fn: Callable[Tuple[str, str], str],
                 company_URLs: Iterable[str]):
    """
    This is useful when doing prompt engineering to see what effect a given
    change has across a set of companies, rather than just testing on
    one company.
    """
    if enable_test_harness:
        results = []
        for company_URL in company_URLs:
            company_name = company_name_from_URL(company_URL)
            html = generator_fn(company_name, company_URL)
            print(f"{company_URL} {company_name} -> <<{html}>>")
            results.append(dict(URL=company_URL, name=company_name, html=html))
            # Dump provisional results to disc:
            html_str = show_results(results)
        return HTML(html_str)
    else:
        return None

In [40]:
# test_harness(company_name_from_URL_as_html, company_URLs)
test_harness(generate_company_summary_html, company_URLs)
# test_harness(generate_leadership_team_html, company_URLs)
# test_harness(generate_press_releases_html, company_URLs)
# test_harness(generate_posters_html, company_URLs)