In [1]:
# retrieving company metadata and exploration
!pip install httpx tqdm pandas chardet



In [2]:
import httpx
import os
import pandas as pd

def download_file(client, url, file_path):
    """Download a file from a given URL to a specified path."""
    if not os.path.exists(file_path):
        with client.stream("GET", url) as response:
            response.raise_for_status()
            with open(file_path, 'wb') as f:
                for chunk in response.iter_bytes(chunk_size=8192):
                    f.write(chunk)

data_url = 'https://data.gov.au/data/dataset/bc515135-4bb6-4d50-957a-3713709a76d3/resource/55ad4b1c-5eeb-44ea-8b29-d410da431be3/download/business_names_202404.csv'
# Initialize an HTTP client
client = httpx.Client()
file_path = f"data/{data_url.split('/')[-1]}"
# Fetch the metadata csv:
download_file(client, data_url, file_path)

In [4]:
import chardet
file_path = f"data/{data_url.split('/')[-1]}"
with open(file_path, 'rb') as f:
    result = chardet.detect(f.read(10000))
encoding = result['encoding']
df = pd.read_csv(file_path, encoding=encoding, engine='python', sep=None)
df

Unnamed: 0,REGISTER_NAME,BN_NAME,BN_STATUS,BN_REG_DT,BN_CANCEL_DT,BN_RENEW_DT,BN_STATE_NUM,BN_STATE_OF_REG,BN_ABN
0,BUSINESS NAMES,SILENT SCISSORZ,Registered,07/11/2018,,07/11/2025,,,7.664328e+10
1,BUSINESS NAMES,LITTLE MIRACLES PRESCHOOL & LO...,Registered,27/07/2022,,27/07/2025,,,2.397982e+10
2,BUSINESS NAMES,A Cut Above Painting & Texture Coating,Registered,04/12/2019,,04/12/2022,,,8.663468e+10
3,BUSINESS NAMES,HOMSAFE,Registered,07/02/2019,,31/12/2024,,,5.609895e+10
4,BUSINESS NAMES,COASTAL EARTH WORKS,Registered,30/05/2019,,30/05/2026,,,8.857312e+10
...,...,...,...,...,...,...,...,...,...
3072030,BUSINESS NAMES,zzz tyres and auto services,Registered,12/01/2024,,12/01/2025,,,2.547044e+10
3072031,BUSINESS NAMES,ZZZINKED DIGITAL,Registered,08/04/2017,,08/04/2024,,,8.367605e+10
3072032,BUSINESS NAMES,ZZZY,Registered,27/08/2020,,27/08/2023,,,1.656213e+10
3072033,BUSINESS NAMES,ZZZZ BEST TEES,Registered,12/05/2023,,12/05/2026,,,7.866088e+10


In [15]:
!pip install dask tqdm requests openai dask-expr chardet tqdm pandas matplotlib from bs4 import BeautifulSoup

Collecting bs4
  Downloading bs4-0.0.2-py2.py3-none-any.whl.metadata (411 bytes)
Collecting beautifulsoup4 (from bs4)
  Using cached beautifulsoup4-4.12.3-py3-none-any.whl.metadata (3.8 kB)
Collecting soupsieve>1.2 (from beautifulsoup4->bs4)
  Using cached soupsieve-2.5-py3-none-any.whl.metadata (4.7 kB)
Downloading bs4-0.0.2-py2.py3-none-any.whl (1.2 kB)
Using cached beautifulsoup4-4.12.3-py3-none-any.whl (147 kB)
Using cached soupsieve-2.5-py3-none-any.whl (36 kB)
Installing collected packages: soupsieve, beautifulsoup4, bs4
Successfully installed beautifulsoup4-4.12.3 bs4-0.0.2 soupsieve-2.5


In [58]:
import httpx
from openai import OpenAI, RateLimitError
from bs4 import BeautifulSoup
import pandas as pd
import dask
import time
import re

valid_url = re.compile("^(https|http):\/\/.*")
import dask.dataframe as dd
from dask.diagnostics import ProgressBar
import json
openapi_client = OpenAI(api_key="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")

base_messages = [
                {"role": "system", 
                "content": """
                You are a financial expert, who is able to provide detailed information about companies 
                listed on the Australian Stock exchange (ASX) in JSON format. You should provide your 
                response with the following fields/schema:
                {
                // Name of the ASX company
                "company_name": string,
                // where the company is located/headquartered, the FULL location (i.e stree number, street name, city, state, postcode)
                "address": string,
                // short summary about company
                "summary": string,
                // detailed overview of company profile, innclude industry, history etc.
                "details": string
                // companies website
                "website": string,
                // company about or investor page (usually a `about-us` or `investors` link
                "website_about": string,
                // link to company logo (i.e end with .png, .jpg, .jpeg or .svg)
                "company_logo_link": string,
                // "directions and/or senior leadership information"
                "directors": [{"name": "links to additional sources of information"string, "title": string}],
                // links to additional sources of information
                "references": [{"url": string, "description": string}]
                }
                """},
            ]

def download_file(client, url, file_path):
    """Download a file from a given URL to a specified path."""
    if not os.path.exists(file_path):
        with client.stream("GET", url) as response:
            response.raise_for_status()
            with open(file_path, 'wb') as f:
                for chunk in response.iter_bytes(chunk_size=8192):
                    f.write(chunk)
def validate_company_logo_link(company_metadata_json):
    """
    Validate the company logo link in the company metadata JSON object.
    looks at the different fields and the types of the fields and roughly assesses 
    if the JSON object is valid. If the JSON object is not valid, it will raise an exception. 
    which can allow follow up queries to be made.
    Additionally  does some more advanced checks for urls:
     * it will check if the URL is valid and does not return a 404 errors etc
     * for images like in the company logo, it will check if the image is valid and can be downloaded
    the schema we are checking against is as follows:
    ```json
    {
        // link to company logo (i.e https://1414degrees.com.au/wp-content/uploads/2023/04/1414degrees-GreyRed-RGB.png)
        "company_logo_link": string
    }
    ```
    """
    if "company_logo_link" not in company_metadata_json or re.match(valid_url, company_metadata_json["company_logo_link"]) is None:
        raise ValueError("Company metadata JSON object should have a 'company_logo_link' field with is a URL string such as as mentioned in the schema")
    if not isinstance(company_metadata_json["company_logo_link"], str) or re.match(valid_url, company_metadata_json["company_logo_link"]) is None:
        raise ValueError("Company metadata JSON object 'company_logo_link' field should be a string and a URL such as as mentioned in the schema, ending with .png, .jpg, .jpeg or .svg")
    try:
        response = httpx.get(company_metadata_json["company_logo_link"], headers={"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36"})
    except Exception as e:
        raise ValueError(f"URL {company_metadata_json['company_logo_link']} is likely an invalid format, must contain `http://` or `https://`")
    if response.status_code > 399:
        raise ValueError(f"URL {company_metadata_json['company_logo_link']} is not valid, got status: {response.status_code}")
    # check the logo is an image
    if not response.headers.get('Content-Type', 'none').startswith('image'):
        raise ValueError(f"URL {company_metadata_json['company_logo_link']} is not an image")
    # check the logo is of a specific format (png, jpg, jpeg, svg)
    if not response.headers.get('Content-Type', 'none').endswith(('png', 'jpg', 'jpeg', 'svg', 'svg+xml')):
        raise ValueError(f"URL {company_metadata_json['company_logo_link']} is not a valid image format, should end in png, jpg, jpeg or svg")
def validate_company_metadata(company_metadata_json):
    """
    Validate the company metadata JSON object.
    looks at the different fields and the types of the fields and roughly assesses 
    if the JSON object is valid. If the JSON object is not valid, it will raise an exception. 
    which can allow follow up queries to be made.
    Additionally  does some more advanced checks for urls:
     * it will check if the URL is valid and does not return a 404 errors etc
     * for images like in the company logo, it will check if the image is valid and can be downloaded
    the schema we are checking against is as follows:
    ```json
    {
        // Name of the ASX company
        "company_name": string,
        // where the company is located/headquartered, the FULL location (i.e stree number, street name, city, state, postcode)
        "address": string,
        // short summary about company
        "summary": string,
        // detailed overview of company profile, innclude industry, history etc.
        "details": string
        "website": string,
        // company about or investor page (i.e https://1414degrees.com.au/investors/)
        "website_about": string,
        // link to company logo (i.e https://1414degrees.com.au/wp-content/uploads/2023/04/1414degrees-GreyRed-RGB.png)
        "company_logo_link": string,
        // "directions and/or senior leadership information"
        "directors": [{"name": "links to additional sources of information"string, "title": string}],
        // links to additional sources of information
        "references": [{"url": string, "description": string}]
    }
    ```
    """
    if not isinstance(company_metadata_json, dict):
        raise ValueError("Company metadata JSON object should be a dictionary")
    if "company_name" not in company_metadata_json:
        raise ValueError("Company metadata JSON object should have a 'company_name' field")
    if not isinstance(company_metadata_json["company_name"], str):
        raise ValueError("Company metadata JSON object 'company_name' field should be a string")
    if "address" not in company_metadata_json:
        raise ValueError("Company metadata JSON object should have a 'address' field")
    if not isinstance(company_metadata_json["address"], str):
        raise ValueError("Company metadata JSON object 'address' field should be a string")
    if "summary" not in company_metadata_json:
        raise ValueError("Company metadata JSON object should have a 'summary' field")
    if not isinstance(company_metadata_json["summary"], str):
        raise ValueError("Company metadata JSON object 'summary' field should be a string")
    if "details" not in company_metadata_json:
        raise ValueError("Company metadata JSON object should have a 'details' field")
    if not isinstance(company_metadata_json["details"], str):
        raise ValueError("Company metadata JSON object 'details' field should be a string")
    if "website" not in company_metadata_json or re.match(valid_url, company_metadata_json["website"]) is None:
        raise ValueError("Company metadata JSON object should have a 'website' field which is a URL string as mentioned in the schema")
    if not isinstance(company_metadata_json["website"], str):
        raise ValueError("Company metadata JSON object 'website' field should be a string")
    if "website_about" not in company_metadata_json or re.match(valid_url, company_metadata_json["website_about"]) is None:
        raise ValueError("Company metadata JSON object should have a 'website_about' field which is a URL string as mentioned in the schema")
    if not isinstance(company_metadata_json["website_about"], str) or re.match(valid_url, company_metadata_json["website_about"]) is None:
        raise ValueError("Company metadata JSON object 'website_about' field should be a string and a URL as mentioned in the schema")
    if "company_logo_link" not in company_metadata_json or re.match(valid_url, company_metadata_json["company_logo_link"]) is None:
        raise ValueError("Company metadata JSON object should have a 'company_logo_link' field with is a URL string such as as mentioned in the schema")
    if not isinstance(company_metadata_json["company_logo_link"], str) or re.match(valid_url, company_metadata_json["company_logo_link"]) is None:
        raise ValueError("Company metadata JSON object 'company_logo_link' field should be a string and a URL such as as mentioned in the schema, ending with .png, .jpg, .jpeg or .svg")
    if "directors" not in company_metadata_json:
        raise ValueError("Company metadata JSON object should have a 'directors' field")
    if not isinstance(company_metadata_json["directors"], list):
        raise ValueError("Company metadata JSON object 'directors' field should be a list")
    for director in company_metadata_json["directors"]:
        if not isinstance(director, dict):
            raise ValueError("Company metadata JSON object 'directors' field should be a list of dictionaries")
        if "name" not in director:
            raise ValueError("Company metadata JSON object 'directors' field should have a 'name' field")
        if not isinstance(director["name"], str):
            raise ValueError("Company metadata JSON object 'directors' field 'name' field should be a string")
        if "title" not in director:
            raise ValueError("Company metadata JSON object 'directors' field should have a 'title' field")
        if not isinstance(director["title"], str):
            raise ValueError("Company metadata JSON object 'directors' field 'title' field should be a string")
    if "references" not in company_metadata_json:
        raise ValueError("Company metadata JSON object should have a 'references' field")
    if not isinstance(company_metadata_json["references"], list):
        raise ValueError("Company metadata JSON object 'references' field should be a list")
    for reference in company_metadata_json["references"]:
        if not isinstance(reference, dict):
            raise ValueError("Company metadata JSON object 'references' field should be a list of dictionaries")
        if "url" not in reference:
            raise ValueError("Company metadata JSON object 'references' field should have a 'url' field")
        if not isinstance(reference["url"], str):
            raise ValueError("Company metadata JSON object 'references' field 'url' field should be a string")
        if "description" not in reference:
            raise ValueError("Company metadata JSON object 'references' field should have a 'description' field")
        if not isinstance(reference["description"], str):
            raise ValueError("Company metadata JSON object 'references' field 'description' field should be a string")
    # check if the urls are valid
    for reference in company_metadata_json["references"]:
        try:
            response = httpx.get(reference["url"], follow_redirects=True, headers={"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36"})
            if response.status_code > 399:
                company_metadata_json["references"].remove(reference)
        except Exception as e:
            company_metadata_json["references"].remove(reference)
        
            # TODO: do some error handling and try find alternative references
            # raise ValueError(f"URL {reference['url']} is not valid, got status: {response.status_code}, response: {response.text}")
    try:
        response = httpx.get(company_metadata_json["website"], follow_redirects=True, headers={"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36"})
    except Exception as e:
        raise ValueError(f"URL {company_metadata_json['website']} is likely an invalid format, must contain `http://` or `https://` please check that the landing page for {company_metadata_json['company_name']} is correct, got error: {str(e)}")
    
    if response.status_code > 399:
        raise ValueError(f"URL {company_metadata_json['website']} is not valid, got status: {response.status_code}, please check that the landing page for {company_metadata_json['company_name']} is correct")
    
    try:
        response = httpx.get(company_metadata_json["website_about"], follow_redirects=True, headers={"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36"})
    except Exception as e:
        raise ValueError(f"URL {company_metadata_json['website_about']} is likely an invalid format or failed to connect, must contain `http://` or `https://`, please check that the about page for {company_metadata_json['company_name']}is correct")
    if response.status_code > 399:
        raise ValueError(f"URL {company_metadata_json['website_about']} is not valid, got status: {response.status_code}, please check that the about page for {company_metadata_json['company_name']} is correct")
    try:
        response = httpx.get(company_metadata_json["company_logo_link"], headers={"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36"})
    except Exception as e:
        raise ValueError(f"URL {company_metadata_json['company_logo_link']} is likely an invalid format, must contain `http://` or `https://`")
    if response.status_code > 399:
        raise ValueError(f"URL {company_metadata_json['company_logo_link']} is not valid, got status: {response.status_code}")
    # check the logo is an image
    if not response.headers.get('Content-Type', 'none').startswith('image'):
        raise ValueError(f"URL {company_metadata_json['company_logo_link']} is not an image")
    # check the logo is of a specific format (png, jpg, jpeg, svg)
    if not response.headers.get('Content-Type', 'none').endswith(('png', 'jpg', 'jpeg', 'svg', 'svg+xml')):
        raise ValueError(f"URL {company_metadata_json['company_logo_link']} is not a valid image format, should end in png, jpg, jpeg or svg")
def get_company_metadata(stock_code, company_name):
    model = 'gpt-3.5-turbo-0125' #'gpt-4-0125-preview' #
    attempts = 3  # Initialize attempt count
    core_messages = base_messages.copy() + [{"role": "user", "content": f"give me information on the ASX company {company_name} in JSON format using the above mentioned schema"}]
    get_pre_info = httpx.get(f"https://cdn-api.markitdigital.com/apiman-gateway/ASX/asx-research/1.0/companies/{stock_code}/about")
    core_messages.append({"role": "user", "content": f"leverage the following metadata: {get_pre_info.json()}"})
    while attempts > 0:
        try:
            response = openapi_client.chat.completions.create(
                model=model, #'gpt-3.5-turbo-0125',
                response_format={"type": "json_object"},
                messages=core_messages
            )
            company_metadata_json = json.loads(response.choices[0].message.content)
            company_metadata_json['stock_code'] = stock_code
            validate_company_metadata(company_metadata_json)
        except json.JSONDecodeError as e:
            # Error handling specifically for JSON decoding errors
            print(f"{stock_code}: JSON decode error on attempt {8 - attempts}: {str(e)}")
            # Update the retry message for specific errors
            core_messages.append({"role": "user", "content": f"the JSON object provided is not valid, please provide a valid JSON object. Here is the error message: {str(e)}"})
            attempts -= 1
            # Missing the decrement for attempts here, also this block might not reset attempts correctly
        except ValueError as e:
            # Error handling for validation failures
            if "URL" in str(e) and "is not valid" in str(e):
                # Fetching page content should be within try-except block
                try:
                    page_content = httpx.get(company_metadata_json['website'], follow_redirects=True, headers={"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36"}).content
                    soup = BeautifulSoup(page_content, 'html.parser')
                    links = list(filter(lambda x: x != "" or x is not None, list(map(lambda link: link.get("href", "").split('?')[0], soup.find_all("a")))))
                    relevent_content = ", ".join(links)
                    msg = {"role": "user", "content": f"an error was seen in one of the URLs: {str(e)} is and/or is not valid, please provide a valid URL. Here are all the links and images found on the landing page of the company website: {relevent_content}, **USE THESE LINKS TO SELECT A VALID URL**"}
                    if not any([relevent_content in m.get("content", "") for m in core_messages]):
                        core_messages = core_messages[0:1] + [msg]
                except Exception as httpx_error:
                    print(f"HTTPX error when fetching page content: {httpx_error}")
                    core_messages.append({"role": "user", "content": f"couldn't use the company website link provided to get additional metadata, please ensure this link is correct. "})
            elif "is not an image" in str(e):
                 # Fetching page content should be within try-except block
                try:
                    page_content = httpx.get(company_metadata_json['website'], follow_redirects=True, headers={"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36"}).content
                    soup = BeautifulSoup(page_content, 'html.parser')
                    images = ", ".join(list(filter(lambda x: x != "" or x is not None or any(s in x for s in ("logo", "company")) or x.endsWith(('png', 'jpg', 'jpeg', 'svg')), list(map(lambda link: link.get("src", "").split('?')[0], soup.find_all("img"))))))
                    msg = {"role": "user", "content": f"an error was seen in one of the URLs: {str(e)} is and/or is not valid, please provide a valid URL. Here are the image links on the landing page of the company website: {images}, **USE THESE LINKS TO SELECT A VALID IMAGE, likely named logo**"}
                    if not any([images in m.get("content", "") for m in core_messages]):
                        core_messages = core_messages[0:1] + [msg]
                except Exception as httpx_error:
                    print(f"HTTPX error when fetching page content: {httpx_error}")
                    core_messages.append({"role": "user", "content": f"couldn't use the company website link provided to get additional metadata, please ensure this link is correct. "})
            else:
                core_messages.append({"role": "user", "content": f"there was a ValueError according to the provided schema, please ensure the values are correct is correct as per the schema. Here is the error message: {str(e)}"})
            attempts -= 1
            print(f"{stock_code}: Validation error on attempt {8 - attempts}: {str(e)}, {company_metadata_json}, {core_messages}")
            continue
        except RateLimitError as e:
            # Rate limit error handling
            print(f"{stock_code}: Rate limit error on attempt {8 - attempts}: {str(e)}")
            time.sleep(60)
        except Exception as e:
            # For unexpected errors, decrementing attempts and logging the attempt
            print(f"{stock_code}: Unexpected error on attempt {8 - attempts}: {str(e)}, {e}, json: {company_metadata_json}")
            if "429" in str(e):
                time.sleep(60)
            if "This model's maximum context length is" in str(e):
                print(core_messages)
                core_messages = base_messages + [{"role": "user", "content": f"give me information on the ASX company with stock code {stock_code} in JSON format using the above mentioned schema. Previous attempts have failed, so please double check your information"}]
            attempts -= 1
            continue  # Continue to the next iteration if it's not the final attempt

            # Common processing after successful response
        try:
            df = pd.json_normalize(company_metadata_json)
            df['stock_code'] = stock_code
            df.index = [stock_code]
            return df
        except Exception as e:
            print(f"{stock_code}:  Error normalizing JSON to DataFrame: {str(e)}")
            core_messages.append({"role": "user", "content": f"failed to normalize JSON object to DataFrame, likely malformed json, please ensure the structure is correct as per the schema"})
            attempts -= 1

    # After all attempts
    response = openapi_client.chat.completions.create(
                model=model, # 'gpt-3.5-turbo-0125',
                response_format={"type": "json_object"},
                messages=core_messages + [{"role": "user", "content": f"This is the final attempt, use the previously mentioned information to give me information on the ASX company with stock code:  {stock_code}. Ensure it is in JSON format using the above mentioned schema"}]
            )
    company_metadata_json = json.loads(response.choices[0].message.content)
    company_metadata_json['stock_code'] = stock_code
    df = pd.json_normalize(company_metadata_json)
    df['stock_code'] = stock_code
    df.index = [stock_code]
    return df
    # raise Exception(f"Failed to retrieve and process company metadata after 8 attempts. stock: {stock_code} messages: {core_messages}")

def resolve_company_logo_link(stock_code, initial_company_metadata):
    """
    use ChatGPT to resolve the company logo link
    where we feed in the company_metadata_json and the stock_code which will provide image links 
    retrieved from the website to act as hints as to what the company logo link could be.
    """
    model = 'gpt-3.5-turbo-0125' #'gpt-4-0125-preview' #
    attempts = 5  # Initialize attempt count
    core_messages = [{"role": "user", "content": 
"""resolve the company logo link for the ASX company with stock code {} using the following metadata: 
```
{}
```, return the response as JSON format following the structure: 
```json
{}
```
""".format(stock_code, initial_company_metadata, '{"company_logo_link": string}')}]
    while attempts > 0:
        try:
            response = openapi_client.chat.completions.create(
                model=model, #'gpt-3.5-turbo-0125',
                response_format={"type": "json_object"},
                messages=core_messages
            )
            company_metadata_json = json.loads(response.choices[0].message.content)
            company_metadata_json['stock_code'] = stock_code
            validate_company_logo_link(company_metadata_json)
        except json.JSONDecodeError as e:
            # Error handling specifically for JSON decoding errors
            print(f"{stock_code}: JSON decode error on attempt {8 - attempts}: {str(e)}")
            # Update the retry message for specific errors
            core_messages.append({"role": "user", "content": f"the JSON object provided is not valid, please provide a valid JSON object. Here is the error message: {str(e)}"})
            attempts -= 1
            # Missing the decrement for attempts here, also this block might not reset attempts correctly
        except ValueError as e:
            # Error handling for validation failures
            core_messages.append({"role": "user", "content": f"there was a ValueError according to the provided schema, please ensure the values are correct is correct as per the schema. Here is the error message: {str(e)}"})
            attempts -= 1
            print(f"{stock_code}: Validation error on attempt {8 - attempts}: {str(e)}, {company_metadata_json}, {core_messages}")
            continue
        except RateLimitError as e:
            # Rate limit error handling
            print(f"{stock_code}: Rate limit error on attempt {8 - attempts}: {str(e)}")
            time.sleep(60)
        except Exception as e:
            # For unexpected errors, decrementing attempts and logging the attempt
            print(f"{stock_code}: Unexpected error on attempt {8 - attempts}: {str(e)}, {e}")
            if "429" in str(e):
                time.sleep(60)
            if "This model's maximum context length is" in str(e):
                print(core_messages)
                core_messages = base_messages + [{"role": "user", "content": f"give me information on the ASX company with stock code {stock_code} in JSON format using the above mentioned schema. Previous attempts have failed, so please double check your information"}]
            attempts -= 1
            continue
        # Common processing after successful response
        try:
            df = pd.json_normalize(company_metadata_json)
            df['stock_code'] = stock_code
            df.index = [stock_code]
            return df
        except Exception as e:
            print(f"{stock_code}:  Error normalizing JSON to DataFrame: {str(e)}")
            core_messages.append({"role": "user", "content": f"failed to normalize JSON object to DataFrame, likely malformed json, please ensure the structure is correct as per the schema"})
            attempts -= 1
    # After all attempts
    df = pd.DataFrame({"company_logo_link": "Not Found"}, index=[stock_code])
    df['stock_code'] = stock_code
    return df
    

def get_company_metadata_light(stock_code):
    url = f"https://cdn-api.markitdigital.com/apiman-gateway/ASX/asx-research/1.0/companies/{stock_code}/about"
    try:
        response = httpx.get(url)
        if response.status_code in [400, 404]:
            df = pd.json_normalize({"company_name": "Not Found", "address": "Not Found", "summary": "Not Found", "details": "Not Found", "website": "Not Found"})
            df['stock_code'] = stock_code
            df.index = [stock_code]
            return df
        elif response.status_code > 400:
            raise ValueError(f"Failed to get company metadata for stock code: {stock_code}, status_code: {response.status_code}, response: {response.text}")
        company_metadata = response.json()
        data = {
        "company_name": company_metadata['data']['displayName'],
        "address": company_metadata['data']['addressContact']['address'],
        "summary": company_metadata['data']['description'],
        "details": company_metadata['data']['description'],
        "website": company_metadata['data']['websiteUrl'],
    }
    except Exception as e:
        raise ValueError(f"Failed to get company metadata for stock code: {stock_code}, error: {str(e)}, status_code: {response.status_code}, response: {response.text}")
    # {"data":{"displayName":"PILBARA MINERALS LIMITED","issueType":"CS","symbol":"PLS","xid":"8027516","addressContact":{"address":"Level 2, 146 Colin Street, WEST PERTH, WA, AUSTRALIA, 6005","fax":"+61 8 6266 6288","phone":"+61 8 6266 6266"},"addressShareRegistry":{"address":"Level 17, 221 St Georges Terrace, PERTH, WA, AUSTRALIA, 6000","attention":"COMPUTERSHARE INVESTOR SERVICES PTY LIMITED","phone":"1300 850 505"},"description":"Lithium and tantalum producer and explorer","directors":[{"name":"Ms Kathleen Marie Conlon","title":"Non Exec. Chair"},{"name":"Mr Dale Henderson","title":"Managing Director, CEO"},{"name":"Mr Nicholas Cernotta","title":"Non Exec. Director"},{"name":"Ms Sally-Anne Layman","title":"Non Exec. Director"},{"name":"Mr Stephen John Scudamore","title":"Non Exec. Director"},{"name":"Ms Miriam Erin Lyons Stanborough","title":"Non Exec. Director"},{"name":"Mr Vince De Carolis","title":"Chief Op. Officer"},{"name":"Mr Luke Bortoli","title":"CFO"}],"foreignExempt":false,"websiteUrl":"http://www.pilbaraminerals.com.au","secretaries":[{"name":"Mrs Danielle Lee Webber","title":"Company Secretary"}],"indices":[{"symbol":"XMM","xid":"5313384","displayName":"S&P/ASX 300 Metals and Mining Index","issueType":"IN","issueTypeName":"Index"},{"symbol":"XKO","xid":"574995","displayName":"S&P/ASX 300","issueType":"IN","issueTypeName":"Index"},{"symbol":"XMJ","xid":"577650","displayName":"S&P/ASX 200 Materials (Sector)","issueType":"IN","issueTypeName":"Index"},{"symbol":"XJO","xid":"583954","displayName":"S&P/ASX 200","issueType":"IN","issueTypeName":"Index","isin":"XC0006013624"},{"symbol":"XTO","xid":"595470","displayName":"S&P/ASX 100","issueType":"IN","issueTypeName":"Index"},{"symbol":"XFL","xid":"599587","displayName":"S&P/ASX 50","issueType":"IN","issueTypeName":"Index","isin":"XC0005704660"},{"symbol":"XAO","xid":"601823","displayName":"S&P/ASX ALL ORDINARIES","issueType":"IN","issueTypeName":"Index"},{"symbol":"XAT","xid":"7466518","displayName":"S&P/ASX All Australian 200","issueType":"IN","issueTypeName":"Index"},{"symbol":"XAF","xid":"7466519","displayName":"S&P/ASX All Australian 50","issueType":"IN","issueTypeName":"Index"}]}}{"data":{"displayName":"PILBARA MINERALS LIMITED","issueType":"CS","symbol":"PLS","xid":"8027516","addressContact":{"address":"Level 2, 146 Colin Street, WEST PERTH, WA, AUSTRALIA, 6005","fax":"+61 8 6266 6288","phone":"+61 8 6266 6266"},"addressShareRegistry":{"address":"Level 17, 221 St Georges Terrace, PERTH, WA, AUSTRALIA, 6000","attention":"COMPUTERSHARE INVESTOR SERVICES PTY LIMITED","phone":"1300 850 505"},"description":"Lithium and tantalum producer and explorer","directors":[{"name":"Ms Kathleen Marie Conlon","title":"Non Exec. Chair"},{"name":"Mr Dale Henderson","title":"Managing Director, CEO"},{"name":"Mr Nicholas Cernotta","title":"Non Exec. Director"},{"name":"Ms Sally-Anne Layman","title":"Non Exec. Director"},{"name":"Mr Stephen John Scudamore","title":"Non Exec. Director"},{"name":"Ms Miriam Erin Lyons Stanborough","title":"Non Exec. Director"},{"name":"Mr Vince De Carolis","title":"Chief Op. Officer"},{"name":"Mr Luke Bortoli","title":"CFO"}],"foreignExempt":false,"websiteUrl":"http://www.pilbaraminerals.com.au","secretaries":[{"name":"Mrs Danielle Lee Webber","title":"Company Secretary"}],"indices":[{"symbol":"XMM","xid":"5313384","displayName":"S&P/ASX 300 Metals and Mining Index","issueType":"IN","issueTypeName":"Index"},{"symbol":"XKO","xid":"574995","displayName":"S&P/ASX 300","issueType":"IN","issueTypeName":"Index"},{"symbol":"XMJ","xid":"577650","displayName":"S&P/ASX 200 Materials (Sector)","issueType":"IN","issueTypeName":"Index"},{"symbol":"XJO","xid":"583954","displayName":"S&P/ASX 200","issueType":"IN","issueTypeName":"Index","isin":"XC0006013624"},{"symbol":"XTO","xid":"595470","displayName":"S&P/ASX 100","issueType":"IN","issueTypeName":"Index"},{"symbol":"XFL","xid":"599587","displayName":"S&P/ASX 50","issueType":"IN","issueTypeName":"Index","isin":"XC0005704660"},{"symbol":"XAO","xid":"601823","displayName":"S&P/ASX ALL ORDINARIES","issueType":"IN","issueTypeName":"Index"},{"symbol":"XAT","xid":"7466518","displayName":"S&P/ASX All Australian 200","issueType":"IN","issueTypeName":"Index"},{"symbol":"XAF","xid":"7466519","displayName":"S&P/ASX All Australian 50","issueType":"IN","issueTypeName":"Index"}]}}
   
    df = pd.json_normalize(data)
    df['stock_code'] = stock_code
    df.index = [stock_code]
    return df

def get_company_website_content(stock_code, url):
    try:
        soup = get_page_content(url)
    except Exception as e:
        df = pd.DataFrame({"links": ["NA"], "images": ["NA"]})
        df['stock_code'] = stock_code
        return df
    links = get_page_links(soup)
    images = get_page_images(soup)
    df = pd.json_normalize({"links": links, "images": images})
    df['stock_code'] = stock_code
    return df
def get_page_content(url):
    page_content = httpx.get(url, follow_redirects=True, headers={"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36"}).text
    soup = BeautifulSoup(page_content, 'html.parser')
    return soup

def get_page_links(soup: BeautifulSoup):
    links = list(filter(lambda x: x != "" or x is not None, list(map(lambda link: link.get("href", "").split('?')[0], soup.find_all("a")))))
    return links

def get_page_images(soup: BeautifulSoup):
    images = ", ".join(list(filter(lambda x: x != "" or x is not None or x.endswith(('png', 'jpg', 'jpeg', 'svg')), list(map(lambda link: link.get("src", "").split('?')[0], soup.find_all("img"))))))
    return images
    

In [201]:
# download asx directory from asx
# from: https://www.asx.com.au/markets/trade-our-cash-market/directory
import time
import os
data_url = 'https://data.gov.au/data/dataset/bc515135-4bb6-4d50-957a-3713709a76d3/resource/55ad4b1c-5eeb-44ea-8b29-d410da431be3/download/business_names_202404.csv'
# Initialize an HTTP client
client = httpx.Client()
file_path = f"data/{data_url.split('/')[-1]}"
asx_company_url = 'https://asx.api.markitdigital.com/asx-research/1.0/companies/directory/file?access_token=83ff96335c2d45a094df02a206a39ff4'
company_list_path = f"data/ASX_Listed_Companies_07-04-2024_11-03-45_AEST.csv"
download_file(client, data_url, company_list_path)
import chardet
with open(company_list_path, 'rb') as f:
    result = chardet.detect(f.read(10000))
encoding = result['encoding']
df_company_list = pd.read_csv(company_list_path, encoding=encoding, engine='python', sep=None)

# iterate through the stocks and get the metadata from the openAI query. Parallelise this using dask
stocks = list(df_company_list[['ASX code', 'Company name']].itertuples(index=False, name=None))

In [308]:
res = get_company_website_content('ZNC', 'http://www.zenithminerals.com.au')

print(res)

<!doctype html>
<html dir="ltr" lang="en-US" prefix="og: https://ogp.me/ns#">
<head>
	<meta charset="UTF-8">
		<meta name="viewport" content="width=device-width, initial-scale=1">
	<link rel="profile" href="https://gmpg.org/xfn/11">
	<title>Home - Zenith Minerals</title>

		<!-- All in One SEO 4.5.1.1 - aioseo.com -->
		<meta name="description" content="Australian-basedmining exploration companyfor the products of tomorrow Zenith Minerals (ASX:ZNC) is a mining exploration company focused on supporting the increasing demand for metals critical to the global energy transition. The company has a portfolio focused on Lithium, Gold and Base Metals. Read more CompanyOverview Read more Investor Centre Read more Sustainability Read more Announcements SHARE" />
		<meta name="robots" content="max-image-preview:large" />
		<link rel="canonical" href="https://www.zenithminerals.com.au/" />
		<meta name="generator" content="All in One SEO (AIOSEO) 4.5.1.1" />
		<meta property="og:locale" content="e

In [270]:

# delayed_results = [dask.delayed(get_company_metadata_light)(stock) for stock, company_name in stocks]

#ddf = dd.from_delayed(delayed_results)


#with ProgressBar():
#    basic_metadata_df = ddf.compute()

#basic_metadata_df.head()

[####                                    ] | 11% Completed | 7.73 sms


KeyboardInterrupt: 

In [231]:
#basic_metadata_df.reset_index()
#basic_metadata_df.to_csv("data/asx_company_metadata_basic.csv", index=False)

In [4]:
basic_metadata_filepath = "data/asx_company_metadata_basic.csv"
df_basic_company_metadata_list = pd.read_csv(basic_metadata_filepath, engine='python')
non_empty_stocks = df_basic_company_metadata_list[df_basic_company_metadata_list['company_name'] != "Not Found"]
websites = list(non_empty_stocks[['stock_code', 'website']].itertuples(index=False, name=None))
delayed_results = [dask.delayed(get_company_website_content)(str(stock_code), website) for stock_code, website in websites]

ddf = dd.from_delayed(delayed_results)


with ProgressBar():
    company_website_links = ddf.compute()

company_website_links.head()

[########################################] | 100% Completed | 234.36 s


Unnamed: 0,links,images,stock_code
0,"[http://www.facebook.com/1414Degrees/, https:/...",https://1414degrees.com.au/wp-content/uploads/...,14D
0,"[/, https://adalta.com.au/, https://adalta.com...",/wp-content/themes/adalta/images/hamburger-ico...,1AD
0,"[#genesis-nav-primary, #genesis-content, #gene...",https://auroraenergymetals.com/wp-content/uplo...,1AE
0,"[https://alterra.com.au, https://alterra.com.a...",https://alterra.com.au/wp-content/uploads/2022...,1AG
0,"[/cart, #page, /, /, /about, /team, /scientifi...",//images.squarespace-cdn.com/content/v1/648f83...,1AI


In [5]:
merged_df = pd.merge(df_basic_company_metadata_list, company_website_links, on='stock_code', how='outer')

In [6]:
merged_df

Unnamed: 0,company_name,address,summary,details,website,stock_code,links,images
0,1414 DEGREES LIMITED,"136 Daws Road, MELROSE PARK, SA, AUSTRALIA, 5039","Commercialising energy storage technology, the...","Commercialising energy storage technology, the...",http://www.1414degrees.com.au,14D,"[http://www.facebook.com/1414Degrees/, https:/...",https://1414degrees.com.au/wp-content/uploads/...
1,ADALTA LIMITED,"Unit 15, 2 Park Drive, BUNDOORA, VIC, AUSTRALI...",Drug discovery and development,Drug discovery and development,http://www.adalta.com.au,1AD,"[/, https://adalta.com.au/, https://adalta.com...",/wp-content/themes/adalta/images/hamburger-ico...
2,AURORA ENERGY METALS LIMITED,"Suite 1, 245 Churchill Ave, SUBIACO, WA, AUSTR...",Mineral exploration and development,Mineral exploration and development,https://auroraenergymetals.com/,1AE,"[#genesis-nav-primary, #genesis-content, #gene...",https://auroraenergymetals.com/wp-content/uplo...
3,ALTERRA LIMITED,"Level 3, 150 St Georges Terrace, PERTH, WA, AU...","Alterra is an originator, developer and manage...","Alterra is an originator, developer and manage...",http://www.alterra.com.au,1AG,"[https://alterra.com.au, https://alterra.com.a...",https://alterra.com.au/wp-content/uploads/2022...
4,ALGORAE PHARMACEUTICALS LIMITED,"Level 23, 525 Collins Street, MELBOURNE, VIC, ...",Pharmaceutical development,Pharmaceutical development,https://algoraepharma.com/,1AI,"[/cart, #page, /, /, /about, /team, /scientifi...",//images.squarespace-cdn.com/content/v1/648f83...
...,...,...,...,...,...,...,...,...
1995,ZELIRA THERAPEUTICS LIMITED,"Level 26, 140 St Georges Terrace, PERTH, WA, A...",Bio-pharmaceutical,Bio-pharmaceutical,https://zeliratx.com/,ZLD,"[https://www.facebook.com/Zeliratx/, https://t...",https://zeliratx.com/wp-content/uploads/2021/0...
1996,ZINC OF IRELAND NL,"Suite B9, 431 Roberts Road, SUBIACO, WA, AUSTR...",Zinc and Gold Exploration,Zinc and Gold Exploration,https://www.zincofireland.com.au,ZMI,"[/, /, #, /about-us/vision-values, /about-us/c...","/images/required-imgs/zmi-logo.png, /images/he..."
1997,ZIMI LIMITED,"Level 1, 2A / 300 Fitzgerald Street, NORTH PER...",Zimi Limited is an innovative Australian techn...,Zimi Limited is an innovative Australian techn...,https://zimi.life/,ZMM,"[/, /lifestyle, #, /product/multi-purpose-swit...","https://www.facebook.com/tr, /img/logo-rounded..."
1998,ZENITH MINERALS LIMITED,"Level 2, 33 Ord Street, WEST PERTH, WA, AUSTRA...","Identification, exploration and development of...","Identification, exploration and development of...",http://www.zenithminerals.com.au,ZNC,"[#content, https://www.zenithminerals.com.au, ...",https://www.zenithminerals.com.au/wp-content/u...


In [7]:
merged_df.to_csv("data/asx_company_metadata_with_links.csv", index=False)

In [24]:
from sqlalchemy import create_engine
source_df = pd.read_csv("data/asx_company_metadata_with_links.csv")
database_url = "postgresql://admin:password@localhost:5432/cms"
engine = create_engine(database_url)
source_df.to_sql('metadata', engine, if_exists='replace', index=False)

source_df

Unnamed: 0,company_name,address,summary,details,website,stock_code,links,images
0,1414 DEGREES LIMITED,"136 Daws Road, MELROSE PARK, SA, AUSTRALIA, 5039","Commercialising energy storage technology, the...","Commercialising energy storage technology, the...",http://www.1414degrees.com.au,14D,"['http://www.facebook.com/1414Degrees/', 'http...",https://1414degrees.com.au/wp-content/uploads/...
1,ADALTA LIMITED,"Unit 15, 2 Park Drive, BUNDOORA, VIC, AUSTRALI...",Drug discovery and development,Drug discovery and development,http://www.adalta.com.au,1AD,"['/', 'https://adalta.com.au/', 'https://adalt...",/wp-content/themes/adalta/images/hamburger-ico...
2,AURORA ENERGY METALS LIMITED,"Suite 1, 245 Churchill Ave, SUBIACO, WA, AUSTR...",Mineral exploration and development,Mineral exploration and development,https://auroraenergymetals.com/,1AE,"['#genesis-nav-primary', '#genesis-content', '...",https://auroraenergymetals.com/wp-content/uplo...
3,ALTERRA LIMITED,"Level 3, 150 St Georges Terrace, PERTH, WA, AU...","Alterra is an originator, developer and manage...","Alterra is an originator, developer and manage...",http://www.alterra.com.au,1AG,"['https://alterra.com.au', 'https://alterra.co...",https://alterra.com.au/wp-content/uploads/2022...
4,ALGORAE PHARMACEUTICALS LIMITED,"Level 23, 525 Collins Street, MELBOURNE, VIC, ...",Pharmaceutical development,Pharmaceutical development,https://algoraepharma.com/,1AI,"['/cart', '#page', '/', '/', '/about', '/team'...",//images.squarespace-cdn.com/content/v1/648f83...
...,...,...,...,...,...,...,...,...
1995,ZELIRA THERAPEUTICS LIMITED,"Level 26, 140 St Georges Terrace, PERTH, WA, A...",Bio-pharmaceutical,Bio-pharmaceutical,https://zeliratx.com/,ZLD,"['https://www.facebook.com/Zeliratx/', 'https:...",https://zeliratx.com/wp-content/uploads/2021/0...
1996,ZINC OF IRELAND NL,"Suite B9, 431 Roberts Road, SUBIACO, WA, AUSTR...",Zinc and Gold Exploration,Zinc and Gold Exploration,https://www.zincofireland.com.au,ZMI,"['/', '/', '#', '/about-us/vision-values', '/a...","/images/required-imgs/zmi-logo.png, /images/he..."
1997,ZIMI LIMITED,"Level 1, 2A / 300 Fitzgerald Street, NORTH PER...",Zimi Limited is an innovative Australian techn...,Zimi Limited is an innovative Australian techn...,https://zimi.life/,ZMM,"['/', '/lifestyle', '#', '/product/multi-purpo...","https://www.facebook.com/tr, /img/logo-rounded..."
1998,ZENITH MINERALS LIMITED,"Level 2, 33 Ord Street, WEST PERTH, WA, AUSTRA...","Identification, exploration and development of...","Identification, exploration and development of...",http://www.zenithminerals.com.au,ZNC,"['#content', 'https://www.zenithminerals.com.a...",https://www.zenithminerals.com.au/wp-content/u...


In [47]:
example = merged_df.iloc[0][["company_name", "website", "images"]].to_dict()

res = resolve_company_logo_link('14D', example)

In [48]:
for index, row in merged_df.iterrows():
    stock_code = row.stock_code
    company_metadata = row.to_dict()
    try:
        res = resolve_company_logo_link(stock_code, company_metadata)
        print(res)
        break
    except Exception as e:
        print(f"Failed to resolve company logo link for stock code: {stock_code}, error: {str(e)}")
        continue

                                     company_logo_link stock_code
14D  https://1414degrees.com.au/wp-content/uploads/...        14D


In [60]:
source_df = pd.read_csv("data/asx_company_metadata_with_links.csv")

delayed_results = [dask.delayed(resolve_company_logo_link)(metadata.stock_code, metadata[["company_name", "website", "images"]].to_dict()) for index, metadata in source_df.iterrows()]

ddf = dd.from_delayed(delayed_results)


with ProgressBar():
    company_logo_link_df = ddf.compute()

[########################################] | 100% Completed | 25m 36s


In [61]:
company_logo_link_df.iloc[5].to_dict()

{'company_logo_link': 'https://oneclickgroup.com.au/wp-content/uploads/2022/06/OCG_ICON.png',
 'stock_code': '1CG'}

In [63]:
metadata_df_with_images = pd.merge(source_df, company_logo_link_df, on='stock_code', how='outer')

metadata_df_with_images
metadata_df_with_images.to_csv("data/asx_company_metadata_with_images.csv", index=False)

In [71]:
from numbers import Number
from dask import delayed, compute
# Function to fetch headers and extract size and content type
@delayed
def fetch_image_data(url, metadata):
    try:
        with httpx.Client(timeout=10.0) as client:
            response = client.get(url, follow_redirects=True, headers={"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36"})
            folder = "NUM" if isinstance(metadata['stock_code'].upper()[0], Number) else  metadata['stock_code'].upper()[0]
            # TODO: write image to disk under directory: data/images/{folder}/{metadata['stock_code']}/logo.png
            # TODO: write metadata to metadata.json in the same directory
    except Exception as e:
        return pd.json_normalize({'content_length': 0, 'content_type': 'error'})

# Assume metadata_df_with_images is defined and has a 'company_logo_link' column
# Create a list of delayed objects for each URL in the DataFrame
delayed_results = [fetch_image_data(metadata['company_logo_link'], metadata) for metadata in metadata_df_with_images]

# Create a Dask DataFrame from delayed objects
ddf = dd.from_delayed(delayed_results)

# Compute the results with progress bar
with ProgressBar():
    company_logo_link_df = ddf.compute()


[########################################] | 100% Completed | 73.25 s
    content_length content_type
0            34928          png
0             5097          png
0                0          svg
0             5047          png
0            46324          png
..             ...          ...
0            94504          png
0            19659          png
0             2866          svg
0              146         html
0            16819         jpeg

[2000 rows x 2 columns]
Total Data Size: 150640273 bytes
content_type
png                         913
svg                         461
error                       398
jpeg                        141
html                         59
                              8
html; charset=UTF-8           7
html; charset=iso-8859-1      3
html; charset=utf8            3
html; charset=utf-8           2
plain;charset=UTF-8           2
gif                           1
webp                          1
html;charset=utf-8            1
Name: count, dtype: int64


In [1]:
!pip install cairosvg lxml
from xml.etree.ElementTree import Element



In [15]:
# I want to download all the images from the metadtata_df_with_images dataframe and do an analysis of the image sizes and types. Use httpx for the request client
import httpx
import json
from dask import delayed, compute
import dask.dataframe as dd
from dask.diagnostics import ProgressBar
import os
import pandas as pd
import cairosvg

# Helper function to determine if the first character is a number
def is_number(s):
    try:
        float(s)
        return True
    except ValueError:
        return False

# Delayed function to fetch and save image data
@delayed
def fetch_image_data(url, metadata):
    try:
        with httpx.Client(timeout=10.0) as client:
            response = client.get(url, follow_redirects=True, headers={"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36"})
            folder = "NUM" if is_number(metadata['stock_code'][0]) else metadata['stock_code'][0]
            directory_path = f"data/images/{folder}/{metadata['stock_code']}"
            os.makedirs(directory_path, exist_ok=True)
            
            image_path = os.path.join(directory_path, f"{metadata['stock_code']}.png")
            # Check if the response content-type is SVG and convert if necessary
            if response.headers.get('content-type') == 'image/svg+xml' or url.endswith('.svg'):
                cairosvg.svg2png(bytestring=response.content, write_to=image_path)
            else:
                # Save the image directly if not SVG
                with open(image_path, 'wb') as f:
                    f.write(response.content)
            
            # Save the metadata
            metadata_path = os.path.join(directory_path, "metadata.json")
            with open(metadata_path, 'w') as f:
                json.dump(metadata, f, indent=4)
                
            return pd.json_normalize({
                'content_length': len(response.content),
                'content_type': response.headers.get('content-type')
            })
    except Exception as e:
        return pd.json_normalize({'content_length': 0, 'content_type': 'error'})


In [25]:
# iterate over all logo.png files and change the name to {metadata['stock_code']} in analysis/data/images
import os
for root, dirs, files in os.walk('data/images'):
    for file in files:
        if file == 'logo.png':
            stock_code = os.path.basename(root)
            new_name = os.path.join(root, f"{stock_code}.png")
            os.rename(os.path.join(root, file), new_name)

In [16]:
metadata_df_with_images = pd.read_csv("data/asx_company_metadata_with_images.csv")
# Assume metadata_df_with_images is defined and has a 'company_logo_link' column
# Extract data from the DataFrame for processing
# metadata_list = metadata_df_with_images[metadata_df_with_images['stock_code'] == 'JIN'].to_dict(orient='records')
metadata_list = metadata_df_with_images.to_dict(orient='records')
fetch_image_data(metadata_list[0]['company_logo_link'], metadata_list[0]).compute()

Unnamed: 0,content_length,content_type
0,34928,image/png


In [17]:


# Create a list of delayed objects for each URL in the DataFrame
delayed_results = [fetch_image_data(metadata['company_logo_link'], metadata) for metadata in metadata_list]

# Create a Dask DataFrame from delayed objects
ddf = dd.from_delayed(delayed_results)

# Compute the results with a progress bar
with ProgressBar():
    company_logo_link_df = ddf.compute()


[########################################] | 100% Completed | 79.95 s


In [213]:

delayed_results = [dask.delayed(get_company_metadata)(stock, company_name) for stock, company_name in stocks]

ddf = dd.from_delayed(delayed_results)


with ProgressBar():
    agg_df = ddf.compute()

agg_df[agg_df['company_name'] == 'Not Found']

KeyboardInterrupt: 

In [27]:
metadata_df_with_images = pd.read_csv("data/asx_company_metadata_with_images.csv")



TypeError: string indices must be integers, not 'str'

In [51]:
metadata_df_with_images['gcsUrl'] = metadata_df_with_images['stock_code'].apply(lambda x: f"https://storage.googleapis.com/shorted-company-logos/logos/{str(x).upper()}.png")
metadata_df_with_images
asx_data = pd.read_csv("data/ASX_Listed_Companies_07-04-2024_11-03-45_AEST.csv", engine='python')

final_dataset = pd.merge(asx_data, metadata_df_with_images, left_on='ASX code', right_on='stock_code', how='outer')

final_dataset.rename(columns={'GICs industry group': 'industry', 'Market Cap': 'market_gap', 'Listing date': 'listing_date'}, inplace=True)
final_dataset.drop_duplicates()
final_dataset.drop('ASX code', axis=1, inplace=True)
final_dataset
from sqlalchemy import create_engine
database_url = "postgresql://admin:password@localhost:5432/shorts"
engine = create_engine(database_url)
final_dataset.to_sql('metadata', engine, if_exists='replace', index=False)

final_dataset.to_csv("data/asx_company_metadata_final.csv", index=False)

In [165]:
agg_df.iloc[9].to_dict()

# page_content = httpx.get('https://algoraepharma.com.au/',follow_redirects=True).content
# soup = BeautifulSoup(page_content, 'html.parser')
# links = list(filter(lambda x: x is not None, list(map(lambda link: link.get("href"), soup.find_all("a")))))
# images = list(filter(lambda x: x is not None, list(map(lambda link: link.get("src"), soup.find_all("img")))))
# print(", ".join(links + images))


{'company_name': 'MORELLA CORPORATION LIMITED',
 'address': 'Level 3, 100 Havelock Street, West Perth, WA 6005',
 'summary': 'Morella Corporation Limited is a technology company focused on innovative waste plastic recycling and low-emission hydrogen production.',
 'details': "Morella Corporation Limited operates in the clean technology industry, with a specific focus on waste plastic recycling and hydrogen production. The company is headquartered in West Perth, Western Australia. Morella's mission is to develop sustainable solutions to tackle environmental challenges.",
 'website': 'https://www.morellacorp.com.au',
 'website_about': 'https://www.morellacorp.com.au/about-us',
 'company_logo_link': 'https://www.morellacorp.com.au/logo.png',
 'directors': [{'name': 'John Smith', 'title': 'CEO'},
  {'name': 'Emily Johnson', 'title': 'CFO'}],
 'references': [{'url': 'https://www.asx.com.au/asx/share-price-research/company/1MC',
   'description': 'ASX Company Overview'},
  {'url': 'https://w

In [None]:
# TODO:
# refactor code into a resolver pattern, where each field of the schema has its own "resolver". 
# Each resolver can then have a more specific context provided and appropriate error handling, as well as fallback/default behaviour. 
# Some advantages this will provide:
# * after x attempts to resolve a field, we can provide a default value
# * we can provide more specific error handling for each field
# * we can provide more specific context for each field
# * we can provide more specific error messages for each field and typing
# * we can provide more specific error messages for each field and validation
# * we could enhance the parallelisation of the data discovery as we could break up work across different resolvers, however would likely translate to even more queries and cost etc.
# * resolvers would be entirely domain/context specific workflows and hence can be customised and extended as needed
# * resolvers could leverage vision capability to evaluate different images for the company logo link and best fit