In [1]:
import pandas as pd
from mvodolagin_personal_imports import *

load_dotenv()

from mvodolagin_personal_imports.langchain_stuff import *

In [2]:
scraped_data_dir = Path("./root_pages")
scraped_data_dir = Path(scraped_data_dir.absolute())

In [25]:
all_subdirs = list(scraped_data_dir.iterdir())
random.shuffle(all_subdirs)

for subdir in all_subdirs:
    html_files = list(subdir.glob("*.html"))
    if len(html_files) < 10:
        continue
    break


In [52]:
from bs4 import BeautifulSoup

all_links = []

for fp in html_files:
    with open(fp, "r") as f:
        soup = BeautifulSoup(f.read(), "html5lib")
    links = soup.find_all("a")
    links = [link.get("href") for link in links]
    links = list(set(links))
    all_links.extend(links)

root_url = subdir.name

unprocessed_links = [link for link in all_links if link]
contact_links = [link for link in unprocessed_links if
                 any([link.lower().startswith(prefix) for prefix in ["mailto:", "tel:", "sms:", "whatsapp:"]])]
unprocessed_links = [link for link in unprocessed_links if link not in contact_links]

internal_links = [link for link in unprocessed_links if
                  (not link.startswith("//") and (link.startswith("/") or root_url in link))]
unprocessed_links = [link for link in unprocessed_links if link not in internal_links]

external_links = [link for link in unprocessed_links if (link.startswith("//") or link.startswith("http"))]
unprocessed_links = [link for link in unprocessed_links if link not in external_links]

print(len(all_links), len(contact_links), len(internal_links), len(external_links), len(unprocessed_links))

1385 0 1211 121 38


In [53]:
len(set(internal_links))

172

In [56]:
print(len(html_files))

link_counts = Counter(internal_links)

threshold = 0.3

universal_links = [link for link, count in link_counts.items() if count == len(html_files)]
common_links = [link for link, count in link_counts.items() if len(html_files) > count >= threshold * len(html_files)]
rare_links = [link for link, count in link_counts.items() if count < threshold * len(html_files)]

print(len(universal_links), len(common_links), len(rare_links), len(link_counts))

# missing_links = [link for link in internal_links if all([link not in links for links in [universal_links, common_links, rare_links]])]
# missing_links

13
84 1 87 172


In [59]:
rare_links

['/collections/queen-beds?page=3',
 '/collections/queen-beds/products/corbin-queen-bed-dresser-with-mirror-nightstand',
 '/collections/queen-beds?page=2',
 '/collections/queen-beds/products/ashley-wynnlow-gray-queen-bed-w-dresser-mirror-b440-grp14320',
 '/collections/queen-beds/products/kate-white-queen-bed-w-dresser-mirror-nightstand',
 '/collections/queen-beds/products/pompei-metallic-grey-queen-bed-and-dresser-with-mirror-nightstand',
 '/collections/queen-beds?page=9',
 '/collections/queen-beds/products/ashley-drystan-queen-bed-with-storage-footboard-b211-54s-57-96',
 '/collections/queen-beds/products/linda-new-merlot-king-bedroom-set',
 '/collections/queen-beds/products/kate-grey-queen-bed-w-dresser-mirror-nightstand',
 '/collections/queen-beds/products/kate-grey-queen-bed-w-dresser-mirror',
 '/collections/queen-beds/products/kate-white-queen-bed-w-dresser-mirror',
 '/collections/queen-beds/products/bevelle-queen-bed-w-dresser-mirror-nightstand',
 '/collections/queen-beds/products/

{'category': '/collections/[a-z0-9-]+',
 'item': '/collections/[a-z-]+/products/[a-z-]+'}

In [70]:
counts

Unnamed: 0,both,category,item,miss,total
universal,0,64,0,20,84
common,0,1,0,0,1
rare,81,6,0,0,87


In [145]:
from bs4 import BeautifulSoup
from loguru import logger


def get_count(link_collection, patterns):
    both_patterns = [link for link in link_collection if all([re.match(pat, link) for pat in patterns.values()])]
    category_patterns = [link for link in link_collection if re.match(patterns["category"], link)]
    category_patterns = [link for link in category_patterns if link not in both_patterns]
    item_patterns = [link for link in link_collection if re.match(patterns["item"], link)]
    item_patterns = [link for link in item_patterns if link not in both_patterns]

    counts = {
        "both": len(both_patterns),
        "category": len(category_patterns),
        "item": len(item_patterns),
        "miss": len(link_collection) - len(both_patterns) - len(category_patterns) - len(item_patterns),
        "total": len(link_collection),
    }
    return counts


def check_regexp_match(subdir, patterns=None):
    html_files = list(subdir.glob("*.html"))
    if len(html_files) < 10:
        return None

    # region Extracting links

    all_links = []

    for fp in html_files:
        with open(fp, "r", encoding="utf-8") as f:
            soup = BeautifulSoup(f.read(), "html5lib")
        links = soup.find_all("a")
        links = [link.get("href") for link in links]
        links = list(set(links))
        all_links.extend(links)

    root_url = subdir.name
    logger.debug(f"Root URL: {root_url}")

    unprocessed_links = [link for link in all_links if link]
    contact_links = [
        link
        for link in unprocessed_links
        if any([link.lower().startswith(prefix) for prefix in ["mailto:", "tel:", "sms:", "whatsapp:"]])
    ]
    unprocessed_links = [link for link in unprocessed_links if link not in contact_links]

    internal_links = [
        link for link in unprocessed_links if (not link.startswith("//") and (link.startswith("/") or root_url in link))
    ]
    unprocessed_links = [link for link in unprocessed_links if link not in internal_links]

    external_links = [link for link in unprocessed_links if (link.startswith("//") or link.startswith("http"))]
    unprocessed_links = [link for link in unprocessed_links if link not in external_links]

    logger.debug(
        f"Links: {len(all_links)}, Contacts: {len(contact_links)}, Internals: {len(internal_links)}, Externals: {len(external_links)}, Unprocessed: {len(unprocessed_links)}"
    )

    # endregion

    # region Splitting links

    link_counts = Counter(internal_links)

    threshold = 0.3

    universal_links = [link for link, count in link_counts.items() if count == len(html_files)]
    common_links = [
        link for link, count in link_counts.items() if len(html_files) > count >= threshold * len(html_files)
    ]
    rare_links = [link for link, count in link_counts.items() if count < threshold * len(html_files)]

    # print(len(link_counts), len(universal_links), len(common_links), len(rare_links), len(link_counts))
    # to logger
    logger.debug(
        f"Total links: {len(link_counts)}, Universal: {len(universal_links)}, Common: {len(common_links)}, Rare: {len(rare_links)}"
    )

    # endregion

    # region Checking patterns

    if not patterns:
        patterns = json.load((subdir / "patterns.json").open("r", encoding="utf-8"))
    logger.debug(f"Base patterns: {patterns}")

    counts = {
        "universal": get_count(universal_links, patterns),
        "common": get_count(common_links, patterns),
        "rare": get_count(rare_links, patterns),
    }
    counts = pd.DataFrame(counts).T

    # endregion

    return counts, patterns


In [109]:
def get_links(subdir):
    html_files = list(subdir.glob("*.html"))
    if len(html_files) < 10:
        return None

    # region Extracting links

    all_links = []

    for fp in html_files:
        with open(fp, "r", encoding="utf-8") as f:
            soup = BeautifulSoup(f.read(), "html5lib")
        links = soup.find_all("a")
        links = [link.get("href") for link in links]
        links = list(set(links))
        all_links.extend(links)

    root_url = subdir.name
    logger.debug(f"Root URL: {root_url}")

    unprocessed_links = [link for link in all_links if link]
    contact_links = [
        link
        for link in unprocessed_links
        if any([link.lower().startswith(prefix) for prefix in ["mailto:", "tel:", "sms:", "whatsapp:"]])
    ]
    unprocessed_links = [link for link in unprocessed_links if link not in contact_links]

    internal_links = [
        link for link in unprocessed_links if (not link.startswith("//") and (link.startswith("/") or root_url in link))
    ]
    unprocessed_links = [link for link in unprocessed_links if link not in internal_links]

    external_links = [link for link in unprocessed_links if (link.startswith("//") or link.startswith("http"))]
    unprocessed_links = [link for link in unprocessed_links if link not in external_links]

    logger.debug(
        f"Links: {len(all_links)}, Contacts: {len(contact_links)}, Internals: {len(internal_links)}, Externals: {len(external_links)}, Unprocessed: {len(unprocessed_links)}"
    )

    # endregion

    # region Splitting links

    link_counts = Counter(internal_links)

    threshold = 0.3

    universal_links = [link for link, count in link_counts.items() if count == len(html_files)]
    common_links = [
        link for link, count in link_counts.items() if len(html_files) > count >= threshold * len(html_files)
    ]
    rare_links = [link for link, count in link_counts.items() if count < threshold * len(html_files)]

    # print(len(link_counts), len(universal_links), len(common_links), len(rare_links), len(link_counts))
    # to logger
    logger.debug(
        f"Total links: {len(link_counts)}, Universal: {len(universal_links)}, Common: {len(common_links)}, Rare: {len(rare_links)}"
    )

    # endregion

    return {
        "universal": universal_links,
        "common": common_links,
        "rare": rare_links,
        "contact": contact_links,
    }

In [79]:
# all_counts = {}
# all_patterns = {}

for subdir in tqdm(all_subdirs):
    if subdir.name in all_counts:
        continue
    try:
        res = check_regexp_match(subdir)
    except Exception as e:
        logger.error(f"Error in {subdir.name}: {e}")
        continue
    if res is not None:
        counts, base_patterns = res
        all_counts[subdir.name] = counts
        all_patterns[subdir.name] = base_patterns

  0%|          | 0/377 [00:00<?, ?it/s]

[32m2024-04-18 17:59:10.445[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36mcheck_regexp_match[0m:[36m40[0m - [34m[1mRoot URL: failed[0m
[32m2024-04-18 17:59:10.468[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36mcheck_regexp_match[0m:[36m58[0m - [34m[1mLinks: 195, Contacts: 2, Internals: 13, Externals: 178, Unprocessed: 2[0m
[32m2024-04-18 17:59:10.469[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36mcheck_regexp_match[0m:[36m78[0m - [34m[1mTotal links: 13, Universal: 0, Common: 0, Rare: 13[0m
[32m2024-04-18 17:59:10.472[0m | [31m[1mERROR   [0m | [36m__main__[0m:[36m<module>[0m:[36m10[0m - [31m[1mError in failed: [Errno 2] No such file or directory: 'E:\\Work\\Personal\\repos\\web_scrapers\\spiders\\texttailor\\texttailor\\notebooks\\root_pages\\failed\\patterns.json'[0m
[32m2024-04-18 17:59:20.270[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36mcheck_regexp_match[0m:[36m40[0m - [34m[1mRoot URL: www.teppermans.com[0m
[32m20

In [80]:
debug_dir = Path("./debug")
debug_dir = Path(debug_dir.absolute())
debug_dir.mkdir(exist_ok=True, parents=True)

In [82]:
# pd.concat(all_counts).to_pickle(debug_dir / "counts.pickle")
# pd.DataFrame(all_patterns).T.to_pickle(debug_dir / "patterns.pickle")


In [87]:
df_counts = pd.concat(all_counts)

In [88]:
df_patterns = pd.DataFrame(all_patterns).T

In [89]:
df_patterns

Unnamed: 0,category,item
www.marlofurniture.com,/collections/[a-z0-9-]+,/collections/[a-z-]+/products/[a-z-]+
www.darvin.com,/(browse|brand)/|\.aspx$,/item/[a-z0-9-%2b]+/\d+
cityhomepdx.com,/collections/[\w-]+(?!/products),/products/[a-z0-9-]+?\?view=quickview|/collect...
www.kitchenandcompany.com,/collections/.*,/collections/[\w-]+/products/[\w-]+
www.nelsonsfurniture.com,/collections/[^/]*$|/products/[^/]*$,/products/[a-z0-9-?=.]+
...,...,...
www.conlins.com,/browse/|/item/|\.aspx$|/brand/|/stores/|/p/|/...,/item/[a-z0-9-]+/[0-9]+
www.salvagedfurnitureparlour.com,/collections/[^/]+$,/products/[a-z0-9-%]+
outwestfurnituremt.com,/outwest-furniture-gallery-past-furniture\.php,/outwest-furniture-.+\.php
www.oakcrestfurniture.com,/(swing-sets|oak-furniture|pine-furniture|shed...,/products/[a-z0-9-]+/


In [91]:
df_counts.describe()

Unnamed: 0,both,category,item,miss,total
count,246.0,246.0,246.0,246.0,246.0
mean,15.813008,20.361789,6.589431,40.081301,82.845528
std,67.837122,41.181272,27.782797,72.055885,111.062221
min,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,4.0
50%,0.0,1.0,0.0,9.0,48.5
75%,0.75,20.75,0.0,42.0,117.0
max,946.0,237.0,277.0,453.0,1096.0


In [92]:
df_counts

Unnamed: 0,Unnamed: 1,both,category,item,miss,total
www.marlofurniture.com,universal,0,64,0,20,84
www.marlofurniture.com,common,0,1,0,0,1
www.marlofurniture.com,rare,81,6,0,0,87
www.darvin.com,universal,0,27,0,121,148
www.darvin.com,common,0,0,0,3,3
...,...,...,...,...,...,...
www.oakcrestfurniture.com,common,0,0,0,0,0
www.oakcrestfurniture.com,rare,0,0,0,23,23
www.qualitystorefixtures.com,universal,0,0,0,28,28
www.qualitystorefixtures.com,common,0,0,0,9,9


In [93]:
def check_quality(counts):
    # if `both` match many, it's bad product and good category
    # if `both` + `category` matches all, it's bad category
    # if `item` is 0, it's bad item
    is_ok_quality = {"category": True, "item": True}
    summed = counts.sum()
    if summed["both"] > 0.2 * summed["total"]:
        is_ok_quality["category"] = False
    if summed["both"] + summed["category"] >= summed["total"] - 5:
        is_ok_quality["category"] = False
    if summed["item"] == 0:
        is_ok_quality["item"] = False
    return is_ok_quality




In [95]:
quality_check = {}
for root_url, counts in all_counts.items():
    is_ok_quality = check_quality(counts)
    quality_check[root_url] = is_ok_quality

In [97]:
df_quality = pd.DataFrame(quality_check).T

In [98]:
all_good_index = df_quality.loc[df_quality.all(axis=1)].index

In [103]:
# for root_url in all_good_index:
#     print(root_url)
#     display_full(df_counts.loc[root_url])


In [105]:
clipboard_copy(all_good_index.tolist())

In [106]:
all_bad_index = df_quality.loc[~(df_quality.any(axis=1))].index

In [108]:
df_quality.value_counts()

category  item 
True      False    35
False     False    22
True      True     20
False     True      5
Name: count, dtype: int64

In [111]:
links_dict = get_links(scraped_data_dir / all_bad_index[21])

[32m2024-04-18 18:16:46.259[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36mget_links[0m:[36m19[0m - [34m[1mRoot URL: lavishfurnitureoutlet.com[0m
[32m2024-04-18 18:16:46.261[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36mget_links[0m:[36m37[0m - [34m[1mLinks: 513, Contacts: 24, Internals: 336, Externals: 128, Unprocessed: 24[0m
[32m2024-04-18 18:16:46.262[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36mget_links[0m:[36m57[0m - [34m[1mTotal links: 59, Universal: 16, Common: 7, Rare: 36[0m


In [112]:
links_dict

{'universal': ['/policies/terms-of-service',
  '/cart',
  '/search',
  '/policies/privacy-policy',
  '/pages/contact-us',
  '/collections/living-room',
  '/pages/visit-outlet',
  '/collections/kids',
  '/pages/financing',
  '/collections/bedrooms',
  '/',
  '/collections/frontpage',
  '/collections/mattresses',
  '/pages/house-package-specials',
  '/policies/refund-policy',
  '/collections/dining-rooms'],
 'common': ['/collections/bedrooms/products/emily-black-storage-bedroom-set',
  '/collections/bedrooms/products/sheffield-gray-sleigh-bed',
  '/collections/bedrooms/products/dark-cherry-sheffield-bed',
  '/collections/bedrooms/products/stanley-dark-cherry-bedroom-set',
  '/collections/bedrooms/products/cristal-bed',
  '/collections/bedrooms/products/giovani-gray-bed',
  '/collections/vendors?q=Coaster'],
 'rare': ['/collections/bedrooms/products/stanley-antique-white-bed',
  '/collections/bedrooms/products/emily-dark-cherry-storage-bed',
  '/collections/bedrooms/products/lavonia-gray-

In [161]:
# Role message for the language model
role_message = "You will categorize internal web links into three categories: 'catalog_pages', 'product_pages', and 'other_pages'. For 'catalog_pages', identify links that lead to collections or broad categories of products. 'Product_pages' are links that directly access specific product details. 'Other_pages' include all links not fitting into the other categories, like informational or administrative content. Output the indices of links in a JSON dictionary under the appropriate category based on the input list of links."

# Example input
example_input = """
1. /furniture/beds/king-size
2. /help/returns-policy
3. /sale
4. /furniture/beds/king-size/342322
5. /about-us
6. /blog/how-to-choose-a-bed
"""

# Expected output in JSON format
example_output = """
{{
  "product_pages": [4],
  "catalog_pages": [1, 3],
  "other_pages": [2, 5, 6]
}}
"""

input_template = """{formatted_list}"""

messages = [("system", role_message), ("human", example_input), ("ai", example_output), ("human", input_template)]
messages = [(role, trim_extra_whitespace(message)) for role, message in messages]

choose_links_prompt = ChatPromptTemplate.from_messages(messages)

In [178]:
class A:
    @classmethod
    def v(cls):
        return 13
    
    v = property(v)

In [179]:
A.v

<property at 0x21d8d5dee80>

In [182]:
class ClassProperty:
    def __init__(self, method):
        self.method = method

    def __get__(self, obj, objtype=None):
        return self.method.__func__(objtype)

class MyClass:
    _my_value = 5

    @ClassProperty
    @classmethod
    def value(cls):
        return cls._my_value

# Usage
print(MyClass.value)  # This will print 5


5


In [162]:
def format_links(links_dict):
    internal_links = links_dict["universal"] + links_dict["common"] + links_dict["rare"]
    formatted_list = "\n".join([f"{i + 1}. {link}" for i, link in enumerate(internal_links)])
    return formatted_list


chain = RunnablePassthrough().assign(formatted_list=lambda d: format_links(d)) | choose_links_prompt | basic_llm


In [133]:
RunnablePassthrough().invoke({"a": 1, "b": 2})

{'a': 1, 'b': 2}

In [119]:
chain.invoke(links_dict)

AIMessage(content='{\n"product_pages": [17, 18, 19, 20, 21, 22, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58],\n"catalog_pages": [6, 8, 10, 13, 16, 23, 44],\n"other_pages": [1, 2, 3, 4, 5, 7, 9, 11, 12, 14, 15, 59]\n}', response_metadata={'token_usage': {'completion_tokens': 191, 'prompt_tokens': 946, 'total_tokens': 1137}, 'model_name': 'gpt-3.5-turbo-0125', 'system_fingerprint': 'fp_c2295e73ad', 'finish_reason': 'stop', 'logprobs': None}, id='run-a555f771-6f3e-4195-8190-55f4e8a43100-0')

In [170]:
def unpack_links(links_dict, llm_answer):
    internal_links = links_dict["universal"] + links_dict["common"] + links_dict["rare"]
    llm_answer = safe_json_loads(llm_answer.content)
    # product_pages = [internal_links[i - 1] for i in llm_answer["product_pages"]]
    # catalog_pages = [internal_links[i - 1] for i in llm_answer["catalog_pages"]]
    # other_pages = [internal_links[i - 1] for i in llm_answer["other_pages"]]
    # return {
    #     "product_pages": product_pages,
    #     "catalog_pages": catalog_pages,
    #     "other_pages": other_pages,
    # }
    # Safer, with warning if indices are out of bounds
    out_of_bounds = [i for i in llm_answer.get("product_pages", []) + llm_answer.get("catalog_pages", []) + llm_answer.get("other_pages", []) if i > len(internal_links)]
    if out_of_bounds:
        logger.warning(f"Indices out of bounds: {out_of_bounds}")
    product_pages = [internal_links[i - 1] for i in llm_answer.get("product_pages", []) if 0 < i <= len(internal_links)]
    catalog_pages = [internal_links[i - 1] for i in llm_answer.get("catalog_pages", []) if 0 < i <= len(internal_links)]
    other_pages = [internal_links[i - 1] for i in llm_answer.get("other_pages", []) if 0 < i <= len(internal_links)]
    return {
        "product_pages": product_pages,
        "catalog_pages": catalog_pages,
        "other_pages": other_pages,
    }
    

In [122]:
converted_links = unpack_links(links_dict, chain.invoke(links_dict))

In [123]:
converted_links

{'product_pages': ['/collections/bedrooms/products/emily-black-storage-bedroom-set',
  '/collections/bedrooms/products/sheffield-gray-sleigh-bed',
  '/collections/bedrooms/products/dark-cherry-sheffield-bed',
  '/collections/bedrooms/products/stanley-dark-cherry-bedroom-set',
  '/collections/bedrooms/products/cristal-bed',
  '/collections/bedrooms/products/giovani-gray-bed',
  '/collections/bedrooms/products/stanley-antique-white-bed',
  '/collections/bedrooms/products/emily-dark-cherry-storage-bed',
  '/collections/bedrooms/products/lavonia-gray-storage-bed',
  '/collections/bedrooms/products/veil-bed',
  '/collections/bedrooms/products/sandy-beach-bedroom-set-3',
  '/collections/bedrooms/products/jessica-contemporary-bedroom-set',
  '/collections/bedrooms/products/tatiana-bedroom-set',
  '/collections/bedrooms/products/sheffield-vintage-gray-bed',
  '/collections/bedrooms/products/kauffman-bedroom-set',
  '/collections/bedrooms/products/serenity-bedroom-set',
  '/collections/bedrooms

In [163]:
def choose_example_links(categorized_links_dict, sample_size=5):
    all_product_links = categorized_links_dict["product_pages"]
    all_catalog_links = categorized_links_dict["catalog_pages"]
    product_links = random.sample(all_product_links, min(sample_size, len(all_product_links)))
    catalog_links = random.sample(all_catalog_links, min(sample_size, len(all_catalog_links)))
    newline = "\n"
    formatted_text = f"""
    Product Pages:
    {newline.join([f"- {link}" for link in product_links])}
    
    Catalog Pages:
    {newline.join([f"- {link}" for link in catalog_links])}
    """
    return trim_extra_whitespace(formatted_text)


In [164]:
# Role message for the language model
role_message = "You are tasked with generating Python regular expressions to identify URLs that categorize web pages into product or catalog pages. Focus on extracting meaningful patterns that help differentiate between product details and broader catalog listings. Use elements in the URL path like 'collections', 'products', or similar indicators to guide the creation of these expressions."

# Example input
example_input = """
Product Pages:
- /collections/bedrooms/products/carlton-bedroom-set
- /collections/dining-rooms/products/oak-dining-table

Catalog Pages:
- /collections/bedrooms
- /collections/dining-rooms
"""

# Expected output in Python regular expressions
example_output = """{{"product_page_regex": "^/collections/[^/]+/products/.*$", "catalog_page_regex": "^/collections/[^/]+/?$"}}"""

input_template = """{formatted_link_examples}"""

messages = [("system", role_message), ("human", example_input), ("ai", example_output), ("human", input_template)]
messages = [(role, trim_extra_whitespace(message)) for role, message in messages]

get_regexp_prompt = ChatPromptTemplate.from_messages(messages)


In [171]:
long_chain = (
        RunnablePassthrough().assign(formatted_list=lambda d: format_links(d))
        | RunnablePassthrough.assign(llm_answer=choose_links_prompt | basic_llm)
        | RunnablePassthrough().assign(links_dict=lambda d: unpack_links(d, d["llm_answer"]))
        | RunnablePassthrough().assign(formatted_link_examples=lambda d: choose_example_links(d["links_dict"]))
        | get_regexp_prompt
        | basic_llm
)

In [169]:
res = long_chain.invoke(links_dict)

In [172]:
res

AIMessage(content='{"product_page_regex": "^/collections/[^/]+/products/.*$", "catalog_page_regex": "^/collections/[^/]+(?:\\\\?|$)"}', response_metadata={'token_usage': {'completion_tokens': 35, 'prompt_tokens': 280, 'total_tokens': 315}, 'model_name': 'gpt-3.5-turbo-0125', 'system_fingerprint': 'fp_d9767fc5b9', 'finish_reason': 'stop', 'logprobs': None}, id='run-88ff3c1a-ab5a-4d8c-864b-9461031ef756-0')

In [173]:
new_patterns = safe_json_loads(res.content)
key_remapping = {
    "product_page_regex": "item",
    "catalog_page_regex": "category",
}
new_patterns = {key_remapping[key]: value.strip("'") for key, value in new_patterns.items()}


In [146]:
subdir

WindowsPath('E:/Work/Personal/repos/web_scrapers/spiders/texttailor/texttailor/notebooks/root_pages/www.simsfurnitureco.com.html')

In [147]:
to_check_dir = scraped_data_dir / all_bad_index[21]

In [159]:
check_result, _ = check_regexp_match(to_check_dir, new_patterns)
check_result

[32m2024-04-19 16:08:03.470[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36mcheck_regexp_match[0m:[36m40[0m - [34m[1mRoot URL: lavishfurnitureoutlet.com[0m
[32m2024-04-19 16:08:03.473[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36mcheck_regexp_match[0m:[36m58[0m - [34m[1mLinks: 513, Contacts: 24, Internals: 336, Externals: 128, Unprocessed: 24[0m
[32m2024-04-19 16:08:03.474[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36mcheck_regexp_match[0m:[36m78[0m - [34m[1mTotal links: 59, Universal: 16, Common: 7, Rare: 36[0m
[32m2024-04-19 16:08:03.474[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36mcheck_regexp_match[0m:[36m88[0m - [34m[1mBase patterns: {'item': '^/collections/[^/]+/products/.*$', 'category': '^/collections/[^/]+(?:\\?[^/]+)?$'}[0m


Unnamed: 0,both,category,item,miss,total
universal,0,6,0,10,16
common,0,1,6,0,7
rare,0,2,34,0,36


In [157]:
new_patterns["category"].strip("'")

'^/collections/[^/]+(?:\\?[^/]+)?$'

In [155]:
res = Out[143]

In [156]:
res.content

'{"product_page_regex": "\'^/collections/[^/]+/products/.*$\'", "catalog_page_regex": "\'^/collections/[^/]+(?:\\\\?[^/]+)?$\'"}'