# Tag Generator
A LangGraph LLM agent for generating taxonomy tag recommendations from a set of articles in a Sanity.io Content Lake instance.

In [None]:
# %pip install requests
# %pip install python-dotenv
# %pip install pandas
# %pip install langchain
# %pip install langchain_core
# %pip install langchain_ollama
# %pip install langchain_openai
# %pip install plotly

## 1. Gather Article URLS
GROQ query to gather list of articles to assess; return their URLs

In [1]:
import os
import requests
from dotenv import load_dotenv

load_dotenv()
project_id = os.getenv("SANITY_PROJECT_ID")
dataset = "production"
base_url = "https://www.andyfitzgeraldconsulting.com/insights/"

response = requests.post(
    f"https://{project_id}.api.sanity.io/v2022-03-07/data/query/{dataset}?",
    json={
        "query": f"""
            *[_type in $types]{{
            title,
            "url": "{base_url}" + slug.current,
            "type": insightType->prefLabel
            }}
        """,
        "params": {"types": ["article", "caseStudy"]},
    },
)

if response.status_code == 200:
    doc_list = response.json()
else:
    print(f"Error: {response.status_code}")
    print(response.text)

## 2. Fetch & Tidy Articles
- Extract the content in the `<article>` tag. Remove header image and existing topic tags.
- Write content and metadata to a DataFrame 
- Write content word count

In [2]:
from bs4 import BeautifulSoup
import html2text
import pandas as pd

def fetch_page_content(url):
    # Fetch the page content
    response = requests.get(url)
    if response.status_code == 200:
        response.encoding = 'utf-8'  # Set the encoding explicitly to UTF-8 to avoid HTML entity issues
        html_content = response.text
    else:    
        print(f"Failed to fetch page. Status code: {response.status_code}")

    # Parse the HTML content with BeautifulSoup
    if html_content:
        soup = BeautifulSoup(html_content, "html.parser")
        # Extract the <article> element
        article = soup.find("article")
        if article:
            # Remove existing classification and non-relevant tags
            for tag in article.find_all(["h2", "h3", "figure", "time"]):
                # Remove H2 and H3 topic tags and associated lists
                if "Topics" in tag.get_text(strip=True):
                    next_sibling = tag.find_next_sibling()
                    if next_sibling and next_sibling.name == "ul":
                        next_sibling.decompose()
                    tag.decompose()
                # Remove banner images
                elif tag.name == "figure" and "banner" in tag.get("class", []):
                    tag.decompose()
                # Remove publication timestamp
                elif tag.name == "time":
                    tag.decompose()
                
            # Convert to markdown for easier LLM processing
            converter = html2text.HTML2Text()
            converter.ignore_links = True
            markdown_content = converter.handle(str(article))
            return markdown_content
    else:
        print("No content to parse")

# Fetch the content for each document and write to a dataframe, along with the metadata
df_articles = pd.DataFrame(doc_list["result"])   
df_articles["content"] = df_articles["url"].apply(fetch_page_content)
df_articles["word_count"] = df_articles["content"].apply(lambda x: len(x.split()))      


## 3. Topic Tag Prompt & Chain
- provide 10 - 20 tags, depending on the length of the resource, taking into account site purpose and audience profiles
- for each, return:
    - a tag label
    - a tag definition (1-2 sentences)
    - a sentence explanation of why it was chosen (1-3 sentences)
    - a relevance score between 0 and 1


In [7]:
from langchain_openai import ChatOpenAI

from langchain_core.prompts import ChatPromptTemplate
from pydantic import BaseModel, Field

# Enable tracing with LangSmith
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_PROJECT"] = "tag-gen"

# Set the USER_AGENT environment variable
os.environ["USER_AGENT"] = "langchain-agent"


# Define the output schema for the tag generation model
class TagOutput(BaseModel):
    """Return a list of tag recommendations."""

    class TagRecommendation(BaseModel):
        """Return data about each recommended tag."""

        tag: str = Field(description="The recommended tag for the document.")
        definition: str = Field(
            description="A definition of the tag in the context of the website purpose and audience."
        )
        explanation: str = Field(
            description="An explanation of why the tag is relevant to the document."
        )
        relevance: float = Field(
            description="The relevance score of the tag to the document."
        )

    tags: list[TagRecommendation] = Field(
        description="A list of recommended tags for the document, each containing the tag, explanation, and relevance score."
    )


# OPEN_API_KEY environment variable is set in .env
model = ChatOpenAI(model="gpt-4o-mini", temperature=0)

# Prompt components
purpose = "The site is for a single person information architecture and content strategy consultancy that offers a multi-disciplinary, collaborative, and hands-on approach to information architecture and content strategy, design, and engineering."

audience = """
    1. The Mission-Driven Leader:
        - Profile: Heads a non-profit or social enterprise focused on making a positive social impact. Values clear, effective communication and understands the need for a strong digital presence.
        - Needs: Looking for expert guidance in structuring digital content to maximize impact, engage stakeholders, and communicate their mission.
        - Behavior: Seeks out proven professionals with a track record in supporting similar organizations.

    1. The Tech-Savvy Innovator:    
        - Profile: Works in a technology-driven environment, possibly in a startup or an innovative corporate department. Interested in the latest trends like LLMs and KGs.
        - Needs: Wants to integrate advanced technology into content creation and management to stay ahead in the market.
        - Behavior: Attracted to cutting-edge solutions and thought leadership in the field of information architecture and content design.
        
    3. The Established Professional:
        - Profile: A seasoned professional in a larger, established organization, possibly overseeing a content or digital marketing team.
        - Needs: Looking for high-value strategic solutions to refine and elevate their organization's content strategy and structure.
        - Behavior: Values expertise, reliability, and a demonstrable track record of successful projects.
"""

prompt = ChatPromptTemplate(
    [
        (
            "system",
            """
                You are a topic tagging assistant designed to suggest tags for the provided document. Suggested tags will take into consideration the main topics and themes of the document, as well as the purpose of the site, and the needs of the target audience.

                The purpose of the site is as follows: 

                {purpose}

                The target audience is as follows: 

                {audience}

                Here is the document to be tagged: 

                {content}

                Please provide a list of {tag_count} topical tags that describe the document, in the context of the site's purpose and target audience. When creating tags, do not use parenthetical qualifiers or acronyms. If the plural version of the term and the singular version are equally valid, use the plural form. All terms should be in lower case, unless they are proper nouns.                
                
                Along with each tag, please also provide a one to two sentence definition of the tag in the context of the site purpose and audience, and a one to three sentence explanation of why the tag is relevant to the document. Finally, provide a relevance score for the tag between 0.0 and 1.0, with tags more relevant to the document scoring higher.
            """,
        )
    ]
)

structured_model = model.with_structured_output(TagOutput)

chain = prompt | structured_model

## 4. Process Articles
Write results to a "tags" DataFrame

In [None]:
import pandas as pd

tags_df = pd.DataFrame()

for index, row in df_articles.iloc[0:3].iterrows():
    content, title, url, type, word_count = row[["content", "title", "url", "type", "word_count"]]

    # adjust the tag count requested based on the word count of the document
    count = min(max(10, int(word_count / 400)), 20)
    
    response = chain.invoke(
        {"content": content, "audience": audience, "purpose": purpose, "tag_count": count}
    )

    response_dict = response.model_dump()

    # write each row of tags to the `tags_df` dataframe, and include for each row the document title, URL, and type
    tags_df = pd.concat(
        [tags_df, pd.DataFrame(response_dict["tags"]).assign(title=title, url=url, type=type)],
        ignore_index=True,
    )


## 5. Process Tag List
- remove tags that only appear once or have a relevance score below 0.6
- remove tags that appear for every article (TO DO — once there are more articles)
- write remaining unique tags to a new DataFrame
    - concatenate the multiple definitions
    - average the relevance scores
    - count the number of content types represented

In [1]:
# # write tags_df to a CSV file for later reference in development (to limit remote calls to the LLM)
import pandas as pd
# tags_df.to_csv("unprocessed_tags.csv", index=False)
# # read the CSV file back in
tags_df = pd.read_csv("unprocessed_tags.csv")

In [2]:
# create a new DataFrame and remove any rows where the tag only appears once and the relevance score is less than 0.6
filtered_tags_df = tags_df.groupby("tag").filter(lambda x: len(x) > 1 or x["relevance"].max() >= 0.6)

# write remaining unique tags to a new DataFrame, synthesize the multiple explanations, average the relevance scores, and count the number of Types represented by each tag
processed_tags_df = (
    filtered_tags_df.groupby("tag")
    .agg(
        tag_count=("title", "count"),
        relevance=("relevance", "mean"),
        std_dev=("relevance", "std"),
        type_count=("type", "nunique"),
        definition=("definition", lambda x: " ".join(x)),

    )
    .reset_index()
)


## 6. Synthesize Definitions

In [3]:
# Synthesize grouped explanations using Mistral on Ollama
from langchain_ollama import ChatOllama

ollama = ChatOllama(model="mistral", temperature=0)

# Define a function to synthesize explanations
def synthesize_definitions(tag, definitions):
    prompt = f"""
    You are a content strategist and copywriter. The following tags and definitions are topic tag definitions for a website that provides information architecture and content strategy services. The definitions are not a single definition, but rather several definitions for the same tag that have been joined together. 
    
    You are tasked with synthesizing these definitions into a single, concise definition of between three and five sentences that is easy to understand and captures the essence of the topic tag:

    Here's the tag: {tag}

    Here are the definitions: {definitions}

    Please provide your synthesized definition below. Your definition should be between three and five sentences long. 
    """
    response = ollama.invoke(prompt)
    return response.content

# Apply the function to the definitions column
processed_tags_df["synthesized_definition"] = processed_tags_df.apply(
    lambda row: synthesize_definitions(row["tag"],row["definition"]) if row["tag_count"] > 1 else row["definition"],
    axis=1
)

# write the processed tags to a new CSV file
# processed_tags_df.to_csv("processed_tags.csv", index=False)


In [4]:
# # write processes_tags_df to a CSV file for later reference in development (to limit remote calls to the LLM)
import pandas as pd
processed_tags_df.to_csv("processed_tags.csv", index=False)
# # read the CSV file back in
# tags_df = pd.read_csv("unprocessed_tags.csv")

## 7. Visualize Results

In [5]:
# create a scatter plot with relevance on the x-axis and tag_count on the y-axis and tag name available on hover

import plotly.express as px

fig = px.scatter(
    processed_tags_df[processed_tags_df["tag_count"] > 1],
    hover_data=["tag"],
    x="relevance",
    y="tag_count",
    title="Tag Relevance vs. Tag Count",
    width=800,
    height=800,
    size="type_count",
    color="std_dev",
    color_continuous_scale=px.colors.sequential.Viridis,
    labels={
        "relevance": "Relevance Score",
        "tag_count": "Number of Resources",
        "type_count": "Number of Types",
        "std_dev": "StdDev",
    },
)

fig.show()

# Deprecated Utilities

## NLP Consolidation

In [None]:
# Use natural language processing to identify similar tags and definitions
import spacy

nlp = spacy.load("en_core_web_lg")

tokenized_tags = [nlp(tag) for tag in processed_tags_df["tag"].tolist()]

tokenized_definitions = [nlp(definition) for definition in processed_tags_df["synthesized_definition"].tolist()]

# create a list of dicts with the tag and definition tokens
tag_tokens = [
    {"tag": tag, "definition": definition}
    for tag, definition in zip(tokenized_tags, tokenized_definitions)
]

print("Similarity by Tag:")
for tag in tag_tokens:
    #identify tag explanations that are similar to each other and group them together
    for other_tag in tag_tokens:
        if tag != other_tag and tag["tag"].similarity(other_tag["tag"]) > 0.98:
            print(f"{tag['tag'].text} is similar to {other_tag['tag'].text}")

print("\nSimilarity by Definition:")
for tag in tag_tokens:
    #identify tag explanations that are similar to each other and group them together
    for other_tag in tag_tokens:
        if tag != other_tag and tag["definition"].similarity(other_tag["definition"]) > 0.98:
            print(f"{tag['tag'].text} is similar to {other_tag['tag'].text}")


# for tag in tokenized_explanations:
#     # print(tag.text, tag.has_vector, tag.vector_norm)
#     # identify tags that are similar to each other
#     for other_tag in tokenized_explanations:
#         if tag != other_tag and tag.similarity(other_tag) > 0.95:
#             print(f"{tag.text} is similar to {other_tag.text}")

