In [None]:
import sys
sys.path.append('../src')
import os
from langchain_core.prompts import PromptTemplate
from langchain_openai import ChatOpenAI
from preprocessing \
    import WoowahanProcessor, TossProcessor, MediumProcessor, KakaoProcessor, OliveYoungProcessor
from utils.db_conn import Connection
import warnings
import boto3

os.environ["OPENAI_API_KEY"] = boto3.client("ssm", region_name="ap-northeast-2").\
                                    get_parameter(Name='/llm/apikey', WithDecryption=True)\
                                    ["Parameter"]["Value"]
warnings.filterwarnings("ignore")
conn = Connection()

In [None]:
query = f"""
SELECT 
    a.article_id, 
    a.blog_id,
    a.title, 
    a.description, 
    a.content
FROM 
    article AS a
JOIN (
    SELECT article_id FROM article ORDER BY RAND() LIMIT 1
) AS rand_table
ON a.article_id = rand_table.article_id;
"""

sample_text = conn.execute(query)

In [None]:
def postprocess_by_blog_id(text, blog_id):
    processors = {
        1: WoowahanProcessor,
        2: TossProcessor,
        3: MediumProcessor,
        4: KakaoProcessor,
        5: OliveYoungProcessor
    }
    processor_class = processors.get(blog_id)
    if not processor_class:
        raise ValueError(f"Unsupported blog_id: {blog_id}")
    processor = processor_class(text, blog_id)
    return processor.process()

In [None]:
text = sample_text['content'].loc[0]
blog_id = sample_text['blog_id'].loc[0]
text = postprocess_by_blog_id(text, blog_id)

In [None]:
from typing import List, Literal
from pydantic import BaseModel, Field
from langchain_core.output_parsers import PydanticOutputParser

class Classification(BaseModel):
    focusing: Literal[
        'Web', 'Mobile(Android, iOS) engineering', 'Hardware & IoT', 
        'AI & ML & Data', 'Security & Network',
        'DB', 'DevOps & Infra', 'Game',
        '기획', 'Design', 'etc'
    ] = Field(
        description="Most relative topic of the text"
    )
    keywords: List[str] = Field(
        max_length=3,
        description="Three relative keywords extracted from the text considering the focusing topic"
    )
    content_length: int = Field(
        description="Content length of the text excluding metadata"
    )

parser = PydanticOutputParser(pydantic_object=Classification)

In [None]:
prompt = PromptTemplate(
    template="{question}\n{format_instructions}\nContent:\n{content}",
    input_variables=["question", "content"],
    partial_variables={"format_instructions": parser.get_format_instructions()}
)

question = """
You are a professional text classification AI system. Analyze the given text and provide accurate, 
consistent results. Extract the desired information from the following passage.

Follow these guidelines strictly:
1. Focusing:
   - Choose the most relevant topic from the given categories : #focusing enums
   - If the text doesn't primarily focus on a specific technical topic but rather on general 
     events, experiences, or non-technical content, classify it as 'etc'.

2. Keywords:
   - Extract exactly 3 keywords that best represent the main content of the text.
   - Focus on the overall theme and context rather than specific technical terms if the content 
     is more about general events or experiences.
   - Ensure the keywords are directly related to the main topic or event discussed in the text.
   - Exclude any corporate names, product names, or irrelevant information.

3. Content Length:
   - Count and report the number of characters in the main content, excluding any metadata.

Additional guidelines:
* Precisely grasp and concisely summarize the core of the text.
* Never include personal information or ethically problematic content.
* Do not provide subjective opinions or evaluations.
* Do not include any irrelevant information.
* Do not provide information that is not explicitly stated in the text.
"""

In [None]:
query = prompt.format(content=text, question=question)
print(query)

In [None]:
model = ChatOpenAI(model='gpt-4o-mini-2024-07-18')

In [None]:
output = model.predict(query)
output = parser.parse(output)
print(output)