In [None]:
import numpy as np
import pandas as pd
import sys
sys.path.append('../src')

from utils.db_conn import Connection
import warnings
import os
from preprocessing \
    import WoowahanProcessor, TossProcessor, MediumProcessor, KakaoProcessor, OliveYoungProcessor

warnings.filterwarnings("ignore")
conn = Connection()

In [None]:
query = f"""
SELECT 
    a.article_id, 
    a.blog_id,
    a.title, 
    a.description, 
    a.content
FROM 
    article AS a
JOIN (
    SELECT article_id FROM article ORDER BY RAND() LIMIT 5
) AS rand_table
ON a.article_id = rand_table.article_id;
"""

sample = conn.execute(query)

In [None]:
def postprocess_by_blog_id(text, blog_id):
    processors = {
        1: WoowahanProcessor,
        2: TossProcessor,
        3: MediumProcessor,
        4: KakaoProcessor,
        5 : OliveYoungProcessor
    }
    processor_class = processors.get(blog_id)
    if not processor_class:
        raise ValueError(f"Unsupported blog_id: {blog_id}")
    processor = processor_class(text, blog_id)
    return processor.process()

In [None]:
for i in range(5):
    with open(f'../dataset_sample/sample_content_{i}', 'w+') as f:
        f.write(sample['content'].loc[i])
    
    with open(f'../dataset_sample/sample_content_{i}_pre', 'w+') as f:
        text = sample['content'].loc[i]
        blog_id = sample['blog_id'].loc[i]
        text = postprocess_by_blog_id(text, blog_id)
        f.write(text)