In [None]:
# 기본 데이터 처리 라이브러리
import numpy as np
import pandas as pd

# 텍스트 전처리
import re
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split

# 모델 학습 및 평가
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.metrics import accuracy_score, classification_report

# 시각화
import matplotlib.pyplot as plt
import seaborn as sns

import re
from bs4 import BeautifulSoup
import html
from datetime import datetime

# 경고 무시
import warnings
import os
os.chdir('../')
warnings.filterwarnings("ignore")

# 한글 폰트 설정 (matplotlib용)
import matplotlib.font_manager as fm
import matplotlib

# font_path = '/usr/share/fonts/truetype/nanum/NanumGothic.ttf'  # 폰트 경로 설정
# font = fm.FontProperties(fname=font_path).get_name()
# matplotlib.rc('font', family=font)

# GPU 사용 가능 여부 확인
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
if device.type == 'cuda':
    print(f"Device Name : {torch.cuda.get_device_name(0)}")

# nltk 데이터 다운로드 (불용어 등 필요 시만 다운로드)
nltk_data = {
    "stopwords": "corpora/stopwords.zip",
    "punkt": "tokenizers/punkt.zip"
}
for key, path in nltk_data.items():
    try:
        nltk.data.find(path)
    except LookupError:
        print(f"Downloading {key}...")
        nltk.download(key)

# tqdm for progress bars
from tqdm import tqdm
tqdm.pandas()


In [None]:
DATA_PATH = './dataset_sample'
FEATURES = ['title', 'description', 'content', 'contentLength']

os.chdir('/workspaces/recommender/text_classification')

In [None]:
daangn = pd.read_csv(f'{DATA_PATH}/daangn-techblog.csv')[FEATURES]
toss = pd.read_csv(f'{DATA_PATH}/toss-techblog.csv')[FEATURES]
wooahan = pd.read_csv(f'{DATA_PATH}/wooahan-techblog.csv')[FEATURES]

daangn["from"] = "daangn"
toss["from"] = "toss"
wooahan["from"] = "wooahan"

total_article = pd.concat([daangn, toss, wooahan], axis=0)

In [None]:
brand_colors = {
    'daangn': '#FF8B00',      # 당근마켓 오렌지
    'toss': '#0056F6',        # 토스 블루
    'wooahan': '#6BBE45'      # 우아한형제들 그린
}

# Calculate article counts and mean content length by company
article_counts = total_article['from'].value_counts()
mean_lengths = total_article.groupby('from')['contentLength'].mean()

# Plot
fig, axes = plt.subplots(1, 2, figsize=(12, 8))

# Subplot 1: Article Count
colors = [brand_colors[idx] for idx in article_counts.index]
axes[0].bar(article_counts.index, article_counts, color=colors)
axes[0].set_title("회사 별 아티클 개수")
axes[0].set_ylabel("개수")
axes[0].bar_label(axes[0].containers[0], fmt='%d', label_type='edge', fontsize=10)  # Show counts on bars

# Subplot 2: Mean Content Length
colors = [brand_colors[idx] for idx in mean_lengths.index]
axes[1].bar(mean_lengths.index, mean_lengths, color=colors)
axes[1].set_title("회사 별 아티클 내용 평균 길이")
axes[1].set_ylabel("평균 길이 (글자수)")
axes[1].bar_label(axes[1].containers[0], fmt='%.1f', label_type='edge', fontsize=10)  # Show means on bars

# Adjust layout
plt.xlabel("출처")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
def save_sample(sample, ext):
    if isinstance(sample, str):
        filename = f"{ext}_output"
        with open(filename, 'w', encoding='utf-8') as f:
            f.write(sample)
    else:
        filename = f"{sample['from'].iloc[0]}_{ext}"
        with open(filename, 'w', encoding='utf-8') as f:
            f.write(sample['content'].iloc[0])

toss_sample = toss.sample()
daangn_sample = daangn.sample()
wooahan_sample = wooahan.sample()

for sample in [toss_sample, daangn_sample, wooahan_sample]:
   save_sample(sample, 'input')

In [None]:
import re
from typing import List, Dict
from dataclasses import dataclass
from bs4 import BeautifulSoup
import html

@dataclass
class CodeBlock:
    language: str
    code: str
    line_number: int  # Starting line number in the original text
    indentation: int  # Level of indentation

class TextProcessor:
    def __init__(self, text: str):
        self.raw_text = text
        self.cleaned_text = ""
        self.code_blocks = []
        self.language_markers = {
            'python': [
                r'import\s+[\w\s,]+',
                r'def\s+\w+\s*\([^)]*\)\s*:',
                r'class\s+\w+\s*[:(]',
                r'print\s*\(',
                r'if\s+__name__\s*==\s*[\'"]__main__[\'"]',
                r'with\s+.*?\s*as\s+\w+:',
                r'try:(?:\s*except|\s*finally)',
                r'for\s+\w+\s+in\s+',
                r'while\s+.*?:'
            ],
            'javascript': [
                r'function\s+\w+\s*\([^)]*\)',
                r'const\s+\w+\s*=',
                r'let\s+\w+\s*=',
                r'var\s+\w+\s*=',
                r'import\s+.*?from',
                r'=>',
                r'console\.',
                r'module\.exports',
                r'export\s+(?:default\s+)?(?:function|class|const|let|var)'
            ],
            'java': [
                r'public\s+class',
                r'private\s+\w+',
                r'protected\s+\w+',
                r'System\.out\.',
                r'import\s+java\.',
                r'@Override'
            ],
            'sql': [
                r'SELECT\s+.*?\s+FROM',
                r'INSERT\s+INTO',
                r'UPDATE\s+.*?\s+SET',
                r'DELETE\s+FROM',
                r'CREATE\s+TABLE',
                r'ALTER\s+TABLE',
                r'DROP\s+TABLE',
                r'JOIN\s+\w+'
            ],
            'html': [
                r'<(?:html|head|body|div|span|p|a|script|link|meta)',
                r'</[a-z]+>',
                r'class=["\'].*?["\']'
            ],
            'css': [
                r'{\s*[\w-]+\s*:',
                r'@media',
                r'@import',
                r'\.[a-zA-Z][\w-]*\s*{',
                r'#[\w-]+\s*{'
            ]
        }
        self.markdown_markers = {
            'start': [
                r'```\w*',           # Code fence with optional language
                r'~~~\w*',           # Alternative code fence
                r'(?:^|\n)    ',     # Indented code block (4 spaces)
                r'(?:^|\n)\t'        # Indented code block (tab)
            ],
            'end': [
                r'```',
                r'~~~',
                r'\n\S'              # End of indentation
            ]
        }

    def detect_language(self, code: str) -> str:
        """Detect the programming language of a code block"""
        
        # Count matches for each language's patterns
        language_scores = {}
        
        for language, patterns in self.language_markers.items():
            score = 0
            for pattern in patterns:
                matches = re.finditer(pattern, code, re.IGNORECASE | re.MULTILINE)
                score += sum(1 for _ in matches)
            language_scores[language] = score
        
        # Return the language with highest score, or 'unknown' if no clear match
        max_score = max(language_scores.values())
        if max_score > 0:
            for language, score in language_scores.items():
                if score == max_score:
                    return language
        
        return 'unknown'

    def find_code_blocks(self) -> List[CodeBlock]:
        """Find all code blocks in the text"""
        code_blocks = []
        
        # First look for markdown-style code blocks
        line_number = 1
        lines = self.raw_text.split('\n')
        i = 0
        
        while i < len(lines):
            line = lines[i]
            
            # Check for code fence markers
            for start_pattern in self.markdown_markers['start']:
                if re.match(start_pattern, line):
                    code_lines = []
                    start_line = line_number
                    indentation = len(re.match(r'^\s*', line).group())
                    
                    i += 1
                    line_number += 1
                    
                    # Collect lines until end marker
                    while i < len(lines):
                        if any(re.match(end_pattern, lines[i]) for end_pattern in self.markdown_markers['end']):
                            break
                        code_lines.append(lines[i])
                        i += 1
                        line_number += 1
                    
                    if code_lines:
                        code = '\n'.join(code_lines)
                        language = self.detect_language(code)
                        code_blocks.append(CodeBlock(
                            language=language,
                            code=code,
                            line_number=start_line,
                            indentation=indentation
                        ))
            
            i += 1
            line_number += 1
        
        # Then look for language-specific patterns
        text_without_markdown = re.sub(r'```.*?```', '', self.raw_text, flags=re.DOTALL)
        for language, patterns in self.language_markers.items():
            for pattern in patterns:
                matches = re.finditer(pattern, text_without_markdown, re.MULTILINE)
                for match in matches:
                    # Get the context around the match
                    start = match.start()
                    # Find the beginning of the code block
                    block_start = text_without_markdown.rfind('\n', 0, start) + 1
                    # Find the end of the code block
                    block_end = text_without_markdown.find('\n\n', start)
                    if block_end == -1:
                        block_end = len(text_without_markdown)
                    
                    code = text_without_markdown[block_start:block_end].strip()
                    
                    # Only add if it's not already part of a detected code block
                    if not any(code in block.code for block in code_blocks):
                        line_number = text_without_markdown[:start].count('\n') + 1
                        indentation = len(re.match(r'^\s*', code).group())
                        code_blocks.append(CodeBlock(
                            language=language,
                            code=code,
                            line_number=line_number,
                            indentation=indentation
                        ))
        
        return code_blocks

    def remove_html(self):
        """Remove HTML content and decode HTML entities"""
        soup = BeautifulSoup(self.raw_text, 'html.parser')
        text = soup.get_text()
        text = html.unescape(text)
        self.cleaned_text = text

    def clean_whitespace(self):
        """Clean up excessive whitespace"""
        # Replace multiple newlines with double newline
        split_text = re.split(r'\n                \n            \n                    \n                    \n                                    \n                        ', self.cleaned_text)
        if len(split_text) > 1:
            self.cleaned_text = split_text[0]
        self.cleaned_text = re.sub(r'\n{3,}', '\n\n', self.cleaned_text)
        # Replace multiple spaces with single space
        self.cleaned_text = re.sub(r'\s{2,}', ' ', self.cleaned_text)
        # Clean up whitespace around punctuation
        self.cleaned_text = re.sub(r'\s+([.,!?])', r'\1', self.cleaned_text)

    def process_code_blocks(self):
        """Wrap detected code blocks in <code> tags"""
        for block in self.code_blocks:
            code_block_html = f'<code {block.language}>{block.code}</code>'
            self.cleaned_text = self.cleaned_text.replace(block.code, code_block_html)

    def split_text(self):
        """Split Special Text for extract useful article"""
        
        # Daangn Special Text : Published in (Medium 특성)
        # Toss Special Text : 홈페이지회사소개채용고객센터  
        # Wooahan Special Text : {{sub.name}}

        if 'Published in' in self.cleaned_text:
            self.cleaned_text = self.cleaned_text.split('Published in')[1]
        
        if '홈페이지회사소개채용고객센터' in self.cleaned_text:
            self.cleaned_text = self.cleaned_text.split('홈페이지회사소개채용고객센터')[0]
            self.cleaned_text = self.cleaned_text.split('댓글 관련 문의')[0]
        
        if '{{sub.name}}' in self.cleaned_text:
            self.cleaned_text = self.cleaned_text.split('{{sub.name}}')[1]


    def preprocess(self) -> str:
        """Run all preprocessing steps"""
        # Remove HTML
        self.remove_html()
        
        # Detect code blocks
        self.code_blocks = self.find_code_blocks()
        
        # Process code blocks into HTML format
        self.process_code_blocks()
        
        # Clean up excessive whitespace
        self.clean_whitespace()
        self.split_text()

        return self.cleaned_text.lower()


# Example usage
def preprocess_blog_post(raw_text) -> str:
    if type(raw_text) != str:
        raw_text = raw_text['content'].iloc[0]

    """Preprocess the blog post text"""
    processor = TextProcessor(raw_text)
    processed_content = processor.preprocess()
    return processed_content


In [None]:
for input, ext in zip([toss_sample, daangn_sample, wooahan_sample], ['toss', 'daangn', 'wooahan']):
    output = preprocess_blog_post(input)
    save_sample(output, ext)