In [4]:
import re
import time
from typing import Dict, List, Tuple
from urllib.parse import urljoin

import requests
from bs4 import BeautifulSoup
import openai


class CourseParser:
    def __init__(self, base_url: str = "https://karpov.courses/", generation_model_name: str = "gpt-4", timeout: int = 10):
        """
        Initializes the CourseParser with a default base URL and a specified request timeout.
        """
        self.base_url = base_url
        self.generation_model_name = generation_model_name
        self.timeout = timeout
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }

    def _get_response(self, prompt: str) -> str:
        """Get response from OpenAI."""
        return openai.chat.completions.create(
            model=self.generation_model_name,
            messages=[{"role": "user", "content": prompt}],
            temperature=0.0,
            seed=42
        ).choices[0].message.content

    def fetch_page_content(self, url: str) -> str:
        """
        Fetches the HTML content from the given URL, enforcing "https://" if it starts with "http://".
        Args:
            url (str): The URL to fetch.
        Returns:
            str: Raw HTML as a string.
        Raises:
            requests.exceptions.RequestException: If the request fails or times out.
        """
        # Ensure URL uses https
        if url.startswith('http://'):
            url = url.replace('http://', 'https://')

        try:
            response = requests.get(url, headers=self.headers, timeout=self.timeout)
            response.raise_for_status()
            return response.text
        except requests.exceptions.RequestException as e:
            raise Exception(f"Failed to fetch page content: {e}")

    def extract_course_links(self, html_content: str) -> List[str]:
        """
        Extracts all top-level internal course links that match:
        https://karpov.courses/<slug>.
        We assume each course's path is one segment (e.g. /datascience),
        not multiple (e.g. /datascience/overview).
        Args:
            html_content (str): HTML content from which to parse out links.
        Returns:
            List[str]: Absolute URLs for each valid course link discovered.
        """
        soup = BeautifulSoup(html_content, 'html.parser')
        links = set()

        for a_tag in soup.find_all('a', href=True):
            href = a_tag['href']

            # Convert relative URLs to absolute
            absolute_url = urljoin(self.base_url, href)

            # Match pattern: https://karpov.courses/<single-segment>
            if re.match(rf'^{self.base_url}[^/]+/?$', absolute_url):
                links.add(absolute_url.rstrip('/'))

        return list(links)

    def extract_plain_text(self, html_content: str) -> str:
        """
        Extracts plain text by removing <script> and <style> tags,
        then strips out any known JSON-like fragments (e.g., {"key":"value"}).
        Args:
            html_content (str): The raw HTML content of the page.
        Returns:
            str: A cleaned, plain-text version of the webpage.
        """
        soup = BeautifulSoup(html_content, 'html.parser')

        # Remove script and style tags
        for tag in soup(['script', 'style']):
            tag.decompose()

        # Get text with proper spacing
        text = soup.get_text(separator=' ', strip=True)

        # Remove JSON-like fragments
        text = re.sub(r'\{.*?\}', '', text)
        text = re.sub(r'\".*?\"', '', text)

        # Remove excessive whitespace
        text = ' '.join(text.split())

        return text

    def generate_cleaned_landing_text(self, plain_text: str) -> str:
        """
        Uses OpenAI to extract the landing page text content from a plain text page.
        Args:
            plain_text (str): The plain text content of the page
        Returns:
            str: The extracted text content
        """
        prompt = f"""Extract the main content from this course landing page text, removing any navigation,
footer or unrelated elements. Keep only the course description, benefits, program details and
other relevant information. Return clean, well-formatted text.

Page text:
{plain_text}"""

        return self._get_response(prompt)

    def generate_course_summary(self, raw_text: str) -> str:
        """
        Uses OpenAI to generate a summary of the course content.
        Args:
            raw_text (str): The raw text content of the course.
        Returns:
            str: The generated summary
        """
        prompt = f"""Create a concise summary (3-5 sentences) of this course description.
Highlight the key topics covered, target audience, and main benefits.

Course content:
{raw_text}"""

        return self._get_response(prompt)

    def run_parsing(self) -> Tuple[Dict[str, str], Dict[str, str]]:
        base_url_html = self.fetch_page_content(self.base_url)
        course_links = self.extract_course_links(base_url_html)

        course_summaries = {}
        courses_texts = {}

        for link in course_links:
            try:
                course_html = self.fetch_page_content(link)
                plain_text = self.extract_plain_text(course_html)

                course_text = self.generate_cleaned_landing_text(plain_text)
                course_summary = self.generate_course_summary(course_text)

                course_summaries[link] = course_summary
                courses_texts[link] = course_text

                time.sleep(2)  # Avoid rate limiting

            except Exception as e:
                print(f"Error processing {link}: {e}")
                continue

        return course_summaries, courses_texts
        """
        (
    # summaries
    {
        "https://karpov.courses/datascience": "Этот интенсивный курс...",
        "https://karpov.courses/analytics": "Курс по аналитике...",
        ...
    },
    # full_texts
    {
        "https://karpov.courses/datascience": "Полный текст курса...",
        ...
    }
)
"""

На первом этапе мы берём краткие саммари по всем курсам, объединяем их в один текст и отправляем в промпт вместе с вопросом студента. Затем просим модель на основе этого контекста дать максимально точный ответ.


На втором этапе мы берём полные тексты лендингов курсов, ссылки на которые были указаны моделью, снова объединяем их с краткими саммари, и формируем новый, расширенный промпт с запросом пользователя. Затем отправляем его в OpenAI API и получаем уже уточнённый и полный ответ.



In [5]:
from typing import Dict, List
import openai
import logging
import re

logger = logging.getLogger(__name__)

class RAGService:
    """RAG service that uses summaries first, then detailed information when needed."""

    def __init__(self, generation_model_name: str = "gpt-4o-mini"):
        """Initialize with specified model."""
        self.generation_model_name = generation_model_name
        self.course_summaries = None
        self.courses_texts = None

    def fit(self, course_summaries: Dict[str, str], courses_texts: Dict[str, str]):
        """Store course data for later use."""
        self.course_summaries = course_summaries
        self.courses_texts = courses_texts

    def _get_response(self, prompt: str) -> str:
        """Get response from OpenAI."""
        response = openai.chat.completions.create(
            model=self.generation_model_name,
            messages=[{"role": "user", "content": prompt}],
            temperature=0.1,
            seed=42)
        return response.choices[0].message.content

    def _generate_initial_answer(self, summaries_context: str, user_query: str) -> str:
        """Get initial answer using only course summaries."""
        prompt = f"""
        Используя только следующие краткие описания курсов, ответь на вопрос пользователя.
        Если информации недостаточно, верни специальный сигнал NEED_MORE_INFO со списком ссылок на курсы,
        по которым нужна дополнительная информация (только ссылки, разделенные запятыми).

        Описания курсов:
        {summaries_context}

        Вопрос пользователя: {user_query}
        """
        response = self._get_response(prompt)
        return response

    def _extract_needed_links(self, answer: str, courses_texts: Dict[str, str]) -> List[str]:
        """Extract and validate links from the NEED_MORE_INFO response."""
        if "NEED_MORE_INFO" not in answer:
            return []

        links = re.findall(r'https?://[^\s,]+', answer)
        valid_links = [link for link in links if link in courses_texts]
        return valid_links

    def _generate_detailed_answer(  # Переименовано в _generate_detailed_answer
            self,
            summaries_context: str,
            valid_links: List[str],
            courses_texts: Dict[str, str],
            user_query: str
            ) -> str:
        """Generate detailed answer using summaries and specific course details."""
        detailed_context = "\n".join([courses_texts[link] for link in valid_links])
        prompt = f"""
        Используя краткие описания курсов и дополнительную информацию по запрошенным курсам,
        ответь на вопрос пользователя максимально подробно.

        Краткие описания:
        {summaries_context}

        Детальная информация по запрошенным курсам:
        {detailed_context}

        Вопрос пользователя: {user_query}
        """
        response = self._get_response(prompt)
        return response

    def generate_answer(self, user_query: str) -> str:
        """Coordinate the two-step RAG process."""
        try:
            # Step 1: Format texts
            summaries_context = self._format_summaries(self.course_summaries)

            # Step 2: Get initial answer
            initial_answer = self._generate_initial_answer(summaries_context, user_query)

            # Step 3: Check if more info is needed
            needed_links = self._extract_needed_links(initial_answer, self.courses_texts)

            # Step 4: If needed, get detailed answer (теперь используем _generate_detailed_answer)
            if needed_links:
                detailed_answer = self._generate_detailed_answer(
                    summaries_context,
                    needed_links,
                    self.courses_texts,
                    user_query
                )
                return detailed_answer

            # If no valid links or no more info needed, return initial answer
            return initial_answer
        except Exception as e:
            logger.error(f"Error generating answer: {e}")
            return f"Ошибка при генерации ответа: {str(e)}"

    def _format_summaries(self, course_text: Dict[str, str]) -> str:
        """Format course summaries into a single text block."""
        return "\n".join(f"{url}: {summary}" for url, summary in course_text.items())

In [6]:
if __name__ == '__main__':
    import os
    openai.api_key = os.getenv("OPENAI_API_KEY")  # или задайте напрямую

    parser = CourseParser()
    summaries_dict, full_texts_dict = parser.run_parsing()

    rag = RAGService()
    rag.fit(summaries_dict, full_texts_dict)

    query = "Какие курсы подойдут для начинающего аналитика?"
    answer = rag.generate_answer(query)
    print(answer)

Error processing https://karpov.courses/recurrent_payments: The api_key client option must be set either by passing api_key to the client or by setting the OPENAI_API_KEY environment variable
Error processing https://karpov.courses/#freecourse: The api_key client option must be set either by passing api_key to the client or by setting the OPENAI_API_KEY environment variable
Error processing https://karpov.courses/simulator-ds: The api_key client option must be set either by passing api_key to the client or by setting the OPENAI_API_KEY environment variable
Error processing https://karpov.courses/#submenu:detailsmobile: The api_key client option must be set either by passing api_key to the client or by setting the OPENAI_API_KEY environment variable
Error processing https://karpov.courses/big-data-analytics: The api_key client option must be set either by passing api_key to the client or by setting the OPENAI_API_KEY environment variable
Error processing https://karpov.courses/#b2b: The

ERROR:__main__:Error generating answer: The api_key client option must be set either by passing api_key to the client or by setting the OPENAI_API_KEY environment variable


Error processing https://karpov.courses/#Free: The api_key client option must be set either by passing api_key to the client or by setting the OPENAI_API_KEY environment variable
Ошибка при генерации ответа: The api_key client option must be set either by passing api_key to the client or by setting the OPENAI_API_KEY environment variable
