In [33]:
import os
import requests
import json
import time
import operator
import functools

# from langchain.chat_models.gigachat import GigaChat
from langchain_community.chat_models.gigachat import GigaChat

from langchain.memory import ConversationBufferMemory, ConversationBufferWindowMemory, \
    ConversationSummaryMemory, VectorStoreRetrieverMemory
from langchain.chains import ConversationChain, ConversationalRetrievalChain, LLMChain
from langchain.chains.question_answering import load_qa_chain
# from langchain.chains import load_question_generator_chain

from langchain_community.embeddings import GigaChatEmbeddings
from langchain_openai import OpenAIEmbeddings
from langchain.embeddings import HuggingFaceEmbeddings


from langchain.schema import HumanMessage, SystemMessage, AIMessage
from langchain.tools import tool

from pydantic import BaseModel
from pprint import pprint

from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain.prompts import PromptTemplate

from typing import Annotated, Literal, Sequence
from typing_extensions import TypedDict

from langchain_core.messages import BaseMessage, HumanMessage

from langgraph.prebuilt import create_react_agent

from langgraph.graph import END, START, StateGraph
from langchain.document_loaders import WebBaseLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS

ImportError: cannot import name 'load_question_generator_chain' from 'langchain.chains' (/Users/alexlakiza/py/magnifico-assistente-intervista/venv/lib/python3.10/site-packages/langchain/chains/__init__.py)

In [18]:
from dotenv import load_dotenv
# Load environment variables
load_dotenv('.env')

True

In [21]:
# class SimpleChatBot:
#     def __init__(self, llm, memory_type, window_size=None):
#         if memory_type == 'window':
#             assert window_size is not None
#             assert type(window_size) == int
#
#         self.memory_types = {
#             'simple': ConversationBufferMemory(),
#             'summary': ConversationSummaryMemory(llm=llm),
#             'window': ConversationBufferWindowMemory(k=window_size)}
#
#         self.memory = self.memory_types[memory_type]
#
#
#         self.conversation = ConversationChain(
#             llm=llm,
#             verbose=True,
#             memory=self.memory
#         )
#
#
#     def _respond(self, user_input):
#         return self.conversation.invoke(user_input)
#
#     def print_memory(self):
#         print(self.conversation.memory.load_memory_variables({})['history'])
#
#     def run(self):
#         print("This is a simple chat bot:")
#         #TODO
#         while True:
#             user_input = input("User: ")
#             if user_input == "":
#                 break
#             print(f"User: {user_input}")
#             try:
#                 model_response = self._respond(user_input=user_input)
#                 print(f"AI: {model_response['response']}")
#             except Exception as e:
#                 print(f"Error: {str(e)}")
#         time.sleep(0.2)

In [19]:
giga_key = os.environ.get("SB_AUTH_DATA")
giga = GigaChat(credentials=giga_key, model="GigaChat", timeout=30, verify_ssl_certs=False)
# chat = SimpleChatBot(giga, memory_type='summary', window_size=2)
# chat.run()
# print('=='*10)
# chat.print_memory()

In [20]:
from langchain_community.document_loaders import AsyncHtmlLoader
from langchain_community.document_transformers import Html2TextTransformer

urls = ["https://uproger.com/100-voprosov-c-sobesov-v-data-science-i-ml", "https://kalashnikof.com/blog/voprosy-dlya-sobesedovaniya-frontend-react/"]
loader = AsyncHtmlLoader(urls)
docs = loader.load()

html2text = Html2TextTransformer()
docs_transformed = html2text.transform_documents(docs)

Fetching pages: 100%|##########| 2/2 [00:00<00:00,  3.03it/s]


In [54]:
# Step 1: Create or Load Knowledge Base
# This section assumes you've preprocessed and indexed interview questions from multiple websites.
# You can use FAISS or other vector databases for RAG.

def create_knowledge_base(documents):
    """Creates a vectorstore for the knowledge base."""
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=50)
    split_docs = text_splitter.split_documents(documents)
    embeddings = HuggingFaceEmbeddings(model_name="DeepPavlov/rubert-base-cased-sentence")
    # embeddings = GigaChatEmbeddings(verify_ssl_certs=False, scope="GIGACHAT_API_PERS")
    vectorstore = FAISS.from_documents(split_docs, embeddings)
    return vectorstore

def get_retrieval_chain(vectorstore):
    """Creates a conversational retrieval chain."""

    # 1. Define an LLM
    # llm = ChatOpenAI(model="gpt-4", temperature=0)

    # 2. Define a chain for combining documents
    combine_docs_prompt = PromptTemplate(
        input_variables=["context", "question"],
        template=(
            "You are a helpful assistant. Use the following context to answer the question.\n\n"
            "Context:\n{context}\n\n"
            "Question:\n{question}\n\n"
            "Answer:"
        ),
    )
    combine_docs_chain = load_qa_chain(giga, chain_type="stuff", prompt=combine_docs_prompt)

    # 3. Define a question generator chain
    question_generator_prompt = PromptTemplate(
        input_variables=["chat_history", "question"],
        template=(
            "Given the following conversation history and a follow-up question, rewrite the follow-up "
            "question to make it a standalone question.\n\n"
            "Conversation history:\n{chat_history}\n\n"
            "Follow-up question:\n{question}\n\n"
            "Standalone question:"
        ),
    )
    question_generator_chain = LLMChain(llm=giga, prompt=question_generator_prompt)

    # 4. Add memory for chat history
    memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)

    # 5. Build the ConversationalRetrievalChain
    retrieval_chain = ConversationalRetrievalChain(
        retriever=vectorstore.as_retriever(search_kwargs={"k": 5}),
        question_generator=question_generator_chain,
        combine_docs_chain=combine_docs_chain,
        memory=memory,
    )

    return retrieval_chain


# Step 3: Generate Topics and Questions from Vacancy Description
# Define a prompt tem
# plate for extracting interview topics and questions.
TOPICS_PROMPT_TEMPLATE = PromptTemplate(
    input_variables=["vacancy"],
    template=(
        "Given the following job description: \n"
        "{vacancy}\n"
        "1. Extract a list of relevant topics that the candidate should prepare for.\n"
        "2. Provide sample interview questions and suggested answers for each topic."
    )
)

def generate_topics_and_questions(vacancy_description, retrieval_chain=None):
    """Generates interview topics, questions, and answers by combining LLM knowledge and retrieved knowledge base."""
    retrieved_knowledge = ""

    # Step 1: Retrieve knowledge from the knowledge base if the chain is provided
    if retrieval_chain:
        knowledge_results = retrieval_chain({"question": vacancy_description})
        retrieved_knowledge = knowledge_results['answer']

    # Step 2: Combine both sources of knowledge into a single prompt for the LLM
    combined_prompt = (
        "You are an IT interview assistant helping a candidate prepare for a job interview.\n\n"
        "Job Description:\n"
        f"{vacancy_description}\n\n"
        "Additional Knowledge from the database:\n"
        f"{retrieved_knowledge}\n\n"
        "Based on the job description and the additional knowledge:\n"
        "1. Extract a list of relevant topics the candidate should prepare for.\n"
        "2. Provide sample interview questions for each topic.\n"
        "3. Provide concise suggested answers for each question."
    )

    # Step 3: Generate the response using the LLM
    response = giga.invoke(combined_prompt).content
    return response


# Step 4: URL Parsing Functionality
# Load content from a given URL.

def process_url(url):
    """Fetches and preprocesses content from a URL."""
    loader = WebBaseLoader(url)
    documents = loader.load()
    return documents[0].page_content if documents else ""

# Main Chatbot Functionality
# Example: Case 1 (Description Input) and Case 2 (URL Input)


In [55]:
documents = docs_transformed  # Replace with your document loading logic
vectorstore = create_knowledge_base(documents)
retrieval_chain = get_retrieval_chain(vectorstore)

print("Welcome to the IT Interview Assistant Bot!")
print("Choose an option:")
print("1. Provide a job description")
print("2. Provide a URL to a job posting")
choice = input("Enter your choice: ")

if choice == "1":
    vacancy_description = input("Enter the job description: ")
    output = generate_topics_and_questions(vacancy_description, retrieval_chain)
    print("\nTopics and Questions:\n")
    print(output)

elif choice == "2":
    url = input("Enter the URL of the job posting: ")
    job_description = process_url(url)
    if job_description:
        output = generate_topics_and_questions(job_description, retrieval_chain)
        print("\nTopics and Questions:\n")
        print(output)
    else:
        print("Failed to retrieve content from the URL.")

else:
    print("Invalid choice. Please try again.")


No sentence-transformers model found with name DeepPavlov/rubert-base-cased-sentence. Creating a new one with mean pooling.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Welcome to the IT Interview Assistant Bot!
Choose an option:
1. Provide a job description
2. Provide a URL to a job posting

Topics and Questions:

### Подготовка к интервью: список тем и вопросы

1. **Анализ данных с использованием Python**
   - Какие библиотеки для анализа данных вы чаще всего используете? Почему именно они?
   - Как вы решаете проблему нехватки данных для обучения модели?
   - Расскажите о вашем опыте использования библиотек Pandas и NumPy для обработки больших объемов данных.

2. **SQL**
   - Опишите, как вы использовали SQL для выполнения сложных запросов и агрегации данных.
   - Приведите примеры, когда SQL был полезен для оптимизации производительности приложения.

3. **Машинное обучение и статистический анализ**
   - Что вы понимаете под термином "переобучение" в контексте машинного обучения? Как вы его избегаете?
   - Какую роль играет кросс-валидация в оценке качества модели? Приведите пример.
   - В чем разница между линейной регрессией и логистической регре