In [1]:
import torch
from transformers import AutoTokenizer, AutoModel, pipeline
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from transformers import (
    AutoModelForQuestionAnswering,
    T5ForConditionalGeneration,
)
import pandas as pd
from sentence_transformers import SentenceTransformer, util
import re
import faiss
import nltk
from nltk.tokenize import sent_tokenize

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
nltk.download("punkt_tab")

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/artemiy/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [3]:
df = pd.read_csv("papers.csv")
df = df.dropna(subset=["Title", "Text"])

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [5]:
# Модель для генерации эмбеддингов
embedding_model_name = "sentence-transformers/all-MiniLM-L6-v2"
embedding_tokenizer = AutoTokenizer.from_pretrained(embedding_model_name)
embedding_model = AutoModel.from_pretrained(embedding_model_name).to(device)

In [6]:
# Модель для генерации инструкций
instruction_model_name = "google/flan-t5-base"
instruction_tokenizer = AutoTokenizer.from_pretrained(instruction_model_name)
instruction_model = T5ForConditionalGeneration.from_pretrained(
    instruction_model_name
).to(device)

In [7]:
# Модель для ответа на вопросы
qa_model_name = "deepset/roberta-base-squad2"
qa_tokenizer = AutoTokenizer.from_pretrained(qa_model_name)
qa_model = AutoModelForQuestionAnswering.from_pretrained(qa_model_name).to(device)

In [8]:
def get_embeddings(texts, batch_size=16):
    all_embeddings = []
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i : i + batch_size]
        inputs = embedding_tokenizer(
            batch_texts, padding=True, truncation=True, return_tensors="pt"
        ).to(device)
        with torch.no_grad():
            embeddings = embedding_model(**inputs).last_hidden_state.mean(dim=1)
        all_embeddings.append(embeddings.cpu())
    return torch.cat(all_embeddings)

In [9]:
def is_instructional_question(question):
    instructional_keywords = ["how", "steps", "procedure", "process", "implement", "deploy"]
    return any(word in question.lower() for word in instructional_keywords)

In [10]:
def generate_instruction_answer(question):
    input_text = f"Summarize the steps for {question}"
    inputs = instruction_tokenizer(input_text, return_tensors="pt").to(device)
    with torch.no_grad():
        outputs = instruction_model.generate(
            **inputs,
            max_length=100,
            num_beams=5,
            num_return_sequences=1,
            no_repeat_ngram_size=3,
        )
    return instruction_tokenizer.decode(outputs[0], skip_special_tokens=True)

In [11]:
def split_and_tokenize(text):
    blocks = text.split("\n\n")
    sentences = []
    for block in blocks:
        tokenized = nltk.sent_tokenize(block)
        combined = []
        i = 0
        while i < len(tokenized):
            if re.match(r"^\d+\.?$", tokenized[i]):
                if i + 1 < len(tokenized):
                    combined.append(f"{tokenized[i]} {tokenized[i + 1]}")
                    i += 2
                else:
                    combined.append(tokenized[i])
                    i += 1
            else:
                combined.append(tokenized[i])
                i += 1
        sentences.extend(combined)
    return sentences

In [12]:
df["sentences"] = df["Text"].apply(split_and_tokenize)
sentences = df.explode("sentences").reset_index(drop=True)
sentence_embeddings = get_embeddings(sentences["sentences"].tolist()).numpy()

In [13]:
# Индексирование с помощью FAISS
index = faiss.IndexFlatL2(sentence_embeddings.shape[1])
index.add(sentence_embeddings)

In [14]:
def search_relevant_sentences(question, k=5):
    question_embedding = get_embeddings([question]).detach().numpy()
    distances, indices = index.search(question_embedding, k)
    return sentences.iloc[indices[0]]["sentences"].tolist()

In [15]:
def extract_answer(question, context):
    inputs = qa_tokenizer(question, context, return_tensors="pt", truncation=True).to(device)
    with torch.no_grad():
        outputs = qa_model(**inputs)
    start_idx = torch.argmax(outputs.start_logits)
    end_idx = torch.argmax(outputs.end_logits) + 1
    answer = qa_tokenizer.convert_tokens_to_string(
        qa_tokenizer.convert_ids_to_tokens(inputs["input_ids"][0][start_idx:end_idx])
    )
    return answer.strip(), outputs.start_logits.max().item()

In [16]:
def format_answer(answer):
    if answer:
        return answer[0].upper() + answer[1:]
    return answer

In [17]:
def answer_question(question):
    if is_instructional_question(question):
        relevant_sentences = search_relevant_sentences(question, k=3)
        combined_context = " ".join(relevant_sentences)
        instruction_answer = generate_instruction_answer(
            f"{question}. {combined_context}"
        )
        return format_answer(instruction_answer)

    relevant_sentences = search_relevant_sentences(question)
    answers = [extract_answer(question, sentence) for sentence in relevant_sentences]

    filtered_answers = [ans for ans in answers if ans[0]]
    if not filtered_answers:
        return "Ответ не найден"

    best_answer = max(filtered_answers, key=lambda x: x[1])
    return format_answer(best_answer[0])

In [18]:
questions = [
    "Where can I apply Convolutional Neural Network?",
    "What is Reinforcement Learning?",
    "How to deploy a machine learning model?",
    "How to implement a random forest algorithm?",
]

In [19]:
question1 = "Where can I apply Convolutional Neural Network?"

In [20]:
question2 = "What is Reinforcement Learning?"

In [21]:
question3 = "How to deploy a machine learning model?"

In [22]:
question4 = "How to implement a random forest algorithm?"

In [23]:
import textwrap

In [26]:
print(textwrap.fill(answer_question(question1)))

Image processing, classification, segmentation and also for other auto
correlated data


In [27]:
print(textwrap.fill(answer_question(question2)))

The problem faced by an agent( a program) that must learn behavior
through trial and error interactions with a dynamic environment to
maximize some reward


In [28]:
print(textwrap.fill(answer_question(question3)))

Deploy a machine learning application into a production environment.


In [29]:
print(textwrap.fill(answer_question(question4)))

Create a random forest model. Build a Random Forest Classifier.


In [24]:
for question in questions:
    print(f"Вопрос: {question}\nОтвет: {answer_question(question)}\n")

Вопрос: Where can I apply Convolutional Neural Network?
Ответ: Image processing, classification, segmentation and also for other auto correlated data

Вопрос: What is Reinforcement Learning?
Ответ: The problem faced by an agent( a program) that must learn behavior through trial and error interactions with a dynamic environment to maximize some reward

Вопрос: How to deploy a machine learning model?
Ответ: Deploy a machine learning application into a production environment.

Вопрос: How to implement a random forest algorithm?
Ответ: Create a random forest model. Build a Random Forest Classifier.

