In [86]:
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain.vectorstores import DocArrayInMemorySearch
from langchain.document_loaders import TextLoader
from langchain.chains import RetrievalQA,  ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory
from langchain_ollama.llms import OllamaLLM
from langchain_ollama.embeddings import OllamaEmbeddings
from langchain.document_loaders import TextLoader
from langchain.document_loaders import PyPDFLoader


In [88]:
def load_db(file, chain_type, k):
    print("loading pdf ...")
    # load documents
    loader = PyPDFLoader(file)
    documents = loader.load()
    # split documents
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)
    docs = text_splitter.split_documents(documents)
    # define embedding
    embeddings = OllamaEmbeddings(model="llama3.2:1b")
    # create vector database from data
    db = DocArrayInMemorySearch.from_documents(docs, embeddings)
    # define retriever
    retriever = db.as_retriever(search_type="similarity", search_kwargs={"k": k})
    """retriever = db.as_retriever(
    search_type="similarity", 
    search_kwargs={"k": k, "neighbors": , "iterations": 2}
)"""
    # create a chatbot chain. Memory is managed externally.
    qa = ConversationalRetrievalChain.from_llm(
        llm =  OllamaLLM(model="llama3.2:1b"),
        chain_type=chain_type, 
        retriever=retriever, 
        return_source_documents=True,
        return_generated_question=True,
    )
    
    return qa 

In [90]:
qa = load_db("MachineLearning-Lecture01.pdf", "refine", 4)
chat_history = []
while True:
    print("User")
    question = input()
    if question == "q":
        break
        
    bot_response = qa({"question": question, "chat_history":chat_history})
    chat_history.append((question, bot_response["answer"]))
    print("chatbot: " +  bot_response["answer"])

loading pdf ...
User


 Why is the example of predicting housing prices considered a supervised learning problem?


chatbot: The example of predicting housing prices can be considered a supervised learning problem because it involves training an algorithm on labeled data, where the goal is to predict a continuous output (housing price) based on one or more input features (e.g., age, size, location). In this case:

* The feature "age" and "size" are continuous inputs that can be used as predictors of housing prices.
* The target variable "price" is the actual housing price, which is a categorical output class (malignant or benign).
* By using labeled data with examples like "crosses" indicating malignant tumors and "O"s indicating benign tumors, we can train an algorithm to learn the relationships between these features and the target variable.

This type of problem is often referred to as regression analysis in machine learning. The goal is to develop a model that can predict the housing price based on various input features, while minimizing errors and maximizing predictions for new, unseen data.
U

 In the example of predicting tumor malignancy, what kind of variable is being predicted (malignant or benign)?


chatbot: Based on the provided context and information, I would refine the original answer as follows:

The type of variable being predicted in the context of tumor malignancy classification is a continuous variable.

This is because the algorithm is predicting whether a patient's tumor is malignant (represented by "O") or benign (represented by "B"), which are distinct categories. The variables that are input into the model include:

* Patient age
* Tumor size
* Type of tumor cross ("X" for malignancy and "B" for benign)

The output will be a continuous value representing the probability of the patient's tumor being malignant or benign, rather than a categorical variable like "O" or "B".
User


 q
