In [1]:
import fitz  
import numpy as np
from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.docstore.document import Document
#from sentence_transformers import SentenceTransformer
from langchain.chains import RetrievalQA
from langchain.llms import Ollama

from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory

## Deepseek Models

# 🚀 DeepSeek Model Versions & Recommendations

DeepSeek offers various model sizes optimized for different use cases. Below is a comparison of the available versions to help you choose the best one for your needs.

## ✅ Recommended Lightweight DeepSeek Versions

| **Model**                 | **Command**                          | **Size**  | **Best For** |
|---------------------------|------------------------------------|-----------|-------------|
| **DeepSeek 1.5B (default)** | `ollama pull deepseek-r1:1.5b` | **1.5B params (~2GB RAM needed)** | Lightest version, fastest |
| **DeepSeek 7B**            | `ollama pull deepseek-r1:7b` | **7B params (~10GB RAM needed)** | Balanced speed & power |
| **DeepSeek 8B (Llama)**    | `ollama pull deepseek-r1:8b` | **8B params (~12GB RAM needed)** | Llama3-based, better reasoning |
| **DeepSeek 14B**           | `ollama pull deepseek-r1:14b` | **14B params (~20GB RAM needed)** | More accurate but heavier |

## My Choice: 
✅ `deepseek-r1:1.5b`    



## Full RAG Pipeline


### **Overview**
This pipeline allows me to:
- **Extract text from PDFs**
- **Chunk text into smaller sections**
- **Embed text using embeddings**
- **Store and retrieve embeddings using FAISS**
- **Query the system using DeepSeek LLM via Ollama**

In [2]:
def extract_text_from_pdf(pdf_path):
    """Extracts text from a given PDF file."""
    doc = fitz.open(pdf_path)
    text = "\n".join([page.get_text("text") for page in doc])
    return text if text.strip() else None  # Avoid empty documents

def load_and_process_pdfs(pdf_files):
    """Loads multiple PDFs and extracts valid text."""
    documents = [extract_text_from_pdf(pdf) for pdf in pdf_files if extract_text_from_pdf(pdf)]
    return [doc for doc in documents if isinstance(doc, str) and doc.strip()]  

def split_text_into_chunks(documents, chunk_size=500, chunk_overlap=50):
    """Splits text into smaller chunks for efficient processing."""
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    texts = []
    for doc in documents:
        texts.extend(text_splitter.split_text(doc))
    return texts

def create_faiss_vectorstore(texts, model_name="sentence-transformers/all-MiniLM-L6-v2"):
    """Generates embeddings and stores them in a FAISS vector database."""
    embedding_model = HuggingFaceEmbeddings(model_name=model_name)
    documents = [Document(page_content=text) for text in texts]  
    vectorstore = FAISS.from_documents(documents, embedding_model)
    print("✅ FAISS vector store successfully created!")
    return vectorstore

In [11]:
def run_pipeline(pdf_files, query):
    llm = Ollama(model="deepseek-r1:1.5b") 
    documents = load_and_process_pdfs(pdf_files) 
    texts = split_text_into_chunks(documents)  
    vectorstore = create_faiss_vectorstore(texts)  
    
    retriever = vectorstore.as_retriever()
    qa_chain = RetrievalQA.from_chain_type(llm, retriever=retriever)
    response = qa_chain.run(query)
    clean_response = response.replace("<think>", "").replace("</think>", "").strip()
    return "\n".join(clean_response.split(". "))

In [12]:
pdf_files = ["Geron2017.pdf", "knafling2015.pdf", "Mohri2018.pdf", "NG2018.pdf", "NG2023.pdf", "Russell2010.pdf", "Zheng2018.pdf"]
query = '''
Summarize the key technical and social aspects discussed in the provided books 
on machine learning. Specifically, highlight advancements in ML algorithms, 
ethical concerns, bias in AI, and the societal impact of automation.
'''

response = run_pipeline(pdf_files, query)
print(response)

✅ FAISS vector store successfully created!
Okay, so I'm trying to answer this question about machine learning
The user has given me several books and some context from them
Let me break it down step by step.

First, they mentioned Stuart Russell and Peter Norvig's "Artificial Intelligence: A Modern Approach" which is a big book covering ML topics
It includes things like algorithms and how they work, the bias-variance tradeoff, etc
Then there's a chapter on ethics in computing from Berler and Brunnstein, 2001
They talk about jobs being automated, leisure issues, uniqueness loss, AI uses, and accountability.

They also pointed out that while ML is great for automation, it can cause serious ethical problems
So the question is asking to summarize the key technical aspects and social aspects discussed in these books, specifically focusing on advancements in ML algorithms, ethical concerns, bias in AI, and societal impact of automation.

Alright, so I need to cover each of these areas from b

## Set up conversational retrieval with memory

In [None]:
def setup_conversational_chain(vectorstore, model_name="deepseek-r1:1.5b"):
    """Creates a retrieval-based QA system with memory."""
    llm = Ollama(model=model_name)  
    memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
    qa_chain = ConversationalRetrievalChain.from_llm(llm, retriever=vectorstore.as_retriever(), memory=memory)
    return qa_chain

def ask_question(qa_chain, query):
    """Asks a question and stores the conversation history."""
    response = qa_chain.invoke({"question": query})
    print("🔎 Answer:", response.get("answer", "No answer found."))
    return response.get("answer", "No answer found.")

def run_pipeline_with_memory(pdf_files, query):
    documents = load_and_process_pdfs(pdf_files)  
    texts = split_text_into_chunks(documents) 
    
    embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    documents = [Document(page_content=text) for text in texts]  
    vectorstore = FAISS.from_documents(documents, embedding_model)
    
    qa_chain = setup_conversational_chain(vectorstore)
    
    answer = ask_question(qa_chain, query)
    answer = answer.replace("<think>", "").replace("</think>", "").strip()

    return answer, qa_chain

    

In [14]:
pdf_files = ["Geron2017.pdf", "knafling2015.pdf", "Mohri2018.pdf", "NG2018.pdf", "NG2023.pdf", "Russell2010.pdf", "Zheng2018.pdf"]
query = "Summarize key technical and social aspects in ML books."
answer, qa_chain = run_pipeline_with_memory(pdf_files, query)

  memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)


🔎 Answer: <think>
Okay, so I need to summarize the key technical and social aspects of machine learning books based on the given context. Let's break down what's provided.

First, there are a few paragraphs about specific authors and their books. Stuart Russell and Peter Norvig wrote "Artificial Intelligence: A Modern Approach," which is a comprehensive textbook covering a lot of topics including ML. Another author is John Langford from Carnegie Mellon University who has also written "Revisiting the Bias-Variance Tradeoff." There's also a chapter titled "Machine Learning for Data Science" in this book, focusing on practical aspects.

Then, there's an example about using case studies to explain concepts without getting into specifics. The book is used by many industries and helps leaders understand ML principles through examples from various fields.

So the user wants me to list these technical and social aspects. From the context, I can gather:

1. Technical Content: The book covers va

In [15]:
follow_up_query = "Which of the books focus more on ethical aspects of AI"
ask_question(qa_chain, follow_up_query)

🔎 Answer: <think>
嗯，用户问的是ML教材的主要技术和社会内容。我需要先理清这些方面。

首先，技术方面包括数据准备、特征工程、算法设计、模型评估、部署与优化、应用场景等。这些都是机器学习中的核心问题和方法论。

然后是社交方面，涉及伦理、可解释性、社区参与、团队协作、公众认知和反馈机制等。这部分是关于如何在技术上实施ML，并考虑人类的视角。

总结一下，用户可能希望了解教材中系统地涵盖了技术原理和技术应用，同时也在社会层面考虑相关问题。这样可以全面展示机器学习的发展和影响。
</think>

在 ML 教材中，有哪些主要的技术和社交方面内容？

主要的技术方面包括数据准备、特征工程、算法设计、模型评估、部署与优化以及应用场景等。

主要的社交方面包括伦理、可解释性、社区参与、团队协作、公众认知和反馈机制等。


'<think>\n嗯，用户问的是ML教材的主要技术和社会内容。我需要先理清这些方面。\n\n首先，技术方面包括数据准备、特征工程、算法设计、模型评估、部署与优化、应用场景等。这些都是机器学习中的核心问题和方法论。\n\n然后是社交方面，涉及伦理、可解释性、社区参与、团队协作、公众认知和反馈机制等。这部分是关于如何在技术上实施ML，并考虑人类的视角。\n\n总结一下，用户可能希望了解教材中系统地涵盖了技术原理和技术应用，同时也在社会层面考虑相关问题。这样可以全面展示机器学习的发展和影响。\n</think>\n\n在 ML 教材中，有哪些主要的技术和社交方面内容？\n\n主要的技术方面包括数据准备、特征工程、算法设计、模型评估、部署与优化以及应用场景等。\n\n主要的社交方面包括伦理、可解释性、社区参与、团队协作、公众认知和反馈机制等。'

In [16]:
follow_up_query = "Can you please answer my previous question in English?"
ask_question(qa_chain, follow_up_query)

🔎 Answer: <think>
嗯，用户让我把之前的英文提问重述成一个独立的问题。我需要仔细分析他们的原文，看看他们到底想要什么。

他们提到了一本关于机器学习的书籍，询问哪些书更多地关注伦理方面的内容。这可能涉及到机器学习技术中的道德问题、伦理决策或者法律层面的影响。

接下来，我要确保新提问清晰明确，不引入其他信息或假设，只专注于用户的问题。所以，我将重新表述为：“Which of the ML textbooks focus more on ethical considerations and practices in AI”？

是的，这样既保持了原意，又独立解答了问题，不会有任何混淆。
</think>

Which of the ML textbooks focus more on ethical considerations and practices in AI?


'<think>\n嗯，用户让我把之前的英文提问重述成一个独立的问题。我需要仔细分析他们的原文，看看他们到底想要什么。\n\n他们提到了一本关于机器学习的书籍，询问哪些书更多地关注伦理方面的内容。这可能涉及到机器学习技术中的道德问题、伦理决策或者法律层面的影响。\n\n接下来，我要确保新提问清晰明确，不引入其他信息或假设，只专注于用户的问题。所以，我将重新表述为：“Which of the ML textbooks focus more on ethical considerations and practices in AI”？\n\n是的，这样既保持了原意，又独立解答了问题，不会有任何混淆。\n</think>\n\nWhich of the ML textbooks focus more on ethical considerations and practices in AI?'

# Try Another Embeddings

In [17]:
from transformers import AutoTokenizer, TFAutoModel
import tensorflow as tf

class DeepSeekEmbeddingsTF:
    def __init__(self, model_name="deepseek-ai/deepseek-embedding-v1"):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = TFAutoModel.from_pretrained(model_name, from_tf=True)

    def embed_text(self, text):
        inputs = self.tokenizer(text, padding=True, truncation=True, return_tensors="tf")
        outputs = self.model(**inputs)
        return tf.reduce_mean(outputs.last_hidden_state, axis=1).numpy().squeeze()

embedding_model = DeepSeekEmbeddingsTF()


OSError: deepseek-ai/deepseek-embedding-v1 is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `huggingface-cli login` or by passing `token=<your_token>`