In [9]:
!pip install fitz
!pip install pymupdf



### Generating Raw Data

In [10]:
import fitz  
import glob
import fitz  
import pytesseract
from pdf2image import convert_from_path
import os
import markdown
import re
# Extract text from PDFs, using OCR if necessary
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        page_text = page.get_text("text")
        if page_text.strip():  
            text += page_text + "\n"
        else:  
            images = convert_from_path(pdf_path)  # OCR fallback
            for image in images:
                text += pytesseract.image_to_string(image) + "\n"
    return text.strip()

def extract_text_from_md(md_path):
    with open(md_path, "r", encoding="utf-8") as f:
        content = f.read()
    text = markdown.markdown(content)
    # Replace unwanted characters (apply same replacements as PDF)
    text = re.sub(r"</?ol>", "", text)
    text = re.sub(r"</?p>", "", text)
    text = re.sub(r"<h1>.*?</h1>", "", text)
    return text

# Load all PDFs and Markdown files
pdf_files = glob.glob("data/*.pdf")
md_files = glob.glob("data/*.md")

documents = []
for file in pdf_files:
    documents.append(extract_text_from_pdf(file))
for file in md_files:
    documents.append(extract_text_from_md(file))

### Generate Chunks

In [11]:
!pip install nltk



In [12]:
import nltk
from nltk.tokenize import sent_tokenize

# Download the Punkt tokenizer model for sentence tokenization
nltk.download('punkt_tab')

# Define the function to split documents into smaller chunks
def split_documents(documents, max_length=256):
    chunks = []
    for doc in documents:
        sentences = sent_tokenize(doc)  # Tokenize the document into sentences
        chunk = ""
        for sentence in sentences:
            if len(chunk) + len(sentence) <= max_length:
                chunk += sentence + " "
            else:
                chunks.append(chunk.strip())
                chunk = sentence + " "
        if chunk:
            chunks.append(chunk.strip())
    return chunks

def preprocess_chunk(chunk):
    chunk = chunk.strip()                    
    chunk = chunk.replace("\n", " ")    
    return chunk

chunks = split_documents(documents)

#for i, chunk in enumerate(chunks):
#    print(f"Chunk {i + 1}: {chunk}")

chunks = [preprocess_chunk(chunk) for chunk in chunks]

[nltk_data] Downloading package punkt_tab to /home/yasiru/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


### Generating Questions and Answers

In [13]:
!pip install transformers



In [14]:
from transformers import pipeline

# Initialize the pipelines
question_generator = pipeline("text2text-generation", model="valhalla/t5-base-qg-hl")
qa_extractor = pipeline("question-answering", model="distilbert-base-cased-distilled-squad")


# Process each context chunk to generate questions and extract answers
qa_pairs_per_chunk= []  # List to hold QA pairs
for context in chunks:
    # Step 1: Generate questions
    question = question_generator(f"Generate questions from: {context}")[0]["generated_text"] 
    # Step 2: Extract answers for each question
    answer = qa_extractor(question=question, context=context)
    qa_pairs_per_chunk.append({"context":context,"question": question, "answer": answer["answer"]})


for item in qa_pairs_per_chunk:                                    # Display the QA pairs for each chunk
    print(f"Q: {item['question']}")
    print(f"A: {item['answer']}")

Device set to use cpu
Device set to use cpu


Q: What are the first-generation reasoning models?
A: DeepSeek-R1-Zero and DeepSeek-R1
Q: What is the model trained via large-scale reinforcement learning?
A: DeepSeek-R1-Zero
Q: What does DeepSeek-R1-Zero naturally emerge with?
A: powerful and intriguing reasoning behaviors
Q: What does DeepSeek-R1 do?
A: incorporates multi-stage training and cold-start data before RL
Q: What are the names of the models that we open-source?
A: DeepSeek-R1-Zero, DeepSeek-R1
Q: What is the percentage of accuracy of DeepSeek-R1?
A: 42.0
Q: What is the name of the cs.CL database?
A: arXiv:2501.12948v1
Q: What is the base model of DeepSeek?
A: Reinforcement Learning on the Base Model
Q: What is the name of the training template?
A: Template
Q: What are the main topics of DeepSeek-R1-Zero?
A: Self-evolution Process
Q: What is the name of the program that is used to teach Reinforcement Learning?
A: Rejection Sampling and Supervised Fine-Tuning
Q: What is the name of the experiment?
A: DeepSeek-R1 Evaluation


In [15]:
print(len(qa_pairs_per_chunk[1]))
print(len(chunks))

3
575


In [16]:
print(len(qa_pairs_per_chunk))

575


In [17]:
print(qa_pairs_per_chunk)

[{'context': 'DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning DeepSeek-AI research@deepseek.com Abstract We introduce our first-generation reasoning models, DeepSeek-R1-Zero and DeepSeek-R1.', 'question': 'What are the first-generation reasoning models?', 'answer': 'DeepSeek-R1-Zero and DeepSeek-R1'}, {'context': 'DeepSeek-R1-Zero, a model trained via large-scale reinforcement learning (RL) without super- vised fine-tuning (SFT) as a preliminary step, demonstrates remarkable reasoning capabilities.', 'question': 'What is the model trained via large-scale reinforcement learning?', 'answer': 'DeepSeek-R1-Zero'}, {'context': 'Through RL, DeepSeek-R1-Zero naturally emerges with numerous powerful and intriguing reasoning behaviors. However, it encounters challenges such as poor readability, and language mixing.', 'question': 'What does DeepSeek-R1-Zero naturally emerge with?', 'answer': 'powerful and intriguing reasoning behaviors'}, {'context': 'To addres

In [18]:
print(chunks)

['DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning DeepSeek-AI research@deepseek.com Abstract We introduce our first-generation reasoning models, DeepSeek-R1-Zero and DeepSeek-R1.', 'DeepSeek-R1-Zero, a model trained via large-scale reinforcement learning (RL) without super- vised fine-tuning (SFT) as a preliminary step, demonstrates remarkable reasoning capabilities.', 'Through RL, DeepSeek-R1-Zero naturally emerges with numerous powerful and intriguing reasoning behaviors. However, it encounters challenges such as poor readability, and language mixing.', 'To address these issues and further enhance reasoning performance, we introduce DeepSeek-R1, which incorporates multi-stage training and cold-start data before RL. DeepSeek- R1 achieves performance comparable to OpenAI-o1-1217 on reasoning tasks.', 'To support the research community, we open-source DeepSeek-R1-Zero, DeepSeek-R1, and six dense models (1.5B, 7B, 8B, 14B, 32B, 70B) distilled from DeepS

In [19]:
dataset = []
for i in range(len(qa_pairs_per_chunk)):
    dataset.append({ 
        "Instruction": qa_pairs_per_chunk[i]["question"],  # ✅ No [0]
        "Answer": qa_pairs_per_chunk[i]["answer"]       # ✅ No [0]
    })


In [20]:
print(len(dataset))

575


In [21]:
import json

with open("output.json", "w") as file:
    json.dump(dataset, file)


In [22]:
print(len(documents))

6


In [23]:
import torch
print(torch.__version__)


2.6.0+cu124


In [24]:
import torch
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)} is available.")
else:
    print("No GPU available. Training will run on CPU.")

No GPU available. Training will run on CPU.


In [25]:
import transformers

In [27]:
import json
import random

# Ensure reproducibility
random.seed(42)

# Shuffle dataset
random.shuffle(dataset)

# Define split ratio (80% train, 20% test)
split_ratio = 0.8
split_index = int(len(dataset) * split_ratio)

# Create train and test datasets
train_dataset = dataset[:split_index]
test_dataset = dataset[split_index:]

# Save train dataset
with open("train_dataset.json", "w", encoding="utf-8") as file:
    json.dump(train_dataset, file, indent=4, ensure_ascii=False)

# Save test dataset
with open("test_dataset.json", "w", encoding="utf-8") as file:
    json.dump(test_dataset, file, indent=4, ensure_ascii=False)

print(f"✅ Train dataset saved with {len(train_dataset)} samples.")
print(f"✅ Test dataset saved with {len(test_dataset)} samples.")


✅ Train dataset saved with 460 samples.
✅ Test dataset saved with 115 samples.
