In [None]:
!pip install arxiv requests tqdm beautifulsoup4

Collecting arxiv
  Downloading arxiv-2.3.0-py3-none-any.whl.metadata (5.2 kB)
Collecting feedparser~=6.0.10 (from arxiv)
  Downloading feedparser-6.0.12-py3-none-any.whl.metadata (2.7 kB)
Collecting sgmllib3k (from feedparser~=6.0.10->arxiv)
  Downloading sgmllib3k-1.0.0.tar.gz (5.8 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading arxiv-2.3.0-py3-none-any.whl (11 kB)
Downloading feedparser-6.0.12-py3-none-any.whl (81 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m81.5/81.5 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: sgmllib3k
  Building wheel for sgmllib3k (setup.py) ... [?25l[?25hdone
  Created wheel for sgmllib3k: filename=sgmllib3k-1.0.0-py3-none-any.whl size=6046 sha256=39a17802ee5347a76d9a72937686f24907806bb5cccbf5238cc9710506664721
  Stored in directory: /root/.cache/pip/wheels/03/f5/1a/23761066dac1d0e8e683e5fdb27

In [None]:
#!/usr/bin/env python3
"""
download_engineering_pdfs.py

Goal: Download ~150 engineering PDFs (mixed research + lecture notes + textbooks)
from arXiv, NPTEL (best-effort), and any manual OpenStax/other URLs provided.

Notes:
- This script uses a "fast" mode (no duplicate checking); it enforces file size < 5 MB.
- Place any manual PDF URLs (OpenStax or other sources) line-separated in openstax_urls.txt.
- Output folder: data/raw_pdfs/engineering/
"""

import os
import sys
import time
import math
import shutil
import requests
import arxiv
from tqdm import tqdm
from bs4 import BeautifulSoup

OUT_DIR = "data/raw_pdfs/engineering"
os.makedirs(OUT_DIR, exist_ok=True)

TARGET_COUNT = 150
MAX_FILE_SIZE_BYTES = 5 * 1024 * 1024  # 5 MB

# Engineering search queries (broad, covers many subdomains)
ENGINEERING_QUERIES = [
    "machine learning engineering",
    "control systems engineering",
    "thermodynamics engineering",
    "chemical engineering process",
    "heat transfer engineering",
    "transport phenomena engineering",
    "fluid mechanics engineering",
    "structural engineering",
    "power systems electrical engineering",
    "digital design computer engineering",
    "aerospace structures",
    "robotics engineering",
    "materials science engineering",
    "signal processing engineering",
    "embedded systems engineering",
    "process control chemical engineering",
    "industrial engineering optimization",
    "data science engineering",
    "electronics communication engineering",
    "civil engineering design",
]

# NPTEL base search page (best-effort scraping)
NPTEL_SEARCH_BASE = "https://onlinecourses.nptel.ac.in"  # base domain used when linking to course pages

# Helper utils ---------------------------------------------------------
def safe_filename(s: str) -> str:
    keep = "-_.() abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"
    return "".join(c if c in keep else "_" for c in s)[:200]

def save_response_stream(resp, path):
    """Save response streaming to file path (write in chunks)."""
    with open(path, "wb") as f:
        for chunk in resp.iter_content(chunk_size=8192):
            if chunk:
                f.write(chunk)

def file_too_large(path):
    try:
        return os.path.getsize(path) > MAX_FILE_SIZE_BYTES
    except OSError:
        return True

def remove_file(path):
    try:
        os.remove(path)
    except OSError:
        pass

# ARXIV downloader ----------------------------------------------------
def download_from_arxiv(queries, target_count, out_dir):
    """Download PDFs from arXiv for given queries until we reach target_count (or run out)."""
    count = 0
    # We'll pull in batches per query.
    for q in queries:
        if count >= target_count:
            break
        # request more results per query to reach total quota faster
        max_results = min(40, target_count - count)  # safety cap per query
        search = arxiv.Search(query=q, max_results=max_results, sort_by=arxiv.SortCriterion.Relevance)
        for result in search.results():
            if count >= target_count:
                break
            try:
                title = result.title
                authors = "_".join([a.name.split()[-1] for a in result.authors])[:80]
                filename = safe_filename(f"arxiv_{q[:30]}{authors}{title}.pdf")
                path = os.path.join(out_dir, filename)
                # skip if already exists quickly (fast mode may not check duplicates)
                if os.path.exists(path):
                    continue
                # arXiv offers a direct pdf url
                pdf_url = result.pdf_url
                # try streaming download
                try:
                    resp = requests.get(pdf_url, stream=True, timeout=30)
                    if resp.status_code == 200:
                        save_response_stream(resp, path)
                        if file_too_large(path):
                            remove_file(path)
                            # Too large; skip
                            continue
                        count += 1
                    else:
                        # fallback: use arxiv result.download_pdf
                        try:
                            result.download_pdf(filename=path)
                            if file_too_large(path):
                                remove_file(path)
                                continue
                            count += 1
                        except Exception:
                            continue
                except Exception:
                    # fallback download via arxiv lib
                    try:
                        result.download_pdf(filename=path)
                        if file_too_large(path):
                            remove_file(path)
                            continue
                        count += 1
                    except Exception:
                        continue
            except Exception:
                continue
    return count

# NPTEL downloader (best-effort) -------------------------------------
def download_from_nptel(max_courses, out_dir):
    """
    Best-effort NPTEL crawler:
    - We attempt to fetch some known NPTEL course pages by iterating through common URL patterns.
    - NPTEL structure can change; this is a heuristic.
    - If you have specific course list URLs, put them in 'nptel_course_urls.txt' (one per line).
    """
    downloaded = 0
    # Try fetching URLs from user-provided file first
    course_file = "nptel_course_urls.txt"
    urls = []
    if os.path.exists(course_file):
        with open(course_file, "r") as f:
            for line in f:
                u = line.strip()
                if u:
                    urls.append(u)
    else:
        # fallback heuristic course listing (popular NPTEL structure uses /courses/)
        # We'll try a few common course name seeds combined with branches
        seeds = [
            "control-systems",
            "thermodynamics",
            "fluid-mechanics",
            "process-control",
            "machine-learning",
            "signal-processing",
            "power-systems",
            "design-of-machines",
            "chemical-process-safety",
            "engineering-materials"
        ]
        branches = ["iit-kanpur", "iit-bombay", "iit-delhi", "iit-madras", "iit-kharagpur", "iit-roorkee"]
        for s in seeds:
            for b in branches:
                # Construct a guess URL (this is heuristic; many won't exist)
                urls.append(f"https://onlinecourses.nptel.ac.in/{b}/flights/{s}.htm")
                urls.append(f"https://onlinecourses.nptel.ac.in/{b}/{s}.htm")
        urls = list(dict.fromkeys(urls))  # unique

    for u in urls:
        if downloaded >= max_courses:
            break
        try:
            resp = requests.get(u, timeout=15)
            if resp.status_code != 200:
                continue
            soup = BeautifulSoup(resp.text, "html.parser")
            # look for links that look like PDFs
            for a in soup.find_all("a", href=True):
                href = a["href"]
                if href.lower().endswith(".pdf"):
                    pdf_url = href if href.startswith("http") else requests.compat.urljoin(u, href)
                    fname = safe_filename("nptel_" + os.path.basename(pdf_url).split("?")[0])
                    out_path = os.path.join(out_dir, fname)
                    if os.path.exists(out_path):
                        continue
                    try:
                        r = requests.get(pdf_url, stream=True, timeout=30)
                        if r.status_code == 200:
                            save_response_stream(r, out_path)
                            if file_too_large(out_path):
                                remove_file(out_path)
                                continue
                            downloaded += 1
                            if downloaded >= max_courses:
                                break
                    except Exception:
                        continue
        except Exception:
            continue
    return downloaded

# OPENSTAX / manual URLs loader ---------------------------------------
def download_manual_urls(file_with_urls, out_dir, max_count):
    """
    Reads a newline-separated list of PDF URLs (e.g. OpenStax book chapter links or textbook links)
    and downloads them until max_count is reached.
    """
    if not os.path.exists(file_with_urls):
        return 0

    downloaded = 0
    with open(file_with_urls, "r") as f:
        for line in f:
            if downloaded >= max_count:
                break
            url = line.strip()
            if not url:
                continue
            try:
                fname = safe_filename("manual_" + os.path.basename(url).split("?")[0])
                out_path = os.path.join(out_dir, fname)
                if os.path.exists(out_path):
                    continue
                r = requests.get(url, stream=True, timeout=30)
                if r.status_code == 200:
                    save_response_stream(r, out_path)
                    if file_too_large(out_path):
                        remove_file(out_path)
                        continue
                    downloaded += 1
            except Exception:
                continue
    return downloaded

# MAIN DRIVER ---------------------------------------------------------
def main():
    print("=== Engineering PDF downloader (fast mode, <5MB each) ===")
    total_downloaded = 0

    # 1) Try arXiv first (bulk of research PDFs)
    remaining = TARGET_COUNT - total_downloaded
    if remaining > 0:
        print(f"\n-> Downloading from arXiv (target: {remaining}) ...")
        downloaded = download_from_arxiv(ENGINEERING_QUERIES, remaining, OUT_DIR)
        total_downloaded += downloaded
        print(f"arXiv downloaded: {downloaded} (total {total_downloaded})")

    # 2) Try NPTEL lecture notes (best-effort)
    remaining = TARGET_COUNT - total_downloaded
    if remaining > 0:
        # limit number of NPTEL downloads to a portion (e.g., up to 40)
        nptel_target = min(40, remaining)
        print(f"\n-> Attempting to fetch lecture PDFs from NPTEL (target: {nptel_target}) ...")
        nptel_downloaded = download_from_nptel(nptel_target, OUT_DIR)
        total_downloaded += nptel_downloaded
        print(f"NPTEL downloaded: {nptel_downloaded} (total {total_downloaded})")

    # 3) Manual OpenStax / textbook / misc URLs (put them in openstax_urls.txt)
    remaining = TARGET_COUNT - total_downloaded
    if remaining > 0:
        print(f"\n-> Downloading manual URLs from openstax_urls.txt (target: {remaining}) ...")
        manual_downloaded = download_manual_urls("openstax_urls.txt", OUT_DIR, remaining)
        total_downloaded += manual_downloaded
        print(f"Manual urls downloaded: {manual_downloaded} (total {total_downloaded})")

    print("\n=== Summary ===")
    print(f"Total PDFs downloaded: {total_downloaded}/{TARGET_COUNT}")
    print(f"Saved to folder: {OUT_DIR}")
    print("Note: files > 5MB are removed automatically. If you want more PDFs, you can re-run the script or add manual URLs.")
    print("Done.")


In [None]:
main()

=== Engineering PDF downloader (fast mode, <5MB each) ===

-> Downloading from arXiv (target: 150) ...


  for result in search.results():


arXiv downloaded: 150 (total 150)

=== Summary ===
Total PDFs downloaded: 150/150
Saved to folder: data/raw_pdfs/engineering
Note: files > 5MB are removed automatically. If you want more PDFs, you can re-run the script or add manual URLs.
Done.


In [None]:
!pip install PyMuPDF nltk tqdm

Collecting PyMuPDF
  Downloading pymupdf-1.26.5-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Downloading pymupdf-1.26.5-cp39-abi3-manylinux_2_28_x86_64.whl (24.1 MB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m24.1/24.1 MB[0m [31m68.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyMuPDF
Successfully installed PyMuPDF-1.26.5


In [None]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [None]:
import fitz  # PyMuPDF
import os
from tqdm import tqdm
import nltk
from nltk.tokenize import sent_tokenize

nltk.download('punkt')

PDF_DIR = "/content/data/raw_pdfs/engineering"
CHUNK_DIR = "/content/data/chunks"
os.makedirs(CHUNK_DIR, exist_ok=True)

def extract_text_from_pdf(pdf_path):
    """Extracts plain text from a PDF."""
    text = ""
    try:
        with fitz.open(pdf_path) as doc:
            for page in doc:
                text += page.get_text("text")
        return text
    except Exception as e:
        print(f"‚ö† Error reading {pdf_path}: {e}")
        return ""

def create_chunks(text, max_words=180):
    """Splits long text into readable chunks."""
    sentences = sent_tokenize(text)
    chunks, current_chunk, count = [], "", 0

    for sent in sentences:
        words = sent.split()
        if count + len(words) <= max_words:
            current_chunk += " " + sent
            count += len(words)
        else:
            chunks.append(current_chunk.strip())
            current_chunk = sent
            count = len(words)
    if current_chunk:
        chunks.append(current_chunk.strip())

    return chunks

# ---- Process all PDFs ----
chunk_count = 0
for pdf_file in tqdm(os.listdir(PDF_DIR)[::20], desc="üìö Processing PDFs"):
    if pdf_file.endswith(".pdf"):
        pdf_path = os.path.join(PDF_DIR, pdf_file)
        text = extract_text_from_pdf(pdf_path)
        if not text.strip():
            continue
        chunks = create_chunks(text)

        # Save chunks as text files
        for i, chunk in enumerate(chunks):
            chunk_filename = f"{os.path.splitext(pdf_file)[0]}chunk{i}.txt"
            with open(os.path.join(CHUNK_DIR, chunk_filename), "w") as f:
                f.write(chunk)
            chunk_count += 1

print(f"\n‚úÖ Done! Total chunks created: {chunk_count}")
print(f"üìÇ Saved inside: {CHUNK_DIR}")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
üìö Processing PDFs: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 8/8 [00:01<00:00,  5.47it/s]


‚úÖ Done! Total chunks created: 693
üìÇ Saved inside: /content/data/chunks





In [None]:
import os
import google.genai as genai  # <-- Keep this line

# --- Verification Step ---
api_key_status = os.getenv("GEMINI_API_KEY")

if api_key_status:
    print("‚úÖ GEMINI_API_KEY is loaded in this session.")
else:
    print("‚ùå GEMINI_API_KEY is NOT loaded in this session.")
    # Prompt user to input the key securely
    from getpass import getpass
    api_key = getpass("Enter your GEMINI_API_KEY: ")
    os.environ["GEMINI_API_KEY"] = api_key
    print("üîë GEMINI_API_KEY has been set for this session.")

‚ùå GEMINI_API_KEY is NOT loaded in this session.
Enter your GEMINI_API_KEY: ¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑
üîë GEMINI_API_KEY has been set for this session.


In [None]:
!pip install jsonlines

Collecting jsonlines
  Downloading jsonlines-4.0.0-py3-none-any.whl.metadata (1.6 kB)
Downloading jsonlines-4.0.0-py3-none-any.whl (8.7 kB)
Installing collected packages: jsonlines
Successfully installed jsonlines-4.0.0


In [None]:
DATASET_PATH = "/content/data/dataset/"
os.makedirs(DATASET_PATH, exist_ok=True)

In [None]:
import os
import jsonlines
from tqdm import tqdm
import google.generativeai as genai

CHUNK_DIR = "/content/data/chunks"
DATASET_PATH = "/content/data/dataset/train.jsonl"
MAX_SAMPLES = 500  # stops after creating 500 samples

def generate_summary(chunk):
    prompt = f"""
    Summarize this in 10 academic-style sentences:
    {chunk}
    """
    response = genai.GenerativeModel("gemini-2.5-flash").generate_content(prompt)
    return response.text.strip()

def generate_questions(chunk):
    prompt = f"""
    Create 4 exam-style questions and answers from this:
    {chunk}

    Use this format strictly:
    Q1:
    A1:
    Q2:
    A2:
    Q3:
    A3:
    Q4:
    A4:
    """
    response = genai.GenerativeModel("gemini-2.5-flash").generate_content(prompt)
    return response.text.strip()

output_count = 0
with jsonlines.open(DATASET_PATH, mode='w') as writer:
    for chunk_file in tqdm(os.listdir(CHUNK_DIR), desc="Generating Dataset"):
        if output_count >= MAX_SAMPLES:
            break

        with open(os.path.join(CHUNK_DIR, chunk_file), 'r') as f:
            chunk = f.read()

        try:
            summary = generate_summary(chunk)
            qa = generate_questions(chunk)

            writer.write({
                "input": f"<TASK_SUMMARY>\nContext:\n{chunk}",
                "output": summary
            })
            writer.write({
                "input": f"<TASK_QUESTIONS>\nContext:\n{chunk}",
                "output": qa
            })

            output_count += 2  # 1 summary + 1 Q&A
        except Exception as e:
            print(f"‚ö† Skipped {chunk_file}: {e}")

print(f"\n‚úÖ Dataset created: {DATASET_PATH}")
print(f"üì¶ Total dataset entries: {output_count}")

Generating Dataset:  13%|‚ñà‚ñé        | 93/693 [36:55<3:21:54, 20.19s/it]

‚ö† Skipped arxiv_thermodynamics engineeringElouardThermodynamics of Quantum Open Systems_ Applications in Quantum Optics and Optomechanicschunk137.txt: 429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. To monitor your current usage, head to: https://ai.dev/usage?tab=rate-limit. 
* Quota exceeded for metric: generativelanguage.googleapis.com/generate_content_free_tier_requests, limit: 250
Please retry in 33.524303552s. [violations {
  quota_metric: "generativelanguage.googleapis.com/generate_content_free_tier_requests"
  quota_id: "GenerateRequestsPerDayPerProjectPerModel-FreeTier"
  quota_dimensions {
    key: "model"
    value: "gemini-2.5-flash"
  }
  quota_dimensions {
    key: "location"
    value: "global"
  }
  quota_value: 250
}
, links {
  description: "Learn more about Gemini API quotas"
  url: "https://ai.google.dev/gemini-api/docs/rate-limits"
}
, r

Generating Dataset:  14%|‚ñà‚ñé        | 94/693 [36:56<2:21:47, 14.20s/it]

‚ö† Skipped arxiv_thermodynamics engineeringElouardThermodynamics of Quantum Open Systems_ Applications in Quantum Optics and Optomechanicschunk258.txt: 429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. To monitor your current usage, head to: https://ai.dev/usage?tab=rate-limit. 
* Quota exceeded for metric: generativelanguage.googleapis.com/generate_content_free_tier_requests, limit: 250
Please retry in 33.309157942s. [violations {
  quota_metric: "generativelanguage.googleapis.com/generate_content_free_tier_requests"
  quota_id: "GenerateRequestsPerDayPerProjectPerModel-FreeTier"
  quota_dimensions {
    key: "model"
    value: "gemini-2.5-flash"
  }
  quota_dimensions {
    key: "location"
    value: "global"
  }
  quota_value: 250
}
, links {
  description: "Learn more about Gemini API quotas"
  url: "https://ai.google.dev/gemini-api/docs/rate-limits"
}
, r

Generating Dataset:  14%|‚ñà‚ñé        | 95/693 [37:03<2:01:21, 12.18s/it]

‚ö† Skipped arxiv_thermodynamics engineeringElouardThermodynamics of Quantum Open Systems_ Applications in Quantum Optics and Optomechanicschunk27.txt: 429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. To monitor your current usage, head to: https://ai.dev/usage?tab=rate-limit. 
* Quota exceeded for metric: generativelanguage.googleapis.com/generate_content_free_tier_requests, limit: 250
Please retry in 25.852520062s. [violations {
  quota_metric: "generativelanguage.googleapis.com/generate_content_free_tier_requests"
  quota_id: "GenerateRequestsPerDayPerProjectPerModel-FreeTier"
  quota_dimensions {
    key: "model"
    value: "gemini-2.5-flash"
  }
  quota_dimensions {
    key: "location"
    value: "global"
  }
  quota_value: 250
}
, links {
  description: "Learn more about Gemini API quotas"
  url: "https://ai.google.dev/gemini-api/docs/rate-limits"
}
, re

Generating Dataset:  14%|‚ñà‚ñç        | 96/693 [37:13<1:54:44, 11.53s/it]

‚ö† Skipped arxiv_thermodynamics engineeringElouardThermodynamics of Quantum Open Systems_ Applications in Quantum Optics and Optomechanicschunk195.txt: 429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. To monitor your current usage, head to: https://ai.dev/usage?tab=rate-limit. 
* Quota exceeded for metric: generativelanguage.googleapis.com/generate_content_free_tier_requests, limit: 250
Please retry in 15.823731689s. [violations {
  quota_metric: "generativelanguage.googleapis.com/generate_content_free_tier_requests"
  quota_id: "GenerateRequestsPerDayPerProjectPerModel-FreeTier"
  quota_dimensions {
    key: "model"
    value: "gemini-2.5-flash"
  }
  quota_dimensions {
    key: "location"
    value: "global"
  }
  quota_value: 250
}
, links {
  description: "Learn more about Gemini API quotas"
  url: "https://ai.google.dev/gemini-api/docs/rate-limits"
}
, r

Generating Dataset:  14%|‚ñà‚ñç        | 98/693 [37:31<1:33:25,  9.42s/it]

‚ö† Skipped arxiv_thermodynamics engineeringElouardThermodynamics of Quantum Open Systems_ Applications in Quantum Optics and Optomechanicschunk68.txt: 429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. To monitor your current usage, head to: https://ai.dev/usage?tab=rate-limit. 
* Quota exceeded for metric: generativelanguage.googleapis.com/generate_content_free_tier_requests, limit: 250
Please retry in 57.951341878s. [violations {
  quota_metric: "generativelanguage.googleapis.com/generate_content_free_tier_requests"
  quota_id: "GenerateRequestsPerDayPerProjectPerModel-FreeTier"
  quota_dimensions {
    key: "model"
    value: "gemini-2.5-flash"
  }
  quota_dimensions {
    key: "location"
    value: "global"
  }
  quota_value: 250
}
, links {
  description: "Learn more about Gemini API quotas"
  url: "https://ai.google.dev/gemini-api/docs/rate-limits"
}
, re

Generating Dataset:  14%|‚ñà‚ñç        | 99/693 [37:31<1:05:54,  6.66s/it]

‚ö† Skipped arxiv_thermodynamics engineeringElouardThermodynamics of Quantum Open Systems_ Applications in Quantum Optics and Optomechanicschunk197.txt: 429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. To monitor your current usage, head to: https://ai.dev/usage?tab=rate-limit. 
* Quota exceeded for metric: generativelanguage.googleapis.com/generate_content_free_tier_requests, limit: 250
Please retry in 57.745622773s. [violations {
  quota_metric: "generativelanguage.googleapis.com/generate_content_free_tier_requests"
  quota_id: "GenerateRequestsPerDayPerProjectPerModel-FreeTier"
  quota_dimensions {
    key: "model"
    value: "gemini-2.5-flash"
  }
  quota_dimensions {
    key: "location"
    value: "global"
  }
  quota_value: 250
}
, links {
  description: "Learn more about Gemini API quotas"
  url: "https://ai.google.dev/gemini-api/docs/rate-limits"
}
, r

Generating Dataset:  14%|‚ñà‚ñç        | 100/693 [37:31<46:42,  4.73s/it] 

‚ö† Skipped arxiv_machine learning engineeringShafiq_Mashkoor_Mayr-Dorn_EgyedMachine Learning for Software Engineering_ A Systematic Mappingchunk88.txt: 429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. To monitor your current usage, head to: https://ai.dev/usage?tab=rate-limit. 
* Quota exceeded for metric: generativelanguage.googleapis.com/generate_content_free_tier_requests, limit: 250
Please retry in 57.531911014s. [violations {
  quota_metric: "generativelanguage.googleapis.com/generate_content_free_tier_requests"
  quota_id: "GenerateRequestsPerDayPerProjectPerModel-FreeTier"
  quota_dimensions {
    key: "model"
    value: "gemini-2.5-flash"
  }
  quota_dimensions {
    key: "location"
    value: "global"
  }
  quota_value: 250
}
, links {
  description: "Learn more about Gemini API quotas"
  url: "https://ai.google.dev/gemini-api/docs/rate-limits"
}
, r

Generating Dataset:  15%|‚ñà‚ñç        | 101/693 [37:32<33:18,  3.38s/it]

‚ö† Skipped arxiv_control systems engineeringDasControl System Design Using Finite Laplace Transform Theorychunk23.txt: 429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. To monitor your current usage, head to: https://ai.dev/usage?tab=rate-limit. 
* Quota exceeded for metric: generativelanguage.googleapis.com/generate_content_free_tier_requests, limit: 250
Please retry in 57.315559759s. [violations {
  quota_metric: "generativelanguage.googleapis.com/generate_content_free_tier_requests"
  quota_id: "GenerateRequestsPerDayPerProjectPerModel-FreeTier"
  quota_dimensions {
    key: "model"
    value: "gemini-2.5-flash"
  }
  quota_dimensions {
    key: "location"
    value: "global"
  }
  quota_value: 250
}
, links {
  description: "Learn more about Gemini API quotas"
  url: "https://ai.google.dev/gemini-api/docs/rate-limits"
}
, retry_delay {
  seconds: 57
}
]


Generating Dataset:  15%|‚ñà‚ñç        | 102/693 [37:32<23:57,  2.43s/it]

‚ö† Skipped arxiv_thermodynamics engineeringElouardThermodynamics of Quantum Open Systems_ Applications in Quantum Optics and Optomechanicschunk187.txt: 429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. To monitor your current usage, head to: https://ai.dev/usage?tab=rate-limit. 
* Quota exceeded for metric: generativelanguage.googleapis.com/generate_content_free_tier_requests, limit: 250
Please retry in 57.084445946s. [violations {
  quota_metric: "generativelanguage.googleapis.com/generate_content_free_tier_requests"
  quota_id: "GenerateRequestsPerDayPerProjectPerModel-FreeTier"
  quota_dimensions {
    key: "model"
    value: "gemini-2.5-flash"
  }
  quota_dimensions {
    key: "location"
    value: "global"
  }
  quota_value: 250
}
, links {
  description: "Learn more about Gemini API quotas"
  url: "https://ai.google.dev/gemini-api/docs/rate-limits"
}
, r

Generating Dataset:  15%|‚ñà‚ñç        | 103/693 [37:41<42:57,  4.37s/it]

‚ö† Skipped arxiv_machine learning engineeringShafiq_Mashkoor_Mayr-Dorn_EgyedMachine Learning for Software Engineering_ A Systematic Mappingchunk17.txt: 429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. To monitor your current usage, head to: https://ai.dev/usage?tab=rate-limit. 
* Quota exceeded for metric: generativelanguage.googleapis.com/generate_content_free_tier_requests, limit: 250
Please retry in 48.19166123s. [violations {
  quota_metric: "generativelanguage.googleapis.com/generate_content_free_tier_requests"
  quota_id: "GenerateRequestsPerDayPerProjectPerModel-FreeTier"
  quota_dimensions {
    key: "model"
    value: "gemini-2.5-flash"
  }
  quota_dimensions {
    key: "location"
    value: "global"
  }
  quota_value: 250
}
, links {
  description: "Learn more about Gemini API quotas"
  url: "https://ai.google.dev/gemini-api/docs/rate-limits"
}
, re

Generating Dataset:  15%|‚ñà‚ñå        | 104/693 [37:41<30:41,  3.13s/it]

‚ö† Skipped arxiv_machine learning engineeringShafiq_Mashkoor_Mayr-Dorn_EgyedMachine Learning for Software Engineering_ A Systematic Mappingchunk66.txt: 429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. To monitor your current usage, head to: https://ai.dev/usage?tab=rate-limit. 
* Quota exceeded for metric: generativelanguage.googleapis.com/generate_content_free_tier_requests, limit: 250
Please retry in 47.974944385s. [violations {
  quota_metric: "generativelanguage.googleapis.com/generate_content_free_tier_requests"
  quota_id: "GenerateRequestsPerDayPerProjectPerModel-FreeTier"
  quota_dimensions {
    key: "model"
    value: "gemini-2.5-flash"
  }
  quota_dimensions {
    key: "location"
    value: "global"
  }
  quota_value: 250
}
, links {
  description: "Learn more about Gemini API quotas"
  url: "https://ai.google.dev/gemini-api/docs/rate-limits"
}
, r

Generating Dataset:  15%|‚ñà‚ñå        | 105/693 [37:41<22:04,  2.25s/it]

‚ö† Skipped arxiv_thermodynamics engineeringElouardThermodynamics of Quantum Open Systems_ Applications in Quantum Optics and Optomechanicschunk71.txt: 429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. To monitor your current usage, head to: https://ai.dev/usage?tab=rate-limit. 
* Quota exceeded for metric: generativelanguage.googleapis.com/generate_content_free_tier_requests, limit: 250
Please retry in 47.740046714s. [violations {
  quota_metric: "generativelanguage.googleapis.com/generate_content_free_tier_requests"
  quota_id: "GenerateRequestsPerDayPerProjectPerModel-FreeTier"
  quota_dimensions {
    key: "model"
    value: "gemini-2.5-flash"
  }
  quota_dimensions {
    key: "location"
    value: "global"
  }
  quota_value: 250
}
, links {
  description: "Learn more about Gemini API quotas"
  url: "https://ai.google.dev/gemini-api/docs/rate-limits"
}
, re

Generating Dataset:  15%|‚ñà‚ñå        | 106/693 [37:41<16:02,  1.64s/it]

‚ö† Skipped arxiv_chemical engineering processWang_WuTowards Foundation Model for Chemical Reactor Modeling_ Meta-Learning with Physics-Informed Adaptationchunk64.txt: 429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. To monitor your current usage, head to: https://ai.dev/usage?tab=rate-limit. 
* Quota exceeded for metric: generativelanguage.googleapis.com/generate_content_free_tier_requests, limit: 250
Please retry in 47.534461891s. [violations {
  quota_metric: "generativelanguage.googleapis.com/generate_content_free_tier_requests"
  quota_id: "GenerateRequestsPerDayPerProjectPerModel-FreeTier"
  quota_dimensions {
    key: "model"
    value: "gemini-2.5-flash"
  }
  quota_dimensions {
    key: "location"
    value: "global"
  }
  quota_value: 250
}
, links {
  description: "Learn more about Gemini API quotas"
  url: "https://ai.google.dev/gemini-api/docs/rat

Generating Dataset:  15%|‚ñà‚ñå        | 107/693 [37:42<11:50,  1.21s/it]

‚ö† Skipped arxiv_machine learning engineeringThebelt_Wiebe_Kronqvist_Tsay_MisenerMaximizing information from chemical engineering data sets_ Applications to machine learningchunk58.txt: 429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. To monitor your current usage, head to: https://ai.dev/usage?tab=rate-limit. 
* Quota exceeded for metric: generativelanguage.googleapis.com/generate_content_free_tier_requests, limit: 250
Please retry in 47.321167113s. [violations {
  quota_metric: "generativelanguage.googleapis.com/generate_content_free_tier_requests"
  quota_id: "GenerateRequestsPerDayPerProjectPerModel-FreeTier"
  quota_dimensions {
    key: "model"
    value: "gemini-2.5-flash"
  }
  quota_dimensions {
    key: "location"
    value: "global"
  }
  quota_value: 250
}
, links {
  description: "Learn more about Gemini API quotas"
  url: "https://ai.google.dev/

Generating Dataset:  16%|‚ñà‚ñå        | 108/693 [37:42<08:56,  1.09it/s]

‚ö† Skipped arxiv_thermodynamics engineeringElouardThermodynamics of Quantum Open Systems_ Applications in Quantum Optics and Optomechanicschunk126.txt: 429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. To monitor your current usage, head to: https://ai.dev/usage?tab=rate-limit. 
* Quota exceeded for metric: generativelanguage.googleapis.com/generate_content_free_tier_requests, limit: 250
Please retry in 47.103616907s. [violations {
  quota_metric: "generativelanguage.googleapis.com/generate_content_free_tier_requests"
  quota_id: "GenerateRequestsPerDayPerProjectPerModel-FreeTier"
  quota_dimensions {
    key: "model"
    value: "gemini-2.5-flash"
  }
  quota_dimensions {
    key: "location"
    value: "global"
  }
  quota_value: 250
}
, links {
  description: "Learn more about Gemini API quotas"
  url: "https://ai.google.dev/gemini-api/docs/rate-limits"
}
, r

Generating Dataset:  16%|‚ñà‚ñå        | 109/693 [37:42<06:53,  1.41it/s]

‚ö† Skipped arxiv_machine learning engineeringShafiq_Mashkoor_Mayr-Dorn_EgyedMachine Learning for Software Engineering_ A Systematic Mappingchunk60.txt: 429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. To monitor your current usage, head to: https://ai.dev/usage?tab=rate-limit. 
* Quota exceeded for metric: generativelanguage.googleapis.com/generate_content_free_tier_requests, limit: 250
Please retry in 46.881390048s. [violations {
  quota_metric: "generativelanguage.googleapis.com/generate_content_free_tier_requests"
  quota_id: "GenerateRequestsPerDayPerProjectPerModel-FreeTier"
  quota_dimensions {
    key: "model"
    value: "gemini-2.5-flash"
  }
  quota_dimensions {
    key: "location"
    value: "global"
  }
  quota_value: 250
}
, links {
  description: "Learn more about Gemini API quotas"
  url: "https://ai.google.dev/gemini-api/docs/rate-limits"
}
, r

Generating Dataset:  16%|‚ñà‚ñå        | 110/693 [37:42<05:27,  1.78it/s]

‚ö† Skipped arxiv_machine learning engineeringShafiq_Mashkoor_Mayr-Dorn_EgyedMachine Learning for Software Engineering_ A Systematic Mappingchunk5.txt: 429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. To monitor your current usage, head to: https://ai.dev/usage?tab=rate-limit. 
* Quota exceeded for metric: generativelanguage.googleapis.com/generate_content_free_tier_requests, limit: 250
Please retry in 46.657582781s. [violations {
  quota_metric: "generativelanguage.googleapis.com/generate_content_free_tier_requests"
  quota_id: "GenerateRequestsPerDayPerProjectPerModel-FreeTier"
  quota_dimensions {
    key: "model"
    value: "gemini-2.5-flash"
  }
  quota_dimensions {
    key: "location"
    value: "global"
  }
  quota_value: 250
}
, links {
  description: "Learn more about Gemini API quotas"
  url: "https://ai.google.dev/gemini-api/docs/rate-limits"
}
, re

Generating Dataset:  16%|‚ñà‚ñå        | 111/693 [37:43<04:28,  2.17it/s]

‚ö† Skipped arxiv_thermodynamics engineeringElouardThermodynamics of Quantum Open Systems_ Applications in Quantum Optics and Optomechanicschunk88.txt: 429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. To monitor your current usage, head to: https://ai.dev/usage?tab=rate-limit. 
* Quota exceeded for metric: generativelanguage.googleapis.com/generate_content_free_tier_requests, limit: 250
Please retry in 46.435797954s. [violations {
  quota_metric: "generativelanguage.googleapis.com/generate_content_free_tier_requests"
  quota_id: "GenerateRequestsPerDayPerProjectPerModel-FreeTier"
  quota_dimensions {
    key: "model"
    value: "gemini-2.5-flash"
  }
  quota_dimensions {
    key: "location"
    value: "global"
  }
  quota_value: 250
}
, links {
  description: "Learn more about Gemini API quotas"
  url: "https://ai.google.dev/gemini-api/docs/rate-limits"
}
, re

Generating Dataset:  16%|‚ñà‚ñå        | 112/693 [37:43<03:46,  2.57it/s]

‚ö† Skipped arxiv_thermodynamics engineeringElouardThermodynamics of Quantum Open Systems_ Applications in Quantum Optics and Optomechanicschunk40.txt: 429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. To monitor your current usage, head to: https://ai.dev/usage?tab=rate-limit. 
* Quota exceeded for metric: generativelanguage.googleapis.com/generate_content_free_tier_requests, limit: 250
Please retry in 46.204073277s. [violations {
  quota_metric: "generativelanguage.googleapis.com/generate_content_free_tier_requests"
  quota_id: "GenerateRequestsPerDayPerProjectPerModel-FreeTier"
  quota_dimensions {
    key: "model"
    value: "gemini-2.5-flash"
  }
  quota_dimensions {
    key: "location"
    value: "global"
  }
  quota_value: 250
}
, links {
  description: "Learn more about Gemini API quotas"
  url: "https://ai.google.dev/gemini-api/docs/rate-limits"
}
, re

Generating Dataset:  16%|‚ñà‚ñå        | 112/693 [37:43<3:15:41, 20.21s/it]


KeyboardInterrupt: 

In [None]:
!rm -rf /content/data/chunks