# Scrapping

In [11]:
pip install requests googlesearch-python

Collecting googlesearch-python
  Downloading googlesearch_python-1.2.5-py3-none-any.whl.metadata (2.9 kB)
Downloading googlesearch_python-1.2.5-py3-none-any.whl (4.8 kB)
Installing collected packages: googlesearch-python
Successfully installed googlesearch-python-1.2.5
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.2 -> 24.3.1
[notice] To update, run: D:\Python\python.exe -m pip install --upgrade pip


In [5]:
import os
import time
import random
from googlesearch import search
import requests
from PyPDF2 import PdfReader
from PyPDF2.errors import PdfReadError

# Directory to save the PDFs
base_save_directory = "../data"
if not os.path.exists(base_save_directory):
    os.makedirs(base_save_directory)

# List of queries to collect only SQL course and textbook PDFs explaining SQL concepts
queries = [
    "filetype:pdf Cours SQL lang:fr",
    "filetype:pdf Livre SQL lang:fr",
    "filetype:pdf Concepts de bases de données SQL lang:fr",
    "filetype:pdf Apprendre les concepts SQL lang:fr",
    "filetype:pdf Cours SQL concepts avancés lang:fr",
    "filetype:pdf Cours bases de données relationnelles SQL lang:fr",
    "filetype:pdf Introduction aux concepts SQL lang:fr",
    "filetype:pdf Livre sur les bases de données SQL lang:fr",
    "filetype:pdf Comprendre SQL concepts lang:fr",
    "filetype:pdf Cours SQL pour débutants lang:fr",
    "filetype:pdf Concepts SQL pour les bases de données lang:fr",
    "filetype:pdf Apprendre les jointures SQL lang:fr",
    "filetype:pdf Livre sur SQL et bases de données lang:fr",
    "filetype:pdf Cours SQL sous-requêtes lang:fr",
    "filetype:pdf Concepts avancés des bases de données SQL lang:fr"
]

# Directory to save the PDFs
if not os.path.exists(base_save_directory):
    os.makedirs(base_save_directory)

# Function to download a PDF file from a URL
def download_pdf(pdf_url, save_directory):
    try:
        response = requests.get(pdf_url, stream=True)
        response.raise_for_status()
        pdf_name = pdf_url.split("/")[-1]
        file_path = os.path.join(save_directory, pdf_name)

        # Check for duplicates
        if os.path.exists(file_path):
            print(f"{pdf_name} already exists. Skipping download.")
            return file_path, False

        with open(file_path, "wb") as pdf_file:
            for chunk in response.iter_content(chunk_size=1024):
                if chunk:
                    pdf_file.write(chunk)
        print(f"Downloaded: {pdf_name}")
        return file_path, True
    except Exception as e:
        print(f"Failed to download {pdf_url}: {e}")
        return None, False

# Function to extract PDF links from a Google search result page
def extract_pdf_links(query):
    pdf_links = []
    try:
        for url in search(query, num_results=100, lang="fr"):
            if url.endswith(".pdf"):
                pdf_links.append(url)
    except Exception as e:
        print(f"Error extracting PDFs for query {query}: {e}")
    return pdf_links

# Function to filter PDFs based on relevance to SQL courses (exclude tutorials, guides, etc.)
def filter_pdf_links(pdf_links):
    filtered_links = []
    for link in pdf_links:
        if 'cours' in link or 'livre' in link or 'concepts' in link:
            filtered_links.append(link)
    return filtered_links

# Function to check if a PDF is corrupted
def is_pdf_corrupted(pdf_path):
    try:
        with open(pdf_path, "rb") as file:
            reader = PdfReader(file)
            # Try to read the first page
            reader.pages[0]
        return False  # If no error occurs, the PDF is valid
    except PdfReadError as e:
        print(f"Corrupted PDF detected: {pdf_path}, Error: {e}")
        return True
    except Exception as e:
        print(f"Error reading PDF {pdf_path}: {e}")
        return True

# Main function to collect SQL PDFs and organize them
def collect_sql_pdfs():
    print(f"\n*****************Searching for SQL PDFs...******************")
    for query in queries:
        print(f"Searching for: {query}")
        pdf_links = extract_pdf_links(query)
        print(f"Found {len(pdf_links)} PDFs for query: {query}")

        filtered_links = filter_pdf_links(pdf_links)
        print(f"Filtered to {len(filtered_links)} relevant PDFs for query: {query}")

        for pdf_url in filtered_links:
            file_path, downloaded = download_pdf(pdf_url, base_save_directory)
            if downloaded and file_path:
                if is_pdf_corrupted(file_path):  # Check if the file is corrupted
                    print(f"Deleting corrupted PDF: {file_path}")
                    os.remove(file_path)  # Remove the corrupted PDF
            time.sleep(random.uniform(1, 3))  # Random sleep to avoid overwhelming servers

if __name__ == "__main__":
    collect_sql_pdfs()


*****************Searching for SQL PDFs...******************
Searching for: filetype:pdf Cours SQL lang:fr
Found 92 PDFs for query: filetype:pdf Cours SQL lang:fr
Filtered to 29 relevant PDFs for query: filetype:pdf Cours SQL lang:fr
Downloaded: cours-sql-sh-.pdf
Downloaded: slides-michel-slides.pdf
Downloaded: c2.pdf
Downloaded: hasssclic521.pdf
Downloaded: sgbd-b7.pdf
Downloaded: 3i009cours1rappels2017.pdf
Failed to download https://perso.liris.cnrs.fr/veronique.deslandres/lib/exe/fetch.php%3Fmedia%3Dstart:ens:java:cours6.1-jdbc-2021.pdf: [Errno 22] Invalid argument: '../data\\fetch.php%3Fmedia%3Dstart:ens:java:cours6.1-jdbc-2021.pdf'
Downloaded: JDBC.pdf
Downloaded: coursphp3.pdf
Failed to download https://lig-membres.imag.fr/genoud/teaching/PL2AI/cours/pdf/PL2/PL2_16JDBC.pdf: 404 Client Error: Not Found for url: https://lig-membres.imag.fr/genoud/teaching/PL2AI/cours/pdf/PL2/PL2_16JDBC.pdf/
Downloaded: cours.pdf
Downloaded: cours8.pdf
Downloaded: Cours-JPA-v1.3.pdf
Downloaded: NFP

In [4]:
import requests
from bs4 import BeautifulSoup
import os
from PyPDF2 import PdfReader

def google_search(query, num_results=100):
    """Perform a Google search and return result links."""
    headers = {"User-Agent": "Mozilla/5.0"}
    search_url = f"https://www.google.com/search?q={query}&num={num_results}"
    response = requests.get(search_url, headers=headers)
    soup = BeautifulSoup(response.text, "html.parser")
    links = [a['href'] for a in soup.select('a') if 'url?q=' in a['href']]
    return [link.split('&')[0].replace('/url?q=', '') for link in links]

def download_pdf(url, download_dir="../pdfsv"):
    """Download PDF from URL."""
    if not os.path.exists(download_dir):
        os.makedirs(download_dir)
    try:
        response = requests.get(url, stream=True)
        if response.headers.get('content-type') == 'application/pdf':
            filename = os.path.join(download_dir, url.split("/")[-1])
            with open(filename, "wb") as f:
                f.write(response.content)
            return filename
    except Exception as e:
        print(f"Failed to download {url}: {e}")
    return None

def validate_pdf(pdf_path):
    """Validate content of PDF to check if it's SQL-focused."""
    try:
        reader = PdfReader(pdf_path)
        text = " ".join(page.extract_text() for page in reader.pages[:5])
        return "SQL" in text and "SGBD" in text
    except Exception as e:
        print(f"Error reading {pdf_path}: {e}")
    return False

def main():
    query = "filetype:pdf cours SQL en français -webdev -php -oracle -jdbc -java"
    links = google_search(query)
    for link in links:
        if link.endswith(".pdf"):
            pdf_path = download_pdf(link)
            if pdf_path and validate_pdf(pdf_path):
                print(f"Validated and saved: {pdf_path}")
            else:
                print(f"Skipped: {link}")

if __name__ == "__main__":
    main()

Validated and saved: ../pdfsv\Fiche%2520-%2520SQL.pdf
Skipped: https://www-igm.univ-mlv.fr/~cherrier/download/BD1/cours/SQL-syntaxe.pdf
Skipped: https://www.i3s.unice.fr/~edemaria/cours/c3.pdf
Skipped: https://repository.root-me.org/Programmation/SQL/FR%2520-%2520Les%2520bases%2520fondamentales%2520du%2520langage%2520Transact%2520SQL.pdf
Skipped: http://sql.bdpedia.fr/files/slvues.pdf
Skipped: https://ressources.unisciel.fr/sillages/informatique/bdd/tpsql_corrige.pdf
Skipped: http://pascal.delahaye1.free.fr/cpge/informatique/cours/cours11.pdf
Skipped: https://www.montefiore.uliege.be/services/verif/cours/bd/repet2014/tp6.pdf
Skipped: http://alexandre.patin.free.fr/fichiers/doc-sql.pdf
Validated and saved: ../pdfsv\cours-BD2012-cours4Et5.pdf
Skipped: http://sql.bdpedia.fr/files/sltriggers.pdf
Skipped: http://www.rzo.free.fr/docs_BD2A/memo_sql.pdf
Skipped: http://pascal.delahaye1.free.fr/cpge/informatique/td/td13.pdf
Validated and saved: ../pdfsv\Chap_10_L_acces_aux_donnees.pdf
Skipped: 

In [8]:
import requests
from bs4 import BeautifulSoup
import os
from PyPDF2 import PdfReader

download_dir = "../pdfs"

def google_search(query, num_results=100):
    """Perform a Google search and return result links."""
    headers = {"User-Agent": "Mozilla/5.0"}
    search_url = f"https://www.google.com/search?q={query}&num={num_results}"
    response = requests.get(search_url, headers=headers)
    soup = BeautifulSoup(response.text, "html.parser")
    links = [a['href'] for a in soup.select('a') if 'url?q=' in a['href']]
    return [link.split('&')[0].replace('/url?q=', '') for link in links]

def download_pdf(url):
    """Download PDF from URL."""
    if not os.path.exists(download_dir):
        os.makedirs(download_dir)
    try:
        response = requests.get(url, stream=True)
        if response.headers.get('content-type') == 'application/pdf':
            filename = os.path.join(download_dir, url.split("/")[-1])
            with open(filename, "wb") as f:
                f.write(response.content)
            return filename
    except Exception as e:
        print(f"Failed to download {url}: {e}")
    return None

def validate_pdf(pdf_path):
    """Validate content of PDF to check if it's SQL-focused."""
    try:
        reader = PdfReader(pdf_path)
        text = " ".join(page.extract_text() for page in reader.pages[:5])
        return "SQL" in text and "SGBD" in text
    except Exception as e:
        print(f"Error reading {pdf_path}: {e}")
    return False

def main():
    # Queries targeting pure SQL courses in French
    queries = [
        "filetype:pdf cours SQL en français -webdev -php -java -html",
        "filetype:pdf introduction au langage SQL -webdev -php -java",
        "filetype:pdf apprentissage SQL bases de données en français -webdev -php",
        "filetype:pdf cours SQL norme SQL2 ou SQL3 en français -java -php",
        "filetype:pdf tutoriel SQL SGBD relationnel en français -php -java",
    ]
    for query in queries:
        print(f"Searching for: {query}")
        links = google_search(query)
        for link in links:
            if link.endswith(".pdf"):
                pdf_path = download_pdf(link)
                if pdf_path and validate_pdf(pdf_path):
                    print(f"Validated and saved: {pdf_path}")
                else:
                    print(f"Skipped: {link}")

if __name__ == "__main__":
    main()

Searching for: filetype:pdf cours SQL en français -webdev -php -java -html
Searching for: filetype:pdf introduction au langage SQL -webdev -php -java
Searching for: filetype:pdf apprentissage SQL bases de données en français -webdev -php
Searching for: filetype:pdf cours SQL norme SQL2 ou SQL3 en français -java -php
Searching for: filetype:pdf tutoriel SQL SGBD relationnel en français -php -java
