In [38]:
pip install --upgrade selenium webdriver-manager

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.3.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [39]:
import os
import time
import requests
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By

def init_driver():
    options = Options()
    options.add_argument("--start-maximized")
    service = Service(executable_path="D:/CNN crawl/chromedriver.exe")
    driver = webdriver.Chrome(service=service, options=options)
    return driver

def safe_folder_name(name):
    # Loại bỏ ký tự không hợp lệ cho tên folder
    return "".join(c for c in name if c.isalnum() or c in " _-").rstrip()

def download_image(url, path):
    headers = {
        "Referer": "https://nettruyenzn.com",
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
    }
    try:
        r = requests.get(url, stream=True, timeout=10, headers=headers)
        if r.status_code == 200:
            with open(path, 'wb') as f:
                for chunk in r.iter_content(1024):
                    f.write(chunk)
        else:
            print(f"Không tải được ảnh {url}: {r.status_code}")
    except Exception as e:
        print(f"Không tải được ảnh {url}: {e}")

def crawl_chap(driver, chap_url, chap_folder):
    driver.get(chap_url)
    time.sleep(2)
    # Ảnh truyện thường nằm trong div .reading-detail img
    img_tags = driver.find_elements(By.CSS_SELECTOR, ".reading-detail img")
    for idx, img in enumerate(img_tags):
        img_url = img.get_attribute("src")
        if img_url:
            img_name = f"{idx+1:03d}.jpg"
            img_path = os.path.join(chap_folder, img_name)
            print(f"  Tải ảnh: {img_url}")
            download_image(img_url, img_path)

def crawl_truyen(driver, truyen_url, truyen_folder):
    driver.get(truyen_url)
    time.sleep(2)
    # Lấy danh sách chap (thường là .list-chapter li a)
    chap_links = driver.find_elements(By.CSS_SELECTOR, ".list-chapter li a")
    print(f"  Tổng số chap: {len(chap_links)}")
    # Lưu lại thông tin chap trước khi duyệt
    chap_info = []
    for chap in chap_links:
        chap_name = safe_folder_name(chap.text.strip())
        chap_url = chap.get_attribute("href")
        chap_info.append((chap_name, chap_url))
    # Duyệt qua từng chap đã lưu
    for chap_name, chap_url in chap_info:
        chap_folder = os.path.join(truyen_folder, chap_name)
        os.makedirs(chap_folder, exist_ok=True)
        print(f"  Đang crawl chap: {chap_name}")
        crawl_chap(driver, chap_url, chap_folder)

def crawl_nettruyen(driver, base_url, max_truyen=3):
    driver.get(base_url)
    time.sleep(2)
    # Lấy danh sách truyện (div.item h3.title a)
    truyen_links = driver.find_elements(By.CSS_SELECTOR, "div.item h3.title a")
    for i, truyen in enumerate(truyen_links[:max_truyen]):
        truyen_name = safe_folder_name(truyen.text.strip())
        truyen_url = truyen.get_attribute("href")
        truyen_folder = os.path.join("truyen", truyen_name)
        os.makedirs(truyen_folder, exist_ok=True)
        print(f"Đang crawl truyện: {truyen_name}")
        crawl_truyen(driver, truyen_url, truyen_folder)

if __name__ == "__main__":
    os.makedirs("truyen", exist_ok=True)
    driver = init_driver()
    try:
        truyen_name = safe_folder_name("Người chơi mới cấp tối đa")
        truyen_url = "https://nettruyenzn.com/truyen-tranh/nguoi-choi-moi-cap-toi-da"
        truyen_folder = os.path.join("truyen", truyen_name)
        os.makedirs(truyen_folder, exist_ok=True)
        crawl_truyen(driver, truyen_url, truyen_folder)
    finally:
        driver.quit()

  Tổng số chap: 20
  Đang crawl chap: Chapter 208
  Tải ảnh: https://st4.aducdn.com/nettruyen/nguoi-choi-moi-cap-toi-da/208/0.jpg
  Tải ảnh: https://st4.aducdn.com/nettruyen/nguoi-choi-moi-cap-toi-da/208/1.jpg
  Tải ảnh: https://st4.aducdn.com/nettruyen/nguoi-choi-moi-cap-toi-da/208/2.jpg
  Tải ảnh: https://st4.aducdn.com/nettruyen/nguoi-choi-moi-cap-toi-da/208/3.jpg
  Tải ảnh: https://st4.aducdn.com/nettruyen/nguoi-choi-moi-cap-toi-da/208/4.jpg
  Tải ảnh: https://st4.aducdn.com/nettruyen/nguoi-choi-moi-cap-toi-da/208/5.jpg
  Tải ảnh: https://st4.aducdn.com/nettruyen/nguoi-choi-moi-cap-toi-da/208/6.jpg
  Tải ảnh: https://st4.aducdn.com/nettruyen/nguoi-choi-moi-cap-toi-da/208/7.jpg
  Tải ảnh: https://st4.aducdn.com/nettruyen/nguoi-choi-moi-cap-toi-da/208/8.jpg
  Tải ảnh: https://st4.aducdn.com/nettruyen/nguoi-choi-moi-cap-toi-da/208/9.jpg
  Tải ảnh: https://st4.aducdn.com/nettruyen/nguoi-choi-moi-cap-toi-da/208/10.jpg
  Tải ảnh: https://st4.aducdn.com/nettruyen/nguoi-choi-moi-cap-toi-da

In [None]:
import time
import csv
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service

def init_driver():
    options = Options()
    options.add_argument("--start-maximized")
    service = Service(executable_path="D:/CNN crawl/chromedriver.exe")
    driver = webdriver.Chrome(service=service, options=options)
    return driver

def crawl_topcv(url, output_csv="topcv_jobs.csv"):
    driver = init_driver()
    driver.get(url)
    time.sleep(3)

    jobs = driver.find_elements(By.CSS_SELECTOR, "div.job-item")
    print(f"Tìm thấy {len(jobs)} việc làm trên trang này.")

    job_data = []
    for job in jobs:
        try:
            title_elem = job.find_element(By.CSS_SELECTOR, "h3.title a")
            title = title_elem.text.strip()
            link = title_elem.get_attribute("href")
            company = job.find_element(By.CSS_SELECTOR, "div.company a").text.strip()
            location = job.find_element(By.CSS_SELECTOR, "div.location").text.strip()
            salary = job.find_element(By.CSS_SELECTOR, "div.salary").text.strip()

            # Vào trang chi tiết job
            driver.execute_script("window.open('');")
            driver.switch_to.window(driver.window_handles[1])
            driver.get(link)
            time.sleep(2)

            # Crawl mô tả công việc
            try:
                description = driver.find_element(By.CSS_SELECTOR, "div.job-detail__content").text.strip()
            except:
                description = ""

            # Crawl kỹ năng (nếu có)
            try:
                skills = [el.text.strip() for el in driver.find_elements(By.CSS_SELECTOR, ".job-detail__tag span")]
                skills_str = ", ".join(skills)
            except:
                skills_str = ""

            # Đóng tab chi tiết, quay lại tab chính
            driver.close()
            driver.switch_to.window(driver.window_handles[0])

            # Chỉ lưu job nếu đủ thông tin
            if all([title, company, location, salary, link, description, skills_str]):
                job_data.append({
                    "title": title,
                    "company": company,
                    "location": location,
                    "salary": salary,
                    "link": link,
                    "description": description,
                    "skills": skills_str
                })
                print(f"Đã crawl: {title} | {company} | {location} | {salary} | {skills_str}")
            else:
                print(f"Bỏ qua job thiếu thông tin: {title} | {company} | {location} | {salary} | {skills_str}")

        except Exception as e:
            print("Bỏ qua job bị lỗi:", e)
            continue

    driver.quit()

    # Lưu ra file CSV nếu có dữ liệu
    if job_data:
        with open(output_csv, "w", newline='', encoding="utf-8") as f:
            writer = csv.DictWriter(f, fieldnames=["title", "company", "location", "salary", "link", "description", "skills"])
            writer.writeheader()
            writer.writerows(job_data)
        print(f"Đã lưu {len(job_data)} job vào {output_csv}")
    else:
        print("Không có job nào đủ thông tin để lưu vào file.")

if __name__ == "__main__":
    crawl_topcv("https://www.topcv.vn/tim-viec-lam-cong-nghe-thong-tin-cr257?type_keyword=1&sba=1&category_family=r257")

In [40]:
import os
print(os.path.isfile("D:/CNN crawl/chromedriver.exe"))

True


In [41]:
import requests
print(requests.get("https://www.google.com").status_code)

200
