In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
%%bash
apt-get update -qq
apt-get install -y -qq chromium-browser
pip install -q selenium webdriver-manager


In [None]:
# 셀 하나에 실행
!chromium-browser --version

In [None]:
%%bash
apt-get update -qq
apt-get install -y -qq wget gnupg

# Google의 공개 키 추가
wget -q -O - https://dl.google.com/linux/linux_signing_key.pub | apt-key add -

# stable 채널 저장소 추가
echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" \
    > /etc/apt/sources.list.d/google-chrome.list

apt-get update -qq
apt-get install -y -qq google-chrome-stable
pip install -q selenium webdriver-manager bs4


In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager

def init_driver():
    chrome_path = "/usr/bin/google-chrome-stable"  # 방금 설치된 Chrome
    options = Options()
    options.binary_location = chrome_path
    options.add_argument("--headless")             # GUI 필요 없으면 이대로
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    options.add_argument("--disable-gpu")
    options.add_argument("--disable-software-rasterizer")
    options.add_argument("--window-size=1920,1080")
    options.add_argument("--remote-debugging-port=9222")

    # webdriver-manager가 Chrome 버전에 맞는 드라이버를 알아서 내려받습니다.
    driver_path = ChromeDriverManager().install()
    service = Service(driver_path)
    driver = webdriver.Chrome(service=service, options=options)
    return driver

# 동작 테스트
if __name__ == "__main__":
    driver = init_driver()
    try:
        driver.get("https://www.google.com")
        print("TITLE:", driver.title)  # "Google" 이 출력되어야 정상
    finally:
        driver.quit()


In [None]:
import time
import re
import os
import uuid
import urllib.parse
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup

# ── 설정 ──
USER_AGENT = (
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
    "AppleWebKit/537.36 (KHTML, like Gecko) "
    "Chrome/114.0.0.0 Safari/537.36"
)
BASE_URL = "https://www.jobkorea.co.kr"

def init_driver():
    # 유니크한 프로필 경로 생성
    profile_dir = f"/tmp/selenium_profile_{uuid.uuid4().hex}"
    os.makedirs(profile_dir, exist_ok=True)

    chrome_path = "/usr/bin/google-chrome-stable"  # 이미 설치된 Chrome
    options = Options()
    options.binary_location = chrome_path
    options.add_argument("--headless")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    options.add_argument("--disable-gpu")
    options.add_argument("--window-size=1920,1080")
    options.add_argument("--disable-software-rasterizer")
    options.add_argument("--disable-setuid-sandbox")
    options.add_argument("--remote-debugging-port=9222")
    # **여기**: 매번 다른 user-data-dir 을 지정
    options.add_argument(f"--user-data-dir={profile_dir}")

    # webdriver-manager 로 맞는 드라이버 설치
    driver_path = ChromeDriverManager().install()
    service = Service(driver_path)
    return webdriver.Chrome(service=service, options=options)

def get_job_links(keyword, page_num, driver):
    links, seen = [], set()
    for n in range(1, page_num+1):
        search_url = (
            f"{BASE_URL}/Search/?"
            f"stext={urllib.parse.quote(keyword)}&"
            f"tabType=recruit&Page_No={n}"
        )
        driver.get(search_url)
        # 검색 결과 로딩 대기
        WebDriverWait(driver, 10).until(
            EC.presence_of_all_elements_located((By.CSS_SELECTOR, "a[href*='/Recruit/GI_Read/']"))
        )
        soup = BeautifulSoup(driver.page_source, "html.parser")

        for a in soup.select("a[href*='/Recruit/GI_Read/']"):
            full = urllib.parse.urljoin(BASE_URL, a["href"])
            if full in seen:
                continue
            seen.add(full)

            span = a.find("span")
            if not span or len(span.get_text(strip=True)) < 5:
                continue

            links.append((span.get_text(strip=True), full))

        time.sleep(1)  # 요청 사이 약간 텀
    return links

def parse_job_detail(url, driver):
    driver.get(url)
    # CAPTCHA 페이지가 내려오면 수동으로 풀 때까지 대기
    if "보안문자를 입력해 주시기 바랍니다" in driver.page_source:
        print("[CAPTCHA] 수동 입력 대기")
        WebDriverWait(driver, 300).until_not(
            EC.text_to_be_present_in_element((By.TAG_NAME, "body"), "보안문자를 입력해 주시기 바랍니다")
        )

    # 실제 공고 로딩 대기 (회사명 span이 보일 때까지)
    WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.CSS_SELECTOR, "span[class*='coName']"))
    )
    soup = BeautifulSoup(driver.page_source, "html.parser")

    # ── 데이터 추출 ──
    # 회사명
    co_tag = soup.find("span", class_=re.compile(r"coName"))
    company_name = co_tag.get_text(strip=True) if co_tag else ""

    # 공고 제목
    h3 = soup.find("h3", class_=re.compile(r"hd_"))
    job_title = ""
    if h3:
        for bad in h3.find_all(['span','p']):
            bad.extract()
        job_title = h3.get_text(strip=True)

    # 지원자격(요약) / 근무조건
    def extract_section(name):
        t = soup.find("div", string=re.compile(name))
        if not t:
            return ""
        nxt = t.find_next_sibling("div")
        return nxt.get_text(strip=True) if nxt else ""

    simple_qual = extract_section("지원자격")
    work_cond   = extract_section("근무조건")

    # 상세 지원자격(경력/학력/스킬)
    qual_details = {}
    for div in soup.find_all("div", class_=re.compile(r"tbRow.*clear")):
        h4 = div.find("h4")
        if h4 and "지원자격" in h4.get_text():
            dl = div.find("dl", class_="tbList")
            if dl:
                for dt, dd in zip(dl.find_all("dt"), dl.find_all("dd")):
                    qual_details[dt.get_text(strip=True)] = dd.get_text(strip=True)
            break

    return {
        "company_name": company_name,
        "job_title":    job_title,
        "simple_qual":  simple_qual,
        "work":         work_cond,
        "qual_details": qual_details
    }

def main():
    driver = init_driver()
    try:
        keyword  = input("키워드를 입력하세요: ")
        page_num = int(input("크롤링할 페이지 수를 입력하세요: "))

        links = get_job_links(keyword, page_num, driver)
        with open("jobKorea_selenium.txt", "w", encoding="utf-8") as f:
            for idx, (title, url) in enumerate(links, 1):
                f.write(f"=== 공고 {idx} ===\n")
                f.write(f"검색 제목: {title}\nURL: {url}\n")

                data = parse_job_detail(url, driver)
                f.write(f"회사명: {data['company_name']}\n")
                f.write(f"공고 제목: {data['job_title']}\n")
                f.write(f"지원자격 (요약): {data['simple_qual']}\n")
                f.write(f"근무조건: {data['work']}\n")
                f.write("지원자격 상세:\n")
                for k, v in data["qual_details"].items():
                    f.write(f"  - {k}: {v}\n")
                f.write("\n")

        print("완료: jobKorea_selenium.txt 저장됨")
    finally:
        driver.quit()

if __name__ == "__main__":
    main()


In [None]:
import os

print("현재 작업 디렉토리:", os.getcwd())
print("디렉토리 목록:")
print(os.listdir(os.getcwd()))


In [None]:
from google.colab import drive
import shutil
import os

# 1. Drive 마운트
drive.mount('/content/drive')

# 2. 대상 디렉토리 생성 (존재하지 않으면)
dst_dir = '/content/drive/MyDrive/DILAB/OK'
os.makedirs(dst_dir, exist_ok=True)

# 3. 파일 복사 또는 이동
src = '/content/jobKorea_selenium.txt'
dst = os.path.join(dst_dir, 'jobKorea_selenium.txt')

# 복사하려면:
shutil.copy(src, dst)
print(f"파일이 복사되었습니다: {dst}")

# 만약 원본을 지우고 싶다면 이동(move)을 사용하세요:
# shutil.move(src, dst)
# print(f"파일이 이동되었습니다: {dst}")


In [None]:
import requests
from bs4 import BeautifulSoup
import urllib.parse
import re
import os

# ── 설정
HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/114.0.0.0 Safari/537.36"
    )
}
BASE_URL = "https://www.jobkorea.co.kr"

def get_job_links(keyword, page_num):
    links = []
    seen = set()
    for n in range(1, page_num + 1):
        search_url = (
            f"{BASE_URL}/Search/?"
            f"stext={urllib.parse.quote(keyword)}&"
            f"tabType=recruit&Page_No={n}"
        )
        try:
            resp = requests.get(search_url, headers=HEADERS, timeout=10)
            resp.raise_for_status()
        except Exception as e:
            print(f"[페이지 {n}] 요청 실패:", e)
            continue

        soup = BeautifulSoup(resp.text, "html.parser")
        for a in soup.select("a[href*='/Recruit/GI_Read/']"):
            href = a["href"]
            full_url = urllib.parse.urljoin(BASE_URL, href)
            if full_url in seen:
                continue
            seen.add(full_url)

            span = a.find("span")
            if not span:
                continue
            title = span.get_text(strip=True)
            if len(title) < 5:
                continue

            links.append((title, full_url))
    return links

def parse_job_detail(url):
    try:
        resp = requests.get(url, headers=HEADERS, timeout=10)
        resp.raise_for_status()
    except Exception as e:
        print(f"[상세] {url} 요청 실패:", e)
        return None

    soup = BeautifulSoup(resp.text, "html.parser")

    co_tag = soup.find("span", class_=re.compile(r"coName"))
    company_name = co_tag.get_text(strip=True) if co_tag else ""

    h3 = soup.find("h3", class_=re.compile(r"hd_"))
    job_title = ""
    if h3:
        for span in h3.find_all("span"):
            span.extract()
        for p_txt in h3.find_all("p", class_=re.compile(r"txt")):
            p_txt.extract()
        job_title = h3.get_text(strip=True)

    def extract_section(title):
        tit = soup.find("div", string=re.compile(title))
        if tit:
            content = tit.find_next_sibling("div")
            return content.get_text(strip=True) if content else ""
        return ""
    simple_qual = extract_section("지원자격")
    work = extract_section("근무조건")

    qual_details = {}
    for div in soup.find_all("div", class_=re.compile(r"tbRow.*clear")):
        h4 = div.find("h4")
        if h4 and "지원자격" in h4.get_text():
            dl = div.find("dl", class_="tbList")
            if dl:
                for dt, dd in zip(dl.find_all("dt"), dl.find_all("dd")):
                    qual_details[dt.get_text(strip=True)] = dd.get_text(strip=True)
            break

    return {
        "company_name": company_name,
        "job_title": job_title,
        "simple_qual": simple_qual,
        "work": work,
        "qual_details": qual_details
    }

def main():
    # Colab 환경이라면, 미리 drive.mount() 해 두세요
    # from google.colab import drive
    # drive.mount('/content/drive')

    keyword = input("키워드를 입력하세요: ")
    page_num = int(input("크롤링할 페이지 수를 입력하세요: "))
    links = get_job_links(keyword, page_num)

    # Drive 경로 설정
    dst_dir = "/content/drive/MyDrive/DILAB/OK"
    os.makedirs(dst_dir, exist_ok=True)
    out_path = os.path.join(dst_dir, "jobKorea.txt")

    with open(out_path, "w", encoding="utf-8") as f:
        for idx, (_, url) in enumerate(links, 1):
            f.write(f"=== 공고 {idx} ===\n")
            data = parse_job_detail(url)
            if not data:
                f.write("상세 정보 파싱 실패\n\n")
                continue
            f.write(f"회사명: {data['company_name']}\n")
            f.write(f"공고 제목: {data['job_title']}\n")
            f.write(f"지원자격 (요약): {data['simple_qual']}\n")
            f.write(f"근무조건: {data['work']}\n")
            f.write("지원자격 상세:\n")
            for k, v in data["qual_details"].items():
                f.write(f"  - {k}: {v}\n")
            f.write("\n")

    print(f"완료: 파일이 저장되었습니다 → {out_path}")

if __name__ == "__main__":
    main()


In [None]:
import os

print("현재 작업 디렉토리:", os.getcwd())
print("디렉토리 목록:")
print(os.listdir(os.getcwd()))


In [None]:
from google.colab import drive
import shutil
import os

# 1. Drive 마운트
drive.mount('/content/drive')

# 2. 대상 디렉토리 생성 (존재하지 않으면)
dst_dir = '/content/drive/MyDrive/DILAB/OK'
os.makedirs(dst_dir, exist_ok=True)

# 3. 파일 복사 또는 이동
src = '/content/jobKorea.txt'
dst = os.path.join(dst_dir, 'jobKorea.txt')

# 복사하려면:
shutil.copy(src, dst)
print(f"파일이 복사되었습니다: {dst}")

# 만약 원본을 지우고 싶다면 이동(move)을 사용하세요:
# shutil.move(src, dst)
# print(f"파일이 이동되었습니다: {dst}")
