In [None]:
pip install numpy pandas matplotlib seaborn scikit-learn scipy statsmodels xgboost lightgbm tensorflow torch plotly bokeh altair folium jupyter_contrib_nbextensions voila jupyterlab_widgets nbdime nltk spacy transformers requests beautifulsoup4 selenium Pillow opencv-python scikit-image dask pyspark geopandas shapely fiona tqdm pytest black pylint sqlalchemy python-dotenv


In [None]:
import requests
from bs4 import BeautifulSoup
import sqlite3
import random
import time
from flask import Flask, render_template, redirect, url_for
import schedule
from datetime import datetime

# Keywords for job search
KEYWORDS = [
    "chemistry", "analytical chemistry", "organic chemistry",
    "quality control", "HSE", "environment", "food chemistry",
    "chimie", "chimie analytique", "chimie organique",
    "contrôle qualité", "sécurité", "hygiène", "chimie alimentaire"
]

# User-Agent list for request headers
USER_AGENTS = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
]

# Job site configurations
JOB_SITES = [
    {
        "name": "Tanitjobs",
        "url": "https://www.tanitjobs.com/jobs/?listing_type%5Bequal%5D=Job&action=search&keywords%5Ball_words%5D=",
        "selectors": {
            "job_card": "article.listing-item",
            "title": ".listing-item__title a",
            "link": ".listing-item__title a",
            "location": ".listing-item__info--item-location",
            "date": ".listing-item__date",
        }
    },
    {
        "name": "Keejob",
        "url": "https://www.keejob.com/offres-emploi/?keywords=",
        "selectors": {
            "job_card": ".span9",
            "title": "h6 a",
            "link": "h6 a",
            "location": ".meta_a .fa-map-marker",
            "date": None,  # Dates are unavailable on the listing page.
        }
    },
    {
        "name": "Tunisie Travail",
        "url": "https://www.tunisietravail.net/search/",
        "selectors": {
            "job_card": ".Post",
            "title": ".PostHead h1 a",
            "link": ".PostHead h1 a",
            "location": None,  # Location is unavailable on the listing page.
            "date": ".PostDateIndex strong",
        }
    },
    {
        "name": "Option Carriere",
        "url": "https://www.optioncarriere.tn/emploi?s=",
        "selectors": {
            "job_card": "article.job",
            "title": "header h2 a",
            "link": "header h2 a",
            "location": ".location li span",
            "date": ".tags li span",
        }
    }
]

# Database configuration
DATABASE = "jobs.db"

def setup_database():
    conn = sqlite3.connect(DATABASE)
    cursor = conn.cursor()
    cursor.execute('''
        CREATE TABLE IF NOT EXISTS jobs (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            title TEXT,
            link TEXT UNIQUE,
            location TEXT,
            date TEXT,
            site_name TEXT,
            is_new INTEGER,
            is_applied INTEGER,
            timestamp DATETIME DEFAULT CURRENT_TIMESTAMP
        )
    ''')
    conn.commit()
    conn.close()

# Web scraping function
def scrape_jobs():
    conn = sqlite3.connect(DATABASE)
    cursor = conn.cursor()
    headers = {"User-Agent": random.choice(USER_AGENTS)}

    for site in JOB_SITES:
        for keyword in KEYWORDS:
            search_url = f"{site['url']}{keyword}"
            try:
                response = requests.get(search_url, headers=headers)
                if response.status_code != 200:
                    print(f"Failed to fetch {search_url}")
                    continue
                
                soup = BeautifulSoup(response.text, 'html.parser')
                job_cards = soup.select(site["selectors"]["job_card"])
                for card in job_cards:
                    title_element = card.select_one(site["selectors"]["title"])
                    link_element = card.select_one(site["selectors"]["link"])
                    location_element = card.select_one(site["selectors"]["location"])
                    date_element = card.select_one(site["selectors"]["date"])
                    
                    title = title_element.text.strip() if title_element else "N/A"
                    link = title_element["href"] if title_element and "href" in title_element.attrs else "N/A"
                    if link and not link.startswith("http"):
                        link = f"https://www.{site['name'].lower()}.com{link}"  # Adjust for relative URLs.
                    location = location_element.text.strip() if location_element else "N/A"
                    date = date_element.text.strip() if date_element else "N/A"

                    cursor.execute('''
                        INSERT OR IGNORE INTO jobs (title, link, location, date, site_name, is_new, is_applied)
                        VALUES (?, ?, ?, ?, ?, 1, 0)
                    ''', (title, link, location, date, site["name"]))
            except Exception as e:
                print(f"Error scraping {site['name']}: {e}")

    conn.commit()
    conn.close()

# Update job labels
def update_job_labels():
    conn = sqlite3.connect(DATABASE)
    cursor = conn.cursor()
    cursor.execute('''
        UPDATE jobs
        SET is_new = 0
        WHERE timestamp <= datetime('now', '-1 day') AND is_applied = 0
    ''')
    conn.commit()
    conn.close()

# Mark a job as applied
def mark_job_as_applied(job_id):
    conn = sqlite3.connect(DATABASE)
    cursor = conn.cursor()
    cursor.execute('''
        UPDATE jobs
        SET is_applied = 1
        WHERE id = ?
    ''', (job_id,))
    conn.commit()
    conn.close()

# Flask web app
app = Flask(__name__)

@app.route("/")
def index():
    conn = sqlite3.connect(DATABASE)
    cursor = conn.cursor()
    cursor.execute('''
        SELECT id, title, link, location, date, site_name, is_new, is_applied
        FROM jobs
        ORDER BY timestamp DESC
    ''')
    jobs = cursor.fetchall()
    conn.close()
    return render_template("index.html", jobs=jobs)

@app.route("/apply/<int:job_id>")
def apply(job_id):
    mark_job_as_applied(job_id)
    return redirect(url_for("index"))

# Scheduler to run scraping and updates every 24 hours
def run_scheduler():
    schedule.every(24).hours.do(scrape_jobs)
    schedule.every(24).hours.do(update_job_labels)
    while True:
        schedule.run_pending()
        time.sleep(1)

if __name__ == "__main__":
    setup_database()
    scrape_jobs()
    from threading import Thread
    Thread(target=run_scheduler).start()
    app.run(debug=True)


In [None]:
# /job_scraper/advanced_scraper.py

import requests
from bs4 import BeautifulSoup
import sqlite3
from datetime import datetime, timedelta
import time

# Define target websites and search URL templates
WEBSITES = {
    "tanitjobs": "https://www.tanitjobs.com/jobs/?listing_type%5Bequal%5D=Job&action=search&keywords%5Ball_words%5D={keyword}",
    "optioncarriere": "https://www.optioncarriere.tn/emploi?s={keyword}+&l=Tunisie",
    "keejob": "https://www.keejob.com/offres-emploi/?keywords={keyword}",
    "tunisietravail": "https://www.tunisietravail.net/search/{keyword}"
}

# Keywords to search for in English and French
KEYWORDS = [ "chemistry", "analytical chemistry", "organic chemistry",
    "quality control", "HSE", "environment", "food chemistry",
    "chimie", "chimie analytique", "chimie organique",
    "contrôle qualité", "sécurité", "hygiène", "chimie alimentaire"]

# Database setup
DB_FILE = "jobs_advanced.db"


def init_db():
    """Initialize the SQLite database."""
    conn = sqlite3.connect(DB_FILE)
    cursor = conn.cursor()
    cursor.execute("""
        CREATE TABLE IF NOT EXISTS jobs (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            site TEXT,
            title TEXT,
            link TEXT UNIQUE,
            company TEXT,
            location TEXT,
            experience TEXT,
            date_posted TEXT,
            is_new INTEGER DEFAULT 1,
            applied INTEGER DEFAULT 0,
            fetched_at TIMESTAMP
        )
    """)
    conn.commit()
    conn.close()


def fetch_jobs_from_tanitjobs(keyword):
    """Fetch jobs from TanitJobs."""
    url = WEBSITES["tanitjobs"].format(keyword=keyword)
    response = requests.get(url)
    if response.status_code != 200:
        print(f"[ERROR] Failed to fetch TanitJobs for keyword: {keyword}")
        return []
    soup = BeautifulSoup(response.text, "html.parser")
    jobs = []

    for article in soup.find_all("article", class_="listing-item"):
        title_elem = article.find("div", class_="media-heading")
        title = title_elem.get_text(strip=True) if title_elem else None
        link = title_elem.find("a")["href"] if title_elem else None
        company_elem = article.find("span", class_="listing-item__info--item-company")
        company = company_elem.get_text(strip=True) if company_elem else None
        location_elem = article.find("span", class_="listing-item__info--item-location")
        location = location_elem.get_text(strip=True) if location_elem else None
        date_elem = article.find("div", class_="listing-item__date")
        date_posted = date_elem.get_text(strip=True) if date_elem else None

        if title and link:
            job = ("tanitjobs", title, link, company, location, None, date_posted)
            print("[TanitJobs Fetched]", job)  # Added
            jobs.append(job)

    return jobs


def fetch_jobs_from_optioncarriere(keyword):
    """Fetch jobs from OptionCarriere."""
    url = WEBSITES["optioncarriere"].format(keyword=keyword)
    response = requests.get(url)
    if response.status_code != 200:
        print(f"[ERROR] Failed to fetch OptionCarriere for keyword: {keyword}")
        return []
    soup = BeautifulSoup(response.text, "html.parser")
    jobs = []

    for article in soup.find_all("article", class_="job clicky"):
        title_elem = article.find("h2")
        title = title_elem.get_text(strip=True) if title_elem else None
        link = "https://www.optioncarriere.tn" + title_elem.find("a")["href"] if title_elem else None
        company_elem = article.find("p", class_="company")
        company = company_elem.get_text(strip=True) if company_elem else None
        location_elem = article.find("li", class_="location")
        location = location_elem.get_text(strip=True) if location_elem else None
        date_elem = article.find("span", class_="badge")
        date_posted = date_elem.get_text(strip=True) if date_elem else None

        if title and link:
            job = ("optioncarriere", title, link, company, location, None, date_posted)
            print("[OptionCarriere Fetched]", job)  # Added
            jobs.append(job)

    return jobs


def fetch_jobs_from_keejob(keyword):
    """Fetch jobs from Keejob."""
    url = WEBSITES["keejob"].format(keyword=keyword)
    response = requests.get(url)
    if response.status_code != 200:
        print(f"[ERROR] Failed to fetch Keejob for keyword: {keyword}")
        return []
    soup = BeautifulSoup(response.text, "html.parser")
    jobs = []

    for div in soup.find_all("div", class_="span9"):
        title_elem = div.find("h6")
        title = title_elem.get_text(strip=True) if title_elem else None
        link = "https://www.keejob.com" + title_elem.find("a")["href"] if title_elem else None
        company_elem = div.find("a", {"data-original-title": ""})
        company = company_elem.get_text(strip=True) if company_elem else None
        location_elem = div.find("i", class_="fa-map-marker")
        location = location_elem.next_sibling.strip() if location_elem else None

        if title and link:
            job = ("keejob", title, link, company, location, None, None)
            print("[Keejob Fetched]", job)  # Added
            jobs.append(job)

    return jobs


def fetch_jobs_from_tunisietravail(keyword):
    """Fetch jobs from TunisieTravail."""
    url = WEBSITES["tunisietravail"].format(keyword=keyword)
    response = requests.get(url)
    if response.status_code != 200:
        print(f"[ERROR] Failed to fetch TunisieTravail for keyword: {keyword}")
        return []
    soup = BeautifulSoup(response.text, "html.parser")
    jobs = []

    for post in soup.find_all("div", class_="Post"):
        title_elem = post.find("h1")
        title = title_elem.get_text(strip=True) if title_elem else None
        link = title_elem.find("a")["href"] if title_elem else None
        company_elem = post.find("strong", text=lambda t: "Entreprise" in t)
        company = company_elem.get_text(strip=True) if company_elem else None
        location_elem = post.find("p", text=lambda t: "Ville" in t)
        location = location_elem.get_text(strip=True) if location_elem else None
        date_elem = post.find("p", class_="PostDateIndex")
        date_posted = date_elem.get_text(strip=True) if date_elem else None

        if title and link:
            job = ("tunisietravail", title, link, company, location, None, date_posted)
            print("[TunisieTravail Fetched]", job)  # Added
            jobs.append(job)

    return jobs


def fetch_jobs():
    """Fetch jobs from all websites."""
    jobs = []
    for keyword in KEYWORDS:
        print(f"[INFO] Fetching jobs for keyword: {keyword}")
        jobs.extend(fetch_jobs_from_tanitjobs(keyword))
        jobs.extend(fetch_jobs_from_optioncarriere(keyword))
        jobs.extend(fetch_jobs_from_keejob(keyword))
        jobs.extend(fetch_jobs_from_tunisietravail(keyword))
    return jobs


def update_jobs():
    """Update the job listings in the database."""
    conn = sqlite3.connect(DB_FILE)
    cursor = conn.cursor()
    jobs = fetch_jobs()

    for job in jobs:
        cursor.execute("""
            INSERT OR IGNORE INTO jobs 
            (site, title, link, company, location, experience, date_posted, fetched_at)
            VALUES (?, ?, ?, ?, ?, ?, ?, ?)
        """, job + (datetime.now(),))
        print("[DB Inserted]", job)  # Added

    conn.commit()
    conn.close()


def mark_applied(link):
    """Mark a job as applied."""
    conn = sqlite3.connect(DB_FILE)
    cursor = conn.cursor()
    cursor.execute("UPDATE jobs SET applied = 1 WHERE link = ?", (link,))
    conn.commit()
    conn.close()


def cleanup_jobs():
    """Remove 'new' labels from jobs older than 24 hours."""
    conn = sqlite3.connect(DB_FILE)
    cursor = conn.cursor()
    cutoff = datetime.now() - timedelta(days=1)
    cursor.execute("UPDATE jobs SET is_new = 0 WHERE fetched_at < ?", (cutoff,))
    conn.commit()
    conn.close()


def display_jobs():
    """Display jobs stored in the database."""
    conn = sqlite3.connect(DB_FILE)
    cursor = conn.cursor()
    cursor.execute("""
        SELECT title, company, location, date_posted, link, is_new, applied
        FROM jobs
        ORDER BY fetched_at DESC
    """)
    jobs = cursor.fetchall()
    conn.close()
    return jobs


def main():
    """Main script execution."""
    init_db()
    while True:
        print("[INFO] Updating jobs...")
        update_jobs()
        print("[INFO] Cleaning up old jobs...")
        cleanup_jobs()
        print("[INFO] Displaying jobs:")
        for job in display_jobs():
            print(job)
        print("[INFO] Sleeping for 24 hours...")
        time.sleep(86400)  # 24 hours delay


if __name__ == "__main__":
    main()


[INFO] Updating jobs...
[INFO] Fetching jobs for keyword: chemistry
[ERROR] Failed to fetch TanitJobs for keyword: chemistry
[OptionCarriere Fetched] ('optioncarriere', 'Chemistry School Teacher', 'https://www.optioncarriere.tn/jobad/tn5611f948e520464d5bc455b9d4c5c79d', 'Oxbridge International School', None, None, 'Il y a 1 mois')
[OptionCarriere Fetched] ('optioncarriere', 'Expert/Tutor in STEM Subjects', 'https://www.optioncarriere.tn/jobad/tn5ec8f1d024156890f03e29e863f08041', 'Livingston Research', None, None, 'Il y a 1 mois')


  company_elem = post.find("strong", text=lambda t: "Entreprise" in t)
  location_elem = post.find("p", text=lambda t: "Ville" in t)


[TunisieTravail Fetched] ('tunisietravail', 'Kuwait Shop Kuwait is looking for Laboratory Chemist / Qc Chemist', 'https://www.tunisietravail.net/kuwait-shop-is-looking-for-laboratory-chemist-qc-chemist-kuwait-100701/', None, None, None, 'Jan, 2023')
[INFO] Fetching jobs for keyword: analytical chemistry
[ERROR] Failed to fetch TanitJobs for keyword: analytical chemistry
[TunisieTravail Fetched] ('tunisietravail', 'Concentrix recrute des Conseillers Clients Anglophone en Réceptions', 'https://www.tunisietravail.net/concentrix-recrute-des-conseillers-clients-anglophone-en-receptions-106695/', None, None, None, None)
[TunisieTravail Fetched] ('tunisietravail', 'IKI recrute des Conseillers Commerciaux Spécialisés en Mutuelle Santé', 'https://www.tunisietravail.net/iki-recrute-des-conseillers-commerciaux-specialises-en-mutuelle-sante-119064/', None, None, None, None)
[TunisieTravail Fetched] ('tunisietravail', 'MMT recrute Contrôleur Qualité Fruits et Légumes', 'https://www.tunisietravail.n

  cursor.execute("""
  cursor.execute("UPDATE jobs SET is_new = 0 WHERE fetched_at < ?", (cutoff,))


In [7]:
import requests
from bs4 import BeautifulSoup
import sqlite3
import random
import time
from flask import Flask, render_template, redirect, url_for
from threading import Thread
from datetime import datetime
import schedule

# Keywords for job search
KEYWORDS = [
    "chemistry", "analytical chemistry", "organic chemistry",
    "quality control", "HSE", "environment", "food chemistry",
    "chimie", "chimie analytique", "chimie organique",
    "contrôle qualité", "sécurité", "hygiène", "chimie alimentaire"
]

# User-Agent list for request headers
USER_AGENTS = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
]

# Job site configurations
JOB_SITES = [
    {
        "name": "Tanitjobs",
        "url": "https://www.tanitjobs.com/jobs/?listing_type%5Bequal%5D=Job&action=search&keywords%5Ball_words%5D=",
        "selectors": {
            "job_card": "article.listing-item",
            "title": ".listing-item__title a",
            "link": ".listing-item__title a",
            "location": ".listing-item__info--item-location",
            "date": ".listing-item__date",
        }
    },
    {
        "name": "Keejob",
        "url": "https://www.keejob.com/offres-emploi/?keywords=",
        "selectors": {
            "job_card": ".span9",
            "title": "h6 a",
            "link": "h6 a",
            "location": ".meta_a .fa-map-marker",
            "date": None,  # Dates are unavailable on the listing page.
        }
    },
    {
        "name": "Tunisie Travail",
        "url": "https://www.tunisietravail.net/search/",
        "selectors": {
            "job_card": ".Post",
            "title": ".PostHead h1 a",
            "link": ".PostHead h1 a",
            "location": None,
            "date": ".PostDateIndex strong",
        }
    },
    {
        "name": "Option Carriere",
        "url": "https://www.optioncarriere.tn/emploi?s=",
        "selectors": {
            "job_card": "article.job",
            "title": "header h2 a",
            "link": "header h2 a",
            "location": ".location li span",
            "date": ".tags li span",
        }
    }
]

# Database configuration
DATABASE = "jobs.db"

def setup_database():
    conn = sqlite3.connect(DATABASE)
    cursor = conn.cursor()
    cursor.execute('''
        CREATE TABLE IF NOT EXISTS jobs (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            title TEXT,
            link TEXT UNIQUE,
            location TEXT,
            date TEXT,
            site_name TEXT,
            is_new INTEGER,
            is_applied INTEGER,
            timestamp DATETIME DEFAULT CURRENT_TIMESTAMP
        )
    ''')
    conn.commit()
    conn.close()

# Helper function for making requests with retries
def make_request(url):
    headers = {"User-Agent": random.choice(USER_AGENTS)}
    for _ in range(3):  # Retry 3 times
        try:
            response = requests.get(url, headers=headers)
            if response.status_code == 200:
                return response
            time.sleep(1)  # Wait before retrying
        except Exception as e:
            print(f"Request error: {e}")
    return None

# Web scraping function
def scrape_jobs():
    conn = sqlite3.connect(DATABASE)
    cursor = conn.cursor()

    for site in JOB_SITES:
        for keyword in KEYWORDS:
            search_url = f"{site['url']}{keyword.replace(' ', '+')}"
            response = make_request(search_url)
            if not response:
                print(f"Failed to fetch {search_url}")
                continue

            soup = BeautifulSoup(response.text, 'html.parser')
            job_cards = soup.select(site["selectors"]["job_card"])
            for card in job_cards:
                try:
                    title_element = card.select_one(site["selectors"]["title"])
                    link_element = card.select_one(site["selectors"]["link"])
                    location_element = card.select_one(site["selectors"]["location"])
                    date_element = card.select_one(site["selectors"]["date"])
                    
                    title = title_element.text.strip() if title_element else "N/A"
                    link = title_element["href"] if title_element and "href" in title_element.attrs else "N/A"
                    if link and not link.startswith("http"):
                        link = f"https://www.{site['name'].lower()}.com{link}"
                    location = location_element.text.strip() if location_element else "N/A"
                    date = date_element.text.strip() if date_element else "N/A"

                    cursor.execute('''
                        INSERT OR IGNORE INTO jobs (title, link, location, date, site_name, is_new, is_applied)
                        VALUES (?, ?, ?, ?, ?, 1, 0)
                    ''', (title, link, location, date, site["name"]))
                except Exception as e:
                    print(f"Error parsing job from {site['name']}: {e}")

    conn.commit()
    conn.close()

# Update job labels
def update_job_labels():
    conn = sqlite3.connect(DATABASE)
    cursor = conn.cursor()
    cursor.execute('''
        UPDATE jobs
        SET is_new = 0
        WHERE timestamp <= datetime('now', '-1 day') AND is_applied = 0
    ''')
    conn.commit()
    conn.close()

# Mark a job as applied
def mark_job_as_applied(job_id):
    conn = sqlite3.connect(DATABASE)
    cursor = conn.cursor()
    cursor.execute('''
        UPDATE jobs
        SET is_applied = 1
        WHERE id = ?
    ''', (job_id,))
    conn.commit()
    conn.close()

# Flask web app
app = Flask(__name__)

@app.route("/")
def index():
    conn = sqlite3.connect(DATABASE)
    cursor = conn.cursor()
    cursor.execute('''
        SELECT id, title, link, location, date, site_name, is_new, is_applied
        FROM jobs
        ORDER BY timestamp DESC
    ''')
    jobs = cursor.fetchall()
    conn.close()
    return render_template("index.html", jobs=jobs)

@app.route("/apply/<int:job_id>")
def apply(job_id):
    mark_job_as_applied(job_id)
    return redirect(url_for("index"))

# Scheduler function
def run_scheduler():
    schedule.every(24).hours.do(scrape_jobs)
    schedule.every(24).hours.do(update_job_labels)
    while True:
        schedule.run_pending()
        time.sleep(1)

# Main function
if __name__ == "__main__":
    setup_database()
    Thread(target=run_scheduler).start()  # Run the scheduler in a separate thread
    scrape_jobs()  # Perform an initial scrape
    app.run(debug=True)


Failed to fetch https://www.tanitjobs.com/jobs/?listing_type%5Bequal%5D=Job&action=search&keywords%5Ball_words%5D=chemistry
Failed to fetch https://www.tanitjobs.com/jobs/?listing_type%5Bequal%5D=Job&action=search&keywords%5Ball_words%5D=analytical+chemistry
Failed to fetch https://www.tanitjobs.com/jobs/?listing_type%5Bequal%5D=Job&action=search&keywords%5Ball_words%5D=organic+chemistry
Failed to fetch https://www.tanitjobs.com/jobs/?listing_type%5Bequal%5D=Job&action=search&keywords%5Ball_words%5D=quality+control
Failed to fetch https://www.tanitjobs.com/jobs/?listing_type%5Bequal%5D=Job&action=search&keywords%5Ball_words%5D=HSE
Failed to fetch https://www.tanitjobs.com/jobs/?listing_type%5Bequal%5D=Job&action=search&keywords%5Ball_words%5D=environment
Failed to fetch https://www.tanitjobs.com/jobs/?listing_type%5Bequal%5D=Job&action=search&keywords%5Ball_words%5D=food+chemistry
Failed to fetch https://www.tanitjobs.com/jobs/?listing_type%5Bequal%5D=Job&action=search&keywords%5Ball_w

 * Running on http://127.0.0.1:5000
Press CTRL+C to quit
 * Restarting with watchdog (windowsapi)


SystemExit: 1

In [None]:
pip install schedule


In [None]:
import undetected_chromedriver as uc
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import random
from bs4 import BeautifulSoup
import sqlite3
from datetime import datetime
import schedule

# Constants
KEYWORDS = [
    "chemistry", "analytical chemistry", "organic chemistry",
    "quality control", "HSE", "environment", "food chemistry",
    "chimie", "chimie analytique", "chimie organique",
    "contrôle qualité", "sécurité", "hygiène", "chimie alimentaire"
]

BASE_URL = "https://www.tanitjobs.com/jobs/"
DB_NAME = "jobs.db"

def setup_database():
    """Set up the database."""
    conn = sqlite3.connect(DB_NAME)
    cursor = conn.cursor()
    cursor.execute("""
    CREATE TABLE IF NOT EXISTS jobs (
        id INTEGER PRIMARY KEY AUTOINCREMENT,
        title TEXT,
        link TEXT UNIQUE,
        date TEXT,
        location TEXT,
        experience TEXT,
        is_new INTEGER DEFAULT 1,
        applied INTEGER DEFAULT 0
    )
    """)
    conn.commit()
    conn.close()

def fetch_with_undetected_chrome():
    """Fetch job listings using undetected-chromedriver."""
    options = uc.ChromeOptions()
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    options.add_argument("--disable-blink-features=AutomationControlled")
    options.add_argument("--headless=new")  # Use new headless mode
    options.add_argument(f"user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36")

    driver = uc.Chrome(options=options)
    jobs = []

    try:
        for keyword in KEYWORDS:
            print(f"\n--- Fetching jobs for keyword: {keyword} ---")
            query = f"?listing_type[equal]=Job&action=search&keywords[all_words]={keyword.replace(' ', '+')}"
            driver.get(BASE_URL + query)

            # Wait for the Cloudflare check to complete
            WebDriverWait(driver, 30).until(
                EC.presence_of_element_located((By.CLASS_NAME, "listing-item__jobs"))
            )

            # Parse the page content
            soup = BeautifulSoup(driver.page_source, "html.parser")
            jobs.extend(parse_jobs(soup))
            
            # Random delay between requests
            time.sleep(random.uniform(2, 5))

    except Exception as e:
        print(f"Selenium error: {e}")
    finally:
        driver.quit()
    
    return jobs

def parse_jobs(soup):
    """Parse HTML content for job listings."""
    jobs = []
    listings = soup.find_all("article", class_="listing-item__jobs")

    for listing in listings:
        try:
            title = listing.find("div", class_="media-heading").get_text(strip=True)
            link = listing.find("a", class_="link")["href"]
            date = listing.find("div", class_="listing-item__date").get_text(strip=True)
            location = listing.find("span", class_="listing-item__info--item-location").get_text(strip=True)
            experience = listing.find("div", class_="listing-item__desc").get_text(strip=True)[:50]

            print(f"Title: {title}")
            print(f"Link: {link}")
            print(f"Date: {date}")
            print(f"Location: {location}")
            print(f"Experience: {experience}")
            print("-" * 40)

            jobs.append({
                "title": title,
                "link": link,
                "date": date,
                "location": location,
                "experience": experience
            })
        except Exception as e:
            print(f"Error parsing listing: {e}")
            
    return jobs

def update_database(jobs):
    """Update the database with new jobs."""
    conn = sqlite3.connect(DB_NAME)
    cursor = conn.cursor()

    cursor.execute("""
    UPDATE jobs SET is_new = 0 WHERE is_new = 1 AND applied = 0 AND date(date) <= date('now', '-1 day')
    """)

    for job in jobs:
        try:
            cursor.execute("""
            INSERT INTO jobs (title, link, date, location, experience, is_new) 
            VALUES (?, ?, ?, ?, ?, 1)
            """, (job["title"], job["link"], job["date"], job["location"], job["experience"]))
            print(f"Inserted new job into database: {job['title']}")
        except sqlite3.IntegrityError:
            print(f"Job already exists in the database: {job['title']}")
    conn.commit()
    conn.close()

def display_jobs():
    """Display jobs from the database."""
    conn = sqlite3.connect(DB_NAME)
    cursor = conn.cursor()
    cursor.execute("""
    SELECT title, link, date, location, experience, is_new, applied FROM jobs ORDER BY date(date) DESC
    """)
    rows = cursor.fetchall()
    conn.close()

    print("\n--- Job Listings ---")
    for row in rows:
        title, link, date, location, experience, is_new, applied = row
        status = "[NEW]" if is_new else "[OLD]"
        applied_status = "[APPLIED]" if applied else "[UNAPPLIED]"
        print(f"{status} {title} - {date} - {location}")
        print(f"Experience: {experience}")
        print(f"Link: {link}")
        print(f"Status: {applied_status}")
        print("-" * 40)

def job_runner():
    """Run the job fetching and updating process."""
    print(f"\n--- Running Job Scraper at {datetime.now()} ---")
    jobs = fetch_with_undetected_chrome()
    update_database(jobs)
    display_jobs()

# Main execution
if __name__ == "__main__":
    setup_database()
    schedule.every(24).hours.do(job_runner)

    print("Job scraper started. Press Ctrl+C to stop.")
    try:
        job_runner()  # Run initially to populate data
        while True:
            schedule.run_pending()
            time.sleep(1)
    except KeyboardInterrupt:
        print("Job scraper stopped.")


Job scraper started. Press Ctrl+C to stop.

--- Running Job Scraper at 2024-12-07 15:34:25.652100 ---

--- Fetching jobs for keyword: chemistry ---
Selenium error: Message: 
Stacktrace:
	GetHandleVerifier [0x00D53433+25059]
	(No symbol) [0x00CDCE34]
	(No symbol) [0x00BBBEC3]
	(No symbol) [0x00BFFD86]
	(No symbol) [0x00BFFFCB]
	(No symbol) [0x00C3D952]
	(No symbol) [0x00C21F44]
	(No symbol) [0x00C3B51E]
	(No symbol) [0x00C21C96]
	(No symbol) [0x00BF3FAC]
	(No symbol) [0x00BF4F3D]
	GetHandleVerifier [0x01045593+3113795]
	GetHandleVerifier [0x0105A25A+3198986]
	GetHandleVerifier [0x01052A32+3168226]
	GetHandleVerifier [0x00DF32A0+680016]
	(No symbol) [0x00CE577D]
	(No symbol) [0x00CE2A28]
	(No symbol) [0x00CE2BC5]
	(No symbol) [0x00CD5820]
	BaseThreadInitThunk [0x76027BA9+25]
	RtlInitializeExceptionChain [0x7794C0CB+107]
	RtlClearBits [0x7794C04F+191]


--- Job Listings ---


In [25]:
pip install requests beautifulsoup4 cloudscraper undetected-chromedriver fake-useragent schedule selenium

Note: you may need to restart the kernel to use updated packages.


In [48]:
pip install -U undetected-chromedriver

Note: you may need to restart the kernel to use updated packages.


In [None]:
import requests
from bs4 import BeautifulSoup
import sqlite3
from datetime import datetime, timedelta
import time
import re

DB_FILE = "jobs.db"

# Keywords for filtering chemistry-related jobs (simplified for broader search)
KEYWORDS = [
    "chimie analytique", "instrumentation", "contrôle qualité",
    "chimie organique", "chimie des matériaux", "HSE", "analyse environnementale",
    "spectroscopie", "chromatographie", "validation de méthode", "calibration",
    "traitement des eaux", "chemistry", "analytical chemistry", "organic chemistry",
    "quality control", "HSE", "environment", "food chemistry",
    "chimie", "chimie alimentaire", "chimie industrielle", "QHSE"
]

BASE_URLS = {
    "optioncarriere": "https://www.optioncarriere.tn/emploi?s={query}&l=Tunisie",
    "keejob": "https://www.keejob.com/offres-emploi/?keywords={query}",
    "tunisietravail": "https://www.tunisietravail.net/search/{query}"
}

HEADERS = {"User-Agent": "Mozilla/5.0"}

def initialize_db():
    conn = sqlite3.connect(DB_FILE)
    c = conn.cursor()
    c.execute('''
        CREATE TABLE IF NOT EXISTS jobs (
            id INTEGER PRIMARY KEY,
            title TEXT,
            link TEXT UNIQUE,
            publish_date TEXT,
            location TEXT,
            experience TEXT,
            description TEXT,
            status TEXT,
            added_date TIMESTAMP
        )
    ''')
    conn.commit()
    conn.close()

def save_job_to_db(title, link, publish_date, location, experience, description, status):
    conn = sqlite3.connect(DB_FILE)
    c = conn.cursor()
    try:
        c.execute('''
            INSERT OR IGNORE INTO jobs (title, link, publish_date, location, experience, description, status, added_date)
            VALUES (?, ?, ?, ?, ?, ?, ?, ?)
        ''', (title, link, publish_date, location, experience, description, status, datetime.now()))
        conn.commit()
    except sqlite3.Error as e:
        print(f"Database error: {e}")
    finally:
        conn.close()

def score_job(title, description, site=None):
    """
    Scores a job based on the frequency of keywords in the title and description.
    Adds higher weight for matches in the title.
    For 'keejob', scores are weighted even more heavily.
    """
    title_weight = 2
    description_weight = 1

    # Apply heavier weight for 'keejob'
    if site == "keejob":
        title_weight = 4
        description_weight = 2

    score = 0
    for keyword in KEYWORDS:
        title_matches = title.lower().count(keyword.lower())
        description_matches = description.lower().count(keyword.lower())
        score += (title_matches * title_weight) + (description_matches * description_weight)
    return score

def filter_jobs(jobs, site=None):
    """
    Filters and scores jobs based on keywords.
    Logs scoring details in the console for 'keejob'.
    """
    scored_jobs = [
        (job, score_job(job[0], job[5], site))  # job[0] = title, job[5] = description
        for job in jobs
    ]

    # Debug scoring details
    for job, score in scored_jobs:
        print(f"[{site}] Job Title: {job[0]}, Score: {score}, Description: {job[5][:50]}...")

    scored_jobs = [(job, score) for job, score in scored_jobs if score > 0]
    scored_jobs.sort(key=lambda x: x[1], reverse=True)  # Sort by score, descending
    return [job for job, score in scored_jobs]

def exclude_old_jobs(jobs):
    """
    Filters out jobs older than one month based on publish date.
    """
    one_month_ago = datetime.now() - timedelta(days=30)
    valid_jobs = []
    for job in jobs:
        try:
            publish_date = datetime.strptime(job[2], "%Y-%m-%d")
            if publish_date >= one_month_ago:
                valid_jobs.append(job)
        except ValueError:
            print(f"Invalid date format for job: {job}")
            continue
    return valid_jobs

def fetch_job_details(url, site):
    try:
        response = requests.get(url, headers=HEADERS, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, "html.parser")
    except requests.exceptions.RequestException as e:
        print(f"Failed to fetch job details from {url}: {e}")
        return "N/A", "N/A", "N/A", "N/A"

    description, publish_date, location, experience = "N/A", "N/A", "N/A", "N/A"

    try:
        if site == "keejob":
            desc_section = soup.find("div", class_="block_a")
            if desc_section:
                description = desc_section.get_text(strip=True)
            publish_date_section = soup.find("i", class_="fa-clock-o")
            if publish_date_section:
                publish_date = publish_date_section.next_sibling.strip()
            location_section = soup.find("i", class_="fa-map-marker")
            if location_section:
                location = location_section.next_sibling.strip()

        elif site == "optioncarriere":
            header = soup.find("header")
            if header:
                description = header.find_next("section", class_="content").get_text(strip=True)
            publish_date_section = header.find("ul", class_="tags")
            if publish_date_section:
                publish_date = publish_date_section.get_text(strip=True)
            location_section = header.find("span")
            if location_section:
                location = location_section.get_text(strip=True)

        elif site == "tunisietravail":
            desc_section = soup.find("div", class_="PostContent")
            if desc_section:
                description = desc_section.get_text(strip=True)
            publish_date_section = soup.find("p", class_="PostDate")
            if publish_date_section:
                publish_date = publish_date_section.get_text(strip=True)
            location_section = soup.find("p", class_="PostInfo")
            if location_section:
                location = location_section.get_text(strip=True)
    except Exception as e:
        print(f"Error parsing job details from {url}: {e}")

    return description, publish_date, location, experience

def fetch_jobs_from_optioncarriere(keyword):
    url = BASE_URLS["optioncarriere"].format(query=keyword)
    response = requests.get(url, headers=HEADERS)
    soup = BeautifulSoup(response.text, "html.parser")
    jobs = []
    for job in soup.find_all("article", class_="job"):
        title_tag = job.find("a")
        if not title_tag:
            continue
        title = title_tag.text.strip()
        link = f"https://www.optioncarriere.tn{title_tag['href']}"
        description, publish_date, location, experience = fetch_job_details(link, "optioncarriere")
        jobs.append((title, link, publish_date, location, experience, description))
    return jobs

def fetch_jobs_from_tunisietravail(keyword):
    url = BASE_URLS["tunisietravail"].format(query=keyword)
    response = requests.get(url, headers=HEADERS)
    soup = BeautifulSoup(response.text, "html.parser")
    jobs = []
    for job in soup.find_all("div", class_="Post"):
        title_tag = job.find("a", class_="h1titleall")
        if not title_tag:
            continue
        title = title_tag.text.strip()
        link = title_tag["href"]
        description, publish_date, location, experience = fetch_job_details(link, "tunisietravail")
        jobs.append((title, link, publish_date, location, experience, description))
    return jobs

def fetch_jobs_from_keejob(keyword):
    url = BASE_URLS["keejob"].format(query=keyword)
    response = requests.get(url, headers=HEADERS)
    soup = BeautifulSoup(response.text, "html.parser")
    jobs = []
    for job in soup.find_all("div", class_="block_white_a"):
        title_tag = job.find("a", style=True)
        if not title_tag:
            continue
        title = title_tag.text.strip()
        link = f"https://www.keejob.com{title_tag['href']}"
        description, publish_date, location, experience = fetch_job_details(link, "keejob")
        jobs.append((title, link, publish_date, location, experience, description))
    return jobs

def process_jobs(fetch_function, keyword, site=None):
    jobs = fetch_function(keyword)
    print(f"Fetched {len(jobs)} jobs for keyword: {keyword} from {site}")
    jobs = filter_jobs(jobs, site)
    print(f"Filtered to {len(jobs)} relevant jobs for keyword: {keyword} from {site}")
    jobs = exclude_old_jobs(jobs)
    print(f"Filtered to {len(jobs)} recent jobs for keyword: {keyword} from {site}")
    for job in jobs:
        print(f"Saving Job to DB: {job}")
        save_job_to_db(*job, "new")

def update_jobs():
    for keyword in KEYWORDS:
        print(f"\nUpdating jobs for keyword: {keyword}")
        process_jobs(fetch_jobs_from_optioncarriere, keyword, site="optioncarriere")
        process_jobs(fetch_jobs_from_tunisietravail, keyword, site="tunisietravail")
        process_jobs(fetch_jobs_from_keejob, keyword, site="keejob")

if __name__ == "__main__":
    initialize_db()
    while True:
        update_jobs()
        print("Job list updated. Sleeping for 24 hours...")
        time.sleep(86400)



Updating jobs for keyword: chimie analytique
Fetched 6 jobs for keyword: chimie analytique from optioncarriere
[optioncarriere] Job Title: Technicien Superieur en Chimie, Score: 8, Description: Dans le cadre du développement de notre client, un...
[optioncarriere] Job Title: Technicien Laboratoire Contrôle Qualité, Score: 5, Description: Pour faire face aux différents besoins et accompag...
[optioncarriere] Job Title: Technicienne Contrôle Qualité, Score: 7, Description: Société de production végétale installé à Djebel O...
[optioncarriere] Job Title: Technicien / Technicienne chimiste en laboratoire d'analyse industrielle, Score: 2, Description: Référence : 1250/2024/1653 (Offre d'emploi)Date de...
[optioncarriere] Job Title: Technico-Commercial . Produits Cosmétiques /Dispositifs Médicaux, Score: 4, Description: Compétences techniques :Maîtriser les caractéristi...
[optioncarriere] Job Title: Chef Produit Calcium, Score: 4, Description: RH Formaconsult, spécialisée dans le service R

In [5]:
from fastapi import FastAPI, HTTPException
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel
import sqlite3
from datetime import datetime
import time
import threading

# Import your existing logic here
from job_scraper import (
    initialize_db,
    update_jobs,  # Your job-fetching and updating logic
    KEYWORDS
)

# Initialize FastAPI app
app = FastAPI()

# Enable CORS for React frontend
app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],  # Allow all origins; adjust for security in production
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)

# Initialize SQLite DB
DB_FILE = "jobs.db"

# Models for API Input/Output
class Job(BaseModel):
    title: str
    link: str
    publish_date: str
    location: str
    experience: str
    description: str
    status: str

@app.on_event("startup")
def startup_event():
    initialize_db()

# Background thread for updating jobs periodically
def start_job_updater():
    def update_loop():
        while True:
            update_jobs()  # Calls the update_jobs function from your logic
            print("Jobs updated. Sleeping for 24 hours...")
            time.sleep(86400)
    thread = threading.Thread(target=update_loop, daemon=True)
    thread.start()

@app.on_event("startup")
def start_updater():
    start_job_updater()

# Endpoint: Fetch jobs from the database
@app.get("/jobs", response_model=list[Job])
def get_jobs(keyword: str = None):
    """
    Fetch jobs from the SQLite database.
    Supports filtering by a keyword.
    """
    conn = sqlite3.connect(DB_FILE)
    c = conn.cursor()

    query = "SELECT title, link, publish_date, location, experience, description, status FROM jobs"
    if keyword:
        query += " WHERE title LIKE ? OR description LIKE ?"
        params = (f"%{keyword}%", f"%{keyword}%")
    else:
        params = ()

    c.execute(query, params)
    jobs = c.fetchall()
    conn.close()

    # Convert results to list of dicts
    return [
        {
            "title": row[0],
            "link": row[1],
            "publish_date": row[2],
            "location": row[3],
            "experience": row[4],
            "description": row[5],
            "status": row[6],
        }
        for row in jobs
    ]

# Endpoint: List supported keywords
@app.get("/keywords")
def get_keywords():
    """
    Return the list of keywords used for filtering jobs.
    """
    return {"keywords": KEYWORDS}



ModuleNotFoundError: No module named 'job_scraper'

In [7]:
pip install uvicorn


Collecting uvicorn
  Downloading uvicorn-0.32.1-py3-none-any.whl.metadata (6.6 kB)
Downloading uvicorn-0.32.1-py3-none-any.whl (63 kB)
   ---------------------------------------- 0.0/63.8 kB ? eta -:--:--
   ------------------- -------------------- 30.7/63.8 kB 1.4 MB/s eta 0:00:01
   ---------------------------------------- 63.8/63.8 kB 687.0 kB/s eta 0:00:00
Installing collected packages: uvicorn
Successfully installed uvicorn-0.32.1
Note: you may need to restart the kernel to use updated packages.
