In [1]:
from openai import OpenAI

client = OpenAI(
  base_url="OpenAI_base_URL",
  api_key="api_key",
)

# Below codes in this cell is to test the AI connection 
# job_description = """
# We are looking for a Senior Data Engineer with experience in building and maintaining scalable data pipelines.
# The ideal candidate should be proficient in Python, SQL, and have experience with cloud platforms such as AWS or Azure.
# Familiarity with Apache Spark, Databricks, and ETL frameworks is required.
# Strong knowledge of data modeling, data warehousing, and working with REST APIs is a plus.
# """

# # # Call the OpenAI API
# response = client.chat.completions.create(
#   model="gpt-3.5-turbo",
#   messages=[
#     {"role": "system", "content": "You are an AI assistant that extracts technical skills as comma separated list from job descriptions."},
#     {"role": "user", "content": job_description}
#   ]
# )
# print(response.choices[0].message.content)
# O/P: Python, SQL, AWS, Azure, Apache Spark, Databricks, ETL frameworks, data modeling, data warehousing, REST APIs



In [2]:
!pip install selenium
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from collections import Counter

# Function to scrape LinkedIn job postings and extract skills
def scrape_linkedin_jobs(keyword, num_pages):
    job_skills = []

    # Set up options for the Chrome WebDriver
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')  # Run Chrome in headless mode (optional)
    options.add_argument("--no-sandbox")

    # Start a Selenium WebDriver with options
    driver = webdriver.Chrome(options=options)


    url = f'https://www.linkedin.com/jobs/search/?keywords={keyword}&location=Germany'
    driver.get(url)
    j = 0
    # Scroll to load more jobs (you may need to adjust the number of scrolls)
    for _ in range(num_pages):
        print("scroll ######",j)
        j = j+1
        driver.find_element(By.TAG_NAME, 'body').send_keys(Keys.END)
        time.sleep(2)  # Wait for content to load

    # Extract job titles and skills (modify as needed)
    job_cards = driver.find_elements(By.CSS_SELECTOR, '.base-card')
    break_element = 0
    with open("jd.tsv", mode="wt") as f:
      f.write("Job_title|Link|Skills|Exp|Visa_status\n")
      for card in job_cards:
        try:
          job_title_element = WebDriverWait(card, 10)\
          .until(EC.presence_of_element_located((By.CSS_SELECTOR, '.base-search-card__title')))
          company_element = WebDriverWait(card, 10)\
          .until(EC.presence_of_element_located((By.CSS_SELECTOR, '.base-search-card__subtitle')))
          description = WebDriverWait(card, 10)\
          .until(EC.presence_of_element_located((By.CSS_SELECTOR, '.base-card__full-link')))

          company_name = company_element.text
          job_title = job_title_element.text
          print(job_title, " ", company_name)
          job_link = description.get_attribute('href')
          print(job_link)

          #Hitting each job's URL to get more information
          job_driver = webdriver.Chrome(options=options)
          job_driver.get(job_link)

          #expand descriptions by clicking on show more
          expand_description(job_driver)
          #extract description element
          job_description = extract_job_description(job_driver)
          #extract skills from description
          extracted_skills = extract_skills(job_description)
          job_skills.append(extracted_skills)
          #Create a CSV file with job details
          experience = exp_extract(job_description)
          visa_stat = visa_support_check(job_description)
          f.write(f"{job_title}|{job_link}|{str(extracted_skills)}|{experience}|{visa_stat}\n")
          job_driver.quit()

        except Exception as e:
          # print("Job details not found for this card.")
          print(f"Error is {e}")
          continue

    # Close the WebDriver when done
    driver.quit()

    #print(job_skills)

    return job_skills


# Function to extract job description using JavaScript
def extract_job_description(driver):
    try:
        # Wait for the job description element to be present (you can adjust the timeout)
        WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, '.description')))

        # Execute JavaScript code to extract job description
        job_description = driver.execute_script("return document.querySelector('.description').textContent")
        return job_description
    except TimeoutException:
        return "Job description not found or couldn't be loaded"

# Function to expand job description by clicking "show more" if available
def expand_description(driver):
    try:
        show_more_button = WebDriverWait(card, 10)\
            .until(EC.presence_of_element_located((By.CSS_SELECTOR, '.show-more-less-html__button')))

        show_more_button.click()
        time.sleep(2)  # Wait for the description to expand
    except Exception as e:
        pass  # No "show more" button found or error occurred


# Function to extract skills from a job title
def extract_skills(description):
    description = description.lower()
    response = client.chat.completions.create(
        model="deepseek/deepseek-r1:free",
        messages=[
            {"role": "system", "content": "You are an AI assistant that extracts technical skills as comma separated list from job descriptions."},
            {"role": "user", "content": description}
            ]
        )
    skills_list = [skill.strip() for skill in response.choices[0].message.content.split(",")]
    print(skills_list)
    return skills_list

# Function to find the year of experience needed
def exp_extract(description):
  description = description.lower()
  response = client.chat.completions.create(
      model="deepseek/deepseek-r1:free",
      messages=[
          {"role": "system", "content": "You are an AI assistant that extracts minimum experience needed for this role from job descriptions and gives output as integer"},
          {"role": "user", "content": description}
          ]
      )
  return response.choices[0].message.content

# Function to find whether company helps with visa support
def visa_support_check(description):
  description = description.lower()
  response = client.chat.completions.create(
      model="deepseek/deepseek-r1:free",
      messages=[
          {"role": "system", "content": "Analyze the given job description and determine whether the organization offers assistance with obtaining a visa. Respond with only 'Yes' or 'No'."},
          {"role": "user", "content": description}
          ]
      )
  return response.choices[0].message.content

# Main function
if __name__ == "__main__":
    keyword = "data%20engineer"
    num_pages = 15 # You can adjust the number of pages to scrape

    job_skills = scrape_linkedin_jobs(keyword, num_pages)

    print(f'Data engineer jobs: {len(job_skills)}')

    flattened_skills = [skill for sublist in job_skills for skill in sublist]
    skill_counts = Counter(flattened_skills)
    top_skills = skill_counts.most_common(30)
    with open("Skill_list.csv", mode="wt") as f:
      f.write("Skill_name,Number_of_openings\n")
      for skill, count in top_skills:
        f.write(f"{skill},{count}\n")
          # print(f'{skill}: {count}')


Collecting selenium
  Downloading selenium-4.30.0-py3-none-any.whl.metadata (7.5 kB)
Collecting trio~=0.17 (from selenium)
  Downloading trio-0.29.0-py3-none-any.whl.metadata (8.5 kB)
Collecting trio-websocket~=0.9 (from selenium)
  Downloading trio_websocket-0.12.2-py3-none-any.whl.metadata (5.1 kB)
Collecting outcome (from trio~=0.17->selenium)
  Downloading outcome-1.3.0.post0-py2.py3-none-any.whl.metadata (2.6 kB)
Collecting wsproto>=0.14 (from trio-websocket~=0.9->selenium)
  Downloading wsproto-1.2.0-py3-none-any.whl.metadata (5.6 kB)
Downloading selenium-4.30.0-py3-none-any.whl (9.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.4/9.4 MB[0m [31m36.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading trio-0.29.0-py3-none-any.whl (492 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m492.9/492.9 kB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading trio_websocket-0.12.2-py3-none-any.whl (21 kB)
Downloading outcome-1.3.0.post0-py2.py3-n