# 1. Imports

In [139]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import pandas as pd
import requests

# 2. Functions


**Key Points of the Script:**
1. **WebDriver Setup**: It initializes a headless Chrome driver.
2. **Job Scraping**: The function `scrape_jobs` navigates the LinkedIn job search results, automatically scrolling to load more entries until it gathers the desired number of job postings or reaches the end.
3. **Data Extraction**: For each job listing, the script extracts the job title, company name, location, and salary (if available). If salary isn't listed, it notes "Not listed".
4. **Analysis**: Converts the scraped data into a Pandas DataFrame to facilitate straightforward data manipulation and analysis.
5. **Graceful Shutdown**: Ensures the WebDriver is properly closed after the scraping is complete.

**Considerations**:
- **Dynamic Content**: Selenium waits are used to ensure that the page loads completely before attempting to find elements.
- **Error Handling**: Basic error handling includes skipping over job listings that don't have all the expected fields.
- **Headless Operation**: The script runs Chrome in headless mode for better performance, especially useful when running on servers or automated environments.

This script is configured for educational purposes and assumes ethical use, with proper respect for LinkedIn's terms of service. For practical use, you'd need to ensure compliance with LinkedIn's data use policies.


In [168]:
def setup_driver():
    # Setup Chrome WebDriver
    chrome_options = webdriver.ChromeOptions()
    #options.add_argument('--headless')  # Runs Chrome in headless mode.
    #options.add_argument('--no-sandbox')
    #options.add_argument('--disable-dev-shm-usage')
    # Opens Chrome in full screen mode.
    #chrome_options.add_argument("--start-fullscreen")
    driver = webdriver.Chrome(options=chrome_options)
    return driver

In [169]:
def scrape_jobs(driver, url, num_results=5):
    print("Navigating to the page...")
    driver.get(url)
    time.sleep(2)  # Allow some time for the page to load
    results = []
    count = 0
    last_height = driver.execute_script("return document.body.scrollHeight")

    print(f"Starting the scrape for {num_results} job postings...")
    while count < num_results:
        job_cards = driver.find_elements(
            By.CLASS_NAME, 'base-search-card__info')
        print(f"Found {len(job_cards)} job cards on the page:")
        for card in job_cards[count:]:
            job_title = card.find_element(By.CLASS_NAME, 'base-search-card__title').text if card.find_elements(
                By.CLASS_NAME, 'base-search-card__title') else "Not listed"
            company_name = card.find_element(By.CLASS_NAME, 'base-search-card__subtitle').text if card.find_elements(
                By.CLASS_NAME, 'base-search-card__subtitle') else "Not listed"
            location = card.find_element(By.CLASS_NAME, 'job-search-card__location').text if card.find_elements(
                By.CLASS_NAME, 'job-search-card__location') else "Not listed"
            # Extract the datetime attribute from the <time> element
            post_date = card.find_element(By.CLASS_NAME, 'job-search-card__listdate').get_attribute(
                'datetime') if card.find_elements(By.CLASS_NAME, 'job-search-card__listdate') else "Not listed"

            results.append({
                'Job Title': job_title,
                'Company Name': company_name,
                'Location': location,
                # 'Salary': salary,
                'Posted Date': post_date  # Add the extracted date to the results dictionary
            })
            print(
                f"Scraped {count + 1}/{num_results}: {job_title} at {company_name}")
            count += 1

        # Scroll down to bottom
        driver.execute_script(
            "window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(3)

        # Calculate new scroll height and compare with last scroll height
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            print("Reached the end of the page or no more job cards to load.")
            break
        last_height = new_height

    return results

In [173]:
def scrape_jobs(driver, url, num_results=5):
    print("Navigating to the page...")
    driver.get(url)
    time.sleep(2)  # Allow some time for the page to load
    results = []
    count = 0
    last_height = driver.execute_script("return document.body.scrollHeight")

    print(f"Starting the scrape for {num_results} job postings...")
    while count < num_results:
        job_cards = driver.find_elements(By.CLASS_NAME, 'base-card')
        print(f"Found {len(job_cards)} job cards on the page:")
        for card in job_cards[count:]:
            # Click on the job card to open the job details
            try:
                driver.execute_script("arguments[0].click();", card)
                time.sleep(4)  # Wait for the details to load
            except Exception as e:
                print(f"Error clicking on job card: {e}")
                continue

            # Click the 'Show more' button to reveal the full description
            try:
                show_more_button = driver.find_element(
                    By.CLASS_NAME, 'show-more-less-html__button--more')
                show_more_button.click()
                time.sleep(4)  # Wait for the description to expand
            except NoSuchElementException:
                print("No 'Show more' button to click.")
            except ElementClickInterceptedException:
                print("The 'Show more' button was not clickable.")

            # Scrape the job details from the details pane
            try:
                job_title = driver.find_element(
                    By.CLASS_NAME, 'topcard__title').text
                company_name = driver.find_element(
                    By.CLASS_NAME, 'topcard__flavor').text
                location = driver.find_element(
                    By.CLASS_NAME, 'topcard__flavor--bullet').text
                description_container = driver.find_element(
                    By.CLASS_NAME, 'description__text--rich')
                description = description_container.get_attribute('innerText')
                post_date = driver.find_element(
                    By.CLASS_NAME, 'posted-time-ago__text').text
                list_date = card.find_element(By.CLASS_NAME, 'job-search-card__listdate').get_attribute(
                    'datetime')
            except NoSuchElementException as e:
                print(
                    f"Not all elements could be found for job description: {e}")
                # If some elements are missing, continue with what has been found
                job_title = job_title if 'job_title' in locals() else "Not listed"
                company_name = company_name if 'company_name' in locals() else "Not listed"
                location = location if 'location' in locals() else "Not listed"
                description = description if 'description' in locals() else "Not listed"
                post_date = post_date if 'post_date' in locals() else "Not listed"
                list_date = list_date if 'list_date' in locals() else "Not listed"

            results.append({
                'Job Title': job_title,
                'Company Name': company_name,
                'Location': location,
                'Posted Date': post_date,
                'Listed Date': list_date,
                'Job Description': description
            })
            print(
                f"Scraped {count + 1}/{num_results}: {job_title} at {company_name}")
            count += 1
            if count >= num_results:
                break

        # Scroll down to bottom
        driver.execute_script(
            "window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(3)

        # Calculate new scroll height and compare with last scroll height
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            print("Reached the end of the page or no more job cards to load.")
            break
        last_height = new_height

    return results

In [174]:
def analyze_data(data):
    # This function converts the list of dictionaries (data) into a pandas DataFrame
    # for easier data manipulation and subsequent analysis.
    df = pd.DataFrame(data)

    # The `describe()` method provides a summary of statistics pertaining to the DataFrame columns,
    # including mean, standard deviation, min, and max values, which can be useful for understanding
    # the distribution of numerical data in the dataset.
    df.describe()

    return df
    

def main():
    # Initialize the Chrome WebDriver using a helper function that sets up the driver with necessary options.
    driver = setup_driver()

    # URL setup for the LinkedIn job search based on specific keywords and location.
    url = 'https://www.linkedin.com/jobs/search/?keywords=sales%20development%20representative&location=United%20States'

    try:
        # Scrape job data from LinkedIn using the scrape_jobs function, aiming to collect data on 1000 job postings.
        job_data = scrape_jobs(driver, url, 40)

        # Analyze the scraped job data using the analyze_data function and print the statistical summary.
        DF = analyze_data(job_data)
        print(DF)

    finally:
        # Ensure the driver is properly quit after scraping to free resources and avoid any leftover processes.
        driver.quit()
        return DF

# Running Scraper

In [175]:
DF = main()

Navigating to the page...
Starting the scrape for 40 job postings...
Found 25 job cards on the page:
Scraped 1/40: Sales Development Representative at Runwise


UnboundLocalError: cannot access local variable 'DF' where it is not associated with a value

In [176]:
# Not using the Salary column, so dropping it
DF.drop('Salary', axis=1, inplace=True)

In [177]:
DF

Unnamed: 0,Job Title,Company Name,Location,Posted Date,Job Description
0,Sales Development Representative,Runwise,"New York, NY",1 month ago,Runwise is looking for a growth-minded Sales D...
1,Sales Development Representative,Assembled,San Francisco Bay Area,2 days ago,Assembled is building software to transform an...
2,Inside Sales Representative,Boston Celtics,"Boston, MA",3 weeks ago,Summary:\n\n\n\n\nThroughout the Boston Celtic...
3,Sales Development Representative,Flowhub,United States,2 months ago,"At Flowhub, we're about more than technology —..."
4,Sales Development Representative,Klook,"Los Angeles, CA",1 month ago,What you'll do:\n\nAcquisition and Account Man...
5,Sales Development Representative,CivicPlus,United States,3 weeks ago,If you are looking to break into tech sales or...
6,Sales Development Representative,WorkWave,United States,1 month ago,The Business Development Representative positi...
7,Sales Development Representative (SDR),Swell,"Salt Lake City, UT",6 days ago,Swell is looking for energetic and motivated p...
8,Inbound Sales Development Representative - AMER,Notion,"San Francisco, CA",1 month ago,About Us\n\nWe're on a mission to make it poss...
9,Sales Development Representative,Slang.ai,"New York, NY",11 months ago,We're hiring Sales Development Representative ...


In [156]:
len(DF)

40

In [157]:
import datetime
DF
# Get the current date and time
now = datetime.datetime.now()

# Format the date and time as a string in the desired format
date_time = now.strftime("%Y-%m-%d_%H-%M-%S")

# Save the DataFrame to a CSV file with the formatted date and time as the file name
file_name = f"{date_time}.csv"
DF.to_csv(file_name, index=False)

In [59]:
# importing the modules
from IPython.display import display

In [60]:
# displaying the DataFrame
display(DF)

Unnamed: 0,Job Title,Company Name,Location,Salary
0,Sales Development Representative,Runwise,"New York, NY",Not listed
1,Sales Development Representative,Assembled,San Francisco Bay Area,Not listed
2,Inside Sales Representative,Boston Celtics,"Boston, MA",Not listed
3,Sales Development Representative,Flowhub,United States,Not listed
4,Sales Development Representative,Klook,"Los Angeles, CA",Not listed
5,Sales Development Representative,CivicPlus,United States,Not listed
6,Sales Development Representative,WorkWave,United States,Not listed
7,Sales Development Representative (SDR),Swell,"Salt Lake City, UT",Not listed
8,Inbound Sales Development Representative - AMER,Notion,"San Francisco, CA",Not listed
9,Sales Development Representative,Slang.ai,"New York, NY",Not listed


In [120]:
API_TOKEN = "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"

In [121]:
with open('sample_job_description.txt', 'r') as file:
    job_description = file.read()

with open('sample_job_description_2.txt', 'r') as file:
    job_description_2 = file.read()

with open('sample_job_description_3.txt', 'r') as file:
    job_description_3 = file.read()

with open('sample_job_description_4.txt', 'r') as file:
    job_description_4 = file.read()

In [128]:
prompt = '''
How much is this job paying? What is the salary? (ONLY print that number; do not print anything else!!!)
'''

In [131]:
headers = {"Authorization": f"Bearer {API_TOKEN}"}
API_URL = "https://api-inference.huggingface.co/models/deepset/roberta-base-squad2"


def query(payload):
    response = requests.post(API_URL, headers=headers, json=payload)
    return response.json()


data = query(
    {
        "inputs": {
            "question": prompt,
            "context": job_description_2,
        }
    }
)

In [132]:
data

{'score': 0.08668936043977737, 'start': 3333, 'end': 3340, 'answer': '$64,500'}

## Testing Hugging Face API

In [66]:
import requests

In [67]:
API_TOKEN = "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"

In [68]:

API_URL = "https://api-inference.huggingface.co/models/gpt2"
headers = {"Authorization": f"Bearer {API_TOKEN}"}


def query(payload):
    response = requests.post(API_URL, headers=headers, json=payload)
    return response.json()


data = query("Can you please let us know more details about your ")

In [69]:
data

[{'generated_text': 'Can you please let us know more details about your iphone, so it can get through to you?\n\nThank you for your time on our podcast.'}]

In [97]:
prompt = '''
How much is this job paying? What is the salary? (ONLY print that number; do not print anything else!!!)
'''

**Now Testing with Google Gemini 7b**

In [115]:
with open('sample_job_description.txt', 'r') as file:
    job_description = file.read()

with open('sample_job_description_2.txt', 'r') as file:
    job_description_2 = file.read()

with open('sample_job_description_3.txt', 'r') as file:
    job_description_3 = file.read()

with open('sample_job_description_4.txt', 'r') as file:
    job_description_4 = file.read()

In [111]:
API_URL = "https://api-inference.huggingface.co/models/gpt2"
headers = {"Authorization": f"Bearer {API_TOKEN}"}


def query(payload):
    response = requests.post(API_URL, headers=headers, json=payload)
    return response.json()


data = query("Can you please let us know more details about your ")

In [84]:
data

[{'generated_text': 'Can you please let us know more details about your iphone, so it can get through to you?\n\nThank you for your time on our podcast.'}]

In [85]:
headers = {"Authorization": f"Bearer {API_TOKEN}"}
API_URL = "https://api-inference.huggingface.co/models/deepset/roberta-base-squad2"


def query(payload):
    response = requests.post(API_URL, headers=headers, json=payload)
    return response.json()


data = query(
    {
        "inputs": {
            "question": "What's my name?",
            "context": "My name is Clara and I live in Berkeley.",
        }
    }
)

In [117]:
headers = {"Authorization": f"Bearer {API_TOKEN}"}
API_URL = "https://api-inference.huggingface.co/models/deepset/roberta-base-squad2"


def query(payload):
    response = requests.post(API_URL, headers=headers, json=payload)
    return response.json()


data = query(
    {
        "inputs": {
            "question": prompt,
            "context": job_description_4,
        }
    }
)

In [119]:
data

{'score': 0.35506680607795715,
 'start': 1033,
 'end': 1065,
 'answer': 'Sales Development Representative'}

In [None]:
headers = {"Authorization": f"Bearer {API_TOKEN}"}
API_URL = "https://api-inference.huggingface.co/models/deepset/roberta-base-squad2"


def query(payload):
    response = requests.post(API_URL, headers=headers, json=payload)
    return response.json()


data = query(
    {
        "inputs": {
            "question": "What's my name?",
            "context": "My name is Clara and I live in Berkeley.",
        }
    }
)

In [91]:
headers = {"Authorization": f"Bearer {API_TOKEN}"}
API_URL = "https://api-inference.huggingface.co/models/google/gemma-1.1-7b-it"


def query(payload):
    response = requests.post(API_URL, headers=headers, json=payload)
    return response.json()


data = query(
    {

        "chat" : {"role": "user", 
                 "content" : "<bos><start_of_turn>user Write a hello world program<end_of_turn> <start_of_turn>model" }
    }
)

JSONDecodeError: Expecting value: line 1 column 1 (char 0)

In [95]:
import requests

headers = {"Authorization": f"Bearer {API_TOKEN}"}
API_URL = "https://api-inference.huggingface.co/models/google/gemma-1.1-7b-it"


def query(payload):
    try:
        response = requests.post(API_URL, headers=headers, json=payload)
        if response.status_code != 200:
            print(f"Error: API returned status code {response.status_code}")
            return None  # Or handle the error differently

        return response.json()
    except JSONDecodeError as e:
        print("Error decoding JSON response:", e)
        return None  # Or handle the error differently


data = query(
    {
        "chat": {
            "question": "What's my name?",
            "context": "My name is Clara and I live in Berkeley.",
        }
    }
)

print(data)

Error: API returned status code 422
None
