# 1. Imports

In [139]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import pandas as pd
import requests

# 2. Functions

**Key Points of the Script:**
1. **WebDriver Setup**: It initializes a Chrome WebDriver, not headless by default but with an option commented to enable headless mode. The driver is set to open in fullscreen mode.
2. **Job Scraping**: The function `scrape_jobs` navigates to LinkedIn job search results, automatically scrolling to load more entries and clicking on job cards to expand details. It continues until it gathers the desired number of job postings or reaches the end of available listings.
3. **Data Extraction**: For each job listing, the script extracts job title, company name, location, job description, posted date, and listed date from the details pane. It handles cases where some details might be missing by filling in "Not listed" if necessary.
4. **Analysis**: Converts the scraped data into a Pandas DataFrame to facilitate data manipulation and analysis.
5. **Graceful Shutdown**: Ensures the WebDriver is properly closed after scraping to free resources and avoid leftover processes.

**Considerations**:
- **Dynamic Content Handling**: Implements Selenium waits to manage dynamic content and ensure elements are loaded before interaction. Uses JavaScript execution for actions like scrolling and clicking, which can be more reliable on dynamic pages.
- **Error Handling**: Robust error handling to manage exceptions during element interaction, including clicks that fail due to overlays or missing elements.
- **Full Screen Mode**: The script runs Chrome in fullscreen mode by default, which can be useful for visibility during manual troubleshooting but is not typically needed in automated environments.

**Note**:
- As a caution, LinkedIn’s terms of service prohibit scraping, which could lead to account restrictions or bans if detected. This script assumes a small-scale, non-intrusive use case which typically avoids detection. However, it is important to consider these limitations and potential legal implications when automating interactions with web services.

**Web Scraper Driver Setup - Helper Function**

In [234]:
def setup_driver():
    # Setup Chrome WebDriver
    chrome_options = webdriver.ChromeOptions()
    #options.add_argument('--headless')  # Runs Chrome in headless mode.
    #options.add_argument('--no-sandbox')
    #options.add_argument('--disable-dev-shm-usage')
    # Opens Chrome in full screen mode.
    chrome_options.add_argument("--start-fullscreen")
    driver = webdriver.Chrome(options=chrome_options)
    return driver

**Scraping Function**

In [235]:
def scrape_jobs(driver, url, num_results=5):
    print("Navigating to the page...")
    driver.get(url)
    time.sleep(2)  # Allow some time for the page to load
    results = []
    count = 0
    last_height = driver.execute_script("return document.body.scrollHeight")

    print(f"Starting the scrape for {num_results} job postings...")
    while count < num_results:
        job_cards = driver.find_elements(By.CLASS_NAME, 'base-card')
        print(f"Found {len(job_cards)} job cards on the page:")
        for card in job_cards[count:]:
            # Click on the job card to open the job details
            try:
                driver.execute_script("arguments[0].click();", card)
                time.sleep(4)  # Wait for the details to load
            except Exception as e:
                print(f"Error clicking on job card: {e}")
                continue

            # Click the 'Show more' button to reveal the full description
            try:
                show_more_button = driver.find_element(
                    By.CLASS_NAME, 'show-more-less-html__button--more')
                show_more_button.click()
                time.sleep(4)  # Wait for the description to expand
            except NoSuchElementException:
                print("No 'Show more' button to click.")
            except ElementClickInterceptedException:
                print("The 'Show more' button was not clickable.")

            # Scrape the job details from the details pane
            try:
                job_title = driver.find_element(
                    By.CLASS_NAME, 'topcard__title').text
                company_name = driver.find_element(
                    By.CLASS_NAME, 'topcard__flavor').text
                location = driver.find_element(
                    By.CLASS_NAME, 'topcard__flavor--bullet').text
                description_container = driver.find_element(
                    By.CLASS_NAME, 'description__text--rich')
                description = description_container.get_attribute('innerText')
                post_date = driver.find_element(
                    By.CLASS_NAME, 'posted-time-ago__text').text
                list_date = card.find_element(By.CLASS_NAME, 'job-search-card__listdate').get_attribute(
                    'datetime')
            except NoSuchElementException as e:
                print(
                    f"Not all elements could be found for job description: {e}")
                # If some elements are missing, continue with what has been found
                job_title = job_title if 'job_title' in locals() else "Not listed"
                company_name = company_name if 'company_name' in locals() else "Not listed"
                location = location if 'location' in locals() else "Not listed"
                description = description if 'description' in locals() else "Not listed"
                post_date = post_date if 'post_date' in locals() else "Not listed"
                list_date = list_date if 'list_date' in locals() else "Not listed"

            results.append({
                'Job Title': job_title,
                'Company Name': company_name,
                'Location': location,
                'Posted Date': post_date,
                'Listed Date': list_date,
                'Job Description': description
            })
            print(
                f"Scraped {count + 1}/{num_results}: {job_title} at {company_name}")
            count += 1
            if count >= num_results:
                break

        # Scroll down to bottom
        driver.execute_script(
            "window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(3)

        # Calculate new scroll height and compare with last scroll height
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            print("Reached the end of the page or no more job cards to load.")
            break
        last_height = new_height

    return results

**Performing the Scraping Operation Function**

In [236]:
def analyze_data(data):
    df = pd.DataFrame(data)
    return df
    

def main():
    # Initialize the Chrome WebDriver using helper function that sets up the driver with necessary options
    driver = setup_driver()

    # URL setup for the LinkedIn job search based on specific keywords and location
    url = 'https://www.linkedin.com/jobs/search/?keywords=sales%20development%20representative&location=United%20States'

    try:
        # Scrape job data from LinkedIn using the scrape_jobs function
        job_data = scrape_jobs(driver, url, 40)

        # Get the dataframe representation of the scraped data
        DF = analyze_data(job_data)
        print(DF)

    finally:
        # Ensure the driver is properly quit after scraping to free resources and avoid any leftover processes.
        driver.quit()
        return DF

# Running Scraper

**Web Scraping Process**

*Note:*
- If the Chrome tab closes unexpectedly before data collection is complete, this typically indicates an error has occurred.

*Data Throttling:*
- LinkedIn may throttle your requests if excessive activity is detected in a short period, resulting in an HTTP 429 error (Too Many Requests). This error will prevent further job posting information from loading when attempting to interact with the page.
- Refer to the provided screenshot "Error 429 Screenshot - too many requests in time window.png" for a visual reference of this error.
- When throttling occurs, you must pause the scraping process to allow request limits to reset. This prevents frequent updates (e.g., every minute) and large-scale scrapes (e.g., attempting to scrape 1 million job postings at once).
- Deliberate pauses have been incorporated into the script between scraping actions to slow the rate of throttling, though this does not completely eliminate the risk.
- It is crucial to save the collected data after each successful scrape to ensure no data is lost between pauses or in case of errors.

In [None]:
DF = main()

**Show dataframe of data scrapped**

In [178]:
DF

Unnamed: 0,Job Title,Company Name,Location,Posted Date,Job Description
0,Sales Development Representative,Runwise,"New York, NY",1 month ago,Runwise is looking for a growth-minded Sales D...
1,Sales Development Representative,Assembled,San Francisco Bay Area,2 days ago,Assembled is building software to transform an...
2,Inside Sales Representative,Boston Celtics,"Boston, MA",3 weeks ago,Summary:\n\n\n\n\nThroughout the Boston Celtic...
3,Sales Development Representative,Flowhub,United States,2 months ago,"At Flowhub, we're about more than technology —..."
4,Sales Development Representative,Klook,"Los Angeles, CA",1 month ago,What you'll do:\n\nAcquisition and Account Man...
5,Sales Development Representative,CivicPlus,United States,3 weeks ago,If you are looking to break into tech sales or...
6,Sales Development Representative,WorkWave,United States,1 month ago,The Business Development Representative positi...
7,Sales Development Representative (SDR),Swell,"Salt Lake City, UT",6 days ago,Swell is looking for energetic and motivated p...
8,Inbound Sales Development Representative - AMER,Notion,"San Francisco, CA",1 month ago,About Us\n\nWe're on a mission to make it poss...
9,Sales Development Representative,Slang.ai,"New York, NY",11 months ago,We're hiring Sales Development Representative ...


Number of jobs collected

In [179]:
len(DF)

40

**Saving scrapped data into .csv file with current datetime string as file name**

In [183]:
import datetime

# Get the current date and time
now = datetime.datetime.now()

# Format the date and time as a string in the desired format
date_time = now.strftime("%Y-%m-%d_%H-%M-%S")

# Save the DataFrame to a CSV file with the formatted date and time as the file name
file_name = f"{date_time}.csv"
DF.to_csv(file_name, index=False)

# 3. Parsing Job Descriptions to Find Salary Info

**Description of this section**:

1. Ask *roberta-base-squad2* Q&A LM model to use **job description** to find **salary information** for each job.
2. Clean LM's response to get the salary number
3. Put salary number info into dataframe

LinkedIn doesn't have a clear "salary" tag, so you have to read through the job description to get the salary information. (Hence why you need LM models to process them)

Lots of jobs also don't post salary information in the job posting.

Notes:
- Roberta is pretty good, but still has small issues after cleaning up its response
- I tried using Gemma and Phi-3 models with the API, but wasn't able to get it working because of lack of documentation (the models were released less than a week ago at the time of writing)
- I could also do a local LM model, but that requires more setup and overhead work as well as computation power (using my laptop to do this)


HuggingFace Documentation Links:
- Severless Inference API: https://huggingface.co/docs/api-inference/index
- API Overview: https://huggingface.co/docs/api-inference/quicktour
- API Parameters: https://huggingface.co/docs/api-inference/detailed_parameters
- Roberta Doc: https://huggingface.co/deepset/roberta-base-squad2
----
- Online Chatbot: https://huggingface.co/chat/conversation/
----
- Gemma: https://huggingface.co/google/gemma-1.1-7b-it
- Phi-3 mini: https://huggingface.co/microsoft/Phi-3-mini-128k-instruct

### Setup for Parsing

**Question Prompt (to prompt roberta-base-squad2)**

In [201]:
question_prompt = '''
How much is this job paying? What is the salary? (ONLY print that number; do not print anything else!!!)
'''

**API Accessing Info**

In [None]:
# API Info for Hugging Face Inference API
API_TOKEN = "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"
headers = {"Authorization": f"Bearer {API_TOKEN}"}
API_URL = "https://api-inference.huggingface.co/models/deepset/roberta-base-squad2"

# Define the query function to do API request
def query(payload):
    response = requests.post(API_URL, headers=headers, json=payload)
    return response.json()

**String-to-Number Parser (extracting salary information from LM response)**

In [219]:
import re


def find_first_number(text):
    # Use regular expression to find numbers followed optionally by 'k' or 'K'
    matches = re.findall(r'\d[\d,]*\.?\d*\s*[kK]?', text)

    if matches:
        first_match = matches[0]
        # Check if the number ends with 'k' or 'K'
        if 'k' in first_match.lower():
            # Remove commas, 'k' or 'K', and convert to float, then multiply by 1000
            number = float(first_match.lower().replace(
                'k', '').replace(',', '')) * 1000
        else:
            # Remove commas and convert to appropriate number type
            number_str = first_match.replace(',', '')
            if '.' in number_str:
                # If there's a decimal point, convert to float
                number = float(number_str)
            else:
                # If no decimal, convert to int
                number = int(number_str)
        return number
    else:
        # Return "N/A" if no number is found
        return "N/A"

# Example usage
result = find_first_number("$56k to $100,000")
print(result)  # Output: 56000

56000.0


**Extracting Salary Info from Job Posting Description with LM**

Note: may need to run a second time because HuggingFace mdoels sometimes need to "warm up" (load) and will return an error, so wait a bit and then try running again.

In [211]:
for index, row in DF.iterrows():
    # Access the values of each column in the row
    job_description = row['Job Description']
    
    response = query(
        {
            "inputs": {
                "question": question_prompt,
                "context": job_description,
            }
        }
    )

    # Print the response to the question
    print(f"For job posting {index}:")
    
    # FOR DEBUGGING
    # print(f"DESCRIPTION: {job_description}")

    print(f"Response: {response['answer']}")
    
    salary = find_first_number(response['answer'])
    print(f"Salary Extracted: {salary}")
    # Add to the DataFrame
    DF.at[index, 'Salary'] = salary
    #print(f"Job Description: {job_description}")
    print()

For job posting 0:
Response: $85,000- $100,000
Salary Extracted: 85000

For job posting 1:
Response: Sales Development Representative
Salary Extracted: N/A

For job posting 2:
Response: Boston Celtics Inside Sales
Salary Extracted: N/A

For job posting 3:
Response: $55,000
Salary Extracted: 55000

For job posting 4:
Response: A bachelor degree in any discipline
Salary Extracted: N/A

For job posting 5:
Response: $45,000
Salary Extracted: 45000

For job posting 6:
Response: $45,000
Salary Extracted: 45000

For job posting 7:
Response: Competitive compensation plus stock options

Salary Extracted: N/A

For job posting 8:
Response: Sales Development Representative
Salary Extracted: N/A

For job posting 9:
Response: One or more years of experience in SaaS sales
Salary Extracted: N/A

For job posting 10:
Response: $60k-$75k
Salary Extracted: 60000.0

For job posting 11:
Response: $50,000 + commission
Salary Extracted: 50000

For job posting 12:
Response: $50k-$80k+
Salary Extracted: 50000.0

**Displaying the DF**

In [212]:
DF

Unnamed: 0,Job Title,Company Name,Location,Posted Date,Job Description,Salary
0,Sales Development Representative,Runwise,"New York, NY",1 month ago,Runwise is looking for a growth-minded Sales D...,85000.0
1,Sales Development Representative,Assembled,San Francisco Bay Area,2 days ago,Assembled is building software to transform an...,
2,Inside Sales Representative,Boston Celtics,"Boston, MA",3 weeks ago,Summary:\n\n\n\n\nThroughout the Boston Celtic...,
3,Sales Development Representative,Flowhub,United States,2 months ago,"At Flowhub, we're about more than technology —...",55000.0
4,Sales Development Representative,Klook,"Los Angeles, CA",1 month ago,What you'll do:\n\nAcquisition and Account Man...,
5,Sales Development Representative,CivicPlus,United States,3 weeks ago,If you are looking to break into tech sales or...,45000.0
6,Sales Development Representative,WorkWave,United States,1 month ago,The Business Development Representative positi...,45000.0
7,Sales Development Representative (SDR),Swell,"Salt Lake City, UT",6 days ago,Swell is looking for energetic and motivated p...,
8,Inbound Sales Development Representative - AMER,Notion,"San Francisco, CA",1 month ago,About Us\n\nWe're on a mission to make it poss...,
9,Sales Development Representative,Slang.ai,"New York, NY",11 months ago,We're hiring Sales Development Representative ...,


**Data frame describe() output**

In [213]:
DF.describe()

Unnamed: 0,Job Title,Company Name,Location,Posted Date,Job Description,Salary
count,40,40,40,40,40,40.0
unique,4,38,16,11,39,11.0
top,Sales Development Representative,Roboflow,United States,1 month ago,The Business Development Representative positi...,
freq,36,2,10,13,2,22.0


**Saving Dataframe to File After Salary Extraction**

In [224]:
import datetime

# Get the current date and time
now = datetime.datetime.now()

# Format the date and time as a string in the desired format
date_time = now.strftime("%Y-%m-%d_%H-%M-%S")

# Save the DataFrame to a CSV file with the formatted date and time as the file name
file_name = f"{date_time}-with-salary.csv"
DF.to_csv(file_name, index=False)

# 4. Data Analytics (NOT FINISHED)

**Read in the dataframe from saved csv file**

In [226]:
df = pd.read_csv('2024-04-29_00-20-01-with-salary.csv')

In [227]:
df

Unnamed: 0,Job Title,Company Name,Location,Posted Date,Job Description,Salary
0,Sales Development Representative,Runwise,"New York, NY",1 month ago,Runwise is looking for a growth-minded Sales D...,85000.0
1,Sales Development Representative,Assembled,San Francisco Bay Area,2 days ago,Assembled is building software to transform an...,
2,Inside Sales Representative,Boston Celtics,"Boston, MA",3 weeks ago,Summary:\n\n\n\n\nThroughout the Boston Celtic...,
3,Sales Development Representative,Flowhub,United States,2 months ago,"At Flowhub, we're about more than technology —...",55000.0
4,Sales Development Representative,Klook,"Los Angeles, CA",1 month ago,What you'll do:\n\nAcquisition and Account Man...,
5,Sales Development Representative,CivicPlus,United States,3 weeks ago,If you are looking to break into tech sales or...,45000.0
6,Sales Development Representative,WorkWave,United States,1 month ago,The Business Development Representative positi...,45000.0
7,Sales Development Representative (SDR),Swell,"Salt Lake City, UT",6 days ago,Swell is looking for energetic and motivated p...,
8,Inbound Sales Development Representative - AMER,Notion,"San Francisco, CA",1 month ago,About Us\n\nWe're on a mission to make it poss...,
9,Sales Development Representative,Slang.ai,"New York, NY",11 months ago,We're hiring Sales Development Representative ...,


**Data Analytics**

In [232]:
# Helper function to convert salary to a float, and handle non-numeric data
def parse_salary(salary):
    try:
        # Remove dollar sign and commas
        return float(re.sub(r'[^\d.]', '', salary))
    except:
        return np.nan


# Parse salary data
df['Parsed Salary'] = df['Salary'].apply(parse_salary)

# Drop rows without a valid salary
df.dropna(subset=['Parsed Salary'], inplace=True)



# Group by 'Company Name' 
company_info = df.groupby('Company Name').agg({
    # Assuming the first job description is the company description
    'Job Description': lambda x: x.iloc[0],
    'Parsed Salary': ['mean', 'median', 'std', 'max'],
    'Location': lambda x: x.value_counts().to_dict(),  # Location distribution
    # Most used word
    'Job Description': lambda x: Counter(" ".join(x).split()).most_common(1)[0][0]
}).reset_index()


In [233]:
company_info.describe()

Unnamed: 0_level_0,Parsed Salary,Parsed Salary,Parsed Salary,Parsed Salary
Unnamed: 0_level_1,mean,median,std,max
count,0.0,0.0,0.0,0.0
mean,,,,
std,,,,
min,,,,
25%,,,,
50%,,,,
75%,,,,
max,,,,


# EXTRA STUFF (Junk)

In [121]:
with open('sample_job_description.txt', 'r') as file:
    job_description = file.read()

with open('sample_job_description_2.txt', 'r') as file:
    job_description_2 = file.read()

with open('sample_job_description_3.txt', 'r') as file:
    job_description_3 = file.read()

with open('sample_job_description_4.txt', 'r') as file:
    job_description_4 = file.read()

In [128]:
prompt = '''
How much is this job paying? What is the salary? (ONLY print that number; do not print anything else!!!)
'''

In [131]:
headers = {"Authorization": f"Bearer {API_TOKEN}"}
API_URL = "https://api-inference.huggingface.co/models/deepset/roberta-base-squad2"


def query(payload):
    response = requests.post(API_URL, headers=headers, json=payload)
    return response.json()


data = query(
    {
        "inputs": {
            "question": prompt,
            "context": job_description_2,
        }
    }
)

In [132]:
data

{'score': 0.08668936043977737, 'start': 3333, 'end': 3340, 'answer': '$64,500'}

## Testing Hugging Face API

In [66]:
import requests

In [67]:
API_TOKEN = "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"

In [68]:

API_URL = "https://api-inference.huggingface.co/models/gpt2"
headers = {"Authorization": f"Bearer {API_TOKEN}"}


def query(payload):
    response = requests.post(API_URL, headers=headers, json=payload)
    return response.json()


data = query("Can you please let us know more details about your ")

In [69]:
data

[{'generated_text': 'Can you please let us know more details about your iphone, so it can get through to you?\n\nThank you for your time on our podcast.'}]

In [97]:
prompt = '''
How much is this job paying? What is the salary? (ONLY print that number; do not print anything else!!!)
'''

**Now Testing with Google Gemini 7b**

In [115]:
with open('sample_job_description.txt', 'r') as file:
    job_description = file.read()

with open('sample_job_description_2.txt', 'r') as file:
    job_description_2 = file.read()

with open('sample_job_description_3.txt', 'r') as file:
    job_description_3 = file.read()

with open('sample_job_description_4.txt', 'r') as file:
    job_description_4 = file.read()

In [111]:
API_URL = "https://api-inference.huggingface.co/models/gpt2"
headers = {"Authorization": f"Bearer {API_TOKEN}"}


def query(payload):
    response = requests.post(API_URL, headers=headers, json=payload)
    return response.json()


data = query("Can you please let us know more details about your ")

In [84]:
data

[{'generated_text': 'Can you please let us know more details about your iphone, so it can get through to you?\n\nThank you for your time on our podcast.'}]

In [85]:
headers = {"Authorization": f"Bearer {API_TOKEN}"}
API_URL = "https://api-inference.huggingface.co/models/deepset/roberta-base-squad2"


def query(payload):
    response = requests.post(API_URL, headers=headers, json=payload)
    return response.json()


data = query(
    {
        "inputs": {
            "question": "What's my name?",
            "context": "My name is Clara and I live in Berkeley.",
        }
    }
)

In [117]:
headers = {"Authorization": f"Bearer {API_TOKEN}"}
API_URL = "https://api-inference.huggingface.co/models/deepset/roberta-base-squad2"


def query(payload):
    response = requests.post(API_URL, headers=headers, json=payload)
    return response.json()


data = query(
    {
        "inputs": {
            "question": prompt,
            "context": job_description_4,
        }
    }
)

In [119]:
data

{'score': 0.35506680607795715,
 'start': 1033,
 'end': 1065,
 'answer': 'Sales Development Representative'}

In [None]:
headers = {"Authorization": f"Bearer {API_TOKEN}"}
API_URL = "https://api-inference.huggingface.co/models/deepset/roberta-base-squad2"


def query(payload):
    response = requests.post(API_URL, headers=headers, json=payload)
    return response.json()


data = query(
    {
        "inputs": {
            "question": "What's my name?",
            "context": "My name is Clara and I live in Berkeley.",
        }
    }
)

In [91]:
headers = {"Authorization": f"Bearer {API_TOKEN}"}
API_URL = "https://api-inference.huggingface.co/models/google/gemma-1.1-7b-it"


def query(payload):
    response = requests.post(API_URL, headers=headers, json=payload)
    return response.json()


data = query(
    {

        "chat" : {"role": "user", 
                 "content" : "<bos><start_of_turn>user Write a hello world program<end_of_turn> <start_of_turn>model" }
    }
)

JSONDecodeError: Expecting value: line 1 column 1 (char 0)

In [95]:
import requests

headers = {"Authorization": f"Bearer {API_TOKEN}"}
API_URL = "https://api-inference.huggingface.co/models/google/gemma-1.1-7b-it"


def query(payload):
    try:
        response = requests.post(API_URL, headers=headers, json=payload)
        if response.status_code != 200:
            print(f"Error: API returned status code {response.status_code}")
            return None  # Or handle the error differently

        return response.json()
    except JSONDecodeError as e:
        print("Error decoding JSON response:", e)
        return None  # Or handle the error differently


data = query(
    {
        "chat": {
            "question": "What's my name?",
            "context": "My name is Clara and I live in Berkeley.",
        }
    }
)

print(data)

Error: API returned status code 422
None
