# Libraries

In [1]:
import requests
from bs4 import BeautifulSoup

In [2]:
%run ../../../OpenAI_API.ipynb


The openai.ChatCompletion.create() function is used to generate a response to a sequence of messages in the context of a conversation. Here are the parameters of the function:

* model: The ID of the GPT model to use for generating the response. This can be a string representing the name of the model, or an instance of the openai.Model class.
* prompt: An optional string containing the initial prompt to start the conversation. This can be used to set the context for the conversation.
* temperature: A float specifying the "creativity" of the generated responses. Higher values result in more diverse and unexpected responses.
* max_tokens: An integer specifying the maximum number of tokens (words and punctuation) that the generated response should contain.
* n: An integer specifying the number of responses to generate. The API will return the top n results.
* stop: An optional string or list of strings specifying the stopping criteria for the generated response. When the generated text c

[Check for OpenAI credit usage here ($)](https://platform.openai.com/account/usage)

# Static parameters
These parameters are used to filter criteria for the [ClimateBase.org Jobs](https://climatebase.org/jobs?l=&q=&p=0&remote=false) website.

In [3]:
#https://climatebase.org/jobs?l=&q=&p=0&remote=false
domain_name = "https://climatebase.org"
url_path = "/jobs?l=&q=&p=0&remote=false"

# Job types
#https://climatebase.org/jobs?l=&q=&job_types=Full+time+role&p=0&remote=false
d_job_types  = {0:"", 1:"Full+time+role", 2:"Internship"}

# Role type
#https://climatebase.org/jobs?l=&q=&categories=Data+Analyst&p=0&remote=false
d_categories = {0:"", 1:"Data+Analyst", 2:"Data+Scientist", 3:"Research"}

# Remote
#https://climatebase.org/jobs?l=Remote&q=&p=0&remote=true
d_remote = {0:"", 1:"true", 2:"false"}

css_object_class = "list_card"

# User-defined parameters

In [4]:
# These parameters are the filtering criteria for the website.

job_types = d_job_types[1]
print("Job type: " + job_types.replace("+", ""))

categories = d_categories[2]
print("Category: " + categories.replace("+", ""))

remote = d_remote[1]
print("Remote: " + remote)


Job type: Fulltimerole
Category: DataScientist
Remote: true


# User-Defined Functions

In [5]:
def insert_filter(input_text, to_insert):
    """
    This function formats the url structure to make a filtered query.
    """
    # Find the index where "&p=" starts
    index = input_text.find("&p=")

    # Insert the text to the left of "&p="
    new_string = input_text[:index] + to_insert + input_text[index:]
    
    return new_string

In [6]:
def define_remote(input_text):
    """
    This function is similar to insert_filter(), but is specific for the "remote" filtering.
    """
    
    new_string = input_text.replace("?l=", "?l=Remote")
    new_string = input_text.replace("&remote=false", "&remote=true")

    return new_string

In [7]:
def scraping_css_object(url_path, css_object_class):
    """
    Given a CSS object class, this scraper will obtain the relevant information from the website.
    """
    
    url = domain_name + url_path
    #"https://climatebase.org/jobs?l=&q=&categories=Data+Scientist&p=0&remote=true"

    response = requests.get(url)
    soup = BeautifulSoup(response.content, "html.parser")

    # Find all elements with class="list_card"
    found_objects = soup.find_all(class_=css_object_class)

    return found_objects

In [8]:
def scrape_title(current_path):
    """
    This function will obtain the job title from a predefined CSS object specific to 
    the ClimateBase.org website.
    """
    
    html_title = scraping_css_object(current_path, "fcPVcr")
    soup = BeautifulSoup(str(html_title), 'html.parser')
    title = soup.find('h1', {'class': 'PageLayout__Title-sc-1ri9r3s-4 fcPVcr'}).text
    
    return title

In [9]:
def scrape_job_description(current_path):
    """
    This function will obtain the job description from a predefined CSS object specific to
    the ClimateBase.org website.
    """
    
    # Mining job description
    html_bodytext = scraping_css_object(current_path, "EPUZp")

    soup = BeautifulSoup(str(html_bodytext), 'html.parser')
    bodytext = soup.div.text.strip()

    return bodytext

In [10]:
def chatgpt_prompt(title, bodytext):
    """
    This function contains the prompt with the set of rules that are to be sent to ChatGPT to process a text.
    """
    
    text_prompt = """I will prompt you with a job description and I want your help to categorize it, but before I will set some rules. 

    1. Replace any double quotes in the text with single quotes.
    
    2. Please categorize the job description only for the following criteria (if the information is available): 
    * Job title
    * Company mission
    * Company values 
    * Company products or services
    * Job responsibilities
    * Desired software skills
    * Education
    * Required Job Experience
    * Equal Employment Opportunity
    * Salary
    * Benefits
    * Location
    * Type of employment
    * URL

    3. Please provide your answers in a JSON object format, where the key name is the same as the categories but with spaces replaced by underscores (if necessary).
    4. Use a consistent structure for all data entries. Avoid creating nested values. Separate them with a delimiter such as ";" instead.
    5. If any category has no available information, please include a "null" value for the corresponding key in the JSON object. 
    6. Make the categorizations as concise as possible, maybe even as keywords. Be as economic as possible.
    7. Avoid paragraphs of text or long sentences. 
    8. Avoid redundant text.

    Those would be the rules. Now I will prompt you with the text for the job description: 
    """ + "Job title: " + title + ". "+ bodytext + " URL: " + domain_name + current_path
    
    return text_prompt

# Variables

In [11]:
# Formatting variables for filtering criteria on the website.

if job_types != "":
    url_path = insert_filter(url_path, "&job_types=" + job_types)
    
if categories != "":
    url_path = insert_filter(url_path, "&categories=" + categories)

if remote != "":    
    url_path = define_remote(url_path)

# Main

## > Scraping url's
Mining url's from main site by filtered criteria. 

In [12]:
# A list of websites with job different job descriptions is obtained after filtering.
scraped_url_paths = [element['href'] for element in scraping_css_object(url_path, css_object_class)]

# Example: Visualization of the complete url
#domain_name + scraped_url_paths[0]

## > Scraping information from each url

In [13]:
json_list = []

for current_path in scraped_url_paths:
    try:
        # Scraping job title
        title = scrape_title(current_path)

        # Scraping job description
        bodytext = scrape_job_description(current_path)

        # Redacting prompt for ChatGPT
        text_prompt = chatgpt_prompt(title, bodytext)

        # Calling ChatGPT
        reply = call_openai_api(text_prompt, tokens = 1000)
        reply = reply.replace("\n", "") 
        #reply = reply.replace("'", "\"")

        json_object = json.loads(reply)

        # Collecting JSON objects
        json_list.append(json_object)
        
    except:
        continue

In [16]:
filename = categories + "_" + job_types + "_" + "remote_" + remote

In [18]:
with open(filename + '.json', 'w') as f:
    json.dump(json_list, f)

In [19]:
json_list

[{'job_title': 'Mid/Senior Product Designer',
  'company_mission': 'Developing a platform to help electricity consumers, producers and suppliers move towards 24/7 clean energy.',
  'company_values': None,
  'company_products_or_services': 'SaaS platform for trading clean energy',
  'job_responsibilities': 'Design and refine the UI/UX of the Granular web platform; Contribute to the design system and maintain design consistency across features; Lead user research, testing, and validation of new designs; Contribute to ideation along with the founders and the rest of the technology and product team; Support with occasional brand work',
  'desired_software_skills': 'Figma and its component and prototyping system',
  'education': None,
  'required_job_experience': '4+ years of designing user-centric products; Anything that is data-heavy or relating to energy is a big plus',
  'equal_employment_opportunity': 'We are committed to diversity and developing talent. If you do not have all the skil