# Libraries

In [2]:
import requests
from bs4 import BeautifulSoup

In [3]:
import json
import pandas as pd
from datetime import datetime
from json2html import json2html

In [4]:
%run ../../../OpenAI_API.ipynb


The `openai.ChatCompletion.create()` function generates a response to a sequence of messages in the context of a conversation. 
The following are the parameters of the function:


Parameter,Description
model,The engine that the API will connect to generate the response.
text_prompt,Input text for ChatGPT.
temperature,"Float value ∈ [0, 1]. Controls the creativity (i.e. randomness) of the generated text. A higher value means a more creative and unexpected response, and vice versa."
max_tokens,Maximum number of tokens (i.e. words or phrases) for the generated text.
n,An integer specifying the number of top responses to return.
stop,"An optional string or list of strings specifying the stopping criteria for the generated response. When the generated text contains any of the specified strings, the response is considered complete and the generation process stops."


[Check for OpenAI credit usage here ($)](https://platform.openai.com/account/usage)

# Static parameters
These parameters are used to filter criteria for the [ClimateBase.org Jobs](https://climatebase.org/jobs?l=&q=&p=0&remote=false) website.

In [None]:
#https://climatebase.org/jobs?l=&q=&p=0&remote=false
domain_name = "https://climatebase.org"
url_path = "/jobs?l=&q=&p=0&remote=false"

# Job types
#https://climatebase.org/jobs?l=&q=&job_types=Full+time+role&p=0&remote=false
d_job_types  = {0:"", 1:"Full+time+role", 2:"Internship"}

# Role type
#https://climatebase.org/jobs?l=&q=&categories=Data+Analyst&p=0&remote=false
d_categories = {0:"", 1:"Data+Analyst", 2:"Data+Scientist", 3:"Research"}

# Remote
#https://climatebase.org/jobs?l=Remote&q=&p=0&remote=true
d_remote = {0:"", 1:"true", 2:"false"}

css_object_class = "list_card"

# User-defined parameters
These parameters are the filtering criteria for the website.

In [None]:
job_types = d_job_types[1]
print("Job type: " + job_types.replace("+", ""))

categories = d_categories[2]
print("Category: " + categories.replace("+", ""))

remote = d_remote[1]
print("Remote: " + remote)


# User-Defined Functions

In [None]:
def insert_filter(input_text, to_insert):
    """
    This function formats the url structure to make a filtered query.
    """
    # Find the index where "&p=" starts
    index = input_text.find("&p=")

    # Insert the text to the left of "&p="
    new_string = input_text[:index] + to_insert + input_text[index:]
    
    return new_string

In [None]:
def define_remote(input_text):
    """
    This function is similar to insert_filter(), but is specific for the "remote" filtering.
    """
    
    new_string = input_text.replace("?l=", "?l=Remote")
    new_string = input_text.replace("&remote=false", "&remote=true")

    return new_string

In [None]:
def scraping_css_object(url_path, css_object_class):
    """
    Given a CSS object class, this scraper will obtain the relevant information from the website.
    """
    
    url = domain_name + url_path
    #"https://climatebase.org/jobs?l=&q=&categories=Data+Scientist&p=0&remote=true"

    response = requests.get(url)
    soup = BeautifulSoup(response.content, "html.parser")

    # Find all elements with class="list_card"
    found_objects = soup.find_all(class_=css_object_class)

    return found_objects

In [None]:
def scrape_title(current_path):
    """
    This function will obtain the job title from a predefined CSS object specific to 
    the ClimateBase.org website.
    """
    
    html_title = scraping_css_object(current_path, "fcPVcr")
    soup = BeautifulSoup(str(html_title), 'html.parser')
    title = soup.find('h1', {'class': 'PageLayout__Title-sc-1ri9r3s-4 fcPVcr'}).text
    
    return title

In [None]:
def scrape_job_description(current_path):
    """
    This function will obtain the job description from a predefined CSS object specific to
    the ClimateBase.org website.
    """
    
    # Mining job description
    html_bodytext = scraping_css_object(current_path, "EPUZp")
    soup = BeautifulSoup(str(html_bodytext), 'html.parser')
    bodytext = soup.div.text.strip()

    return bodytext

In [None]:
def chatgpt_prompt(title, bodytext):
    """
    This function contains the prompt with the set of rules that are to be sent to ChatGPT to process a text.
    """    
    
    categories = """
    * Job title
    * Company mission
    * Company values 
    * Company products or services
    * Job responsibilities
    * Desired software skills
    * Education
    * Required Job Experience
    * Equal Employment Opportunity
    * Salary
    * Benefits
    * Location
    * Type of employment
    * URL
    """
    json_keys = [category.strip('* ').lower().replace(' ', '_') for category in categories.strip().splitlines()]

    text_prompt = f"""I will prompt you with a job description contained within ```, and I want your help to extract and categorize its information. Before we begin, please follow these rules: 

    1. Replace any double quotes in the text with single quotes.
    2. Extract and categorize the information from the job description for the following categories:{categories}
    3. Please provide your answers in a JSON object format. The keys will be the same as the categories but in lower case and with spaces replaced by underscores. These are respectively and in order: {json_keys}.
    4. Use a consistent structure for all data entries. Never create nested values. Separate them with a delimiter such as ";" instead.
    5. If any category has no available information, please include a "null" value for the corresponding key in the JSON object. 
    6. Make the categorizations as concise as possible, maybe even as keywords. Be as economic as possible.
    7. Avoid paragraphs of text or long sentences. 
    8. Avoid redundant text.

    Please keep these rules in mind when categorizing the job description. Let's begin!: 
    """ + "```Job title: " + title + ".\n\n "+ bodytext + " URL: " + domain_name + current_path + " \n```"
    
    return text_prompt

#display(Markdown(chatgpt_prompt(title, bodytext)))

# Variables

In [None]:
# Formatting variables for filtering criteria on the website.

if job_types != "":
    url_path = insert_filter(url_path, "&job_types=" + job_types)
    
if categories != "":
    url_path = insert_filter(url_path, "&categories=" + categories)

if remote != "":    
    url_path = define_remote(url_path)

# Main

## > Scraping url's
Mining url's from main site by filtered criteria. 

In [None]:
# A list of websites with job different job descriptions is obtained after filtering.
scraped_url_paths = [element['href'] for element in scraping_css_object(url_path, css_object_class)]

# Example: Visualization of the complete url
#domain_name + scraped_url_paths[0]

## > Scraping information from each url

In [None]:
json_list = []

for current_path in scraped_url_paths:
    try:
        # Scraping job title
        title = scrape_title(current_path)

        # Scraping job description
        bodytext = scrape_job_description(current_path)

        # Redacting prompt for ChatGPT
        text_prompt = chatgpt_prompt(title, bodytext)

        # Calling ChatGPT
        reply = call_openai_api(text_prompt, tokens = 1000)
        reply = reply.replace("\n", "") 
        #reply = reply.replace("'", "\"")

        json_object = json.loads(reply)

        # Collecting JSON objects
        json_list.append(json_object)
        
    except:
        continue

In [None]:
current_date = datetime.now()
formatted_date = current_date.strftime("%y%m%d")

filename = formatted_date + '_' + categories + "_" + job_types + "_" + "remote_" + remote
print(filename)

In [None]:
# Exporting JSON file
with open(filename + '.json', 'w') as f:
    json.dump(json_list, f)

# Transforming JSON file into DataFrame

In [4]:
file_name = "230608_Data+Scientist_Full+time+role_remote_true.json"
json_file = json.load(open(file_name))

In [None]:
df = pd.DataFrame(columns=json_file[0].keys())

for i in range(len(json_file)):
    y = pd.json_normalize(json_file[i])
    
    # Patch:
    y.columns= y.columns.str.lower()

    df = pd.concat([df, y], ignore_index=True)

print(df.shape)

df.to_csv("{}.csv".format(filename), index=False,  sep='~')

# Displaying individual JSON objects as HTML

In [7]:
# Convert JSON to HTML
#json_object = json_file[0]
html_table = json2html.convert(json.dumps(json_object))

display_json_as_html = f"""
<div style="display: flex; justify-content: center;">
    <style>
      table {{
        width: 60%;
        border-collapse: collapse;
      }}
      th, td {{
        padding: 8px;
        border-bottom: 1px solid #ddd;
        word-wrap: break-word;
      }}
      th:nth-child(2), td:nth-child(2) {{
        width: 400px;
      }}
    </style>
    {html_table}
</div>
"""

display(HTML(display_json_as_html))

0,1
job_title,Content writer
company_mission,Solving the #1 challenge faced by organizations working on climate: Hiring.
company_values,
company_products_or_services,"Climatebase, the world's leading platform for climate jobs, talent, and community"
job_responsibilities,"Contribute as a content writer for Climatebase, writing deep dives of companies on Climatebase, interviews with founders of climate organizations, climate technology reports/summaries, weekly climate news summaries, climate op-eds, and other types of content; syndicate content across newsletters channels and twitter account; content will be published on the company's blog"
desired_software_skills,
education,
required_job_experience,
equal_employment_opportunity,
salary,
