In [1]:
import pandas as pd
import numpy as np
import google.generativeai as palm
from utils import VectorEmbeddings, remove_html_tags, remove_req_qual_str
import re, ast, json

from bs4 import BeautifulSoup
from tqdm.auto import tqdm

import psycopg2
from psycopg2.extras import execute_values
from pgvector.psycopg2 import register_vector

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

In [2]:
knowledge_df = pd.read_excel('../onet_competencies/knowledge_competencies.xlsx', skiprows=3)

In [3]:
knowledge_df.columns

Index(['Element ID', 'Element Name', 'Element Description',
       'Synonym (Source: My Next Move Easy Read Content)',
       'O*NET Database Occupation Linkages'],
      dtype='object')

In [4]:
with open('../.api_key', 'r') as file:
    api_key = file.readline()
vector_embeddings = VectorEmbeddings(api_key=api_key)

In [5]:
job_df = pd.read_json('job_descriptions.json', lines=True)

In [6]:
text_models = [m for m in palm.list_models() if 'generateText' in m.supported_generation_methods]
text_model = text_models[0]

In [38]:
k_skill = 'Medicine and Dentistry'
k_description = 'knowledge of the information and techniques needed to diagnose and treat injuries, diseases, and deformities. This includes symptoms, treatment alternatives, and preventive health-care measures'
k_high = 'Performing open-heart surgery'
k_low = 'Using a small bandage'

In [7]:
# An example high level requirement for this knowledge is {} while an example low level requirement for this knowledge is {}.

def make_ksa_prompt(k_skill, k_description, job_ad):
    query = 'Does this job ad require {} knowledge to perform the job?'.format(k_skill)
    leading_text = '{} is the {}. Look at the job ad and decide if {} is required to perform this job. If {} is required please provide the specific tasks in a bulletized format from the job ad that relate to {}, if not simply say no'.format(k_skill, k_description,k_skill, k_skill, k_skill)
    prompt = vector_embeddings.make_prompt(leading_text, query, job_ad)
    
    return prompt

In [8]:
def generate_answers(prompt, text_model, candidate_count: int = 3, temperature:float = 0.5):
    answers = palm.generate_text(prompt=prompt,
                            model=text_model,
                            candidate_count=candidate_count,
                            temperature=temperature,
                            max_output_tokens=1000)
    return answers

In [9]:
def determine_if_ksa_is_present(answers):
    for answer in answers.candidates:
        if answer['output'].lower().startswith('no'):
            return False
    return True

In [10]:
job = remove_req_qual_str(remove_html_tags(job_df.iloc[65]["requirements_qualifications"]))

In [11]:
def determine_ksa_vector(job, ksa_df, text_model, **kwargs):
    ksa = []
    non_ksa = []
    # for i, row in knowledge_df.iterrows():
    for i, row in ksa_df.iterrows():
        prompt = make_ksa_prompt(row['Element Name'].lower(), row['Element Description'].lower(), job)

        answers = generate_answers(prompt, text_model, **kwargs)

        add = determine_if_ksa_is_present(answers)

        if add:
            ksa.append(row['Element Name'])
        else:
            non_ksa.append(row['Element Name'])

    return ksa, non_ksa

In [15]:
job_df.columns

Index(['usajobs_control_number', 'position_open_date', 'position_close_date',
       'summary', 'hiring_path_explanation', 'duties', 'major_duties_list',
       'requirements_conditions_of_employment', 'requirements_qualifications',
       'requirements_education', 'required_standard_documents',
       'required_documents', 'how_to_apply', 'how_to_apply_next_steps',
       'requirements', 'evaluations', 'benefits_url', 'benefits',
       'other_information', 'appointment_type_override',
       'position_schedule_override', 'exclusive_clarification_text',
       'video_url', 'hiring_agency_code', 'hiring_agency_name',
       'hiring_department_code', 'hiring_department_name', 'agency_level',
       'agency_level_sort', 'appointment_type', 'work_schedule', 'pay_scale',
       'salary_type', 'vendor', 'travel_requirement', 'telework_eligible',
       'service_type', 'security_clearance_required', 'security_clearance',
       'who_may_apply', 'announcement_closing_type_code',
       'annou

In [16]:
for i, row in job_df.head().iterrows():
    print(row['usajobs_control_number'])

514962300
514177900
540927700
534312000
546156800


In [19]:
from concurrent.futures import ThreadPoolExecutor, as_completed

In [25]:
test_list = job_df.head()['requirements_qualifications'].to_list()

In [None]:
job_responses = [] 
response_list = []
with ThreadPoolExecutor(max_workers=10) as executor:
    for file in test_list:
        print('yes')       
        job_responses.append(executor.submit(determine_ksa_vector, file, ksa_df = knowledge_df, text_model=text_model))
        for body in as_completed(job_responses):
            response_list.append(body.result())

In [22]:
job_responses = [] 
response_list = []
with ThreadPoolExecutor(max_workers=5) as executor:
    for i, row in job_df.head().iterrows():
        print(row['usajobs_control_number'])       
        job_responses.append(executor.submit(determine_ksa_vector, job = row['requirements_qualifications'], ksa_df = knowledge_df, text_model=text_model))
        for body in as_completed(job_responses):
            response_list.append(body.result())

514962300
514177900
540927700
534312000
546156800


In [23]:
response_list

[(['Economics and Accounting',
   'English Language',
   'Communications',
   'Transportation'],
  ['Business and Management',
   'Administration and Management',
   'Administrative',
   'Sales and Marketing',
   'Customer and Personal Service',
   'Personnel and Human Resources',
   'Manufacturing and Production',
   'Production and Processing',
   'Food Production',
   'Engineering and Technology',
   'Computers and Electronics',
   'Engineering and Technology',
   'Design',
   'Building and Construction',
   'Mechanical',
   'Mathematics and Science',
   'Mathematics',
   'Physics',
   'Chemistry',
   'Biology',
   'Psychology',
   'Sociology and Anthropology',
   'Geography',
   'Health Services',
   'Medicine and Dentistry',
   'Therapy and Counseling',
   'Education and Training',
   'Arts and Humanities',
   'Foreign Language',
   'Fine Arts',
   'History and Archeology',
   'Philosophy and Theology',
   'Law and Public Safety',
   'Public Safety and Security',
   'Law and Gover

In [None]:
def batch_api_calls(url_list, chunk_number):
    response_list = []
    with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
        job_responses = []
        for url in url_list:
            job_responses.append(executor.submit(get_job_announcement_text, url=url))
        for body in concurrent.futures.as_completed(job_responses):
            response_list.append(body.result())

In [12]:
determine_ksa_vector(job, knowledge_df, text_model)

(['Administration and Management',
  'Engineering and Technology',
  'Engineering and Technology',
  'Mathematics and Science',
  'Mathematics',
  'Education and Training'],
 ['Business and Management',
  'Administrative',
  'Economics and Accounting',
  'Sales and Marketing',
  'Customer and Personal Service',
  'Personnel and Human Resources',
  'Manufacturing and Production',
  'Production and Processing',
  'Food Production',
  'Computers and Electronics',
  'Design',
  'Building and Construction',
  'Mechanical',
  'Physics',
  'Chemistry',
  'Biology',
  'Psychology',
  'Sociology and Anthropology',
  'Geography',
  'Health Services',
  'Medicine and Dentistry',
  'Therapy and Counseling',
  'Arts and Humanities',
  'English Language',
  'Foreign Language',
  'Fine Arts',
  'History and Archeology',
  'Philosophy and Theology',
  'Law and Public Safety',
  'Public Safety and Security',
  'Law and Government',
  'Communications',
  'Telecommunications',
  'Communications and Media

In [15]:
determine_ksa_vector(job, knowledge_df, text_model)

(['Engineering and Technology',
  'Engineering and Technology',
  'Mathematics and Science',
  'Mathematics',
  'Education and Training',
  'Public Safety and Security'],
 ['Business and Management',
  'Administration and Management',
  'Administrative',
  'Economics and Accounting',
  'Sales and Marketing',
  'Customer and Personal Service',
  'Personnel and Human Resources',
  'Manufacturing and Production',
  'Production and Processing',
  'Food Production',
  'Computers and Electronics',
  'Design',
  'Building and Construction',
  'Mechanical',
  'Physics',
  'Chemistry',
  'Biology',
  'Psychology',
  'Sociology and Anthropology',
  'Geography',
  'Health Services',
  'Medicine and Dentistry',
  'Therapy and Counseling',
  'Arts and Humanities',
  'English Language',
  'Foreign Language',
  'Fine Arts',
  'History and Archeology',
  'Philosophy and Theology',
  'Law and Public Safety',
  'Law and Government',
  'Communications',
  'Telecommunications',
  'Communications and Media

In [144]:
ksa, non_ksa

(['Administration and Management',
  'Engineering and Technology',
  'Engineering and Technology',
  'Design',
  'Building and Construction',
  'Mathematics and Science',
  'Mathematics',
  'Physics',
  'Geography',
  'Medicine and Dentistry',
  'Therapy and Counseling',
  'Education and Training',
  'English Language'],
 ['Business and Management',
  'Administrative',
  'Economics and Accounting',
  'Sales and Marketing',
  'Customer and Personal Service',
  'Personnel and Human Resources',
  'Manufacturing and Production',
  'Production and Processing',
  'Food Production',
  'Computers and Electronics',
  'Mechanical',
  'Chemistry',
  'Biology',
  'Psychology',
  'Sociology and Anthropology',
  'Health Services',
  'Arts and Humanities',
  'Foreign Language',
  'Fine Arts',
  'History and Archeology',
  'Philosophy and Theology',
  'Law and Public Safety',
  'Public Safety and Security',
  'Law and Government',
  'Communications',
  'Telecommunications',
  'Communications and Media

In [142]:
ksa, non_ksa

(['Administration and Management',
  'Engineering and Technology',
  'Engineering and Technology',
  'Design',
  'Building and Construction',
  'Mathematics and Science',
  'Mathematics',
  'Physics',
  'Geography',
  'Medicine and Dentistry',
  'Therapy and Counseling',
  'Education and Training',
  'English Language'],
 ['Business and Management',
  'Administrative',
  'Economics and Accounting',
  'Sales and Marketing',
  'Customer and Personal Service',
  'Personnel and Human Resources',
  'Manufacturing and Production',
  'Production and Processing',
  'Food Production',
  'Computers and Electronics',
  'Mechanical',
  'Chemistry',
  'Biology',
  'Psychology',
  'Sociology and Anthropology',
  'Health Services',
  'Arts and Humanities',
  'Foreign Language',
  'Fine Arts',
  'History and Archeology',
  'Philosophy and Theology',
  'Law and Public Safety',
  'Public Safety and Security',
  'Law and Government',
  'Communications',
  'Telecommunications',
  'Communications and Media

In [140]:
ksa, non_ksa

(['Administration and Management',
  'Engineering and Technology',
  'Engineering and Technology',
  'Design',
  'Building and Construction',
  'Mathematics and Science',
  'Mathematics',
  'Physics',
  'Geography',
  'Medicine and Dentistry',
  'Therapy and Counseling',
  'Education and Training'],
 ['Business and Management',
  'Administrative',
  'Economics and Accounting',
  'Sales and Marketing',
  'Customer and Personal Service',
  'Personnel and Human Resources',
  'Manufacturing and Production',
  'Production and Processing',
  'Food Production',
  'Computers and Electronics',
  'Mechanical',
  'Chemistry',
  'Biology',
  'Psychology',
  'Sociology and Anthropology',
  'Health Services',
  'Arts and Humanities',
  'English Language',
  'Foreign Language',
  'Fine Arts',
  'History and Archeology',
  'Philosophy and Theology',
  'Law and Public Safety',
  'Public Safety and Security',
  'Law and Government',
  'Communications',
  'Telecommunications',
  'Communications and Media

In [112]:
prompt = make_ksa_prompt(knowledge_df.iloc[3]['Element Name'].lower(), knowledge_df.iloc[3]['Element Description'].lower())

In [119]:
temperature = 0.5
answers = palm.generate_text(prompt=prompt,
                            model=text_model,
                            candidate_count=5,
                            temperature=temperature,
                            max_output_tokens=1000)
for i, candidate in enumerate(answer.candidates):
  print(f"Candidate {i}: {candidate['output']}\n")

Candidate 0: No

Candidate 1: No

Candidate 2: No, transportation is not required to perform the job.

Candidate 3: No

Candidate 4: No



In [126]:
for answer in answers.candidates:
    if answer['output'].lower().startswith('no'):
        print('yes')

yes
yes
yes
yes
yes


In [36]:
for i, candidate in enumerate(answer.candidates):
  print(f"Candidate {i}: {candidate['output']}\n")

In [110]:
knowledge_df.iloc[2]['Element Description']

'Knowledge of administrative and office procedures and systems such as word processing, managing files and records, stenography and transcription, designing forms, and workplace terminology.'