In [42]:
import pandas as pd
import numpy as np
import google.generativeai as palm
from utils import VectorEmbeddings
import re, ast, json

from bs4 import BeautifulSoup
from tqdm.auto import tqdm

import psycopg2
from psycopg2.extras import execute_values
from pgvector.psycopg2 import register_vector

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

In [71]:
knowledge_df = pd.read_excel('../onet_competencies/knowledge_competencies.xlsx', skiprows=3)

In [72]:
knowledge_df.columns

Index(['Element ID', 'Element Name', 'Element Description',
       'Synonym (Source: My Next Move Easy Read Content)',
       'O*NET Database Occupation Linkages'],
      dtype='object')

In [2]:
with open('../.api_key', 'r') as file:
    api_key = file.readline()
vector_embeddings = VectorEmbeddings(api_key=api_key)

In [7]:
job_df = pd.read_json('job_descriptions.json', lines=True)

In [25]:
text_models = [m for m in palm.list_models() if 'generateText' in m.supported_generation_methods]
text_model = text_models[0]

In [38]:
k_skill = 'Medicine and Dentistry'
k_description = 'knowledge of the information and techniques needed to diagnose and treat injuries, diseases, and deformities. This includes symptoms, treatment alternatives, and preventive health-care measures'
k_high = 'Performing open-heart surgery'
k_low = 'Using a small bandage'

In [114]:
# An example high level requirement for this knowledge is {} while an example low level requirement for this knowledge is {}.

def make_ksa_prompt(k_skill, k_description, job_ad):
    query = 'Does this job ad require {} knowledge to perform the job?'.format(k_skill)
    leading_text = '{} is the {}. Look at the job ad and decide if {} is required to perform this job. If {} is required please provide the specific tasks in a bulletized format from the job ad that relate to {}, if not simply say no'.format(k_skill, k_description,k_skill, k_skill, k_skill)
    prompt = vector_embeddings.make_prompt(leading_text, query, job_ad)
    
    return prompt

In [125]:
def generate_answers(prompt, text_model, candidate_count: int = 3, temperature:float = 0.5):
    answers = palm.generate_text(prompt=prompt,
                            model=text_model,
                            candidate_count=candidate_count,
                            temperature=temperature,
                            max_output_tokens=1000)
    return answers

In [127]:
def determine_if_ksa_is_present(answers):
    for answer in answers.candidates:
        if answer['output'].lower().startswith('no'):
            return False
    return True

In [143]:
ksa = []
non_ksa = []
for i, row in knowledge_df.iterrows():
    prompt = make_ksa_prompt(row['Element Name'].lower(), row['Element Description'].lower(), job_df.iloc[65]["requirements_qualifications"])

    answers = generate_answers(prompt, text_model, temperature=0.1)

    add = determine_if_ksa_is_present(answers)

    if add:
        ksa.append(row['Element Name'])
    else:
        non_ksa.append(row['Element Name'])

In [144]:
ksa, non_ksa

(['Administration and Management',
  'Engineering and Technology',
  'Engineering and Technology',
  'Design',
  'Building and Construction',
  'Mathematics and Science',
  'Mathematics',
  'Physics',
  'Geography',
  'Medicine and Dentistry',
  'Therapy and Counseling',
  'Education and Training',
  'English Language'],
 ['Business and Management',
  'Administrative',
  'Economics and Accounting',
  'Sales and Marketing',
  'Customer and Personal Service',
  'Personnel and Human Resources',
  'Manufacturing and Production',
  'Production and Processing',
  'Food Production',
  'Computers and Electronics',
  'Mechanical',
  'Chemistry',
  'Biology',
  'Psychology',
  'Sociology and Anthropology',
  'Health Services',
  'Arts and Humanities',
  'Foreign Language',
  'Fine Arts',
  'History and Archeology',
  'Philosophy and Theology',
  'Law and Public Safety',
  'Public Safety and Security',
  'Law and Government',
  'Communications',
  'Telecommunications',
  'Communications and Media

In [142]:
ksa, non_ksa

(['Administration and Management',
  'Engineering and Technology',
  'Engineering and Technology',
  'Design',
  'Building and Construction',
  'Mathematics and Science',
  'Mathematics',
  'Physics',
  'Geography',
  'Medicine and Dentistry',
  'Therapy and Counseling',
  'Education and Training',
  'English Language'],
 ['Business and Management',
  'Administrative',
  'Economics and Accounting',
  'Sales and Marketing',
  'Customer and Personal Service',
  'Personnel and Human Resources',
  'Manufacturing and Production',
  'Production and Processing',
  'Food Production',
  'Computers and Electronics',
  'Mechanical',
  'Chemistry',
  'Biology',
  'Psychology',
  'Sociology and Anthropology',
  'Health Services',
  'Arts and Humanities',
  'Foreign Language',
  'Fine Arts',
  'History and Archeology',
  'Philosophy and Theology',
  'Law and Public Safety',
  'Public Safety and Security',
  'Law and Government',
  'Communications',
  'Telecommunications',
  'Communications and Media

In [140]:
ksa, non_ksa

(['Administration and Management',
  'Engineering and Technology',
  'Engineering and Technology',
  'Design',
  'Building and Construction',
  'Mathematics and Science',
  'Mathematics',
  'Physics',
  'Geography',
  'Medicine and Dentistry',
  'Therapy and Counseling',
  'Education and Training'],
 ['Business and Management',
  'Administrative',
  'Economics and Accounting',
  'Sales and Marketing',
  'Customer and Personal Service',
  'Personnel and Human Resources',
  'Manufacturing and Production',
  'Production and Processing',
  'Food Production',
  'Computers and Electronics',
  'Mechanical',
  'Chemistry',
  'Biology',
  'Psychology',
  'Sociology and Anthropology',
  'Health Services',
  'Arts and Humanities',
  'English Language',
  'Foreign Language',
  'Fine Arts',
  'History and Archeology',
  'Philosophy and Theology',
  'Law and Public Safety',
  'Public Safety and Security',
  'Law and Government',
  'Communications',
  'Telecommunications',
  'Communications and Media

In [112]:
prompt = make_ksa_prompt(knowledge_df.iloc[3]['Element Name'].lower(), knowledge_df.iloc[3]['Element Description'].lower())

In [119]:
temperature = 0.5
answers = palm.generate_text(prompt=prompt,
                            model=text_model,
                            candidate_count=5,
                            temperature=temperature,
                            max_output_tokens=1000)
for i, candidate in enumerate(answer.candidates):
  print(f"Candidate {i}: {candidate['output']}\n")

Candidate 0: No

Candidate 1: No

Candidate 2: No, transportation is not required to perform the job.

Candidate 3: No

Candidate 4: No



In [126]:
for answer in answers.candidates:
    if answer['output'].lower().startswith('no'):
        print('yes')

yes
yes
yes
yes
yes


In [36]:
for i, candidate in enumerate(answer.candidates):
  print(f"Candidate {i}: {candidate['output']}\n")

In [110]:
knowledge_df.iloc[2]['Element Description']

'Knowledge of administrative and office procedures and systems such as word processing, managing files and records, stenography and transcription, designing forms, and workplace terminology.'