In [None]:
import pandas as pd
import json

with open('./data/SkillSpan/json/test.json', 'r') as file:
    data = json.load(file)

flattened_data = []

for sentence_idx, entry in enumerate(data):
    idx = entry['idx']
    source = entry['source']
    for token, skill_tag, knowledge_tag in zip(entry['tokens'], entry['tags_skill'], entry['tags_knowledge']):
        flattened_data.append({'text_idx': idx, "sentence_idx": sentence_idx,  'token': token, 'skill_tag': skill_tag, 'knowledge_tag': knowledge_tag, 'source': source})

test_df = pd.DataFrame(flattened_data)
test_df['skills_tag_predicted'] = 'O' 
test_df['knowledge_tag_predicted'] = 'O' 


test_df = test_df[test_df["source"] == 'tech']

test_df

In [None]:
# Classify using JobBert

In [None]:
from transformers import pipeline

token_skill_classifier = pipeline(model="jjzha/jobbert_skill_extraction", aggregation_strategy="first")
token_knowledge_classifier = pipeline(model="jjzha/jobbert_knowledge_extraction", aggregation_strategy="first")

df_sentences = test_df.groupby(['text_idx', 'sentence_idx'])['token'].apply(list)
df_sentences

In [None]:
def classify_sentence(sentence):
    skill_predictions = token_skill_classifier(sentence)
    knowledge_predictions = token_knowledge_classifier(sentence)
    return skill_predictions, knowledge_predictions


def assign_bio_tags(token_list, sentence, predictions, tag_type):
    bio_tags = ['O'] * len(token_list)  # Initialize all tokens with 'O'

    for pred in predictions:
        # Split the predicted span into individual words
        predicted_words = pred['word'].split()
        num_pred_words = len(predicted_words)
        start_idx = None

        # Iterate through the tokens to find the matching span
        for i in range(len(token_list)):
            if token_list[i:i + num_pred_words] == predicted_words:
                # Calculate the cumulative length of tokens up to the current index
                cumulative_length = sum(len(token) + 1 for token in token_list[:i])  # +1 for space after each token
                if cumulative_length == pred['start']:
                    start_idx = i
                    break

        # Assign BIO tags
        if start_idx is not None:
            for i in range(start_idx, start_idx + num_pred_words):
                # Assign 'B' or 'I' based on the entity_group of the prediction
                bio_tag = ('B' if pred['entity_group'] == 'B' else 'I')
                bio_tags[i] = bio_tag

    return bio_tags


def update_predictions(tokens_df, sentences_df):
    # Iterate through sentences and classify
    for (text_idx, sentence_idx), token_list in sentences_df.items():
        sentence = ' '.join(token_list)
        skill_preds, knowledge_preds = classify_sentence(sentence)
        
        skill_bio_tags = assign_bio_tags(token_list, sentence, skill_preds, 'Skill')
        knowledge_bio_tags = assign_bio_tags(token_list, sentence, knowledge_preds, 'Knowledge')

        # Update the DataFrame with BIO tags
        for token_idx, (token, skill_tag, knowledge_tag) in enumerate(zip(token_list, skill_bio_tags, knowledge_bio_tags)):
            condition = ((tokens_df['text_idx'] == text_idx) & 
                         (tokens_df['sentence_idx'] == sentence_idx) & 
                         (tokens_df['token'] == token))
            tokens_df.loc[condition, 'skills_tag_predicted'] = skill_tag
            tokens_df.loc[condition, 'knowledge_tag_predicted'] = knowledge_tag

# Update the DataFrame with predictions
update_predictions(test_df, df_sentences)

# Display the updated DataFrame
print(test_df)

In [None]:
pd.set_option('display.max_rows', 20)  # or use a specific large number
pd.set_option('display.max_columns', 10)  # or use a specific large number

print(test_df.head(100))

In [None]:
test_df['skill_tag'] = test_df['skill_tag'].apply(lambda x: 'B-Skill' if x == 'B' else ('I-Skill' if x == 'I' else x))
test_df['skills_tag_predicted'] = test_df['skills_tag_predicted'].apply(lambda x: 'B-Skill' if x == 'B' else ('I-Skill' if x == 'I' else x))


In [None]:
test_df['knowledge_tag'] = test_df['knowledge_tag'].apply(lambda x: 'B-Knowledge' if x == 'B' else ('I-Knowledge' if x == 'I' else x))
test_df['knowledge_tag_predicted'] = test_df['knowledge_tag_predicted'].apply(lambda x: 'B-Knowledge' if x == 'B' else ('I-Knowledge' if x == 'I' else x))


In [None]:
test_df

In [None]:
test_df.to_csv('./extracted_skills/test_predicted_job_bert.csv', index=False)

In [1]:
def create_conll_from_df(df, output_path):
    with open(output_path, 'w') as file:
        for _, row in df.iterrows():
            token = row['token']
            skill_tag = row['skill_tag']
            knowledge_tag = row['knowledge_tag']
            skills_tag_predicted = row['skills_tag_predicted']
            knowledge_tag_predicted = row['knowledge_tag_predicted']

            # Write the token and tags separated by tabs
            file.write(f"1\t{token}\t{skill_tag}\t{knowledge_tag}\t{skills_tag_predicted}\t{knowledge_tag_predicted}\n")

            # Add a new line after each sentence
            if _ + 1 in df.index and df.loc[_ + 1, 'sentence_idx'] != row['sentence_idx']:
                file.write("\n")

                
create_conll_from_df(test_df, './extracted_skills/test_predicted_job_bert.conll')

NameError: name 'test_df' is not defined

In [2]:
import os

out = os.popen(
    f"perl nereval.perl < ./extracted_skills/test_predicted_job_bert.conll"
).read()

print(out)

STRICT: Found: 487 outer and 776 inner phrases; Gold: 440 (outer) and 806 (inner).
LOOSE: Found: 487 outer and 776 inner phrases; Gold: 440 (outer) and 806 (inner).

1. Strict, Combined Evaluation (official):
Accuracy:  95.23%;
Precision:  46.56%;
Recall:  47.19%;
FB1:  46.87

2. Loose, Combined Evaluation:
Accuracy:  95.23%;
Precision:  46.56%;
Recall:  47.19%;
FB1:  46.87

3.1 Per-Level Evaluation (outer chunks):
Accuracy:  94.53%;
Precision:  36.14%;
Recall:  40.00%;
FB1:  37.97

3.2 Per-Level Global Evaluation (inner chunks):
Accuracy:  95.92%;
Precision:  53.09%;
Recall:  51.12%;
FB1:  52.09


Evaluation per type and mode:

==>  Knowledge
Outer strict: Precision:   0.00%; Recall:   0.00%; FB1:   0.00
Inner strict: Precision:  53.09%; Recall:  51.12%; FB1:  52.09
Outer loose: Precision:   0.00%; Recall:   0.00%; FB1:   0.00
Inner loose: Precision:  53.09%; Recall:  51.12%; FB1:  52.09
==>      Skill
Outer strict: Precision:  36.14%; Recall:  40.00%; FB1:  37.97
Inner strict: Precis

In [None]:
### Now let's try to do the same with LLMs

# We will have several prompts for the detection of skills and knowledge

In [None]:
test_tech_df_merged = pd.read_csv('./data/merged/test_df_tech_only.csv')

test_tech_df_merged.shape

In [None]:
verbose_template = """
You are a top-notch recruiter and data labeler.
Your task is to thoroughly analyze the following part of a job posting and extract text spans including skills or knowledge.

You should use the following guideline for annotation:

Instruction for Annotating Text for Skills and Knowledge Components
Objective: Identify and annotate spans of text in job postings (JPs) that represent specific skills or knowledge required for the position.

1. Identifying Skills:

General Rule: A skill is usually indicated by a VERB or an (ADJECTIVE) + NOUN combination.
Exclusion of Modal Verbs: Do not tag modal verbs (e.g., can, will) as part of the skill.
Phrase Separation: Split phrases containing prepositions and/or conjunctions, except when conjunctions coordinate two nouns as a single argument.
Handling Anaphoric Pronouns: Avoid tagging skills with anaphoric pronouns. Only tag the preceding skill.
Splitting Coordinated Components: Split nouns and adjectives in coordination if they lack a verb.
Listing of Skills: If skills are listed leading to different subtasks, annotate each separately.
Brevity in Contextual Information: Keep skill annotations concise, especially when followed by company-specific info.
Inclusion of 'skills' or 'knowledge' in Tags: Include these words in the annotation if their omission alters the meaning.
Parenthetical Information: Include if it elaborates on the skill or is an abbreviation.
Adverbial Inclusion: Include adverbials that describe the manner of doing something.
Attitudes as Skills: Annotate attitudes as skills, omitting articles.
Exclusions:
Avoid tagging ironic skills.
Avoid nested annotations; use one span for overlapping skills.
Do not tag skills in top headlines, but do tag in sub-headlines and body text.
Exclude fluff and triggers surrounding the skill component.
Exclude participation, contributing, and transfer expressions from skill annotations.
Do not annotate occupations or positions.
Focus on skills related and specific to the position.

2. Identifying Knowledge:
General Rule: Knowledge is non-executable and possessed by an individual.
Inclusion of Parenthetical Information: Include if it is related to the knowledge component.
Licenses and Certifications: Include additional words like "certificate," "license," etc.
Vague Preceding Verbs: Only tag the knowledge component if preceded by a vague verb.
Specificity: Annotate only specified knowledge components.
Nested Knowledge in Skills: Knowledge components can be nested within skill annotations.
Coordination of Knowledge Components: Annotate as one if all components share one knowledge tag.
Listing Knowledge Tags: Annotate all knowledge tags separately.
Annotation of Industries and Fields: Tag these as knowledge components.

3. Prioritization and Other Considerations:
Prefer skills over knowledge in uncertain cases.
Prioritize skills over attitudes; only tag the skill within an attitude.
Keep annotations concise and relevant to the job.
Annotate skills and knowledge in unconventional places if related to the position.
Consider annotating a combination of skill and knowledge when applicable.
Exclude knowledge/skill components in positions.
Focus on annotations relevant to the specific position and its future expectations.

Output just the JSON array, wihtout any other text. 
Start your response with "[" symbol, and finish with "]".
Make sure your response doesn't include anything else.

Job posting:
{job_posting}

Response:"""

short_template = """
    You are a top-notch recruiter and data labeler.
    Your task is to thoroughly analyze the following part of a job posting and extract text spans including skills and knowledge.
    A skill in a job context is typically indicated by a verb or an adjective-noun combination, reflecting an executable ability or a specific way of performing a task. 
    Knowledge, on the other hand, refers to non-executable information that an individual possesses, often indicated by specific fields, industries, or certifications, and is distinct from direct action or skills.

    Output the JSON array, wihtout any other text. 
    Start your response with "[" symbol, and finish with "]".
    Make sure your response doesn't include anything else.
        
    Job posting:
    {job_posting}
"""

few_shot_template = """
    You are a top-notch recruiter and data labeler.
    Your task is to thoroughly analyze the following part of a job posting and extract text spans including job-related skills or knowledge.
    A skill in a job context is typically indicated by a verb or an adjective-noun combination, reflecting an executable ability or a specific way of performing a task. 
    Knowledge, on the other hand, refers to non-executable information that an individual possesses, often indicated by specific fields, industries, or certifications, and is distinct from direct action or skills.
    
    Below you can see some examples. Pay attention to what is included in detected spans. We want not to include false positive skills and knowledge.

    Job posting:
    Ability to work in large collaborative teams to achieve organizational goals
    Detected spans:
    ["work in large collaborative teams"]

    Job posting:
    Work hands-on together with the other engineers within the Agile team 
    Detected spans:
    ["Work hands-on"]
    
    Job posting:
    Requirements At least 5 years of combined experience in Java or Kotlin and JavaScript or TypeScript programming and related test frameworks ( Selenium TestCafe etc.) .
    Detected spans:
    ["Java", "Kotlin", "JavaScript", "Typescript"]
    
    Job posting:
    A degree in Computer Science or related fields or equivalent practical experience . 
    Detected spans:
    ["degree in Computer Science"]
    
    Job posting:
    Experience in working on a cloud-based application running on Docker .
    Detected spans:
    ["Docker", "working on a cloud-based application"]
    
    Make sure to never include protected attributes like gender identity or veteran status in the returned list:
    
    Job posting:
    We do not discriminate on the basis of any protected attribute including race religion color national origin gender sexual orientation .
    Detected spans:
    []

    Output just the JSON array, wihtout any other text. 
    Start your response with "[" symbol, and finish with "]".
    Make sure your response doesn't include data from examples and anything else except from JSON array.
    Return empty array If there are no obvious skill and knowledge spans in the given posting.
    
    Job posting:
    {job_posting}
    Detected spans:
"""

template_collection = {
    'verbose': verbose_template,
    'short': short_template,
    'few_shot': few_shot_template
}

In [None]:
from langchain import LLMChain, PromptTemplate
from langchain.output_parsers import StructuredOutputParser, ResponseSchema
from langchain.prompts import PromptTemplate, ChatPromptTemplate, HumanMessagePromptTemplate
from langchain.llms import OpenAI
from langchain.chat_models import ChatOpenAI

prompt_template = PromptTemplate(input_variables=["job_posting"], template=template_collection['few_shot'])