In [44]:
import pandas as pd
import random
import json

random.seed(99)

experiment_name = "eighth_experiment"
number_of_house_sentences = 221
number_of_tech_sentences = 423

In [45]:
import json

with open('./data/SkillSpan/json/train.json', 'r') as file:
    skillspan_raw_data = json.load(file)

In [21]:
def filter_json_data(data):
    house_data = []
    tech_data = []

    for item in data:
        if item['source'] == 'house':
            house_data.append(item)
        elif item['source'] == 'tech':
            tech_data.append(item)

    return house_data, tech_data

house_data, tech_data = filter_json_data(skillspan_raw_data)

In [22]:
def annotate_sentence(tokens, tags_skill, tags_knowledge):
    final_tokens = []

    for index, (token, skill_tag, knowledge_tag) in enumerate(zip(tokens, tags_skill, tags_knowledge)):
        current_tokens = [];

        if skill_tag == 'B':
            current_tokens.append("@@");
        if knowledge_tag == 'B':
            current_tokens.append("##");
        current_tokens.append(token)

        if len(tokens) == index+1:
            if knowledge_tag == 'I' or knowledge_tag == 'B':
                current_tokens.append("##")
            if skill_tag == 'I' or skill_tag == 'B':
                current_tokens.append("@@")
        else:
            next_knowledge_tag = tags_knowledge[index+1]
            if (knowledge_tag == 'I' or knowledge_tag == 'B') and (next_knowledge_tag == 'O' or next_knowledge_tag == 'B'):
                current_tokens.append("##")
            next_skill_tag = tags_skill[index+1]
            if (skill_tag == 'I' or skill_tag == 'B') and (next_skill_tag == 'O' or next_skill_tag == 'B'):
                current_tokens.append("@@")

        final_tokens.append(''.join(current_tokens))

    return ' '.join(final_tokens)


idx = 105;

print(tech_data[idx])
annotate_sentence(tech_data[idx]['tokens'], tech_data[idx]['tags_skill'], tech_data[idx]['tags_knowledge'])

{'idx': 2, 'tokens': ['*', 'Hands-on', 'experience', 'in', 'Core', 'Java', 'Spring', 'Boot', 'and', 'Microservices'], 'tags_skill': ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], 'tags_knowledge': ['O', 'O', 'O', 'O', 'B', 'I', 'B', 'I', 'O', 'B'], 'source': 'tech'}


'* Hands-on experience in ##Core Java## ##Spring Boot## and ##Microservices##'

In [23]:
def annotate_data(data):
    for idx in range(len(data)):
        data[idx]['highlighted_sentence'] = annotate_sentence(data[idx]['tokens'], data[idx]['tags_skill'], data[idx]['tags_knowledge'])
        data[idx]['count_skills'] = data[idx]['tags_skill'].count("B")
        data[idx]['count_knowledge'] = data[idx]['tags_knowledge'].count("B")


annotate_data(tech_data)
annotate_data(house_data)

In [24]:
tech_data_without_entities = list(filter(lambda obj: obj['count_skills'] == 0 and obj.get('count_knowledge', 0) == 0, tech_data))
house_data_without_entities = list(filter(lambda obj: obj['count_skills'] == 0 and obj.get('count_knowledge', 0) == 0, house_data))

In [25]:
def generate_rewrite_prompts(data, n):
    selected_entries = random.sample(data, n)

    prompts = []
    for entry in selected_entries:
        sentence = entry['highlighted_sentence']
        prompt = (f"Take this sentence from a Job Posting.\n\n"
          f"Sentence: \"{sentence}\"\n\n"
          f"Rewrite it by finding synonyms but not changing the meaning. "
          f"Don't change the style of the text. The sentence should look like it is from the vacancy.")
        prompts.append(prompt)

    return prompts

In [26]:
from dotenv import load_dotenv
import ast
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser

load_dotenv()

llm = ChatOpenAI(model="gpt-4-0613", temperature=0)

prompt = ChatPromptTemplate.from_messages([
    ("user", "{input}")
])

output_parser = StrOutputParser()

chain = prompt | llm | output_parser

In [27]:
prompts_for_rewrite_tech = generate_rewrite_prompts(tech_data_without_entities, number_of_tech_sentences)

prompts_for_rewrite_tech[:5]

['Take this sentence from a Job Posting.\n\nSentence: "Technologies"\n\nRewrite it by finding synonyms but not changing the meaning. Don\'t change the style of the text. The sentence should look like it is from the vacancy.',
 'Take this sentence from a Job Posting.\n\nSentence: "Likes: 2 Dislikes: 2 Love: 0"\n\nRewrite it by finding synonyms but not changing the meaning. Don\'t change the style of the text. The sentence should look like it is from the vacancy.',
 'Take this sentence from a Job Posting.\n\nSentence: "Why wait? Apply now to build an amazing career and be part of a brilliant team ."\n\nRewrite it by finding synonyms but not changing the meaning. Don\'t change the style of the text. The sentence should look like it is from the vacancy.',
 'Take this sentence from a Job Posting.\n\nSentence: "We embrace a culture of experimentation and constantly strive for improvement and learning ."\n\nRewrite it by finding synonyms but not changing the meaning. Don\'t change the style o

In [28]:
def get_results(prompts, chain):
    results = []
    for current_prompt in prompts:
        result = chain.invoke({"input": current_prompt})
        results.append(result)
    return results

rewritten_tech = get_results(prompts_for_rewrite_tech, chain)

In [29]:
rewritten_tech[0:5]

['"Tech Tools"',
 '"Appreciates: 2 Disapprovals: 2 Adores: 0"',
 '"Why hesitate? Submit your application today to forge an incredible career and join a fantastic team."',
 '"We foster an environment of exploration and continually aim for enhancement and knowledge acquisition."',
 '"We cherish the opportunity to care for our exceptional talent at <ORGANIZATION> <ORGANIZATION>."']

In [30]:
def clean_results(res):
    cleaned_results = [s[1:-1] if s.startswith('"') and s.endswith('"') else s for s in res]
    cleaned_results = [s[1:-1] if s.startswith('\'') and s.endswith('\'') else s for s in cleaned_results]

    return cleaned_results

cleaned_rewritten_tech = clean_results(rewritten_tech)

cleaned_rewritten_tech[:5]

['Tech Tools',
 'Appreciates: 2 Disapprovals: 2 Adores: 0',
 'Why hesitate? Submit your application today to forge an incredible career and join a fantastic team.',
 'We foster an environment of exploration and continually aim for enhancement and knowledge acquisition.',
 'We cherish the opportunity to care for our exceptional talent at <ORGANIZATION> <ORGANIZATION>.']

In [31]:
with open(f"data/experiments/{experiment_name}/{number_of_tech_sentences}_tech_rewrites.json", 'w') as f:
    json.dump(cleaned_rewritten_tech, f)

In [46]:
prompts_for_rewrite_house = generate_rewrite_prompts(house_data_without_entities, number_of_house_sentences)

In [47]:
rewritten_house = get_results(prompts_for_rewrite_house, chain)

In [48]:
rewritten_house[0:5]

['"<ORGANIZATION> thrives by possessing profound understanding of all Danish markets and partnering with international corporations such as <ORGANIZATION> in <LOCATION> and <ORGANIZATION> <ORGANIZATION> in <LOCATION>, in addition to subsidiary entities in <LOCATION> <LOCATION> and <LOCATION>."',
 '"In addition to marketing various brands, this company has also created their own merchandise."',
 '"We require a new team member in our Research and Development department to spearhead the embedded software development for <ORGANIZATION> product."',
 '"Allowance for which the Research Associate may be eligible could become available at the conclusion of the 15-month tenure."',
 '"The department fosters a global atmosphere with 40% of its workforce being international employees."']

In [49]:
cleaned_rewritten_house = clean_results(rewritten_house)

cleaned_rewritten_house[:5]

['<ORGANIZATION> thrives by possessing profound understanding of all Danish markets and partnering with international corporations such as <ORGANIZATION> in <LOCATION> and <ORGANIZATION> <ORGANIZATION> in <LOCATION>, in addition to subsidiary entities in <LOCATION> <LOCATION> and <LOCATION>.',
 'In addition to marketing various brands, this company has also created their own merchandise.',
 'We require a new team member in our Research and Development department to spearhead the embedded software development for <ORGANIZATION> product.',
 'Allowance for which the Research Associate may be eligible could become available at the conclusion of the 15-month tenure.',
 'The department fosters a global atmosphere with 40% of its workforce being international employees.']

In [50]:
with open(f"data/experiments/{experiment_name}/{number_of_house_sentences}_house_rewrites.json", 'w') as f:
    json.dump(cleaned_rewritten_house, f)