In [4]:
import pandas as pd
import random
import json

random.seed(42)

experiment_name = "second_experiment"
number_of_house_sentences = 221
number_of_tech_sentences = 423

In [5]:
import json

with open('./data/SkillSpan/json/train.json', 'r') as file:
    skillspan_raw_data = json.load(file)

In [6]:
def filter_json_data(data):
    house_data = []
    tech_data = []

    for item in data:
        if item['source'] == 'house':
            house_data.append(item)
        elif item['source'] == 'tech':
            tech_data.append(item)

    return house_data, tech_data

house_data, tech_data = filter_json_data(skillspan_raw_data)

In [7]:
def annotate_sentence(tokens, tags_skill, tags_knowledge):
    final_tokens = []

    for index, (token, skill_tag, knowledge_tag) in enumerate(zip(tokens, tags_skill, tags_knowledge)):
        current_tokens = [];

        if skill_tag == 'B':
            current_tokens.append("@@");
        if knowledge_tag == 'B':
            current_tokens.append("##");
        current_tokens.append(token)

        if len(tokens) == index+1:
            if knowledge_tag == 'I' or knowledge_tag == 'B':
                current_tokens.append("##")
            if skill_tag == 'I' or skill_tag == 'B':
                current_tokens.append("@@")
        else:
            next_knowledge_tag = tags_knowledge[index+1]
            if (knowledge_tag == 'I' or knowledge_tag == 'B') and (next_knowledge_tag == 'O' or next_knowledge_tag == 'B'):
                current_tokens.append("##")
            next_skill_tag = tags_skill[index+1]
            if (skill_tag == 'I' or skill_tag == 'B') and (next_skill_tag == 'O' or next_skill_tag == 'B'):
                current_tokens.append("@@")

        final_tokens.append(''.join(current_tokens))

    return ' '.join(final_tokens)


idx = 105;

print(tech_data[idx])
annotate_sentence(tech_data[idx]['tokens'], tech_data[idx]['tags_skill'], tech_data[idx]['tags_knowledge'])

{'idx': 2, 'tokens': ['*', 'Hands-on', 'experience', 'in', 'Core', 'Java', 'Spring', 'Boot', 'and', 'Microservices'], 'tags_skill': ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], 'tags_knowledge': ['O', 'O', 'O', 'O', 'B', 'I', 'B', 'I', 'O', 'B'], 'source': 'tech'}


'* Hands-on experience in ##Core Java## ##Spring Boot## and ##Microservices##'

In [8]:
def annotate_data(data):
    for idx in range(len(data)):
        data[idx]['highlighted_sentence'] = annotate_sentence(data[idx]['tokens'], data[idx]['tags_skill'], data[idx]['tags_knowledge'])
        data[idx]['count_skills'] = data[idx]['tags_skill'].count("B")
        data[idx]['count_knowledge'] = data[idx]['tags_knowledge'].count("B")


annotate_data(tech_data)
annotate_data(house_data)

In [9]:
tech_data_without_entities = list(filter(lambda obj: obj['count_skills'] == 0 and obj.get('count_knowledge', 0) == 0, tech_data))
house_data_without_entities = list(filter(lambda obj: obj['count_skills'] == 0 and obj.get('count_knowledge', 0) == 0, house_data))

In [10]:
def generate_rewrite_prompts(data, n):
    selected_entries = random.sample(data, n)

    prompts = []
    for entry in selected_entries:
        sentence = entry['highlighted_sentence']
        prompt = (f"Take this sentence from a Job Posting.\n\n"
          f"Sentence: \"{sentence}\"\n\n"
          f"Rewrite it by finding synonyms but not changing the meaning. "
          f"Don't change the style of the text. The sentence should look like it is from the vacancy.")
        prompts.append(prompt)

    return prompts

In [16]:
from dotenv import load_dotenv
import ast
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser

load_dotenv()

llm = ChatOpenAI(model="gpt-4-0613", temperature=0)

prompt = ChatPromptTemplate.from_messages([
    ("user", "{input}")
])

output_parser = StrOutputParser()

chain = prompt | llm | output_parser

In [18]:
prompts_for_rewrite_tech = generate_rewrite_prompts(tech_data_without_entities, number_of_tech_sentences)

prompts_for_rewrite_tech[:5]

['Take this sentence from a Job Posting.\n\nSentence: "Job benefits: -"\n\nRewrite it by finding synonyms but not changing the meaning. Don\'t change the style of the text. The sentence should look like it is from the vacancy.',
 'Take this sentence from a Job Posting.\n\nSentence: "To find out more about working with us search <ORGANIZATION> on social media ."\n\nRewrite it by finding synonyms but not changing the meaning. Don\'t change the style of the text. The sentence should look like it is from the vacancy.',
 'Take this sentence from a Job Posting.\n\nSentence: "<ORGANIZATION> <ORGANIZATION> <ORGANIZATION> <ORGANIZATION> <ORGANIZATION>"\n\nRewrite it by finding synonyms but not changing the meaning. Don\'t change the style of the text. The sentence should look like it is from the vacancy.',
 'Take this sentence from a Job Posting.\n\nSentence: "We value fluidity and shared work between teams and you will have plenty of opportunities to explore the different parts of our infrastr

In [19]:
def get_results(prompts, chain):
    results = []
    for current_prompt in prompts:
        result = chain.invoke({"input": current_prompt})
        results.append(result)
    return results

rewritten_tech = get_results(prompts_for_rewrite_tech, chain)

In [20]:
rewritten_tech[0:5]

['"Employment perks: -"',
 '"For additional insights about being part of our team, look up <ORGANIZATION> on social media."',
 '<COMPANY> <FIRM> <INSTITUTION> <CORPORATION> <ENTERPRISE>',
 '"We appreciate flexibility and collaborative efforts among teams, and you will have ample chances to delve into the various aspects of our infrastructure."',
 '<PLACE> <PLACE> <PLACE> <PLACE> <PLACE> <PLACE> <PLACE> <PLACE> <PLACE> <PLACE> <PLACE>']

In [22]:
def clean_results(res):
    cleaned_results = [s[1:-1] if s.startswith('"') and s.endswith('"') else s for s in res]
    cleaned_results = [s[1:-1] if s.startswith('\'') and s.endswith('\'') else s for s in cleaned_results]

    return cleaned_results

cleaned_rewritten_tech = clean_results(rewritten_tech)

cleaned_rewritten_tech[:5]

['Employment perks: -',
 'For additional insights about being part of our team, look up <ORGANIZATION> on social media.',
 '<COMPANY> <FIRM> <INSTITUTION> <CORPORATION> <ENTERPRISE>',
 'We appreciate flexibility and collaborative efforts among teams, and you will have ample chances to delve into the various aspects of our infrastructure.',
 '<PLACE> <PLACE> <PLACE> <PLACE> <PLACE> <PLACE> <PLACE> <PLACE> <PLACE> <PLACE> <PLACE>']

In [25]:
with open(f"data/experiments/{experiment_name}/{number_of_tech_sentences}_tech_rewrites.json", 'w') as f:
    json.dump(cleaned_rewritten_tech, f)

In [26]:
prompts_for_rewrite_house = generate_rewrite_prompts(house_data_without_entities, number_of_house_sentences)

In [28]:
rewritten_house = get_results(prompts_for_rewrite_house[:5], chain)

In [29]:
rewritten_house[0:5]

['"Suggestions for a logical application of these forecasting techniques should finalize the project."',
 '"Acknowledging the potency of <ORGANIZATION> <ORGANIZATION> <ORGANIZATION> business framework and triumph, a formidable group of investors of <ORGANIZATION> in <LOCATION> has infused DKK 80M (approximately 12M USD) into the firm in 2019. With the company\'s value doubling in a mere year, <ORGANIZATION> <ORGANIZATION> <ORGANIZATION> is currently valued at 800 million DKK (roughly 120M USD)."',
 '"92% of our clientele receive a diagnosis and treatment strategy within sixty minutes, and in instances where surgery is required, the patient is also provided with a scheduled date and time for the procedure prior to departing the outpatient facility."',
 '"We are a vibrant, lively, and varied work environment promoting a candid and casual atmosphere, robust solidarity, and lofty aspirations."',
 '"In this initiative, a cross-disciplinary group will tackle this query utilizing cutting-edge

In [30]:
cleaned_rewritten_house = clean_results(rewritten_house)

cleaned_rewritten_house[:5]

['Suggestions for a logical application of these forecasting techniques should finalize the project.',
 "Acknowledging the potency of <ORGANIZATION> <ORGANIZATION> <ORGANIZATION> business framework and triumph, a formidable group of investors of <ORGANIZATION> in <LOCATION> has infused DKK 80M (approximately 12M USD) into the firm in 2019. With the company's value doubling in a mere year, <ORGANIZATION> <ORGANIZATION> <ORGANIZATION> is currently valued at 800 million DKK (roughly 120M USD).",
 '92% of our clientele receive a diagnosis and treatment strategy within sixty minutes, and in instances where surgery is required, the patient is also provided with a scheduled date and time for the procedure prior to departing the outpatient facility.',
 'We are a vibrant, lively, and varied work environment promoting a candid and casual atmosphere, robust solidarity, and lofty aspirations.',
 'In this initiative, a cross-disciplinary group will tackle this query utilizing cutting-edge organotyp

In [32]:
with open(f"data/experiments/{experiment_name}/{number_of_house_sentences}_house_rewrites.json", 'w') as f:
    json.dump(cleaned_rewritten_house, f)