In [1]:
from dotenv import load_dotenv

load_dotenv()

import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/andriimyronenko/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
import pandas as pd
import json

base_path = 'data/SkillSpan/json/'
file_names = ['dev.json', 'train.json', 'test.json']

dfs = {}

for file_name in file_names:
    file_path = base_path + file_name
    with open(file_path, 'r') as file:
        json_data = json.load(file)
        dfs[file_name.split('.')[0]] = pd.DataFrame(json_data)

dev_df = dfs['dev']
train_df = dfs['train']
test_df = dfs['test']

print("Dev DataFrame:")
print(dev_df.head())

print("\nTrain DataFrame:")
print(train_df.head())

print("\nTest DataFrame:")
print(test_df.head())

Dev DataFrame:
   idx                                             tokens  \
0    1  [DevOps, Engineer, (, CI, CD, Cloud, Docker, J...   
1    1  [<ADDRESS>, <ADDRESS>, <LOCATION>, -, <LOCATION>]   
2    1                        [Date, posted:, 2021-04-22]   
3    1                [Likes:, 0, Dislikes:, 0, Love:, 0]   
4    1                                [Job, description:]   

                       tags_skill                  tags_knowledge source  
0  [O, O, O, O, O, O, O, O, O, O]  [O, O, O, O, O, O, O, O, O, O]   tech  
1                 [O, O, O, O, O]                 [O, O, O, O, O]   tech  
2                       [O, O, O]                       [O, O, O]   tech  
3              [O, O, O, O, O, O]              [O, O, O, O, O, O]   tech  
4                          [O, O]                          [O, O]   tech  

Train DataFrame:
   idx                                             tokens  \
0    1  [Senior, QA, Engineer, (, m/f/d, ), <ORGANIZAT...   
1    1  [<ADDRESS>, <ADDRESS

In [3]:
def process_entries(group):
    combined_text = []
    combined_skills = []
    combined_knowledge = []
    source = None

    for _, row in group.iterrows():
        combined_text.extend(row['tokens'])
        
        temp_skills = []
        temp_knowledge = []
        
        for token, skill_tag, knowledge_tag in zip(row['tokens'], row['tags_skill'], row['tags_knowledge']):
            if skill_tag.startswith('B'):
                temp_skills.append(token)
            elif skill_tag.startswith('I') and temp_skills:
                temp_skills[-1] += f' {token}'
            
            if knowledge_tag.startswith('B'):
                temp_knowledge.append(token)
            elif knowledge_tag.startswith('I') and temp_knowledge:
                temp_knowledge[-1] += f' {token}'

        if temp_skills:
            combined_skills.extend(temp_skills)
        if temp_knowledge:
            combined_knowledge.extend(temp_knowledge)
        source = row['source']

    text = ' '.join(combined_text)
    
    return pd.Series([text, combined_skills, combined_knowledge, source], index=['text', 'skills', 'knowledge', 'source'])

def process_dataframe(df):
    grouped_df = df.groupby('idx').apply(process_entries).reset_index()
    return grouped_df

processed_dev_df = process_dataframe(dev_df)
processed_train_df = process_dataframe(train_df)
processed_test_df = process_dataframe(test_df)

final_df = pd.concat([processed_dev_df, processed_train_df, processed_test_df], ignore_index=True)

processed_test_df.head()

Unnamed: 0,idx,text,skills,knowledge,source
0,1,Full Stack Software Engineer - Java / JavaScri...,"[solving business problems, apply your depth o...","[javascript, reactjs, java, javascript, reactj...",tech
1,2,Software Engineer - Java <ORGANIZATION> <ORGAN...,"[solving business problems, apply your depth o...","[java, spring, java-ee, java, spring, java-ee,...",tech
2,3,DevOps Ninja Engineer ( . NET/Azure ) <ORGANIZ...,"[Open for continuous change, Enhance and maint...","[Financial Technology, SaaS, wealthtech, azure...",tech
3,4,Principle Software Engineer | Java <ORGANIZATI...,"[solving business problems, apply your depth o...","[Financial Services, java, reactjs, web-servic...",tech
4,5,Python Software Engineer Intern for <ORGANIZAT...,[build highly efficient accurate and scalable ...,"[Artificial Intelligence, Computer Graphics, S...",tech


In [4]:
test_df_tech_only = processed_test_df[processed_test_df['source'] == 'tech']
test_df_tech_only.to_csv('./data/test_df_tech_only_combined.csv', index=False)

test_df_tech_only

Unnamed: 0,idx,text,skills,knowledge,source
0,1,Full Stack Software Engineer - Java / JavaScri...,"[solving business problems, apply your depth o...","[javascript, reactjs, java, javascript, reactj...",tech
1,2,Software Engineer - Java <ORGANIZATION> <ORGAN...,"[solving business problems, apply your depth o...","[java, spring, java-ee, java, spring, java-ee,...",tech
2,3,DevOps Ninja Engineer ( . NET/Azure ) <ORGANIZ...,"[Open for continuous change, Enhance and maint...","[Financial Technology, SaaS, wealthtech, azure...",tech
3,4,Principle Software Engineer | Java <ORGANIZATI...,"[solving business problems, apply your depth o...","[Financial Services, java, reactjs, web-servic...",tech
4,5,Python Software Engineer Intern for <ORGANIZAT...,[build highly efficient accurate and scalable ...,"[Artificial Intelligence, Computer Graphics, S...",tech
5,6,Senior Ruby on Rails Developer 12.000-18.000 P...,"[gather technical requirements, architect solu...","[e-commerce, open-source, ruby-on-rails, sql, ...",tech
6,7,UI Software Engineer <ORGANIZATION> <ORGANIZAT...,"[Deliver initiatives, modernizing and transfor...","[Financial Services, reactjs, cloud, python, F...",tech
7,8,Backend Engineer ( m/f/d ) <ORGANIZATION> <ORG...,"[build software, help our customers, build and...","[django, postgresql, redis, django, postgresql...",tech
8,9,BI Developer <LOCATION> <LOCATION> <LOCATION> ...,"[building front-end layer, Design and build fr...","[Power BI reports, DAX queries, Azure Analysis...",tech
9,10,System Architect for our new Digital Developme...,"[making the right decisions, definition & comm...","[Agile Software Development, Financial Technol...",tech


In [6]:
def extract_columns(row):
    print(row['text'])
    print('>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>')
    print(row['skills'])
    print('>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>')
    print(row['knowledge'])
    print('>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>\n')

train_df_tech_only = processed_train_df[processed_train_df['source'] == 'tech']
train_df_tech_only.apply(extract_columns, axis=1)

Senior QA Engineer ( m/f/d ) <ORGANIZATION> <ADDRESS> <ADDRESS> <ADDRESS> <ADDRESS> <LOCATION> Date posted: 2021-07-14 Likes: 0 Dislikes: 0 Love: 0 Job description: Location options: Remote Visa sponsor Paid relocation Job type: Full-time Experience level: Senior Role: QA/Test Developer Industry: Business to Business Information Technology Web Technology Company size: 501-1k people Company type: Private Technologies docker agile selenium circleci jenkins Job description In order to support our ongoing international growth we are looking for a Senior QA Engineer to join our Engineering department . You will be working in an end-to-end cross-functional team being responsible for implementing and promoting all QA relevant topics on team level . Responsibilities Design and implement complex end-to-end tests . Work hands-on together with the other engineers within the Agile team - to ensure continuous quality delivery of automated acceptance API and performance tests - while constantly coll

0     None
1     None
2     None
3     None
4     None
      ... 
75    None
76    None
77    None
78    None
79    None
Length: 80, dtype: object