In [1]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/andriimyronenko/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
import pandas as pd
import json

base_path = 'data/SkillSpan/json/'
file_names = ['dev.json', 'train.json', 'test.json']

dfs = {}

for file_name in file_names:
    file_path = base_path + file_name
    with open(file_path, 'r') as file:
        json_data = json.load(file)
        dfs[file_name.split('.')[0]] = pd.DataFrame(json_data)

dev_df = dfs['dev']
train_df = dfs['train']
test_df = dfs['test']

print("Dev DataFrame:")
print(dev_df.head())

print("\nTrain DataFrame:")
print(train_df.head())

print("\nTest DataFrame:")
print(test_df.head())

Dev DataFrame:
   idx                                             tokens  \
0    1  [DevOps, Engineer, (, CI, CD, Cloud, Docker, J...   
1    1  [<ADDRESS>, <ADDRESS>, <LOCATION>, -, <LOCATION>]   
2    1                        [Date, posted:, 2021-04-22]   
3    1                [Likes:, 0, Dislikes:, 0, Love:, 0]   
4    1                                [Job, description:]   

                       tags_skill                  tags_knowledge source  
0  [O, O, O, O, O, O, O, O, O, O]  [O, O, O, O, O, O, O, O, O, O]   tech  
1                 [O, O, O, O, O]                 [O, O, O, O, O]   tech  
2                       [O, O, O]                       [O, O, O]   tech  
3              [O, O, O, O, O, O]              [O, O, O, O, O, O]   tech  
4                          [O, O]                          [O, O]   tech  

Train DataFrame:
   idx                                             tokens  \
0    1  [Senior, QA, Engineer, (, m/f/d, ), <ORGANIZAT...   
1    1  [<ADDRESS>, <ADDRESS

In [18]:
dev_df['part'] = 'dev'
train_df['part'] = 'train'
test_df['part'] = 'test'

# Concatenate the DataFrames
combined_df = pd.concat([dev_df, train_df, test_df], ignore_index=True)

Unnamed: 0,idx,tokens,tags_skill,tags_knowledge,source,part
0,1,"[DevOps, Engineer, (, CI, CD, Cloud, Docker, J...","[O, O, O, O, O, O, O, O, O, O]","[O, O, O, O, O, O, O, O, O, O]",tech,dev
1,1,"[<ADDRESS>, <ADDRESS>, <LOCATION>, -, <LOCATION>]","[O, O, O, O, O]","[O, O, O, O, O]",tech,dev
2,1,"[Date, posted:, 2021-04-22]","[O, O, O]","[O, O, O]",tech,dev
3,1,"[Likes:, 0, Dislikes:, 0, Love:, 0]","[O, O, O, O, O, O]","[O, O, O, O, O, O]",tech,dev
4,1,"[Job, description:]","[O, O]","[O, O]",tech,dev


In [5]:
def bio_tags_to_spans(bio_tags):
    """
    Convert a list of BIO tags into a list of spans, where each span is a pair
    (start, end) representing where an entity starts and ends in the token list.

    Parameters:
    - bio_tags: A list of strings, the BIO tags for each token in a sequence.

    Returns:
    - A list of tuples, each representing the start and end indices of entities.
    """
    spans = []
    start, end = None, None

    for i, tag in enumerate(bio_tags):
        if tag.startswith('B'):  # Beginning of an entity
            if start is not None:
                # End the previous entity
                spans.append((start, end))
            start, end = i, i  # Start a new entity
        elif tag.startswith('I') and start is not None:
            # Inside an entity
            end = i
        else:
            # Outside of an entity
            if start is not None:
                # End the current entity
                spans.append((start, end))
                start, end = None, None

    # Add the last entity if the sentence ends with an entity
    if start is not None:
        spans.append((start, end))

    return spans


In [62]:
def process_entity(index, row, entity_label, accumulator):
    secondary_idx = 0 if entity_label == "SKL" else 1
    qas_id = f"{index}.{secondary_idx}"
    context = ' '.join(row['tokens'])
    
    biotag_column = "tags_skill" if entity_label == "SKL" else "tags_knowledge"
    biotags = row[biotag_column]
    bio_spans = bio_tags_to_spans(biotags)
    end_position = []
    span_position = []
    start_position = []

    for start, end in bio_spans:
        start_position.append(start)
        end_position.append(end)
        span_position.append(f"{start};{end}")
        
    entity_dict = {
        "context": context,
        "entity_label": entity_label,
        "qas_id": qas_id,
        "end_position": end_position,
        "span_position": span_position,
        "start_position": start_position,
        "source": row["source"],
        "part": row["part"]
    }
    
    accumulator.append(entity_dict)

def process_row(index, row, accumulator):
    process_entity(index, row, "SKL", accumulator);
    process_entity(index, row, "KNG", accumulator);
    
def process_dataframe(df):
    accumulator = []
    
    for index, row in df.iterrows():
        process_row(index, row, accumulator);

    return accumulator

In [66]:
mrc_list = process_dataframe(combined_df)

len(mrc_list)

23086

In [68]:
mrc_train = [obj for obj in mrc_list if obj.get("part") != "test"]

len(mrc_train)

15948

In [69]:
mrc_test = [obj for obj in mrc_list if obj.get("part") == "test"]

len(mrc_test)

7138

In [81]:
def save_file(path, json_data):
    with open(path, 'w') as file:
        json.dump(json_data, file)

In [82]:
save_file('./data/merged/mrc-ner.test.all', mrc_test)

In [83]:
save_file('./data/merged/mrc-ner.train.all', mrc_train)

In [88]:
mrc_train_tech = [obj for obj in mrc_list if obj.get("part") != "test" and obj.get("source") == "tech"]

len(mrc_train_tech)

10578

In [89]:
mrc_test_tech = [obj for obj in mrc_list if obj.get("part") == "test" and obj.get("source") == "tech"]

len(mrc_test_tech)

4572

In [90]:
save_file('./data/merged/mrc-ner.test.tech', mrc_test_tech)

In [91]:
save_file('./data/merged/mrc-ner.train.tech', mrc_train_tech)

In [106]:
mrc_train_house = [obj for obj in mrc_list if obj.get("part") != "test" and obj.get("source") == "house"]

len(mrc_train_house)

5370

In [108]:
mrc_test_house = [obj for obj in mrc_list if obj.get("part") == "test" and obj.get("source") == "house"]

len(mrc_test_house)

2566

In [109]:
save_file('./data/merged/mrc-ner.test.house', mrc_test_house)

In [110]:
save_file('./data/merged/mrc-ner.train.house', mrc_train_house)

In [100]:
train_items_with_spans = [item for item in mrc_train if item.get("span_position")]

len(train_items_with_spans)

2993

In [101]:
test_items_with_spans = [item for item in mrc_test if item.get("span_position")]

len(test_items_with_spans)

1095

In [104]:
import random

random_train = random.sample(train_items_with_spans, min(len(mrc_train), 100))
random_test = random.sample(test_items_with_spans, min(len(mrc_train), 20))

In [113]:
save_file('./data/merged/mrc-ner.test.random', random_test)

In [112]:
save_file('./data/merged/mrc-ner.train.random', random_train)