In [None]:
USE_DRIVE = True
if USE_DRIVE:
    project_path  = "drive/MyDrive/EFREI_CAMP/"
    from google.colab import drive
    drive.mount('/content/drive', force_remount=True)
else:
    project_path  = "../"

In [None]:
import requests
import pandas as pd
import datetime
import os
from sklearn.model_selection import train_test_split
from difflib import SequenceMatcher
import json

## Edit `Notebooks/labelstudio_auth.json` before running cells below

In [None]:
with open(os.path.join(project_path, 'Notebooks/labelstudio_auth.json')) as json_file:
    labelstudio_auth = json.load(json_file)
    print(labelstudio_auth)
#labelstudio project config
URL = labelstudio_auth["URL"]
PROJECT_ID = labelstudio_auth["PROJECT_ID"]
TOKEN = labelstudio_auth["TOKEN"]

URL_TASKS_GET = "{}/api/projects/{}/export?exportType=JSON".format(URL, PROJECT_ID)
URL_TASKS_GET = os.path.join(URL,"api/projects/",PROJECT_ID,"export?exportType=JSON")
timestamp_folder = datetime.datetime.now().strftime('%d%m%Y_%H%M')

#path used to save data for training
training_path = os.path.join(project_path, "training/data/labeled", timestamp_folder)

In [None]:
def GET_POST_(TOKEN=None, URL=None, Type="GET", Json=None):
    """
    params:
        - TOKEN (str): Auth. token
        - URL (str)
        - Type (str): GET, POST or PATCH
    returns:
        - response
    """
    headers = {
    "Authorization": "Token {}".format(TOKEN),
    }
    if Type=="POST":
        if Json:
            response = requests.post(URL, headers=headers, json=Json)
        else:
            response = requests.post(URL, headers=headers)
        return response
    elif Type=="GET":
        response = requests.get(URL, headers=headers)
        return response.json()
    elif Type=="PATCH":
        response = requests.patch(URL, headers=headers)
        return response

def load_data_from_json(data=None):
    """
    Load your labeled data exported using Label Studio in json format
    Convert the Label Studio JSON format to Spacy format so that
    the same can be fed into Spacy / Flair NER models
    """
    TRAIN_DATA_formulaire = []
    for i in range(len(data)):
        text = data[i]['data']['text']
        entities = []
        for t in data[i]['annotations'][0]['result']:
            if 'start' in t['value']:
                start = t['value']['start']
                end = t['value']['end']
                ent = t['value']['labels'][0]
                entities.append((start, end, ent))
        TRAIN_DATA_formulaire.append((text, {'entities': entities}))
    return TRAIN_DATA_formulaire

def create_df_from_data(TRAIN_DATA=None):
    """
    """
    TEXT = []
    ANNOTATION = []
    for LINE_TRAIN_DATA in TRAIN_DATA:
        text = LINE_TRAIN_DATA[0]
        text_Annotation = []
        for entity in LINE_TRAIN_DATA[1]["entities"]:
            entity_text = text[entity[0]:entity[1]]
            entity_value = entity[2]
            text_Annotation.append((entity_text, entity_value))
        TEXT.append(text)
        ANNOTATION.append(text_Annotation)
    df = pd.DataFrame({'text': TEXT, 'annotation': ANNOTATION})

    return df

def matcher(string, pattern):
    '''
    Return the start and end index of any pattern present in the text.
    '''
    match_list = []
    pattern = pattern.strip()
    seqMatch = SequenceMatcher(None, string, pattern, autojunk=False)
    match = seqMatch.find_longest_match(0, len(string), 0, len(pattern))
    if (match.size == len(pattern)):
        start = match.a
        end = match.a + match.size
        match_tup = (start, end)
        string = string.replace(pattern, "X" * len(pattern), 1)
        match_list.append(match_tup)
    return match_list, string

def mark_sentence(s, match_list):
    '''
    Marks all the entities in the sentence as per the BIO scheme. 
    '''
    word_dict = {}
    for word in s.split():
        word_dict[word] = 'O'
    for start, end, e_type in match_list:
        temp_str = s[start:end]
        tmp_list = temp_str.split()
        if len(tmp_list) > 1:
            word_dict[tmp_list[0]] = 'B-' + e_type
            for w in tmp_list[1:]:
                word_dict[w] = 'I-' + e_type
        else:
            word_dict[temp_str] = 'B-' + e_type
    return word_dict


def create_data(df, filepath):
    '''
    The function responsible for the creation of data in the said format.
    '''
    with open(filepath, 'w') as f:
        for text, annotation in zip(df.text, df.annotation):
            text_ = text
            match_list = []
            for i in annotation:
                a, text_ = matcher(text, i[0])
                match_list.append((a[0][0], a[0][1], i[1]))
            d = mark_sentence(text, match_list)
            for i in d.keys():
                f.writelines(i + ' ' + d[i] + '\n')
            f.writelines('\n')

            
def save_json(data=None ,filepath=None):
    with open(filepath, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=4)


#### Expected output `['train.json','dev.json','test.json','all.json''all_data.txt','train.txt','dev.txt','test.txt']`

In [None]:
JSON_data = GET_POST_(TOKEN=TOKEN, URL=URL_TASKS_GET, Type="GET")

# split to train 90% dev 10% test 10%
# https://datascience.stackexchange.com/a/15136
train_json, test_json = train_test_split(JSON_data, test_size=0.1, random_state=1)
train_json, dev_json = train_test_split(train_json, test_size=0.1/0.9, random_state=1)

# save files in json and text format
if not os.path.exists(training_path):
    os.makedirs(training_path)
    
save_json(data=train_json ,filepath=os.path.join(training_path, 'train.json'))
save_json(data=dev_json ,filepath=os.path.join(training_path, 'dev.json'))
save_json(data=test_json ,filepath=os.path.join(training_path, 'test.json'))
save_json(data=JSON_data ,filepath=os.path.join(training_path, 'all.json'))

df = create_df_from_data(load_data_from_json(data=JSON_data))
train = create_df_from_data(load_data_from_json(data=train_json))
dev = create_df_from_data(load_data_from_json(data=dev_json))
test = create_df_from_data(load_data_from_json(data=test_json))

create_data(df, os.path.join(training_path, 'all_data.txt'))
create_data(train, os.path.join(training_path, 'train.txt'))
create_data(dev, os.path.join(training_path, 'dev.txt'))
create_data(test, os.path.join(training_path, 'test.txt'))

In [None]:
os.listdir(training_path)