In [7]:
# no cache; always uploading from modules
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [8]:
# built-in
import sys
import pickle

from pandas import DataFrame, concat
from pathlib import Path
from datasets import Dataset, DatasetDict

In [9]:
sys.path.append('../')

In [10]:
from modules.ner import NamedEntities

In [11]:
ner = NamedEntities()

# Initialize every word with 0 label for further manual annotation

## One-Column PDF structure

In [215]:
src_path_one_column = '../corpus/txt/one_column_structure'
dst_path_one_column = '../corpus/annotation/initialized/'

In [216]:
ner.initialize_labels(src_path_one_column, dst_path_one_column)

'Done'

## Two-Column PDF structure

In [217]:
src_path_two_column = '../corpus/txt/two_column_structure/'
dst_path_two_column = '../corpus/annotation/initialized/'

In [218]:
ner.initialize_labels(src_path_two_column, dst_path_two_column)

'Done'

# Build dataset from annotated .txt files

In [14]:
src_path_annotated_txt = '../corpus/annotation/done'

In [15]:
dataset = ner.build_dataset(src_path_annotated_txt)

The dataset has been successfully built!


In [19]:
# Choose tokens, labels or ner_tags, then a paper and a sentence of the paper
# Here we're choosing tokens of the first paper in the 199th sentence.

dataset['papers_tokens'][0][199]

['5.4',
 'Ensemble',
 'of',
 'trees',
 'using',
 'LPBoost',
 'Boosting',
 'is',
 'another',
 'ensemble',
 'method',
 'used',
 'to',
 'enhance',
 'the',
 'performance',
 'of',
 'weak',
 'learners',
 'i.e.',
 'trees',
 '.']

In [267]:
dataset['papers_labels'][0][199]

[0, 1, 2, 2, 0, 1, 2, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

# Save the dataset as a simple python dict

In [365]:
dataset_dst_path = '../dataset/few_shot_learning_dataset_py_dict.pkl'

In [366]:
with open(dataset_dst_path, 'wb') as file:
    pickle.dump(dataset, file)

# Transform the dataset to pandas

1. перевести каждую статью в отдельный словарь
2. сделать df для каждой статьи
3. сделать три DatasetDict:
   2,3: train, 1: test
   1,3: train, 2: test
   1,2: train , 3: test

In [304]:
paper_df = DataFrame()
all_papers_df = DataFrame()

for idx_paper in range(len(dataset['papers_tokens'])):
    
    paper_dict = {
        'paper': [idx_paper]*len(dataset['papers_tokens'][idx_paper]),
        'tokens': dataset['papers_tokens'][idx_paper],
        'labels': dataset['papers_labels'][idx_paper],
        'tags': dataset['papers_tags'][idx_paper]
    }

    paper_df = DataFrame(paper_dict)

    all_papers_df = concat([all_papers_df, paper_df], ignore_index=True)

In [None]:
# TO-DO
# Do everything in a loop. all_papers_df['paper'] == 0, 1, 2 <- loop
# Just do the list/dict of DatasetDict, don't save every dataset in a separate variable

In [325]:
# Each paper to pandas
for paper in 
first_paper_df = all_papers_df.loc[all_papers_df['paper'] == 0].reset_index(drop=True)
print(len(first_paper_df))
second_paper_df = all_papers_df.loc[all_papers_df['paper'] == 1].reset_index(drop=True)
print(len(second_paper_df))
third_paper_df = all_papers_df.loc[all_papers_df['paper'] == 2].reset_index(drop=True)
print(len(third_paper_df))

315
802
320


# Transform to DatasetDict

## First of all, build DatasetDict with all papers as train data

In [388]:
all_papers_dd = DatasetDict({'train': Dataset.from_pandas(all_papers_df)})

## Transform each paper to Dataset, so that further we can combine 3 papers as train and test

In [389]:
frst_paper_test_ds = Dataset.from_pandas(first_paper_df)
scnd_paper_test_ds = Dataset.from_pandas(second_paper_df)
thrd_paper_test_ds = Dataset.from_pandas(third_paper_df)

## Train: 2,3
## Test: 1

In [390]:
scnd_thrd_papers_train_df = concat([second_paper_df, third_paper_df], ignore_index=True)
scnd_thrd_papers_train_ds = Dataset.from_pandas(scnd_thrd_papers_train_df)

In [391]:
scnd_thrd_train_frst_test_dd = DatasetDict({'train': scnd_thrd_papers_train_ds,'test': frst_paper_test_ds})
scnd_thrd_train_frst_test_dd

DatasetDict({
    train: Dataset({
        features: ['paper', 'tokens', 'labels', 'tags'],
        num_rows: 1122
    })
    test: Dataset({
        features: ['paper', 'tokens', 'labels', 'tags'],
        num_rows: 315
    })
})

## Train: 1,3
## Test: 2

In [392]:
frst_thrd_papers_train_df = concat([first_paper_df, third_paper_df], ignore_index=True)
frst_thrd_papers_train_ds = Dataset.from_pandas(frst_thrd_papers_train_df)

In [393]:
frst_thrd_train_scnd_test_dd = DatasetDict({'train': frst_thrd_papers_train_ds,'test': scnd_paper_test_ds})
frst_thrd_train_scnd_test_dd

DatasetDict({
    train: Dataset({
        features: ['paper', 'tokens', 'labels', 'tags'],
        num_rows: 635
    })
    test: Dataset({
        features: ['paper', 'tokens', 'labels', 'tags'],
        num_rows: 802
    })
})

## Train: 1,2
## Test: 3

In [394]:
frst_scnd_papers_train_df = concat([first_paper_df, second_paper_df], ignore_index=True)
frst_scnd_papers_train_ds = Dataset.from_pandas(frst_scnd_papers_train_df)

In [395]:
frst_scnd_train_thrd_test_dd = DatasetDict({'train': frst_scnd_papers_train_ds, 'test': thrd_paper_test_ds})
frst_scnd_train_thrd_test_dd

DatasetDict({
    train: Dataset({
        features: ['paper', 'tokens', 'labels', 'tags'],
        num_rows: 1117
    })
    test: Dataset({
        features: ['paper', 'tokens', 'labels', 'tags'],
        num_rows: 320
    })
})

In [396]:
all_dd = [scnd_thrd_train_frst_test_dd, frst_thrd_train_scnd_test_dd, frst_scnd_train_thrd_test_dd, all_papers_dd]

# Save DatasetDict-s

In [382]:
# TO-DO
# 1. Do it in loop

In [385]:
dd_dst_paths = [
                         '../dataset/scnd_thrd_train_frst_test_dd.pkl', 
                         '../dataset/frst_thrd_train_scnd_test_dd.pkl', 
                         '../dataset/frst_scnd_train_thrd_test_dd.pkl',
                         '../dataset/all_papers_train_dd.pkl'
                    ]

In [386]:
for dst_path, dd in zip(dd_dst_paths, all_dd):
    with open(dst_path, 'wb') as file:
        pickle.dump(dd, file)