In [1]:
from datasets import concatenate_datasets, load_from_disk, DatasetDict



In [2]:
article_classification_task_dataset = load_from_disk('datasets/tasks/article_classification_task_dataset/').shuffle(seed=42)
institution_classification_task_dataset = load_from_disk('datasets/tasks/institution_classification_task_dataset/').shuffle(seed=42)
keyword_generation_task_dataset = load_from_disk('datasets/tasks/keyword_generation_task_dataset/').shuffle(seed=42)
people_classification_task_dataset = load_from_disk('datasets/tasks/people_classification_task_dataset/').shuffle(seed=42)

In [3]:
article_classification_task_dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'theme', 'type'],
        num_rows: 43060
    })
    validation: Dataset({
        features: ['text', 'label', 'theme', 'type'],
        num_rows: 6598
    })
    test: Dataset({
        features: ['text', 'label', 'theme', 'type'],
        num_rows: 3618
    })
})

In [4]:
def get_pandas(dataset):
    return {split: dataset[split].to_pandas() for split in dataset.keys()}

In [5]:
pandas_article_classification_task_dataset = get_pandas(article_classification_task_dataset)
pandas_institution_classification_task_dataset = get_pandas(institution_classification_task_dataset)
pandas_keyword_generation_task_dataset = get_pandas(keyword_generation_task_dataset)
pandas_people_classification_task_dataset = get_pandas(people_classification_task_dataset)

In [6]:
for split in article_classification_task_dataset.keys():
    print(split, pandas_article_classification_task_dataset[split].groupby(['label']).size(), sep='\n', end='\n\n')

train
label
0    16399
1    26661
dtype: int64

validation
label
0    3188
1    3410
dtype: int64

test
label
0    1898
1    1720
dtype: int64



In [7]:
institution_classification_task_dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'type'],
        num_rows: 7189
    })
    validation: Dataset({
        features: ['text', 'type'],
        num_rows: 920
    })
    test: Dataset({
        features: ['text', 'type'],
        num_rows: 492
    })
})

In [8]:
keyword_generation_task_dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'type'],
        num_rows: 26661
    })
    validation: Dataset({
        features: ['text', 'type'],
        num_rows: 3410
    })
    test: Dataset({
        features: ['text', 'type'],
        num_rows: 1720
    })
})

In [9]:
people_classification_task_dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'type'],
        num_rows: 11372
    })
    validation: Dataset({
        features: ['text', 'type'],
        num_rows: 1446
    })
    test: Dataset({
        features: ['text', 'type'],
        num_rows: 748
    })
})

In [10]:
combined_tasks = DatasetDict()

In [11]:
for split, size in zip(['train', 'validation', 'test'], [16000, 3000, 1500]):
    combined_tasks[split] = article_classification_task_dataset[split].filter(lambda article: article['label'] == 1).select(range(size))
    combined_tasks[split] = concatenate_datasets([combined_tasks[split], article_classification_task_dataset[split].filter(lambda article: article['label'] == 0).select(range(size//2))])
combined_tasks = combined_tasks.remove_columns(['label', 'type'])
#for split, size in zip(['train', 'validation', 'test'], [10000, 1400, 740]):
#    combined_tasks[split] = concatenate_datasets([combined_tasks[split], people_classification_task_dataset[split].select(range(size)).remove_columns(['type'])])
#for split, size in zip(['train', 'validation', 'test'], [7000, 900, 490]):
#    combined_tasks[split] = concatenate_datasets([combined_tasks[split], institution_classification_task_dataset[split].select(range(size)).remove_columns(['type'])])
#for split, size in zip(['train', 'validation', 'test'], [10000, 3000, 1500]):
#    combined_tasks[split] = concatenate_datasets([combined_tasks[split], keyword_generation_task_dataset[split].select(range(size)).remove_columns(['type'])])

In [12]:
combined_tasks.shuffle(seed=42).save_to_disk('datasets/tasks/combined_task_dataset')

Saving the dataset (0/1 shards):   0%|          | 0/24000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/4500 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/2250 [00:00<?, ? examples/s]

In [13]:
print(combined_tasks.shuffle(seed=42)['train'][0]['text'])

[korrupció klasszifikáció]
Brüsszel is keresi az Alstom megmentésének módját

Mario Monti, az Európai Bizottság versenyügyekben illetékes tagja pénteken elhalasztotta olaszországi utazását, hogy az Alstom francia nagyvállalat mentőcsomagjával foglalkozzék jelentette be a bizottság pénteken.

cimkék: archívum

###

téma: egyéb



In [14]:
combined_tasks = load_from_disk('datasets/tasks/combined_task_dataset/')

In [15]:
combined_tasks

DatasetDict({
    train: Dataset({
        features: ['text', 'theme'],
        num_rows: 24000
    })
    validation: Dataset({
        features: ['text', 'theme'],
        num_rows: 4500
    })
    test: Dataset({
        features: ['text', 'theme'],
        num_rows: 2250
    })
})

In [16]:
combined_tasks_io = combined_tasks.map(lambda task: {
    'input': task['text'][:task['text'].rfind(':')+1],
    'output': task['text'][task['text'].rfind(':')+1:],
}, remove_columns='text')

In [17]:
print(combined_tasks_io['train'][1]['input'])

[korrupció klasszifikáció]
Hűtlen kezelés a Magyar Közútnál?

Feljelentést tett az ügyészségen a Magyar Közút Nonprofit Zrt. vezérigazgatója két informatikai beszerzés ügyében, miután a társaságnál egy belső vizsgálat olyan eljárási hibákat tárt fel, amelyek a cég megítélése szerint megalapozzák a hűtlen kezelés gyanúját – közölte tegnap a társaság, amely ezért ismeretlen tettes ellen tett ...

cimkék: 

###

téma:


In [18]:
combined_tasks_io.filter(lambda task: 'korrupciós címkék:' not in task['input'])

DatasetDict({
    train: Dataset({
        features: ['theme', 'input', 'output'],
        num_rows: 24000
    })
    validation: Dataset({
        features: ['theme', 'input', 'output'],
        num_rows: 4500
    })
    test: Dataset({
        features: ['theme', 'input', 'output'],
        num_rows: 2250
    })
})

In [19]:
#combined_tasks_io.filter(lambda task: 'korrupciós címkék:' not in task['input']).save_to_disk('datasets/tasks/combined_task_dataset_io_notags')

In [20]:
combined_tasks_io.filter(lambda task: '[korrupció klasszifikáció]' in task['input']).save_to_disk('datasets/tasks/combined_task_dataset_io_notags')

Saving the dataset (0/1 shards):   0%|          | 0/24000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/4500 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/2250 [00:00<?, ? examples/s]

In [21]:
combined_tasks_io.filter(lambda task: 'korrupciós címkék:' not in task['input'])

DatasetDict({
    train: Dataset({
        features: ['theme', 'input', 'output'],
        num_rows: 24000
    })
    validation: Dataset({
        features: ['theme', 'input', 'output'],
        num_rows: 4500
    })
    test: Dataset({
        features: ['theme', 'input', 'output'],
        num_rows: 2250
    })
})