In [1]:
"""Build vocabularies of amino acids and classe from datasets"""

import argparse
import  easydict 
from easydict import EasyDict 
from collections import Counter
import json
import os

In [2]:
""" 
parser = argparse.ArgumentParser()
parser.add_argument('--min_count_word', default=1, help="Minimum count for amino acids in the dataset", type=int)
parser.add_argument('--min_count_tag', default=1, help="Minimum count for labels in the dataset", type=int)
parser.add_argument('--data_dir', default='data', help="Directory containing the dataset")
"""



# Hyper parameters for the vocab
PAD_WORD = '<pad>'
PAD_TAG = 'O'
UNK_WORD = 'UNK'

In [3]:
def save_vocab_to_txt_file(vocab, txt_path):
    with open(txt_path, "w") as f:
        for token in vocab:
            #print ("save_vocab_to_text ", token, vocab )
            f.write(token + '\n')

In [4]:
def save_dict_to_json(d, json_path):
    """Saves dict to json file

    Args:
        d: (dict)
        json_path: (string) path to json file
    """
    with open(json_path, 'w') as f:
        #This is a dictionary. A dictionary comprises of two things : Keys and Values.
        #‘k:v’ is basically the action to be performed in the for loop.
        
        d = {k: v for k, v in d.items()}
        # json.dump(data, write_file, intendation)
        print("data " , d)
        print("writefile" ,f)
        json.dump(d, f, indent=4)

In [5]:
def update_vocab(txt_path, vocab):
   
    with open(txt_path) as f:
        for i, line in enumerate(f):
            #print ("update vocab ", i, line )
            vocab.update(line.strip())
            #print("vocab ", vocab )
    return i + 1

In [6]:
def update_label(txt_path, vocab):
    with open(txt_path) as f:
        for i, line in enumerate(f):
            #print ("update label ", i, line )
            vocab.update(line.strip().split())
    return i + 1

In [7]:
if __name__ == '__main__':
    #args = parser.parse_args()
   
    args = easydict.EasyDict({
    "min_count_word": 1,
    "min_count_tag": 1,
    "data_dir": "data"
})
    

    # Build word vocab with train and test datasets
    print("Building word vocabulary...")
    chars = Counter()
    print("chars", chars)
    size_train_sentences = update_vocab(os.path.join(args.data_dir, 'train/sentences.txt'), chars)
    
    size_dev_sentences = update_vocab(os.path.join(args.data_dir, 'val/sentences.txt'), chars)
    
    size_test_sentences = update_vocab(os.path.join(args.data_dir, 'test/sentences.txt'), chars)
    
    print("-> done. Train , Validation, Test size  : ", size_train_sentences, size_dev_sentences,size_test_sentences )

    # Build tag vocab with train and test datasets
    print("Building tag vocabulary...")
    classes = Counter()
    size_train_tags = update_label(os.path.join(args.data_dir, 'train/labels.txt'), classes)
    size_dev_tags = update_label(os.path.join(args.data_dir, 'val/labels.txt'), classes)
    size_test_tags = update_label(os.path.join(args.data_dir, 'test/labels.txt'), classes)
    print("->done Train , Validation, Test label " ,size_train_tags,  size_dev_tags,size_test_tags )

    # Assert same number of examples in datasets
    assert size_train_sentences == size_train_tags
    assert size_dev_sentences == size_dev_tags
    assert size_test_sentences == size_test_tags

    # Only keep most frequent tokens
    chars = [tok for tok, count in chars.items() if count >= args.min_count_word]
    classes = [tok for tok, count in classes.items() if count >= args.min_count_tag]
    
    print("Most Frequent token : ", chars, classes)

    # Add pad tokens
    if PAD_WORD not in chars: chars.append(PAD_WORD)
        
    print("After pad ", chars)

    # Save vocabularies to file
    print("Saving vocabularies to file...")
    save_vocab_to_txt_file(chars, os.path.join(args.data_dir, 'chars.txt'))
    save_vocab_to_txt_file(classes, os.path.join(args.data_dir, 'classes.txt'))
    print("- done.")

    # Save datasets properties in json file
    sizes = {
        'train_size': size_train_sentences,
        'dev_size': size_dev_sentences,
        'test_size': size_test_sentences,
        'vocab_size': len(chars),
        'number_of_classes': len(classes),
        'pad_word': PAD_WORD,
    }
    
    #we use JSON to store and exchange data
    # It’s nothing more than a standardized format the community uses to pass data around
    save_dict_to_json(sizes, os.path.join(args.data_dir, 'dataset_params.json'))

    # Logging sizes
    to_print = "\n".join("- {}: {}".format(k, v) for k, v in sizes.items())
    
    print("Characteristics of the dataset:\n{}".format(to_print))


Building word vocabulary...
chars Counter()
-> done. Train , Validation, Test size  :  304 65 66
Building tag vocabulary...
->done Train , Validation, Test label  304 65 66
Most Frequent token :  ['M', 'H', 'I', 'N', 'E', 'T', 'D', 'W', 'L', 'V', 'K', 'P', 'A', 'S', 'F', 'R', 'G', 'Q', 'Y', 'C'] ['Endoplasmic.reticulum', 'Golgi.apparatus', 'Cell.membrane', 'Extracellular', 'Cytoplasm', 'Nucleus', 'Plastid', 'Peroxisome', 'Lysosome/Vacuole', 'Mitochondrion']
After pad  ['M', 'H', 'I', 'N', 'E', 'T', 'D', 'W', 'L', 'V', 'K', 'P', 'A', 'S', 'F', 'R', 'G', 'Q', 'Y', 'C', '<pad>']
Saving vocabularies to file...
- done.
data  {'train_size': 304, 'dev_size': 65, 'test_size': 66, 'vocab_size': 21, 'number_of_classes': 10, 'pad_word': '<pad>'}
writefile <_io.TextIOWrapper name='data\\dataset_params.json' mode='w' encoding='cp1252'>
Characteristics of the dataset:
- train_size: 304
- dev_size: 65
- test_size: 66
- vocab_size: 21
- number_of_classes: 10
- pad_word: <pad>
