In [10]:
import json
import os.path
import subprocess

from collections import Counter

#### open settings

In [11]:
with open('./settings.json', 'r') as settings_file:
    settings = json.load(settings_file)

print('keys in settings file:')
list(settings.keys())

keys in settings file:


['data_dir',
 'data_file',
 'top_tags_count',
 'top_tags_file',
 'filtered_tmp_file',
 'train_size',
 'train_file',
 'train_labels_file',
 'test_file',
 'test_labels_file']

#### collect top tags

In [12]:
data_filepath = os.path.join(settings['data_dir'], settings['data_file'])
top_tags_filepath = os.path.join(settings['data_dir'], settings['top_tags_file'])

In [13]:
tags_counter = Counter()

try:
    with open(data_filepath, 'r') as data_file:
        for line in data_file:
            _, tags = line.strip().split('\t')
            tags_counter.update(tags.split(' '))         
except EnvironmentError:
    print('Error opening file {}'.format(data_filepath))

top_tags = [tc[0] for tc in tags_counter.most_common(settings['top_tags_count'])]

try:
    with open(top_tags_filepath, 'w') as top_tags_file:
        for tag in top_tags:
            top_tags_file.write(tag + '\n')
except EnvironmentError:
    print('Error opening file {}'.format(top_tags_filepath))

#### make filtered data

In [14]:
filtered_data_filepath = os.path.join(settings['data_dir'], settings['filtered_tmp_file'])

In [15]:
total_lines = 0

try:
    with open(data_filepath, 'r') as data_file, open(filtered_data_filepath, 'w') as f_data_file:
        for line in data_file:
            sentence, tags = line.strip().split('\t')
            tags = tags.split()
            
            filtered_tags = list(set(tags) & set(top_tags))
            if(len(filtered_tags) == 0):
                continue
            
            total_lines += 1
            f_data_file.write('{}\t{}\n'.format(sentence, ' '.join(filtered_tags)))
            
except EnvironmentError:
    print('Error opening file {}'.format(data_filepath))

#### make train and test datafiles

In [16]:
train_filepath = os.path.join(settings['data_dir'], settings['train_file'])
train_labels_filepath = os.path.join(settings['data_dir'], settings['train_labels_file'])

test_filepath = os.path.join(settings['data_dir'], settings['test_file'])
test_labels_filepath = os.path.join(settings['data_dir'], settings['test_labels_file'])

In [17]:
line_number = 0
border_index = int(float(settings['train_size']) * total_lines)

try:
    with \
    open(filtered_data_filepath, 'r') as f_data_file, \
    open(train_filepath, 'w') as train_file, \
    open(train_labels_filepath, 'w') as train_labels_file, \
    open(test_filepath, 'w') as test_file, \
    open(test_labels_filepath, 'w') as test_labels_file:
        
        for line in f_data_file:
            sentence, tags = line.strip().split('\t')
            
            if line_number <= border_index:
                train_file.write(sentence + '\n')
                train_labels_file.write(tags + '\n')
            else:
                test_file.write(sentence + '\n')
                test_labels_file.write(tags + '\n')
                
            line_number += 1
    
except EnvironmentError:
    print('Error opening file {}'.format(data_filepath))

#### remove filtered data

In [18]:
os.remove(filtered_data_filepath)