Skip to content
Permalink
Branch: master
Find file Copy path
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
158 lines (125 sloc) 5.7 KB

四、文本序列到 TFRecords

大家好! 在本教程中,我将向你展示如何将原始文本数据解析为 TFRecords。 我知道很多人都卡在输入处理流水线,尤其是当你开始着手自己的个人项目时。 所以我真的希望它对你们任何人都有用!

教程的流程图

虚拟的IMDB文本数据

在实践中,我从斯坦福大学提供的大型电影评论数据集中选择了一些数据样本。

在这里导入有用的库

from nltk.tokenize import word_tokenize
import tensorflow as tf
import pandas as pd
import pickle
import random
import glob
import nltk
import re

try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')

将数据解析为 TFRecords

def imdb2tfrecords(path_data='datasets/dummy_text/', min_word_frequency=5,
                   max_words_review=700):
    '''
    这个脚本处理数据
    并将其保存为默认的 TensorFlow 文件格式:tfrecords。
    
    Args:
        path_data: the path where the imdb data is stored.
        min_word_frequency: the minimum frequency of a word, to keep it
                            in the vocabulary.
        max_words_review: the maximum number of words allowed in a review.
    '''
    # 获取正面/负面评论的文件名 
    pos_files = glob.glob(path_data + 'pos/*')
    neg_files = glob.glob(path_data + 'neg/*')

    # 连接正负评论的文件名
    filenames = pos_files + neg_files
    
    # 列出数据集中的所有评论
    reviews = [open(filenames[i],'r').read() for i in range(len(filenames))]
    
    # 移除 HTML 标签
    reviews = [re.sub(r'<[^>]+>', ' ', review) for review in reviews]
        
    # 将每个评论分词
    reviews = [word_tokenize(review) for review in reviews]
    
    # 计算每个评论的的长度
    len_reviews = [len(review) for review in reviews]

    # 展开嵌套列表
    reviews = [word for review in reviews for word in review]
    
    # 计算每个单词的频率
    word_frequency = pd.value_counts(reviews)
    
    # 仅仅保留频率高于最小值的单词
    vocabulary = word_frequency[word_frequency>=min_word_frequency].index.tolist()
    
    # 添加未知,起始和终止记号
    extra_tokens = ['Unknown_token', 'End_token']
    vocabulary += extra_tokens
    
    # 创建 word2idx 词典
    word2idx = {vocabulary[i]: i for i in range(len(vocabulary))}
    
    # 将单词的词汇表写到磁盘
    pickle.dump(word2idx, open(path_data + 'word2idx.pkl', 'wb'))
        
    def text2tfrecords(filenames, writer, vocabulary, word2idx,
                       max_words_review):
        '''
        用于将每个评论解析为部分,并作为 tfrecord 写入磁盘的函数。
        
        Args:
            filenames: the paths of the review files.
            writer: the writer object for tfrecords.
            vocabulary: list with all the words included in the vocabulary.
            word2idx: dictionary of words and their corresponding indexes.
        '''
        # 打乱 filenames
        random.shuffle(filenames)
        for filename in filenames:
            review = open(filename, 'r').read()
            review = re.sub(r'<[^>]+>', ' ', review)
            review = word_tokenize(review)
            # 将 review 归约为最大单词
            review = review[-max_words_review:]
            # 将单词替换为来自 word2idx 的等效索引
            review = [word2idx[word] if word in vocabulary else 
                      word2idx['Unknown_token'] for word in review]
            indexed_review = review + [word2idx['End_token']]
            sequence_length = len(indexed_review)
            target = 1 if filename.split('/')[-2]=='pos' else 0
            # Create a Sequence Example to store our data in
            ex = tf.train.SequenceExample()
            # 向我们的示例添加非顺序特性
            ex.context.feature['sequence_length'].int64_list.value.append(sequence_length)
            ex.context.feature['target'].int64_list.value.append(target)
            # 添加顺序特征
            token_indexes = ex.feature_lists.feature_list['token_indexes']
            for token_index in indexed_review:
                token_indexes.feature.add().int64_list.value.append(token_index)
            writer.write(ex.SerializeToString())
    
    ##########################################################################     
    # Write data to tfrecords.This might take a while.
    ##########################################################################
    writer = tf.python_io.TFRecordWriter(path_data + 'dummy.tfrecords')
    text2tfrecords(filenames, writer, vocabulary, word2idx, 
                   max_words_review)

imdb2tfrecords(path_data='datasets/dummy_text/')

将 TFRecords 解析为 TF 张量

def parse_imdb_sequence(record):
    '''
    解析 imdb tfrecords 的脚本
    
    Returns:
        token_indexes: sequence of token indexes present in the review.
        target: the target of the movie review.
        sequence_length: the length of the sequence.
    '''
    context_features = {
        'sequence_length': tf.FixedLenFeature([], dtype=tf.int64),
        'target': tf.FixedLenFeature([], dtype=tf.int64),
        }
    sequence_features = {
        'token_indexes': tf.FixedLenSequenceFeature([], dtype=tf.int64),
        }
    context_parsed, sequence_parsed = tf.parse_single_sequence_example(record, 
        context_features=context_features, sequence_features=sequence_features)
        
    return (sequence_parsed['token_indexes'], context_parsed['target'],
            context_parsed['sequence_length'])

如果你希望我在本教程中添加任何内容,请告诉我,我将很乐意进一步改善它。

You can’t perform that action at this time.